diff --git a/CMakeLists.txt b/CMakeLists.txt index fcc50e8..895fb28 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -48,6 +48,10 @@ # -DENABLE_IBDIAGS_COMPAT=True (default False) # Include obsolete scripts. These scripts are replaced by C programs with # a different interface now. +# -DNO_MAN_PAGES=1 (default 0, build/install the man pages) +# Disable man pages. Allows rdma-core to be built and installed +# (without man pages) when neither pandoc/rst2man nor the pandoc-prebuilt +# directory are available. cmake_minimum_required(VERSION 2.8.11 FATAL_ERROR) project(rdma-core C) @@ -68,7 +72,7 @@ endif() set(PACKAGE_NAME "RDMA") # See Documentation/versioning.md -set(PACKAGE_VERSION "29.0") +set(PACKAGE_VERSION "32.0") # When this is changed the values in these files need changing too: # debian/control # debian/libibverbs1.symbols @@ -174,7 +178,15 @@ include(RDMA_DoFixup) include(publish_headers) include(rdma_functions) include(pyverbs_functions) -include(rdma_man) +if (NO_MAN_PAGES) + # define empty stub functions to omit man page processing + function(rdma_man_pages) + endfunction() + function(rdma_alias_man_pages) + endfunction() +else() + include(rdma_man) +endif() if (NOT DEFINED ENABLE_STATIC) set(ENABLE_STATIC "OFF" CACHE BOOL "Produce static linking libraries as well as shared libraries.") @@ -423,9 +435,12 @@ if (CYTHON_EXECUTABLE) string(STRIP ${py_path} CMAKE_PYTHON_SO_SUFFIX) endif() -# Look for pandoc and rst2man for making manual pages -FIND_PACKAGE(pandoc) -FIND_PACKAGE(rst2man) +set(NO_MAN_PAGES "OFF" CACHE BOOL "Disable build/install of man pages") +if (NOT NO_MAN_PAGES) + # Look for pandoc and rst2man for making manual pages + FIND_PACKAGE(pandoc) + FIND_PACKAGE(rst2man) +endif () #------------------------- # Find libraries @@ -442,6 +457,7 @@ if (ENABLE_RESOLVE_NEIGH) # FIXME use of pkgconfig is discouraged pkg_check_modules(NL libnl-3.0 libnl-route-3.0 REQUIRED) include_directories(${NL_INCLUDE_DIRS}) + link_directories(${NL_LIBRARY_DIRS}) set(NL_KIND 3) else() set(NL_KIND 0) @@ -559,6 +575,13 @@ if (NOT NL_KIND EQUAL 0) endif() RDMA_AddOptCFlag(CMAKE_C_FLAGS HAVE_C_WREDUNDANT_DECLS "-Wredundant-decls") +# Support of getrandom() was added to glibc in version 2.25 +CHECK_C_SOURCE_COMPILES(" + #include + int main(int argc,const char *argv[]) {char buf[64]; return getrandom(buf, 64, GRND_NONBLOCK);}" + HAVE_GLIBC_GETRANDOM) +RDMA_DoFixup("${HAVE_GLIBC_GETRANDOM}" "sys/random.h") + #------------------------- # Build Prep # Write out a git ignore file to the build directory if it isn't the source @@ -702,18 +725,22 @@ else() message(STATUS " netlink/route/link.h and net/if.h NOT co-includable (old headers)") endif() endif() -if (NOT PANDOC_FOUND) - if (NOT EXISTS "${CMAKE_SOURCE_DIR}/buildlib/pandoc-prebuilt") - message(STATUS " pandoc NOT found and NO prebuilt man pages. 'install' disabled") - else() - message(STATUS " pandoc NOT found (using prebuilt man pages)") +if (NO_MAN_PAGES) + message(STATUS " man pages NOT built") +else() + if (NOT PANDOC_FOUND) + if (NOT EXISTS "${CMAKE_SOURCE_DIR}/buildlib/pandoc-prebuilt") + message(STATUS " pandoc NOT found and NO prebuilt man pages. 'install' disabled") + else() + message(STATUS " pandoc NOT found (using prebuilt man pages)") + endif() endif() -endif() -if (NOT RST2MAN_FOUND) - if (NOT EXISTS "${CMAKE_SOURCE_DIR}/buildlib/pandoc-prebuilt") - message(STATUS " rst2man NOT found and NO prebuilt man pages. 'install' disabled") - else() - message(STATUS " rst2man NOT found (using prebuilt man pages)") + if (NOT RST2MAN_FOUND) + if (NOT EXISTS "${CMAKE_SOURCE_DIR}/buildlib/pandoc-prebuilt") + message(STATUS " rst2man NOT found and NO prebuilt man pages. 'install' disabled") + else() + message(STATUS " rst2man NOT found (using prebuilt man pages)") + endif() endif() endif() if (NOT CYTHON_EXECUTABLE) diff --git a/Documentation/pyverbs.md b/Documentation/pyverbs.md index 3577171..0616d02 100755 --- a/Documentation/pyverbs.md +++ b/Documentation/pyverbs.md @@ -175,6 +175,8 @@ with d.Context(name='mlx5_0') as ctx: ##### Memory window The following example shows the equivalent of creating a type 1 memory window. It includes opening a device and allocating the necessary PD. +The user should unbind or close the memory window before being able to +deregister an MR that the MW is bound to. ```python import pyverbs.device as d from pyverbs.pd import PD @@ -533,12 +535,12 @@ import pyverbs.cm_enums as ce cap = QPCap(max_recv_wr=1) qp_init_attr = QPInitAttr(cap=cap) -server = '11.137.14.124' +addr = '11.137.14.124' port = '7471' # Passive side -sai = AddrInfo(server, port, ce.RDMA_PS_TCP, ce.RAI_PASSIVE) +sai = AddrInfo(src=addr, src_service=port, port_space=ce.RDMA_PS_TCP, flags=ce.RAI_PASSIVE) sid = CMID(creator=sai, qp_init_attr=qp_init_attr) sid.listen() # listen for incoming connection requests new_id = sid.get_request() # check if there are any connection requests @@ -546,9 +548,9 @@ new_id.accept() # new_id is connected to remote peer and ready to communicate # Active side -cai = AddrInfo(server, port, ce.RDMA_PS_TCP) +cai = AddrInfo(src=addr, dst=addr, dst_service=port, port_space=ce.RDMA_PS_TCP) cid = CMID(creator=cai, qp_init_attr=qp_init_attr) -cid.connect() # send connection request to server +cid.connect() # send connection request to passive addr ``` ##### ParentDomain @@ -611,3 +613,58 @@ rate_limit_inbox = (5).to_bytes(length=4, byteorder='big', signed=True) pp = Mlx5PP(ctx, rate_limit_inbox) pp.close() ``` + +##### MLX5 UAR +User Access Region (UAR) is part of PCI address space that is mapped for direct +access to the HCA from the CPU. +The UAR is needed for some device commands over the DevX interface. +The following code snippet demonstrates how to allocate and free an +mlx5dv_devx_uar. +```python +from pyverbs.providers.mlx5.mlx5dv import Mlx5UAR +from pyverbs.device import Context + +ctx = Context(name='rocep0s8f0') +uar = Mlx5UAR(ctx) +uar.close() +``` + +##### Import device, PD and MR +Importing a device, PD and MR enables processes to share their context and then +share PDs and MRs that is associated with. +A process creates a device and then uses some of the Linux systems calls to dup +its 'cmd_fd' member which lets other process to obtain ownership. +Once other process obtains the 'cmd_fd' it can import the device, then PD(s) and +MR(s) to share these objects. +Like in C, Pyverbs users are responsible for unimporting the imported objects +(which will also close the Pyverbs instance in our case) after they finish using +them, and they have to sync between the different processes in order to +coordinate the closure of the objects. +Unlike in C, closing the underlying objects is currently supported only via the +"original" object (meaning only by the process that creates them) and not via +the imported object. This limitation is made because currently there's no +reference or relation between different Pyverbs objects in different processes. +But it's doable and might be added in the future. +Here is a demonstration of importing a device, PD and MR in one process. +```python +from pyverbs.device import Context +from pyverbs.pd import PD +from pyverbs.mr import MR +import pyverbs.enums as e +import os + +ctx = Context(name='ibp0s8f0') +pd = PD(ctx) +mr = MR(pd, 100, e.IBV_ACCESS_LOCAL_WRITE) +cmd_fd_dup = os.dup(ctx.cmd_fd) +improted_ctx = Context(cmd_fd=cmd_fd_dup) +imported_pd = PD(improted_ctx, handle=pd.handle) +imported_mr = MR(imported_pd, handle=mr.handle) +# MRs can be created as usual on the imported PD +secondary_mr = MR(imported_pd, 100, e.IBV_ACCESS_REMOTE_READ) +# Must manually unimport the imported objects (which close the object and frees +# other resources that use them) before closing the "original" objects. +# This prevents unexpected behaviours caused by the GC. +imported_mr.unimport() +imported_pd.unimport() +``` diff --git a/Documentation/testing.md b/Documentation/testing.md index 54e6c35..b1f3e1a 100644 --- a/Documentation/testing.md +++ b/Documentation/testing.md @@ -127,13 +127,6 @@ Ran 14 tests in 0.152s OK ``` -We're using 'parametrize' as it instantiates the TestCase for us. -'parametrize' can accept arguments as well (device name, IB port, GID index and -PKey index): -``` -suite = unittest.TestSuite() -suite.addTest(RDMATestCase.parametrize(YourTestCase, dev_name='devname')) -``` ## Writing Tests The following section explains how to add a new test, using tests/test_odp.py diff --git a/buildlib/azure-pipelines-release.yml b/buildlib/azure-pipelines-release.yml index fd5e4a1..e2c69ea 100644 --- a/buildlib/azure-pipelines-release.yml +++ b/buildlib/azure-pipelines-release.yml @@ -11,7 +11,7 @@ trigger: resources: containers: - container: azp - image: ucfconsort.azurecr.io/rdma-core/azure_pipelines:25.0 + image: ucfconsort.azurecr.io/rdma-core/azure_pipelines:29.0 endpoint: ucfconsort_registry stages: @@ -30,7 +30,7 @@ stages: set -e mkdir build-pandoc artifacts cd build-pandoc - CC=gcc-9 cmake -GNinja .. + CC=gcc-10 cmake -GNinja .. ninja docs cd .. diff --git a/buildlib/azure-pipelines.yml b/buildlib/azure-pipelines.yml index 697d21d..f2a86b4 100644 --- a/buildlib/azure-pipelines.yml +++ b/buildlib/azure-pipelines.yml @@ -22,7 +22,7 @@ pr: resources: containers: - container: azp - image: ucfconsort.azurecr.io/rdma-core/azure_pipelines:28.0 + image: ucfconsort.azurecr.io/rdma-core/azure_pipelines:29.0 endpoint: ucfconsort_registry - container: centos6 image: ucfconsort.azurecr.io/rdma-core/centos6:25.0 @@ -34,11 +34,14 @@ resources: image: ucfconsort.azurecr.io/rdma-core/centos8:25.0 endpoint: ucfconsort_registry - container: fedora - image: ucfconsort.azurecr.io/rdma-core/fc31:25.0 + image: ucfconsort.azurecr.io/rdma-core/fc32:31.0 endpoint: ucfconsort_registry - container: xenial image: ucfconsort.azurecr.io/rdma-core/ubuntu-16.04:28.0 endpoint: ucfconsort_registry + - container: bionic + image: ucfconsort.azurecr.io/rdma-core/ubuntu-18.04:29.0 + endpoint: ucfconsort_registry - container: leap image: ucfconsort.azurecr.io/rdma-core/opensuse-15.0:25.0 endpoint: ucfconsort_registry @@ -61,18 +64,18 @@ stages: - bash: | set -e - mkdir build-gcc9 - cd build-gcc9 - CC=gcc-9 cmake -GNinja .. -DIOCTL_MODE=both -DENABLE_STATIC=1 -DENABLE_WERROR=1 + mkdir build-gcc10 + cd build-gcc10 + CC=gcc-10 cmake -GNinja .. -DIOCTL_MODE=both -DENABLE_STATIC=1 -DENABLE_WERROR=1 ninja - displayName: gcc 9.1 Compile + displayName: gcc 10.0 Compile - task: PythonScript@0 displayName: Check Build Script inputs: scriptPath: buildlib/check-build - arguments: --src .. --cc gcc-9 - workingDirectory: build-gcc9 + arguments: --src .. --cc gcc-10 + workingDirectory: build-gcc10 pythonInterpreter: /usr/bin/python3 # Run sparse on the subdirectories which are sparse clean @@ -95,17 +98,17 @@ stages: set -e mkdir build-clang cd build-clang - CC=clang-9 CFLAGS="-m32" cmake -GNinja .. -DIOCTL_MODE=both -DNO_PYVERBS=1 -DENABLE_WERROR=1 + CC=clang-10 CFLAGS="-m32" cmake -GNinja .. -DIOCTL_MODE=both -DNO_PYVERBS=1 -DENABLE_WERROR=1 ninja - displayName: clang 9.0 32-bit Compile + displayName: clang 10.0 32-bit Compile - bash: | set -e mv util/udma_barrier.h util/udma_barrier.h.old echo "#error Fail" >> util/udma_barrier.h - cd build-gcc9 + cd build-gcc10 rm CMakeCache.txt - CC=gcc-9 cmake -GNinja .. -DIOCTL_MODE=both -DENABLE_WERROR=1 + CC=gcc-10 cmake -GNinja .. -DIOCTL_MODE=both -DENABLE_WERROR=1 ninja mv ../util/udma_barrier.h.old ../util/udma_barrier.h displayName: Simulate non-coherent DMA Platform Compile @@ -114,27 +117,27 @@ stages: set -e mkdir build-arm64 cd build-arm64 - CC=aarch64-linux-gnu-gcc-8 cmake -GNinja .. -DIOCTL_MODE=both -DNO_PYVERBS=1 -DENABLE_WERROR=1 + CC=aarch64-linux-gnu-gcc-9 cmake -GNinja .. -DIOCTL_MODE=both -DNO_PYVERBS=1 -DENABLE_WERROR=1 ninja - displayName: gcc 8.3 ARM64 Compile + displayName: gcc 9.3 ARM64 Compile - bash: | set -e mkdir build-ppc64el cd build-ppc64el - CC=powerpc64le-linux-gnu-gcc-8 cmake -GNinja .. -DIOCTL_MODE=both -DNO_PYVERBS=1 -DENABLE_WERROR=1 + CC=powerpc64le-linux-gnu-gcc-9 cmake -GNinja .. -DIOCTL_MODE=both -DNO_PYVERBS=1 -DENABLE_WERROR=1 ninja - displayName: gcc 8.3 PPC64EL Compile + displayName: gcc 9.3 PPC64EL Compile - bash: | set -e sed -i -e 's/ninja \(.*\)-v/ninja \1/g' debian/rules - debian/rules CC=clang-9 EXTRA_CMAKE_FLAGS="-DCMAKE_BUILD_TYPE=Debug -DENABLE_WERROR=1" build - displayName: clang 9.0 Bionic Build + debian/rules CC=clang-10 EXTRA_CMAKE_FLAGS="-DCMAKE_BUILD_TYPE=Debug -DENABLE_WERROR=1" build + displayName: clang 10.0 Bionic Build - bash: | set -e fakeroot debian/rules binary - displayName: clang 9.0 Bionic .deb Build + displayName: clang 10.0 Bionic .deb Build - bash: | set -e lintian ../*.deb @@ -153,7 +156,7 @@ stages: set -e mkdir build-pandoc artifacts cd build-pandoc - CC=gcc-9 cmake -GNinja .. + CC=gcc-10 cmake -GNinja .. ninja docs cd ../artifacts # FIXME: Check Build.SourceBranch for tag consistency @@ -220,6 +223,8 @@ stages: matrix: xenial: CONTAINER: xenial + bionic: + CONTAINER: bionic container: $[ variables['CONTAINER'] ] steps: - checkout: none diff --git a/buildlib/cbuild b/buildlib/cbuild index 62f83a6..f9f8d2e 100755 --- a/buildlib/cbuild +++ b/buildlib/cbuild @@ -234,10 +234,10 @@ class centos8(Environment): " ".join(sorted(self.pkgs)))); return res; -class fc31(Environment): - docker_parent = "fedora:31"; +class fc32(Environment): + docker_parent = "fedora:32"; pkgs = centos8.pkgs - name = "fc31"; + name = "fc32"; specfile = "redhat/rdma-core.spec"; ninja_cmd = "ninja-build"; is_rpm = True; @@ -257,7 +257,7 @@ class APTEnvironment(Environment): build_python = True; def get_docker_file(self,tmpdir): res = DockerFile(self.docker_parent); - res.lines.append("RUN apt-get update; apt-get install -y --no-install-recommends %s && apt-get clean && rm -rf /usr/share/doc/ /usr/lib/debug /var/lib/apt/lists/"%( + res.lines.append("RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --no-install-recommends %s && apt-get clean && rm -rf /usr/share/doc/ /usr/lib/debug /var/lib/apt/lists/"%( " ".join(sorted(self.pkgs)))); return res; @@ -268,6 +268,15 @@ class APTEnvironment(Environment): with open(os.path.join(sld,name),"w") as F: F.write(content + "\n"); + def fix_https(self,tmpdir): + """The ubuntu image does not include ca-certificates, so if we want to use + HTTPS disable certificate validation.""" + cfgd = os.path.join(tmpdir,"etc","apt","apt.conf.d") + if not os.path.isdir(cfgd): + os.makedirs(cfgd) + with open(os.path.join(cfgd,"01nossl"),"w") as F: + F.write('Acquire::https { Verify-Peer "false"; };') + def add_ppa(self,tmpdir,srcline,keyid): gpgd = os.path.join(tmpdir,"etc","apt","trusted.gpg.d"); if not os.path.isdir(gpgd): @@ -324,6 +333,15 @@ class bionic(APTEnvironment): }; name = "ubuntu-18.04"; aliases = {"bionic", "ubuntu"}; + to_azp = True + +class focal(APTEnvironment): + docker_parent = "ubuntu:20.04" + pkgs = bionic.pkgs | { + 'dh-python', + } + name = "ubuntu-20.04"; + aliases = {"focal", "ubuntu"}; class jessie(APTEnvironment): docker_parent = "debian:8" @@ -398,21 +416,21 @@ class tumbleweed(ZypperEnvironment): # ------------------------------------------------------------------------- class azure_pipelines(APTEnvironment): - docker_parent = "ubuntu:18.04" + docker_parent = "ubuntu:20.04" pkgs = { "abi-compliance-checker", "abi-dumper", "ca-certificates", - "clang-9", + "clang-10", "cmake", "cython3", "debhelper", + "dh-python", "dh-systemd", "dpkg-dev", "fakeroot", - "gcc-9", + "gcc-10", "git", - "python2.7", "libc6-dev", "libnl-3-dev", "libnl-route-3-dev", @@ -432,7 +450,7 @@ class azure_pipelines(APTEnvironment): "valgrind", } | { # 32 bit build support - "libgcc-9-dev:i386", + "libgcc-10-dev:i386", "libc6-dev:i386", "libnl-3-dev:i386", "libnl-route-3-dev:i386", @@ -440,8 +458,8 @@ class azure_pipelines(APTEnvironment): "libudev-dev:i386", } | { # ARM 64 cross compiler - "gcc-8-aarch64-linux-gnu", - "libgcc-8-dev:arm64", + "gcc-9-aarch64-linux-gnu", + "libgcc-9-dev:arm64", "libc6-dev:arm64", "libnl-3-dev:arm64", "libnl-route-3-dev:arm64", @@ -449,8 +467,8 @@ class azure_pipelines(APTEnvironment): "libudev-dev:arm64", } | { # PPC 64 cross compiler - "gcc-8-powerpc64le-linux-gnu", - "libgcc-8-dev:ppc64el", + "gcc-9-powerpc64le-linux-gnu", + "libgcc-9-dev:ppc64el", "libc6-dev:ppc64el", "libnl-3-dev:ppc64el", "libnl-route-3-dev:ppc64el", @@ -462,23 +480,32 @@ class azure_pipelines(APTEnvironment): aliases = {"azp"} def get_docker_file(self,tmpdir): - res = bionic.get_docker_file(self,tmpdir); + res = focal.get_docker_file(self,tmpdir); + self.fix_https(tmpdir) self.add_ppa(tmpdir, - "deb http://ppa.launchpad.net/ubuntu-toolchain-r/test/ubuntu bionic main", + "deb http://ppa.launchpad.net/ubuntu-toolchain-r/test/ubuntu focal main", "60C317803A41BA51845E371A1E9377A2BA9EF27F"); self.add_ppa(tmpdir, - "deb [arch=amd64] http://apt.llvm.org/bionic/ llvm-toolchain-bionic-8 main", + "deb [arch=amd64] https://apt.llvm.org/focal/ llvm-toolchain-focal-10 main", "15CF4D18AF4F7421"); self.add_source_list(tmpdir,"arm64.list", - """deb [arch=arm64,ppc64el] http://ports.ubuntu.com/ bionic main universe -deb [arch=arm64,ppc64el] http://ports.ubuntu.com/ bionic-security main universe -deb [arch=arm64,ppc64el] http://ports.ubuntu.com/ bionic-updates main universe"""); + """deb [arch=arm64,ppc64el] http://ports.ubuntu.com/ focal main universe +deb [arch=arm64,ppc64el] http://ports.ubuntu.com/ focal-security main universe +deb [arch=arm64,ppc64el] http://ports.ubuntu.com/ focal-updates main universe"""); res.lines.insert(1,"ADD etc/ /etc/"); res.lines.insert(1,"RUN dpkg --add-architecture i386 &&" "dpkg --add-architecture ppc64el &&" "dpkg --add-architecture arm64 &&" "sed -i -e 's/^deb /deb [arch=amd64,i386] /g' /etc/apt/sources.list"); + + + # There is some bug in APT where it doesn't order the install + # properly. Probably related to multi-arch.. Resolve it by early + # installing these two packages. + res.lines[-1] = res.lines[-1].replace("update && DEBIAN_FRONTEND=noninteractive apt-get", + "update && apt-get install -y --no-install-recommends libgcc-s1:i386 libgcc-s1:ppc64el && DEBIAN_FRONTEND=noninteractive apt-get") + return res; # ------------------------------------------------------------------------- @@ -491,9 +518,10 @@ environments = [centos6(), amazonlinux2(), xenial(), bionic(), + focal(), jessie(), stretch(), - fc31(), + fc32(), leap(), tumbleweed(), debian_experimental(), diff --git a/buildlib/centos6.spec b/buildlib/centos6.spec index 199930b..d8b69e6 100644 --- a/buildlib/centos6.spec +++ b/buildlib/centos6.spec @@ -1,5 +1,5 @@ Name: rdma-core -Version: 29.0 +Version: 32.0 Release: 1%{?dist} Summary: RDMA core userspace libraries and daemons diff --git a/buildlib/check-build b/buildlib/check-build index ab8524e..4e52d0d 100755 --- a/buildlib/check-build +++ b/buildlib/check-build @@ -84,7 +84,7 @@ def get_symbol_vers(fn,exported=True): def check_lib_symver(args,fn): g = re.match(r"lib([^.]+)\.so\.(\d+)\.(\d+)\.(.*)",fn); if g.group(4) != args.PACKAGE_VERSION: - raise ValueError("Shared Library filename %r does not have the package version %r (%r)%"( + raise ValueError("Shared Library filename %r does not have the package version %r (%r)"%( fn,args.PACKAGE_VERSION,g.groups())); # umad/etc used the wrong symbol version name when they moved to soname 3.0 diff --git a/buildlib/config.h.in b/buildlib/config.h.in index 5f42d65..e22e136 100644 --- a/buildlib/config.h.in +++ b/buildlib/config.h.in @@ -19,8 +19,8 @@ #define RS_CONF_DIR "@CMAKE_INSTALL_FULL_SYSCONFDIR@/rdma/rsocket" #define IWPM_CONFIG_FILE "@CMAKE_INSTALL_FULL_SYSCONFDIR@/iwpmd.conf" -#define SRP_DEAMON_CONFIG_FILE "@CMAKE_INSTALL_FULL_SYSCONFDIR@/srp_daemon.conf" -#define SRP_DEAMON_LOCK_PREFIX "@CMAKE_INSTALL_FULL_RUNDIR@/srp_daemon" +#define SRP_DAEMON_CONFIG_FILE "@CMAKE_INSTALL_FULL_SYSCONFDIR@/srp_daemon.conf" +#define SRP_DAEMON_LOCK_PREFIX "@CMAKE_INSTALL_FULL_RUNDIR@/srp_daemon" #define ACM_CONF_DIR "@CMAKE_INSTALL_FULL_SYSCONFDIR@/rdma" #define IBACM_LIB_PATH "@ACM_PROVIDER_DIR@" diff --git a/buildlib/fixup-include/sys-random.h b/buildlib/fixup-include/sys-random.h new file mode 100644 index 0000000..47b0b81 --- /dev/null +++ b/buildlib/fixup-include/sys-random.h @@ -0,0 +1,13 @@ +#ifndef _FIXUP_SYS_RANDOM_H +#define _FIXUP_SYS_RANDOM_H + +#include + +/* Flags for use with getrandom. */ +#define GRND_NONBLOCK 0x01 + +static inline ssize_t getrandom(void *buf, size_t buflen, unsigned int flags) +{ + return -1; +} +#endif diff --git a/buildlib/gen-sparse.py b/buildlib/gen-sparse.py index 3b8c77e..be5056b 100755 --- a/buildlib/gen-sparse.py +++ b/buildlib/gen-sparse.py @@ -70,6 +70,8 @@ def get_buildlib_patches(dfn): def add_to_dict(d,lst): for I in lst: nh = norm_header(I) + if nh is None: + continue; assert nh not in d d[nh] = (I, find_system_header(args,nh)) diff --git a/buildlib/pandoc-prebuilt/00b1d0691cdea71ca370160f85854622bfef1e92 b/buildlib/pandoc-prebuilt/00b1d0691cdea71ca370160f85854622bfef1e92 index 9378a25..7686465 100644 --- a/buildlib/pandoc-prebuilt/00b1d0691cdea71ca370160f85854622bfef1e92 +++ b/buildlib/pandoc-prebuilt/00b1d0691cdea71ca370160f85854622bfef1e92 @@ -1,4 +1,4 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" .TH "ibv_flow_action_esp" "3" "" "" "" .hy @@ -9,147 +9,132 @@ ibv_flow_action_esp \- Flow action esp for verbs .IP .nf \f[C] -#include\ +#include -struct\ ibv_flow_action\ * -ibv_create_flow_action_esp(struct\ ibv_context\ *ctx, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ ibv_flow_action_esp\ *esp); +struct ibv_flow_action * +ibv_create_flow_action_esp(struct ibv_context *ctx, + struct ibv_flow_action_esp *esp); int -ibv_modify_flow_action_esp(struct\ ibv_flow_action\ *action, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ ibv_flow_action_esp\ *esp); +ibv_modify_flow_action_esp(struct ibv_flow_action *action, + struct ibv_flow_action_esp *esp); -int\ ibv_destroy_flow_action(struct\ ibv_flow_action\ *action); -\f[] +int ibv_destroy_flow_action(struct ibv_flow_action *action); +\f[R] .fi .SH DESCRIPTION .PP An IPSEC ESP flow steering action allows a flow steering rule to decrypt or encrypt a packet after matching. Each action contains the necessary information for this operation in the -\f[I]params\f[] argument. +\f[I]params\f[R] argument. .PP After the crypto operation the packet will continue to be processed by flow steering rules until it reaches a final action of discard or delivery. .PP After the action is created, then it should be associated with a -\f[I]struct ibv_flow_attr\f[] using \f[I]struct -ibv_flow_spec_action_handle\f[] flow specification. +\f[I]struct ibv_flow_attr\f[R] using \f[I]struct +ibv_flow_spec_action_handle\f[R] flow specification. Each action can be associated with multiple flows, and -\f[I]ibv_modify_flow_action_esp\f[] will alter all associated flows +\f[I]ibv_modify_flow_action_esp\f[R] will alter all associated flows simultaneously. .SH ARGUMENTS .TP -.B \f[I]ctx\f[] +.B \f[I]ctx\f[R] RDMA device context to create the action on. -.RS -.RE .TP -.B \f[I]esp\f[] +.B \f[I]esp\f[R] ESP parameters and key material for the action. -.RS -.RE .TP -.B \f[I]action\f[] +.B \f[I]action\f[R] Existing action to modify ESP parameters. -.RS -.RE -.SS \f[I]action\f[] Argument +.SS \f[I]action\f[R] Argument .IP .nf \f[C] -struct\ ibv_flow_action_esp\ { -\ \ \ \ struct\ ibv_flow_action_esp_attr\ *esp_attr; +struct ibv_flow_action_esp { + struct ibv_flow_action_esp_attr *esp_attr; -\ \ \ \ /*\ See\ Key\ Material\ */ -\ \ \ \ uint16_t\ \ \ \ \ \ \ \ keymat_proto; -\ \ \ \ uint16_t\ \ \ \ \ \ \ \ keymat_len; -\ \ \ \ void\ \ \ \ \ \ \ \ \ \ \ *keymat_ptr; + /* See Key Material */ + uint16_t keymat_proto; + uint16_t keymat_len; + void *keymat_ptr; -\ \ \ \ /*\ See\ Replay\ Protection\ */ -\ \ \ \ uint16_t\ \ \ \ \ \ \ \ replay_proto; -\ \ \ \ uint16_t\ \ \ \ \ \ \ \ replay_len; -\ \ \ \ void\ \ \ \ \ \ \ \ \ \ \ *replay_ptr; + /* See Replay Protection */ + uint16_t replay_proto; + uint16_t replay_len; + void *replay_ptr; -\ \ \ \ struct\ ibv_flow_action_esp_encap\ *esp_encap; + struct ibv_flow_action_esp_encap *esp_encap; -\ \ \ \ uint32_t\ \ \ \ \ \ \ \ comp_mask; -\ \ \ \ uint32_t\ \ \ \ \ \ \ \ esn; + uint32_t comp_mask; + uint32_t esn; }; -\f[] +\f[R] .fi .TP -.B \f[I]comp_mask\f[] +.B \f[I]comp_mask\f[R] Bitmask specifying what fields in the structure are valid. -.RS -.RE .TP -.B \f[I]esn\f[] +.B \f[I]esn\f[R] The starting value of the ESP extended sequence number. -Valid only if \f[I]IBV_FLOW_ACTION_ESP_MASK_ESN\f[] is set in -\f[I]comp_mask\f[]. +Valid only if \f[I]IBV_FLOW_ACTION_ESP_MASK_ESN\f[R] is set in +\f[I]comp_mask\f[R]. .RS .PP -The 32 bits of \f[I]esn\f[] will be used to compute the full 64 bit ESN +The 32 bits of \f[I]esn\f[R] will be used to compute the full 64 bit ESN required for the AAD construction. .PP -When in \f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_INLINE_CRYPTO\f[] mode, the -implementation will automatically track rollover of the lower 32 bits of -the ESN. -However, an update of the window is required once every 2^31 sequences. +When in \f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_INLINE_CRYPTO\f[R] mode, +the implementation will automatically track rollover of the lower 32 +bits of the ESN. +However, an update of the window is required once every 2\[ha]31 +sequences. .PP -When in \f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLOAD\f[] mode this +When in \f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLOAD\f[R] mode this value is automatically incremended and it is also used for anti\-replay checks. .RE .TP -.B \f[I]esp_attr\f[] -See \f[I]ESP Attributes\f[]. +.B \f[I]esp_attr\f[R] +See \f[I]ESP Attributes\f[R]. May be NULL on modify. -.RS -.RE .TP -.B \f[I]keymat_proto\f[], \f[I]keymat_len\f[], \f[I]keymat_ptr\f[] +.B \f[I]keymat_proto\f[R], \f[I]keymat_len\f[R], \f[I]keymat_ptr\f[R] Describe the key material and encryption standard to use. May be NULL on modify. -.RS -.RE .TP -.B \f[I]replay_proto\f[], \f[I]replay_len\f[], \f[I]replay_ptr\f[] +.B \f[I]replay_proto\f[R], \f[I]replay_len\f[R], \f[I]replay_ptr\f[R] Describe the replay protection scheme used to manage sequence numbers and prevent replay attacks. This field is only valid in full offload mode. May be NULL on modify. -.RS -.RE .TP -.B \f[I]esp_encap\f[] +.B \f[I]esp_encap\f[R] Describe the encapsulation of ESP packets such as the IP tunnel and/or UDP encapsulation. This field is only valid in full offload mode. May be NULL on modify. -.RS -.RE .SS ESP attributes .IP .nf \f[C] -struct\ ibv_flow_action_esp_attr\ { -\ \ \ \ uint32_t\ \ \ spi; -\ \ \ \ uint32_t\ \ \ seq; -\ \ \ \ uint32_t\ \ \ tfc_pad; -\ \ \ \ uint32_t\ \ \ flags; -\ \ \ \ uint64_t\ \ \ hard_limit_pkts; +struct ibv_flow_action_esp_attr { + uint32_t spi; + uint32_t seq; + uint32_t tfc_pad; + uint32_t flags; + uint64_t hard_limit_pkts; }; -\f[] +\f[R] .fi .TP -.B \f[I]flags\f[] -A bitwise OR of the various \f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS\f[] +.B \f[I]flags\f[R] +A bitwise OR of the various \f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS\f[R] described below. .RS .TP -.B \f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_DECRYPT\f[], \f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT\f[] +.B \f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_DECRYPT\f[R], \f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ENCRYPT\f[R] The action will decrypt or encrypt a packet using the provided keying material. .RS @@ -161,13 +146,13 @@ steering rule. .RE .SS Full Offload Mode .PP -When \f[I]esp_attr\f[] flag -\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLOAD\f[] is set the ESP +When \f[I]esp_attr\f[R] flag +\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLOAD\f[R] is set the ESP header and trailer are added and removed automatically during the cipher operation. -In this case the \f[I]esn\f[] and \f[I]spi\f[] are used to populate and -check the ESP header, and any information from the \f[I]keymat\f[] (eg a -IV) is placed in the headers and otherwise handled automatically. +In this case the \f[I]esn\f[R] and \f[I]spi\f[R] are used to populate +and check the ESP header, and any information from the \f[I]keymat\f[R] +(eg a IV) is placed in the headers and otherwise handled automatically. .PP For decrypt the hardware will perform anti\-replay. .PP @@ -179,52 +164,44 @@ packets protected by the SA defined in this action. The following members of the esp_attr are used only in full offload mode: .TP -.B \f[I]spi\f[] +.B \f[I]spi\f[R] The value for the ESP Security Parameters Index. It is only used for -\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLAOD\f[]. -.RS -.RE +\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLAOD\f[R]. .TP -.B \f[I]seq\f[] +.B \f[I]seq\f[R] The initial 32 lower bytes of the sequence number. This is the value of the ESP sequence number. It is only used for -\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLAOD\f[]. -.RS -.RE +\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLAOD\f[R]. .TP -.B \f[I]tfc_pad\f[] +.B \f[I]tfc_pad\f[R] The length of Traffic Flow Confidentiality Padding as specified by RFC4303. If it is set to zero no additional padding is added. It is only used for -\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLAOD\f[]. -.RS -.RE +\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLAOD\f[R]. .TP -.B \f[I]hard_limit_pkts\f[] +.B \f[I]hard_limit_pkts\f[R] The hard lifetime of the SA measured in number of packets. As specified by RFC4301. After this limit is reached the action will drop future packets to prevent breaking the crypto. It is only used for -\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLAOD\f[]. -.RS -.RE +\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLAOD\f[R]. .SS Inline Crypto Mode .PP -When \f[I]esp_attr\f[] flag -\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_INLINE_CRYPTO\f[] is set the user +When \f[I]esp_attr\f[R] flag +\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_INLINE_CRYPTO\f[R] is set the user must providate packets with additional headers. .PP For encrypt the packet must contain a fully populated IPSEC packet except the data payload is left un\-encrypted and there is no IPsec trailer. If the IV must be unpredictable, then a flag should indicate the -transofrmation such as \f[I]IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ\f[]. +transofrmation such as \f[I]IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ\f[R]. .PP -\f[I]IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ\f[] means that the IV is +\f[I]IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ\f[R] means that the IV is incremented sequentually. If the IV algorithm is supported by HW, then it could provide support for LSO offload with ESP inline crypto. @@ -236,12 +213,12 @@ padding is added and the ICV is added as well. For decrypt the packet is authenticated and decrypted in\-place, resulting in a decrypted IPSEC packet with no trailer. The result of decryption and authentication can be retrieved from an -extended CQ via the \f[I]ibv_wc_read_XXX(3)\f[] function. +extended CQ via the \f[I]ibv_wc_read_XXX(3)\f[R] function. .PP This mode must be combined with the flow steering including -\f[I]IBV_FLOW_SPEC_IPV4\f[] and \f[I]IBV_FLOW_SPEC_ESP\f[] to match the -outer packet headers to ensure that the action is only applied to IPSEC -packets with the correct identifiers. +\f[I]IBV_FLOW_SPEC_IPV4\f[R] and \f[I]IBV_FLOW_SPEC_ESP\f[R] to match +the outer packet headers to ensure that the action is only applied to +IPSEC packets with the correct identifiers. .PP For inline crypto, we have some special requirements to maintain a stateless ESN while maintaining the same parameters as software. @@ -249,116 +226,112 @@ The system supports offloading a portion of the IPSEC flow, enabling a single flow to be split between multiple NICs. .SS Determining the ESN for Ingress Packets .PP -We require a "modify" command once every 2^31 packets. +We require a \[lq]modify\[rq] command once every 2\[ha]31 packets. This modify command allows the implementation in HW to be stateless, as follows: .IP .nf \f[C] -\ \ \ \ \ \ \ \ \ \ \ ESN\ 1\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ ESN\ 2\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ ESN\ 3 + ESN 1 ESN 2 ESN 3 |\-\-\-\-\-\-\-\-\-\-\-\-\-*\-\-\-\-\-\-\-\-\-\-\-\-\-|\-\-\-\-\-\-\-\-\-\-\-\-\-*\-\-\-\-\-\-\-\-\-\-\-\-\-|\-\-\-\-\-\-\-\-\-\-\-\-\-* -^\ \ \ \ \ \ \ \ \ \ \ \ \ ^\ \ \ \ \ \ \ \ \ \ \ \ \ ^\ \ \ \ \ \ \ \ \ \ \ \ \ ^\ \ \ \ \ \ \ \ \ \ \ \ \ ^\ \ \ \ \ \ \ \ \ \ \ \ \ ^ -\f[] +\[ha] \[ha] \[ha] \[ha] \[ha] \[ha] +\f[R] .fi .PP -^ \- marks where command invoked to update the SA ESN state machine. +\[ha] \- marks where command invoked to update the SA ESN state machine. .PD 0 .P .PD -| \- marks the start of the ESN scope (0\-2^32\-1). -At this point move SA ESN "new_window" bit to zero and increment ESN. +| \- marks the start of the ESN scope (0\-2\[ha]32\-1). +At this point move SA ESN \[lq]new_window\[rq] bit to zero and increment +ESN. .PD 0 .P .PD -* \- marks the middle of the ESN scope (2^31). -At this point move SA ESN "new_window" bit to one. +* \- marks the middle of the ESN scope (2\[ha]31). +At this point move SA ESN \[lq]new_window\[rq] bit to one. .PP For decryption the implementation uses the following state machine to determine ESN: .IP .nf \f[C] -if\ (!overlap)\ { -\ \ \ \ use\ esn\ //\ regardless\ of\ packet.seq -}\ else\ {\ //\ new_window -\ \ \ \ if\ (packet.seq\ >=\ 2^31) -\ \ \ \ \ \ \ \ use\ esn -\ \ \ \ else\ //\ packet.seq\ <\ 2^31 -\ \ \ \ \ \ \ \ use\ esn+1 +if (!overlap) { + use esn // regardless of packet.seq +} else { // new_window + if (packet.seq >= 2\[ha]31) + use esn + else // packet.seq < 2\[ha]31 + use esn+1 } -\f[] +\f[R] .fi .PP -This mechanism is controlled by the \f[I]esp_attr\f[] flag: +This mechanism is controlled by the \f[I]esp_attr\f[R] flag: .TP -.B \f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW\f[] +.B \f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_ESN_NEW_WINDOW\f[R] This flag is only used to provide stateless ESN support for inline crypto. It is used only for -\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_INLINE_CRYPTO\f[] and -\f[I]IBV_FLOW_ACTION_ESP_MASK_ESN\f[]. +\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_INLINE_CRYPTO\f[R] and +\f[I]IBV_FLOW_ACTION_ESP_MASK_ESN\f[R]. .RS .PP Setting this flag indicates that the bottom of the replay window is -between 2^31 \- 2^32. +between 2\[ha]31 \- 2\[ha]32. .RE -.SS Key Material for AES GCM (\f[I]IBV_ACTION_ESP_KEYMAT_AES_GCM\f[]) +.SS Key Material for AES GCM (\f[I]IBV_ACTION_ESP_KEYMAT_AES_GCM\f[R]) .PP The AES GCM crypto algorithm as defined by RFC4106. -This struct is to be provided in \f[I]keymat_ptr\f[] when -\f[I]keymat_proto\f[] is set to \f[I]IBV_ACTION_ESP_KEYMAT_AES_GCM\f[]. +This struct is to be provided in \f[I]keymat_ptr\f[R] when +\f[I]keymat_proto\f[R] is set to +\f[I]IBV_ACTION_ESP_KEYMAT_AES_GCM\f[R]. .IP .nf \f[C] -struct\ ibv_flow_action_esp_aes_keymat_aes_gcm\ { -\ \ \ \ uint64_t\ \ \ iv; -\ \ \ \ uint32_t\ \ \ iv_algo;\ /*\ Use\ enum\ ib_uverbs_flow_action_esp_aes_gcm_keymat_iv_algo\ */ +struct ibv_flow_action_esp_aes_keymat_aes_gcm { + uint64_t iv; + uint32_t iv_algo; /* Use enum ib_uverbs_flow_action_esp_aes_gcm_keymat_iv_algo */ -\ \ \ \ uint32_t\ \ \ salt; -\ \ \ \ uint32_t\ \ \ icv_len; + uint32_t salt; + uint32_t icv_len; -\ \ \ \ uint32_t\ \ \ key_len; -\ \ \ \ uint32_t\ \ \ aes_key[256\ /\ 32]; + uint32_t key_len; + uint32_t aes_key[256 / 32]; }; -\f[] +\f[R] .fi .TP -.B \f[I]iv\f[] +.B \f[I]iv\f[R] The starting value for the initialization vector used only with -\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLOAD\f[] encryption as +\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLOAD\f[R] encryption as defined in RFC4106. This field is ignored for -\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_INLINE_CRYPTO\f[]. +\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_INLINE_CRYPTO\f[R]. .RS .PP For a given key, the IV MUST NOT be reused. .RE .TP -.B \f[I]iv_algo\f[] +.B \f[I]iv_algo\f[R] The algorithm used to transform/generate new IVs with -\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLOAD\f[] encryption. +\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLOAD\f[R] encryption. .RS .PP -The only supported value is \f[I]IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ\f[] +The only supported value is \f[I]IB_UVERBS_FLOW_ACTION_IV_ALGO_SEQ\f[R] to generate sequantial IVs. .RE .TP -.B \f[I]salt\f[] +.B \f[I]salt\f[R] The salt as defined by RFC4106. -.RS -.RE .TP -.B \f[I]icv_len\f[] +.B \f[I]icv_len\f[R] The length of the Integrity Check Value in bytes as defined by RFC4106. -.RS -.RE .TP -.B \f[I]aes_key\f[], \f[I]key_len\f[] +.B \f[I]aes_key\f[R], \f[I]key_len\f[R] The cipher key data. It must be either 16, 24 or 32 bytes as defined by RFC4106. -.RS -.RE -.SS Bitmap Replay Protection (\f[I]IBV_FLOW_ACTION_ESP_REPLAY_BMP\f[]) +.SS Bitmap Replay Protection (\f[I]IBV_FLOW_ACTION_ESP_REPLAY_BMP\f[R]) .PP A shifting bitmap is used to identify which packets have already been transmitted. @@ -366,20 +339,22 @@ Each bit in the bitmap represents a packet, it is set if a packet with this ESP sequence number has been received and it passed authentication. If a packet with the same sequence is received, then the bit is already set, causing replay protection to drop the packet. -The bitmap represents a window of \f[I]size\f[] sequence numbers. +The bitmap represents a window of \f[I]size\f[R] sequence numbers. If a newer sequence number is received, then the bitmap will shift to represent this as in RFC6479. -The replay window cannot shift more than 2^31 sequence numbers forward. +The replay window cannot shift more than 2\[ha]31 sequence numbers +forward. .PP -This struct is to be provided in \f[I]replay_ptr\f[] when -\f[I]reply_proto\f[] is set to \f[I]IBV_FLOW_ACTION_ESP_REPLAY_BMP\f[]. +This struct is to be provided in \f[I]replay_ptr\f[R] when +\f[I]reply_proto\f[R] is set to +\f[I]IBV_FLOW_ACTION_ESP_REPLAY_BMP\f[R]. In this mode reply_ptr and reply_len should point to a struct -ibv_flow_action_esp_replay_bmp containing: \f[I]size\f[] : The size of +ibv_flow_action_esp_replay_bmp containing: \f[I]size\f[R] : The size of the bitmap. .SS ESP Encapsulation .PP -An \f[I]esp_encap\f[] specification is required when \f[I]eps_attr\f[] -flags \f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_TUNNEL\f[] is set. +An \f[I]esp_encap\f[R] specification is required when \f[I]eps_attr\f[R] +flags \f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_TUNNEL\f[R] is set. It is used to provide the fields for the encapsulation header that is added/removed to/from packets. Tunnel and Transport mode are defined as in RFC4301. @@ -387,30 +362,30 @@ UDP encapsulation of ESP can be specified by providing the appropriate UDP header. .PP This setting is only used in -\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLOAD\f[] mode. +\f[I]IB_UVERBS_FLOW_ACTION_ESP_FLAGS_FULL_OFFLOAD\f[R] mode. .IP .nf \f[C] -struct\ ibv_flow_action_esp_encap\ { -\ \ \ \ void\ \ \ \ \ \ \ \ *val;\ \ \ \ \ \ \ \ /*\ pointer\ to\ struct\ ibv_flow_xxxx_filter\ */ -\ \ \ \ struct\ ibv_flow_action_esp_encap\ \ \ *next_ptr; -\ \ \ \ uint16_t\ \ \ \ len;\ \ \ \ \ \ \ \ \ \ \ \ /*\ Len\ of\ mask\ and\ pointer\ (separately)\ */ -\ \ \ \ uint16_t\ \ \ \ type;\ \ \ \ \ \ \ \ \ \ \ /*\ Use\ flow_spec\ enum\ */ +struct ibv_flow_action_esp_encap { + void *val; /* pointer to struct ibv_flow_xxxx_filter */ + struct ibv_flow_action_esp_encap *next_ptr; + uint16_t len; /* Len of mask and pointer (separately) */ + uint16_t type; /* Use flow_spec enum */ }; -\f[] +\f[R] .fi .PP Each link in the list specifies a network header in the same manner as the flow steering API. -The header should be selected from a supported header in \[aq]enum -ibv_flow_spec_type\[aq]. +The header should be selected from a supported header in `enum +ibv_flow_spec_type'. .SH RETURN VALUE .PP -Upon success \f[I]ibv_create_flow_action_esp\f[] will return a new -\f[I]struct ibv_flow_action\f[] object, on error NULL will be returned +Upon success \f[I]ibv_create_flow_action_esp\f[R] will return a new +\f[I]struct ibv_flow_action\f[R] object, on error NULL will be returned and errno will be set. .PP -Upon success \f[I]ibv_modify_action_esp\f[] will return 0. +Upon success \f[I]ibv_modify_action_esp\f[R] will return 0. On error the value of errno will be returned. If ibv_modify_flow_action fails, it is guaranteed that the last action still holds. @@ -419,5 +394,5 @@ applied on all packets until this point and the new one is applied on all packets from this point and on. .SH SEE ALSO .PP -\f[I]ibv_create_flow(3)\f[], \f[I]ibv_destroy_action(3)\f[], \f[I]RFC -4106\f[] +\f[I]ibv_create_flow(3)\f[R], \f[I]ibv_destroy_action(3)\f[R], \f[I]RFC +4106\f[R] diff --git a/buildlib/pandoc-prebuilt/03680fe180ea50ca7a257bae4e9229a77c5bee39 b/buildlib/pandoc-prebuilt/03680fe180ea50ca7a257bae4e9229a77c5bee39 new file mode 100644 index 0000000..310d12a --- /dev/null +++ b/buildlib/pandoc-prebuilt/03680fe180ea50ca7a257bae4e9229a77c5bee39 @@ -0,0 +1,61 @@ +.\" Automatically generated by Pandoc 2.5 +.\" +.TH "RDMA_GET_REMOTE_ECE" "3" "2020\-02\-02" "librdmacm" "Librdmacm Programmer\[cq]s Manual" +.hy +.SH NAME +.PP +rdma_get_remote_ece \- Get remote ECE paraemters as received from the +peer. +.SH SYNOPSIS +.IP +.nf +\f[C] +#include + +int rdma_get_remote_ece(struct rdma_cm_id *id, struct ibv_ece *ece); +\f[R] +.fi +.SH DESCRIPTION +.PP +\f[B]rdma_get_remote_ece()\f[R] get ECE parameters as were received from +the communication peer. +.PP +This function is suppose to be used by the users of external QPs. +The call needs to be performed before replying to the peer and needed to +allow for the passive side to know ECE options of other side. +.PP +Being used by external QP and RDMA_CM doesn\[cq]t manage that QP, the +peer needs to call to libibverbs API by itself. +.PP +Usual flow for the passive side will be: +.IP \[bu] 2 +ibv_create_qp() <\- create data QP. +.IP \[bu] 2 +ece = rdma_get_remote_ece() <\- get ECE options from remote peer +.IP \[bu] 2 +ibv_set_ece(ece) <\- set local ECE options with data received from the +peer. +.IP \[bu] 2 +ibv_modify_qp() <\- enable data QP. +.IP \[bu] 2 +rdma_set_local_ece(ece) <\- set desired ECE options after respective +libibverbs provider masked unsupported options. +.IP \[bu] 2 +rdma_accept()/rdma_establish()/rdma_reject_ece() +.SH ARGUMENTS +.TP +.B *id +RDMA communication identifier. +.TP +.B *ece +ECE struct to be filled. +.SH RETURN VALUE +.PP +\f[B]rdma_get_remote_ece()\f[R] returns 0 on success, or \-1 on error. +If an error occurs, errno will be set to indicate the failure reason. +.SH SEE ALSO +.PP +\f[B]rdma_cm\f[R](7), rdma_set_local_ece(3) +.SH AUTHOR +.PP +Leon Romanovsky diff --git a/buildlib/pandoc-prebuilt/0ac7ce8cf9db88f5ac3aa4e5afdeae555017e1fc b/buildlib/pandoc-prebuilt/0ac7ce8cf9db88f5ac3aa4e5afdeae555017e1fc index c906920..5b687b9 100644 --- a/buildlib/pandoc-prebuilt/0ac7ce8cf9db88f5ac3aa4e5afdeae555017e1fc +++ b/buildlib/pandoc-prebuilt/0ac7ce8cf9db88f5ac3aa4e5afdeae555017e1fc @@ -1,4 +1,4 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" .TH "mlx5dv_devx_qp[/cq/srq/wq/ind_tbl]_modify / query" "3" "" "" "" .hy @@ -28,30 +28,30 @@ mlx5dv_devx_ind_tbl_query \- Queries a verbs indirection table via DEVX .IP .nf \f[C] -#include\ -int\ mlx5dv_devx_qp_modify(struct\ ibv_qp\ *qp,\ const\ void\ *in,\ size_t\ inlen, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *out,\ size_t\ outlen); -int\ mlx5dv_devx_qp_query(struct\ ibv_qp\ *qp,\ const\ void\ *in,\ size_t\ inlen, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *out,\ size_t\ outlen); -int\ mlx5dv_devx_cq_modify(struct\ ibv_cq\ *cq,\ const\ void\ *in,\ size_t\ inlen, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *out,\ size_t\ outlen); -int\ mlx5dv_devx_cq_query(struct\ ibv_cq\ *cq,\ const\ void\ *in,\ size_t\ inlen, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *out,\ size_t\ outlen); -int\ mlx5dv_devx_srq_modify(struct\ ibv_srq\ *srq,\ const\ void\ *in,\ size_t\ inlen, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *out,\ size_t\ outlen); -int\ mlx5dv_devx_srq_query(struct\ ibv_srq\ *srq,\ const\ void\ *in,\ size_t\ inlen, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *out,\ size_t\ outlen); -int\ mlx5dv_devx_wq_modify(struct\ ibv_wq\ *wq,\ const\ void\ *in,\ size_t\ inlen, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *out,\ size_t\ outlen); -int\ mlx5dv_devx_wq_query(struct\ ibv_wq\ *wq,\ const\ void\ *in,\ size_t\ inlen, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *out,\ size_t\ outlen); -int\ mlx5dv_devx_ind_tbl_modify(struct\ ibv_rwq_ind_table\ *ind_tbl, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ const\ void\ *in,\ size_t\ inlen, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *out,\ size_t\ outlen); -int\ mlx5dv_devx_ind_tbl_query(struct\ ibv_rwq_ind_table\ *ind_tbl, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ const\ void\ *in,\ size_t\ inlen, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *out,\ size_t\ outlen); -\f[] +#include +int mlx5dv_devx_qp_modify(struct ibv_qp *qp, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_qp_query(struct ibv_qp *qp, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_cq_modify(struct ibv_cq *cq, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_cq_query(struct ibv_cq *cq, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_srq_modify(struct ibv_srq *srq, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_srq_query(struct ibv_srq *srq, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_wq_modify(struct ibv_wq *wq, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_wq_query(struct ibv_wq *wq, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_ind_tbl_modify(struct ibv_rwq_ind_table *ind_tbl, + const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_ind_tbl_query(struct ibv_rwq_ind_table *ind_tbl, + const void *in, size_t inlen, + void *out, size_t outlen); +\f[R] .fi .SH DESCRIPTION .PP @@ -66,43 +66,34 @@ commands can be activated with minimal to none kernel changes. The above APIs enables modifying/querying a verb object via the DEVX interface. This enables interoperability between verbs and DEVX. -As such an application can use the create method from verbs (e.g. -ibv_create_qp) and modify and query the created object via DEVX (e.g. -mlx5dv_devx_qp_modify). +As such an application can use the create method from verbs +(e.g.\ ibv_create_qp) and modify and query the created object via DEVX +(e.g.\ mlx5dv_devx_qp_modify). .SH ARGUMENTS .TP -.B \f[I]qp/cq/wq/srq/ind_tbl\f[] +.B \f[I]qp/cq/wq/srq/ind_tbl\f[R] The ibv_xxx object to issue the action on. -.RS -.RE .TP -.B \f[I]in\f[] -A buffer which contains the command\[aq]s input data provided in a +.B \f[I]in\f[R] +A buffer which contains the command\[cq]s input data provided in a device specification format. -.RS -.RE .TP -.B \f[I]inlen\f[] -The size of \f[I]in\f[] buffer in bytes. -.RS -.RE +.B \f[I]inlen\f[R] +The size of \f[I]in\f[R] buffer in bytes. .TP -.B \f[I]out\f[] -A buffer which contains the command\[aq]s output data according to the +.B \f[I]out\f[R] +A buffer which contains the command\[cq]s output data according to the device specification format. -.RS -.RE .TP -.B \f[I]outlen\f[] -The size of \f[I]out\f[] buffer in bytes. -.RS -.RE +.B \f[I]outlen\f[R] +The size of \f[I]out\f[R] buffer in bytes. .SH RETURN VALUE .PP Upon success 0 is returned or the value of errno on a failure. .SH SEE ALSO .PP -\f[B]mlx5dv_open_device\f[], \f[B]mlx5dv_devx_obj_create\f[] -.SH AUTHOR +\f[B]mlx5dv_open_device\f[R], \f[B]mlx5dv_devx_obj_create\f[R] +.PP +#AUTHOR .PP Yishai Hadas diff --git a/buildlib/pandoc-prebuilt/15509ed914ee358ac026220db5abf5f9fe1737de b/buildlib/pandoc-prebuilt/15509ed914ee358ac026220db5abf5f9fe1737de index 7a629ad..9a6ed43 100644 --- a/buildlib/pandoc-prebuilt/15509ed914ee358ac026220db5abf5f9fe1737de +++ b/buildlib/pandoc-prebuilt/15509ed914ee358ac026220db5abf5f9fe1737de @@ -1,31 +1,31 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" -.TH "IBV_GET_DEVICE_NAME" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.TH "IBV_GET_DEVICE_NAME" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[cq]s Manual" .hy .SH NAME .PP -ibv_get_device_name \- get an RDMA device\[aq]s name +ibv_get_device_name \- get an RDMA device\[cq]s name .SH SYNOPSIS .IP .nf \f[C] -#include\ +#include -const\ char\ *ibv_get_device_name(struct\ ibv_device\ *device); -\f[] +const char *ibv_get_device_name(struct ibv_device *device); +\f[R] .fi .SH DESCRIPTION .PP -\f[B]ibv_get_device_name()\f[] returns a human\-readable name associated -with the RDMA device \f[I]device\f[]. +\f[B]ibv_get_device_name()\f[R] returns a human\-readable name +associated with the RDMA device \f[I]device\f[R]. .SH RETURN VALUE .PP -\f[B]ibv_get_device_name()\f[] returns a pointer to the device name, or +\f[B]ibv_get_device_name()\f[R] returns a pointer to the device name, or NULL if the request fails. .SH SEE ALSO .PP -\f[B]ibv_get_device_guid\f[](3), \f[B]ibv_get_device_list\f[](3), -\f[B]ibv_open_device\f[](3) +\f[B]ibv_get_device_guid\f[R](3), \f[B]ibv_get_device_list\f[R](3), +\f[B]ibv_open_device\f[R](3) .SH AUTHOR .PP Dotan Barak diff --git a/buildlib/pandoc-prebuilt/162c504f31acfe07dc7c4eed5abd1c873ac2f734 b/buildlib/pandoc-prebuilt/162c504f31acfe07dc7c4eed5abd1c873ac2f734 new file mode 100644 index 0000000..5ace0f3 --- /dev/null +++ b/buildlib/pandoc-prebuilt/162c504f31acfe07dc7c4eed5abd1c873ac2f734 @@ -0,0 +1,94 @@ +.\" Automatically generated by Pandoc 2.5 +.\" +.TH "mlx5dv_create_flow" "3" "2018\-9\-19" "mlx5" "mlx5 Programmer\[cq]s Manual" +.hy +.SH NAME +.PP +mlx5dv_create_flow \- creates a steering flow rule +.SH SYNOPSIS +.IP +.nf +\f[C] +#include + +struct ibv_flow * +mlx5dv_create_flow(struct mlx5dv_flow_matcher *flow_matcher, + struct mlx5dv_flow_match_parameters *match_value, + size_t num_actions, + struct mlx5dv_flow_action_attr actions_attr[]) +\f[R] +.fi +.SH DESCRIPTION +.PP +\f[B]mlx5dv_create_flow()\f[R] creates a steering flow rule with the +ability to specify specific driver properties. +.SH ARGUMENTS +.PP +Please see \f[I]mlx5dv_create_flow_matcher(3)\f[R] for +\f[I]flow_matcher\f[R] and \f[I]match_value\f[R]. +.TP +.B \f[I]num_actions\f[R] +Specifies how many actions are passed in \f[I]actions_attr\f[R] +.SS \f[I]actions_attr\f[R] +.IP +.nf +\f[C] +struct mlx5dv_flow_action_attr { + enum mlx5dv_flow_action_type type; + union { + struct ibv_qp *qp; + struct ibv_counters *counter; + struct ibv_flow_action *action; + uint32_t tag_value; + struct mlx5dv_devx_obj *obj; + }; +}; +\f[R] +.fi +.TP +.B \f[I]type\f[R] +MLX5DV_FLOW_ACTION_DEST_IBV_QP The QP passed will receive the matched +packets. +MLX5DV_FLOW_ACTION_IBV_FLOW_ACTION The flow action to be applied. +MLX5DV_FLOW_ACTION_TAG Flow tag to be provided in work completion. +MLX5DV_FLOW_ACTION_DEST_DEVX The DEVX destination object for the matched +packets. +MLX5DV_FLOW_ACTION_COUNTERS_DEVX The DEVX counter object for the matched +packets. +MLX5DV_FLOW_ACTION_DEFAULT_MISS Steer the packet to the default miss +destination. +MLX5DV_FLOW_ACTION_DROP Action is dropping the matched packet. +.TP +.B \f[I]qp\f[R] +QP passed, to be used with \f[I]type\f[R] +\f[I]MLX5DV_FLOW_ACTION_DEST_IBV_QP\f[R]. +.TP +.B \f[I]action\f[R] +Flow action, to be used with \f[I]type\f[R] +\f[I]MLX5DV_FLOW_ACTION_IBV_FLOW_ACTION\f[R] see +\f[I]mlx5dv_create_flow_action_modify_header(3)\f[R] and +\f[I]mlx5dv_create_flow_action_packet_reformat(3)\f[R]. +.TP +.B \f[I]tag_value\f[R] +tag value to be passed in the work completion, to be used with +\f[I]type\f[R] \f[I]MLX5DV_FLOW_ACTION_TAG\f[R] see +\f[I]ibv_create_cq_ex(3)\f[R]. +.TP +.B \f[I]obj\f[R] +DEVX object, to be used with \f[I]type\f[R] +\f[I]MLX5DV_FLOW_ACTION_DEST_DEVX\f[R] or by +\f[I]MLX5DV_FLOW_ACTION_COUNTERS_DEVX\f[R]. +.SH RETURN VALUE +.PP +\f[B]mlx5dv_create_flow\f[R] returns a pointer to the created flow rule, +on error NULL will be returned and errno will be set. +.SH SEE ALSO +.PP +\f[I]mlx5dv_create_flow_action_modify_header(3)\f[R], +\f[I]mlx5dv_create_flow_action_packet_reformat(3)\f[R], +\f[I]mlx5dv_create_flow_matcher(3)\f[R], \f[I]mlx5dv_create_qp(3)\f[R], +\f[I]ibv_create_qp_ex(3)\f[R] \f[I]ibv_create_cq_ex(3)\f[R] +\f[I]ibv_create_counters(3)\f[R] +.SH AUTHOR +.PP +Mark Bloch diff --git a/buildlib/pandoc-prebuilt/177697d3a0829174dba026d04c8eba60b5a27007 b/buildlib/pandoc-prebuilt/177697d3a0829174dba026d04c8eba60b5a27007 new file mode 100644 index 0000000..328f9ca --- /dev/null +++ b/buildlib/pandoc-prebuilt/177697d3a0829174dba026d04c8eba60b5a27007 @@ -0,0 +1,63 @@ +.\" Automatically generated by Pandoc 2.5 +.\" +.TH "RDMA_SET_LOCAL_ECE" "3" "2020\-02\-02" "librdmacm" "Librdmacm Programmer\[cq]s Manual" +.hy +.SH NAME +.PP +rdma_set_local_ece \- Set local ECE paraemters to be used for REQ/REP +communication. +.SH SYNOPSIS +.IP +.nf +\f[C] +#include + +int rdma_set_local_ece(struct rdma_cm_id *id, struct ibv_ece *ece); +\f[R] +.fi +.SH DESCRIPTION +.PP +\f[B]rdma_set_local_ece()\f[R] set local ECE parameters. +.PP +This function is suppose to be used by the users of external QPs. +The call needs to be performed before replying to the peer and needed to +configure RDMA_CM with desired ECE options. +.PP +Being used by external QP and RDMA_CM doesn\[cq]t manage that QP, the +peer needs to call to libibverbs API by itself. +.PP +Usual flow for the passive side will be: +.IP \[bu] 2 +ibv_create_qp() <\- create data QP. +.IP \[bu] 2 +ece = ibv_get_ece() <\- get ECE from libibvers provider. +.IP \[bu] 2 +rdma_set_local_ece(ece) <\- set desired ECE options. +.IP \[bu] 2 +rdma_connect() <\- send connection request +.IP \[bu] 2 +ece = rdma_get_remote_ece() <\- get ECE options from remote peer +.IP \[bu] 2 +ibv_set_ece(ece) <\- set local ECE options with data received from the +peer. +.IP \[bu] 2 +ibv_modify_qp() <\- enable data QP. +.IP \[bu] 2 +rdma_accept()/rdma_establish()/rdma_reject_ece() +.SH ARGUMENTS +.TP +.B \f[I]id\f[R] +RDMA communication identifier. +.TP +.B *ece +ECE parameters. +.SH RETURN VALUE +.PP +\f[B]rdma_set_local_ece()\f[R] returns 0 on success, or \-1 on error. +If an error occurs, errno will be set to indicate the failure reason. +.SH SEE ALSO +.PP +\f[B]rdma_cm\f[R](7), rdma_get_remote_ece(3) +.SH AUTHOR +.PP +Leon Romanovsky diff --git a/buildlib/pandoc-prebuilt/1c3f51131206bb1a7ed34fbdc897910d313df687 b/buildlib/pandoc-prebuilt/1c3f51131206bb1a7ed34fbdc897910d313df687 index 98d7d66..628b7c7 100644 --- a/buildlib/pandoc-prebuilt/1c3f51131206bb1a7ed34fbdc897910d313df687 +++ b/buildlib/pandoc-prebuilt/1c3f51131206bb1a7ed34fbdc897910d313df687 @@ -1,4 +1,4 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" .TH "mlx5dv_flow_action_esp" "3" "" "" "" .hy @@ -9,13 +9,13 @@ mlx5dv_flow_action_esp \- Flow action esp for mlx5 provider .IP .nf \f[C] -#include\ +#include -struct\ ibv_flow_action\ * -mlx5dv_create_flow_action_esp(struct\ ibv_context\ *ctx, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ ibv_flow_action_esp_attr\ *esp, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ mlx5dv_flow_action_esp\ *mlx5_attr); -\f[] +struct ibv_flow_action * +mlx5dv_create_flow_action_esp(struct ibv_context *ctx, + struct ibv_flow_action_esp_attr *esp, + struct mlx5dv_flow_action_esp *mlx5_attr); +\f[R] .fi .SH DESCRIPTION .PP @@ -23,34 +23,32 @@ Create an IPSEC ESP flow steering action. .PD 0 .P .PD -This verb is identical to \f[I]ibv_create_flow_action_esp\f[] verb, but +This verb is identical to \f[I]ibv_create_flow_action_esp\f[R] verb, but allows mlx5 specific flags. .SH ARGUMENTS .PP -Please see \f[I]ibv_flow_action_esp(3)\f[] man page for \f[I]ctx\f[] and -\f[I]esp\f[]. -.SS \f[I]mlx5_attr\f[] argument +Please see \f[I]ibv_flow_action_esp(3)\f[R] man page for \f[I]ctx\f[R] +and \f[I]esp\f[R]. +.SS \f[I]mlx5_attr\f[R] argument .IP .nf \f[C] -struct\ mlx5dv_flow_action_esp\ { -\ \ \ \ uint64_t\ comp_mask;\ \ /*\ Use\ enum\ mlx5dv_flow_action_esp_mask\ */ -\ \ \ \ uint32_t\ action_flags;\ /*\ Use\ enum\ mlx5dv_flow_action_flags\ */ +struct mlx5dv_flow_action_esp { + uint64_t comp_mask; /* Use enum mlx5dv_flow_action_esp_mask */ + uint32_t action_flags; /* Use enum mlx5dv_flow_action_flags */ }; -\f[] +\f[R] .fi .TP -.B \f[I]comp_mask\f[] +.B \f[I]comp_mask\f[R] Bitmask specifying what fields in the structure are valid (\f[I]enum -mlx5dv_flow_action_esp_mask\f[]). -.RS -.RE +mlx5dv_flow_action_esp_mask\f[R]). .TP -.B \f[I]action_flags\f[] +.B \f[I]action_flags\f[R] A bitwise OR of the various values described below. .RS .PP -\f[I]MLX5DV_FLOW_ACTION_FLAGS_REQUIRE_METADATA\f[]: +\f[I]MLX5DV_FLOW_ACTION_FLAGS_REQUIRE_METADATA\f[R]: .PD 0 .P .PD @@ -77,4 +75,4 @@ There is no need to call modify to update the ESN window on egress when this DV is used. .SH SEE ALSO .PP -\f[I]ibv_flow_action_esp(3)\f[], \f[I]RFC 4106\f[] +\f[I]ibv_flow_action_esp(3)\f[R], \f[I]RFC 4106\f[R] diff --git a/buildlib/pandoc-prebuilt/2082c9e75706a10a0c0c9925f5108736249d8368 b/buildlib/pandoc-prebuilt/2082c9e75706a10a0c0c9925f5108736249d8368 index d896f39..ad85484 100644 --- a/buildlib/pandoc-prebuilt/2082c9e75706a10a0c0c9925f5108736249d8368 +++ b/buildlib/pandoc-prebuilt/2082c9e75706a10a0c0c9925f5108736249d8368 @@ -1,27 +1,27 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" -.TH "ibv_create_counters" "3" "2018\-04\-02" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.TH "ibv_create_counters" "3" "2018\-04\-02" "libibverbs" "Libibverbs Programmer\[cq]s Manual" .hy .SH NAME .PP -\f[B]ibv_create_counters\f[], \f[B]ibv_destroy_counters\f[] \- Create or -destroy a counters handle +\f[B]ibv_create_counters\f[R], \f[B]ibv_destroy_counters\f[R] \- Create +or destroy a counters handle .SH SYNOPSIS .IP .nf \f[C] -#include\ +#include -struct\ ibv_counters\ * -ibv_create_counters(struct\ ibv_context\ *context, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ ibv_counters_init_attr\ *init_attr); +struct ibv_counters * +ibv_create_counters(struct ibv_context *context, + struct ibv_counters_init_attr *init_attr); -int\ ibv_destroy_counters(struct\ ibv_counters\ *counters); -\f[] +int ibv_destroy_counters(struct ibv_counters *counters); +\f[R] .fi .SH DESCRIPTION .PP -\f[B]ibv_create_counters\f[]() creates a new counters handle for the +\f[B]ibv_create_counters\f[R]() creates a new counters handle for the RDMA device context. .PP An ibv_counters handle can be attached to a verbs resource (e.g.: QP, @@ -29,71 +29,60 @@ WQ, Flow) statically when these are created. .PP For example attach an ibv_counters statically to a Flow (struct ibv_flow) during creation of a new Flow by calling -\f[B]ibv_create_flow()\f[]. +\f[B]ibv_create_flow()\f[R]. .PP Counters are cleared upon creation and values will be monotonically increasing. .PP -\f[B]ibv_destroy_counters\f[]() releases the counters handle, user +\f[B]ibv_destroy_counters\f[R]() releases the counters handle, user should detach the counters object before destroying it. .SH ARGUMENTS .TP -.B \f[I]context\f[] +.B \f[I]context\f[R] RDMA device context to create the counters on. -.RS -.RE .TP -.B \f[I]init_attr\f[] +.B \f[I]init_attr\f[R] Is an ibv_counters_init_attr struct, as defined in verbs.h. -.RS -.RE -.SS \f[I]init_attr\f[] Argument +.SS \f[I]init_attr\f[R] Argument .IP .nf \f[C] -struct\ ibv_counters_init_attr\ { -\ \ \ \ int\ comp_mask; +struct ibv_counters_init_attr { + int comp_mask; }; -\f[] +\f[R] .fi .TP -.B \f[I]comp_mask\f[] +.B \f[I]comp_mask\f[R] Bitmask specifying what fields in the structure are valid. -.RS -.RE .SH RETURN VALUE .PP -\f[B]ibv_create_counters\f[]() returns a pointer to the allocated +\f[B]ibv_create_counters\f[R]() returns a pointer to the allocated ibv_counters object, or NULL if the request fails (and sets errno to indicate the failure reason) .PP -\f[B]ibv_destroy_counters\f[]() returns 0 on success, or the value of +\f[B]ibv_destroy_counters\f[R]() returns 0 on success, or the value of errno on failure (which indicates the failure reason) .SH ERRORS .TP .B EOPNOTSUPP -\f[B]ibv_create_counters\f[]() is not currently supported on this device -(ENOSYS may sometimes be returned by old versions of libibverbs). -.RS -.RE +\f[B]ibv_create_counters\f[R]() is not currently supported on this +device (ENOSYS may sometimes be returned by old versions of libibverbs). .TP .B ENOMEM -\f[B]ibv_create_counters\f[]() could not create ibv_counters object, not -enough memory -.RS -.RE +\f[B]ibv_create_counters\f[R]() could not create ibv_counters object, +not enough memory .TP .B EINVAL -invalid parameter supplied \f[B]ibv_destroy_counters\f[]() -.RS -.RE +invalid parameter supplied \f[B]ibv_destroy_counters\f[R]() .SH EXAMPLE .PP -An example of use of ibv_counters is shown in \f[B]ibv_read_counters\f[] +An example of use of ibv_counters is shown in +\f[B]ibv_read_counters\f[R] .SH SEE ALSO .PP -\f[B]ibv_attach_counters_point_flow\f[], \f[B]ibv_read_counters\f[], -\f[B]ibv_create_flow\f[] +\f[B]ibv_attach_counters_point_flow\f[R], \f[B]ibv_read_counters\f[R], +\f[B]ibv_create_flow\f[R] .SH AUTHORS .PP Raed Salem diff --git a/buildlib/pandoc-prebuilt/23046225aae54879fdd2d044ba307096e412d64c b/buildlib/pandoc-prebuilt/23046225aae54879fdd2d044ba307096e412d64c index 0f691a3..0240441 100644 --- a/buildlib/pandoc-prebuilt/23046225aae54879fdd2d044ba307096e412d64c +++ b/buildlib/pandoc-prebuilt/23046225aae54879fdd2d044ba307096e412d64c @@ -1,4 +1,4 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" .TH "mlx5dv_alloc_var / mlx5dv_free_var" "3" "" "" "" .hy @@ -11,13 +11,13 @@ mlx5dv_free_var \- Frees a VAR .IP .nf \f[C] -#include\ +#include -struct\ mlx5dv_var\ * -mlx5dv_alloc_var(struct\ ibv_context\ *context,\ uint32_t\ flags); +struct mlx5dv_var * +mlx5dv_alloc_var(struct ibv_context *context, uint32_t flags); -void\ mlx5dv_free_var(struct\ mlx5dv_var\ *dv_var); -\f[] +void mlx5dv_free_var(struct mlx5dv_var *dv_var); +\f[R] .fi .SH DESCRIPTION .PP @@ -29,51 +29,41 @@ device driver, the VAR information is needed for few commands related to Virtio. .SH ARGUMENTS .TP -.B \f[I]context\f[] +.B \f[I]context\f[R] RDMA device context to work on. -.RS -.RE .TP -.B \f[I]flags\f[] +.B \f[I]flags\f[R] Allocation flags for the UAR. -.RS -.RE .SS dv_var .IP .nf \f[C] -struct\ mlx5dv_var\ { -\ \ \ \ uint32_t\ page_id; -\ \ \ \ uint32_t\ length; -\ \ \ \ off_t\ mmap_off; -\ \ \ \ uint64_t\ comp_mask; +struct mlx5dv_var { + uint32_t page_id; + uint32_t length; + off_t mmap_off; + uint64_t comp_mask; }; -\f[] +\f[R] .fi .TP -.B \f[I]page_id\f[] +.B \f[I]page_id\f[R] The device page id to be used. -.RS -.RE .TP -.B \f[I]length\f[] +.B \f[I]length\f[R] The mmap length parameter to be used for mapping a VA to the allocated VAR entry. -.RS -.RE .TP -.B \f[I]mmap_off\f[] +.B \f[I]mmap_off\f[R] The mmap offset parameter to be used for mapping a VA to the allocated VAR entry. -.RS -.RE .SH RETURN VALUE .PP -Upon success \f[I]mlx5dv_alloc_var\f[] returns a pointer to the created +Upon success \f[I]mlx5dv_alloc_var\f[R] returns a pointer to the created VAR ,on error NULL will be returned and errno will be set. .SH SEE ALSO .PP -\f[B]mlx5dv_open_device\f[], \f[B]mlx5dv_devx_obj_create\f[] +\f[B]mlx5dv_open_device\f[R], \f[B]mlx5dv_devx_obj_create\f[R] .SH AUTHOR .PP Yishai Hadas diff --git a/buildlib/pandoc-prebuilt/23660644c7d16519530ca5d9fe12f0f800e1f1c0 b/buildlib/pandoc-prebuilt/23660644c7d16519530ca5d9fe12f0f800e1f1c0 index 8fa3384..7b9b876 100644 --- a/buildlib/pandoc-prebuilt/23660644c7d16519530ca5d9fe12f0f800e1f1c0 +++ b/buildlib/pandoc-prebuilt/23660644c7d16519530ca5d9fe12f0f800e1f1c0 @@ -1,4 +1,4 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" .TH "mlx5dv_open_device" "3" "" "" "" .hy @@ -9,61 +9,55 @@ mlx5dv_open_device \- Open an RDMA device context for the mlx5 provider .IP .nf \f[C] -#include\ +#include -struct\ ibv_context\ * -mlx5dv_open_device(struct\ ibv_device\ *device,\ struct\ mlx5dv_context_attr\ *attr); -\f[] +struct ibv_context * +mlx5dv_open_device(struct ibv_device *device, struct mlx5dv_context_attr *attr); +\f[R] .fi .SH DESCRIPTION .PP Open an RDMA device context with specific mlx5 provider attributes. .SH ARGUMENTS .TP -.B \f[I]device\f[] +.B \f[I]device\f[R] RDMA device to open. -.RS -.RE -.SS \f[I]attr\f[] argument +.SS \f[I]attr\f[R] argument .IP .nf \f[C] -struct\ mlx5dv_context_attr\ { -\ \ \ \ \ \ \ \ uint32_t\ flags; -\ \ \ \ \ \ \ \ uint64_t\ comp_mask; +struct mlx5dv_context_attr { + uint32_t flags; + uint64_t comp_mask; }; -\f[] +\f[R] .fi .TP -.B \f[I]flags\f[] +.B \f[I]flags\f[R] .IP .nf \f[C] -A\ bitwise\ OR\ of\ the\ various\ values\ described\ below. +A bitwise OR of the various values described below. *MLX5DV_CONTEXT_FLAGS_DEVX*: -Allocate\ a\ DEVX\ context -\f[] +Allocate a DEVX context +\f[R] .fi -.RS -.RE .TP -.B \f[I]comp_mask\f[] +.B \f[I]comp_mask\f[R] .IP .nf \f[C] -Bitmask\ specifying\ what\ fields\ in\ the\ structure\ are\ valid -\f[] +Bitmask specifying what fields in the structure are valid +\f[R] .fi -.RS -.RE .SH RETURN VALUE .PP Returns a pointer to the allocated device context, or NULL if the request fails. .SH SEE ALSO .PP -\f[I]ibv_open_device(3)\f[] +\f[I]ibv_open_device(3)\f[R] .SH AUTHOR .PP Yishai Hadas diff --git a/buildlib/pandoc-prebuilt/27eaa419d1bb46824097bffdf0ae970f24c1f0eb b/buildlib/pandoc-prebuilt/27eaa419d1bb46824097bffdf0ae970f24c1f0eb new file mode 100644 index 0000000..5692b32 --- /dev/null +++ b/buildlib/pandoc-prebuilt/27eaa419d1bb46824097bffdf0ae970f24c1f0eb @@ -0,0 +1,48 @@ +.\" Automatically generated by Pandoc 2.5 +.\" +.TH "mlx5dv_query_qp_lag_port" "3" "" "" "" +.hy +.SH NAME +.PP +mlx5dv_query_qp_lag_port \- Query the lag port information of a given QP +.SH SYNOPSIS +.IP +.nf +\f[C] +#include + +int mlx5dv_query_qp_lag_port(struct ibv_qp *qp, uint8_t *port_num, + uint8_t *active_port_num); +\f[R] +.fi +.SH DESCRIPTION +.PP +This API returns the configured and active port num of a given QP in +mlx5 devices. +.PP +The active port num indicates which port that the QP sends traffic out +in a LAG configuration. +.PP +The num_lag_ports field of struct mlx5dv_context greater than 1 means +LAG is supported on this device. +.SH ARGUMENTS +.TP +.B \f[I]qp\f[R] +The ibv_qp object to issue the action on. +.TP +.B \f[I]port_num\f[R] +The configured port num of the QP. +.TP +.B \f[I]active_port_num\f[R] +The current port num of the QP, which may different from the configured +value because of the bonding status. +.SH RETURN VALUE +.PP +0 on success; EOPNOTSUPP if not in LAG mode, or other errno value on +other failures. +.SH SEE ALSO +.PP +\f[I]mlx5dv_modify_qp_lag_port(3)\f[R] +.SH AUTHOR +.PP +Aharon Landau diff --git a/buildlib/pandoc-prebuilt/2a9899c3a62b0c9164f7f76f08930a7e80ea9e51 b/buildlib/pandoc-prebuilt/2a9899c3a62b0c9164f7f76f08930a7e80ea9e51 index 9618af6..f611ee6 100644 --- a/buildlib/pandoc-prebuilt/2a9899c3a62b0c9164f7f76f08930a7e80ea9e51 +++ b/buildlib/pandoc-prebuilt/2a9899c3a62b0c9164f7f76f08930a7e80ea9e51 @@ -1,4 +1,4 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" .TH "mlx5dv_devx_get_event" "3" "" "" "" .hy @@ -9,64 +9,58 @@ mlx5dv_devx_get_event \- Get an asynchronous event. .IP .nf \f[C] -#include\ +#include -struct\ mlx5dv_devx_async_event_hdr\ { -\ \ \ \ uint64_t\ \ \ \ cookie; -\ \ \ \ uint8_t\ \ \ \ \ out_data[]; +struct mlx5dv_devx_async_event_hdr { + uint64_t cookie; + uint8_t out_data[]; }; -ssize_t\ mlx5dv_devx_get_event(struct\ mlx5dv_devx_event_channel\ *event_channel, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ mlx5dv_devx_async_event_hdr\ *event_data, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ size_t\ event_resp_len) -\f[] +ssize_t mlx5dv_devx_get_event(struct mlx5dv_devx_event_channel *event_channel, + struct mlx5dv_devx_async_event_hdr *event_data, + size_t event_resp_len) +\f[R] .fi .SH DESCRIPTION .PP -Get a device event on the given \f[I]event_channel\f[]. +Get a device event on the given \f[I]event_channel\f[R]. Post a successful subscription over the event channel by calling to mlx5dv_devx_subscribe_devx_event() the application should use this API to get the response once an event has occurred. .PP -Upon response the \f[I]cookie\f[] that was supplied upon the -subscription is returned and the \f[I]out_data\f[] includes the data +Upon response the \f[I]cookie\f[R] that was supplied upon the +subscription is returned and the \f[I]out_data\f[R] includes the data itself. -The \f[I]out_data\f[] may be omitted in case the channel was created +The \f[I]out_data\f[R] may be omitted in case the channel was created with the omit data flag. .PP The application must supply a large enough buffer to hold the event according to the device specification, the buffer size is given by the -input \f[I]event_resp_len\f[] parameter. +input \f[I]event_resp_len\f[R] parameter. .SH ARGUMENTS .TP -.B \f[I]event_channel\f[] +.B \f[I]event_channel\f[R] .IP .nf \f[C] -The\ channel\ to\ get\ the\ event\ over. -\f[] +The channel to get the event over. +\f[R] .fi -.RS -.RE .TP -.B \f[I]event_data\f[] +.B \f[I]event_data\f[R] The output data from the asynchronous event. -.RS -.RE .TP -.B \f[I]event_resp_len\f[] +.B \f[I]event_resp_len\f[R] The output buffer size to hold the response. -.RS -.RE .SH RETURN VALUE .PP -Upon success \f[I]mlx5dv_devx_get_event\f[] will return the number of +Upon success \f[I]mlx5dv_devx_get_event\f[R] will return the number of bytes read, otherwise \-1 will be returned and errno was set. .SH NOTES .PP -In case the \f[I]event_channel\f[] was created with the omit data flag, +In case the \f[I]event_channel\f[R] was created with the omit data flag, events having the same type may be combined per subscription and be -reported once with the matching \f[I]cookie\f[]. +reported once with the matching \f[I]cookie\f[R]. In that mode of work, ordering is not preserved between those events to other on this channel. .PP @@ -75,8 +69,9 @@ is preserved, however, events might be loose as of lack of kernel memory, in that case EOVERFLOW will be reported. .SH SEE ALSO .PP -\f[I]mlx5dv_open_device(3)\f[], -\f[I]mlx5dv_devx_subscribe_devx_event(3)\f[] -.SH AUTHOR +\f[I]mlx5dv_open_device(3)\f[R], +\f[I]mlx5dv_devx_subscribe_devx_event(3)\f[R] +.PP +#AUTHOR .PP Yishai Hadas diff --git a/buildlib/pandoc-prebuilt/2cd4402b920e0a57d92dcf281f2091ee6e4ac141 b/buildlib/pandoc-prebuilt/2cd4402b920e0a57d92dcf281f2091ee6e4ac141 index 9713cfd..ea3e6eb 100644 --- a/buildlib/pandoc-prebuilt/2cd4402b920e0a57d92dcf281f2091ee6e4ac141 +++ b/buildlib/pandoc-prebuilt/2cd4402b920e0a57d92dcf281f2091ee6e4ac141 @@ -1,6 +1,6 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" -.TH "IBV_RESIZE_CQ" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.TH "IBV_RESIZE_CQ" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[cq]s Manual" .hy .SH NAME .PP @@ -9,33 +9,33 @@ ibv_resize_cq \- resize a completion queue (CQ) .IP .nf \f[C] -#include\ +#include -int\ ibv_resize_cq(struct\ ibv_cq\ *cq,\ int\ cqe); -\f[] +int ibv_resize_cq(struct ibv_cq *cq, int cqe); +\f[R] .fi .SH DESCRIPTION .PP -\f[B]ibv_resize_cq()\f[] resizes the completion queue (CQ) \f[I]cq\f[] -to have at least \f[I]cqe\f[] entries. -\f[I]cqe\f[] must be at least the number of unpolled entries in the CQ -\f[I]cq\f[]. -If \f[I]cqe\f[] is a valid value less than the current CQ size, -\f[B]ibv_resize_cq()\f[] may not do anything, since this function is +\f[B]ibv_resize_cq()\f[R] resizes the completion queue (CQ) \f[I]cq\f[R] +to have at least \f[I]cqe\f[R] entries. +\f[I]cqe\f[R] must be at least the number of unpolled entries in the CQ +\f[I]cq\f[R]. +If \f[I]cqe\f[R] is a valid value less than the current CQ size, +\f[B]ibv_resize_cq()\f[R] may not do anything, since this function is only guaranteed to resize the CQ to a size at least as big as the requested size. .SH RETURN VALUE .PP -\f[B]ibv_resize_cq()\f[] returns 0 on success, or the value of errno on +\f[B]ibv_resize_cq()\f[R] returns 0 on success, or the value of errno on failure (which indicates the failure reason). .SH NOTES .PP -\f[B]ibv_resize_cq()\f[] may assign a CQ size greater than or equal to +\f[B]ibv_resize_cq()\f[R] may assign a CQ size greater than or equal to the requested size. -The cqe member of \f[I]cq\f[] will be updated to the actual size. +The cqe member of \f[I]cq\f[R] will be updated to the actual size. .SH SEE ALSO .PP -\f[B]ibv_create_cq\f[](3), \f[B]ibv_destroy_cq\f[](3) +\f[B]ibv_create_cq\f[R](3), \f[B]ibv_destroy_cq\f[R](3) .SH AUTHOR .PP Dotan Barak diff --git a/buildlib/pandoc-prebuilt/2d8bf0753443ec6498bc7a90d728d90110707533 b/buildlib/pandoc-prebuilt/2d8bf0753443ec6498bc7a90d728d90110707533 index 7db2a37..f3db2a7 100644 --- a/buildlib/pandoc-prebuilt/2d8bf0753443ec6498bc7a90d728d90110707533 +++ b/buildlib/pandoc-prebuilt/2d8bf0753443ec6498bc7a90d728d90110707533 @@ -1,4 +1,4 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" .TH "mlx5dv_devx_subscribe_devx_event, mlx5dv_devx_subscribe_devx_event_fd" "3" "" "" "" .hy @@ -13,68 +13,57 @@ for device events to signal eventfd. .IP .nf \f[C] -#include\ +#include -int\ mlx5dv_devx_subscribe_devx_event(struct\ mlx5dv_devx_event_channel\ *dv_event_channel, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ mlx5dv_devx_obj\ *obj, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint16_t\ events_sz, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint16_t\ events_num[], -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint64_t\ cookie) +int mlx5dv_devx_subscribe_devx_event(struct mlx5dv_devx_event_channel *dv_event_channel, + struct mlx5dv_devx_obj *obj, + uint16_t events_sz, + uint16_t events_num[], + uint64_t cookie) -int\ mlx5dv_devx_subscribe_devx_event_fd(struct\ mlx5dv_devx_event_channel\ *dv_event_channel, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ int\ fd, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ mlx5dv_devx_obj\ *obj, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint16_t\ event_num) -\f[] +int mlx5dv_devx_subscribe_devx_event_fd(struct mlx5dv_devx_event_channel *dv_event_channel, + int fd, + struct mlx5dv_devx_obj *obj, + uint16_t event_num) +\f[R] .fi .SH DESCRIPTION .PP Subscribe over a DEVX event channel for device events. .SH ARGUMENTS .TP -.B \f[I]dv_event_channel\f[] +.B \f[I]dv_event_channel\f[R] Event channel to subscribe over. -.RS -.RE .TP -.B \f[I]fd\f[] +.B \f[I]fd\f[R] A file descriptor that previously was opened by the eventfd() system call. -.RS -.RE .TP -.B \f[I]obj\f[] -DEVX object that \f[I]events_num\f[] relates to, can be NULL for +.B \f[I]obj\f[R] +DEVX object that \f[I]events_num\f[R] relates to, can be NULL for unaffiliated events. -.RS -.RE .TP -.B \f[I]events_sz\f[] -Size of the \f[I]events_num\f[] buffer that holds the events to +.B \f[I]events_sz\f[R] +Size of the \f[I]events_num\f[R] buffer that holds the events to subscribe for. -.RS -.RE .TP -.B \f[I]events_num\f[] +.B \f[I]events_num\f[R] Holds the required event numbers to subscribe for, numbers are according to the device specification. -.RS -.RE .TP -.B \f[I]cookie\f[] +.B \f[I]cookie\f[R] The value to be returned back when reading the event, can be used as an ID for application use. -.RS -.RE .SH NOTES .PP -When mlx5dv_devx_subscribe_devx_event_fd will be used the \f[I]fd\f[] +When mlx5dv_devx_subscribe_devx_event_fd will be used the \f[I]fd\f[R] will be signaled once an event has occurred. .SH SEE ALSO .PP -\f[I]mlx5dv_open_device(3)\f[], -\f[I]mlx5dv_devx_create_event_channel(3)\f[], -\f[I]mlx5dv_devx_get_event(3)\f[] -.SH AUTHOR +\f[I]mlx5dv_open_device(3)\f[R], +\f[I]mlx5dv_devx_create_event_channel(3)\f[R], +\f[I]mlx5dv_devx_get_event(3)\f[R] +.PP +#AUTHOR .PP Yishai Hadas diff --git a/buildlib/pandoc-prebuilt/34cf0e59f60dd9af279902148ab5180325339afc b/buildlib/pandoc-prebuilt/34cf0e59f60dd9af279902148ab5180325339afc index 6d544c5..c087241 100644 --- a/buildlib/pandoc-prebuilt/34cf0e59f60dd9af279902148ab5180325339afc +++ b/buildlib/pandoc-prebuilt/34cf0e59f60dd9af279902148ab5180325339afc @@ -1,6 +1,6 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" -.TH "IBV_INC_RKEY" "3" "2015\-01\-29" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.TH "IBV_INC_RKEY" "3" "2015\-01\-29" "libibverbs" "Libibverbs Programmer\[cq]s Manual" .hy .SH NAME .PP @@ -9,24 +9,24 @@ ibv_inc_rkey \- creates a new rkey from the given one .IP .nf \f[C] -#include\ +#include -uint32_t\ ibv_inc_rkey(uint32_t\ rkey); -\f[] +uint32_t ibv_inc_rkey(uint32_t rkey); +\f[R] .fi .SH DESCRIPTION .PP -\f[B]ibv_inc_rkey()\f[] Increases the 8 LSB of \f[I]rkey\f[] and returns -the new value. +\f[B]ibv_inc_rkey()\f[R] Increases the 8 LSB of \f[I]rkey\f[R] and +returns the new value. .SH RETURN VALUE .PP -\f[B]ibv_inc_rkey()\f[] returns the new rkey. +\f[B]ibv_inc_rkey()\f[R] returns the new rkey. .SH NOTES .PP The verb generates a new rkey that is different from the previous one on its tag part but has the same index (bits 0xffffff00). A use case for this verb can be to create a new rkey from a Memory -window\[aq]s rkey when binding it to a Memory region. +window\[cq]s rkey when binding it to a Memory region. .SH AUTHORS .PP Majd Dibbiny , Yishai Hadas diff --git a/buildlib/pandoc-prebuilt/35de5f25ab929eed324046bc74a2a953f3b8a47b b/buildlib/pandoc-prebuilt/35de5f25ab929eed324046bc74a2a953f3b8a47b index bb41e29..dda40b7 100644 --- a/buildlib/pandoc-prebuilt/35de5f25ab929eed324046bc74a2a953f3b8a47b +++ b/buildlib/pandoc-prebuilt/35de5f25ab929eed324046bc74a2a953f3b8a47b @@ -1,6 +1,6 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" -.TH "IBV_ATTACH_MCAST" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.TH "IBV_ATTACH_MCAST" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[cq]s Manual" .hy .SH NAME .PP @@ -10,28 +10,28 @@ ibv_attach_mcast, ibv_detach_mcast \- attach and detach a queue pair .IP .nf \f[C] -#include\ +#include -int\ ibv_attach_mcast(struct\ ibv_qp\ *qp,\ const\ union\ ibv_gid\ *gid,\ uint16_t\ lid); +int ibv_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid); -int\ ibv_detach_mcast(struct\ ibv_qp\ *qp,\ const\ union\ ibv_gid\ *gid,\ uint16_t\ lid); -\f[] +int ibv_detach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid); +\f[R] .fi .SH DESCRIPTION .PP -\f[B]ibv_attach_mcast()\f[] attaches the QP \f[I]qp\f[] to the multicast -group having MGID \f[I]gid\f[] and MLID \f[I]lid\f[]. +\f[B]ibv_attach_mcast()\f[R] attaches the QP \f[I]qp\f[R] to the +multicast group having MGID \f[I]gid\f[R] and MLID \f[I]lid\f[R]. .PP -\f[B]ibv_detach_mcast()\f[] detaches the QP \f[I]qp\f[] to the multicast -group having MGID \f[I]gid\f[] and MLID \f[I]lid\f[]. +\f[B]ibv_detach_mcast()\f[R] detaches the QP \f[I]qp\f[R] to the +multicast group having MGID \f[I]gid\f[R] and MLID \f[I]lid\f[R]. .SH RETURN VALUE .PP -\f[B]ibv_attach_mcast()\f[] and \f[B]ibv_detach_mcast()\f[] returns 0 on -success, or the value of errno on failure (which indicates the failure -reason). +\f[B]ibv_attach_mcast()\f[R] and \f[B]ibv_detach_mcast()\f[R] returns 0 +on success, or the value of errno on failure (which indicates the +failure reason). .SH NOTES .PP -Only QPs of Transport Service Type \f[B]IBV_QPT_UD\f[] may be attached +Only QPs of Transport Service Type \f[B]IBV_QPT_UD\f[R] may be attached to multicast groups. .PP If a QP is attached to the same multicast group multiple times, the QP @@ -39,11 +39,11 @@ will still receive a single copy of a multicast message. .PP In order to receive multicast messages, a join request for the multicast group must be sent to the subnet administrator (SA), so that the -fabric\[aq]s multicast routing is configured to deliver messages to the +fabric\[cq]s multicast routing is configured to deliver messages to the local port. .SH SEE ALSO .PP -\f[B]ibv_create_qp\f[](3) +\f[B]ibv_create_qp\f[R](3) .SH AUTHOR .PP Dotan Barak diff --git a/buildlib/pandoc-prebuilt/41bbb0bed7a781be59e8c0dcd8b7278af2ce6882 b/buildlib/pandoc-prebuilt/41bbb0bed7a781be59e8c0dcd8b7278af2ce6882 index f313d1f..9e3e197 100644 --- a/buildlib/pandoc-prebuilt/41bbb0bed7a781be59e8c0dcd8b7278af2ce6882 +++ b/buildlib/pandoc-prebuilt/41bbb0bed7a781be59e8c0dcd8b7278af2ce6882 @@ -1,6 +1,6 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" -.TH "UMAD_INIT" "3" "May 21, 2007" "OpenIB" "OpenIB Programmer\[aq]s Manual" +.TH "UMAD_INIT" "3" "May 21, 2007" "OpenIB" "OpenIB Programmer\[cq]s Manual" .hy .SH NAME .PP @@ -9,30 +9,31 @@ umad_init, umad_done \- perform library initialization and finalization .IP .nf \f[C] -#include\ +#include -int\ umad_init(void); +int umad_init(void); -int\ umad_done(void); -\f[] +int umad_done(void); +\f[R] .fi .SH DESCRIPTION .PP -\f[B]umad_init()\f[] and \f[B]umad_done()\f[] do nothing. +\f[B]umad_init()\f[R] and \f[B]umad_done()\f[R] do nothing. .SH RETURN VALUE .PP Always 0. .SH COMPATIBILITY .PP -Versions prior to release 18 of the library require \f[B]umad_init()\f[] -to be called prior to using any other library functions. +Versions prior to release 18 of the library require +\f[B]umad_init()\f[R] to be called prior to using any other library +functions. Old versions could return a failure code of \-1 from -\f[B]umad_init()\f[]. +\f[B]umad_init()\f[R]. .PP For compatibility, applications should continue to call -\f[B]umad_init()\f[], and check the return code, prior to calling other -\f[B]umad_\f[] functions. -If \f[B]umad_init()\f[] returns an error, then no further use of the +\f[B]umad_init()\f[R], and check the return code, prior to calling other +\f[B]umad_\f[R] functions. +If \f[B]umad_init()\f[R] returns an error, then no further use of the umad library should be attempted. .SH AUTHORS .PP diff --git a/buildlib/pandoc-prebuilt/42f038a5f87713ae1079c61615b27d0a41336faa b/buildlib/pandoc-prebuilt/42f038a5f87713ae1079c61615b27d0a41336faa index de933fe..f1762dc 100644 --- a/buildlib/pandoc-prebuilt/42f038a5f87713ae1079c61615b27d0a41336faa +++ b/buildlib/pandoc-prebuilt/42f038a5f87713ae1079c61615b27d0a41336faa @@ -1,7 +1,7 @@ .\"t -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" -.TH "IBV_WR API" "3" "2018\-11\-27" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.TH "IBV_WR API" "3" "2018\-11\-27" "libibverbs" "Libibverbs Programmer\[cq]s Manual" .hy .SH NAME .PP @@ -34,78 +34,78 @@ ibv_wr_set_xrc_srqn \- Attach an XRC SRQN to the last work request .IP .nf \f[C] -#include\ +#include -void\ ibv_wr_abort(struct\ ibv_qp_ex\ *qp); -int\ ibv_wr_complete(struct\ ibv_qp_ex\ *qp); -void\ ibv_wr_start(struct\ ibv_qp_ex\ *qp); +void ibv_wr_abort(struct ibv_qp_ex *qp); +int ibv_wr_complete(struct ibv_qp_ex *qp); +void ibv_wr_start(struct ibv_qp_ex *qp); -void\ ibv_wr_atomic_cmp_swp(struct\ ibv_qp_ex\ *qp,\ uint32_t\ rkey, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint64_t\ remote_addr,\ uint64_t\ compare, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint64_t\ swap); -void\ ibv_wr_atomic_fetch_add(struct\ ibv_qp_ex\ *qp,\ uint32_t\ rkey, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint64_t\ remote_addr,\ uint64_t\ add); +void ibv_wr_atomic_cmp_swp(struct ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr, uint64_t compare, + uint64_t swap); +void ibv_wr_atomic_fetch_add(struct ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr, uint64_t add); -void\ ibv_wr_bind_mw(struct\ ibv_qp_ex\ *qp,\ struct\ ibv_mw\ *mw,\ uint32_t\ rkey, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ const\ struct\ ibv_mw_bind_info\ *bind_info); -void\ ibv_wr_local_inv(struct\ ibv_qp_ex\ *qp,\ uint32_t\ invalidate_rkey); +void ibv_wr_bind_mw(struct ibv_qp_ex *qp, struct ibv_mw *mw, uint32_t rkey, + const struct ibv_mw_bind_info *bind_info); +void ibv_wr_local_inv(struct ibv_qp_ex *qp, uint32_t invalidate_rkey); -void\ ibv_wr_rdma_read(struct\ ibv_qp_ex\ *qp,\ uint32_t\ rkey, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint64_t\ remote_addr); -void\ ibv_wr_rdma_write(struct\ ibv_qp_ex\ *qp,\ uint32_t\ rkey, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint64_t\ remote_addr); -void\ ibv_wr_rdma_write_imm(struct\ ibv_qp_ex\ *qp,\ uint32_t\ rkey, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint64_t\ remote_addr,\ __be32\ imm_data); +void ibv_wr_rdma_read(struct ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr); +void ibv_wr_rdma_write(struct ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr); +void ibv_wr_rdma_write_imm(struct ibv_qp_ex *qp, uint32_t rkey, + uint64_t remote_addr, __be32 imm_data); -void\ ibv_wr_send(struct\ ibv_qp_ex\ *qp); -void\ ibv_wr_send_imm(struct\ ibv_qp_ex\ *qp,\ __be32\ imm_data); -void\ ibv_wr_send_inv(struct\ ibv_qp_ex\ *qp,\ uint32_t\ invalidate_rkey); -void\ ibv_wr_send_tso(struct\ ibv_qp_ex\ *qp,\ void\ *hdr,\ uint16_t\ hdr_sz, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint16_t\ mss); +void ibv_wr_send(struct ibv_qp_ex *qp); +void ibv_wr_send_imm(struct ibv_qp_ex *qp, __be32 imm_data); +void ibv_wr_send_inv(struct ibv_qp_ex *qp, uint32_t invalidate_rkey); +void ibv_wr_send_tso(struct ibv_qp_ex *qp, void *hdr, uint16_t hdr_sz, + uint16_t mss); -void\ ibv_wr_set_inline_data(struct\ ibv_qp_ex\ *qp,\ void\ *addr,\ size_t\ length); -void\ ibv_wr_set_inline_data_list(struct\ ibv_qp_ex\ *qp,\ size_t\ num_buf, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ const\ struct\ ibv_data_buf\ *buf_list); -void\ ibv_wr_set_sge(struct\ ibv_qp_ex\ *qp,\ uint32_t\ lkey,\ uint64_t\ addr, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint32_t\ length); -void\ ibv_wr_set_sge_list(struct\ ibv_qp_ex\ *qp,\ size_t\ num_sge, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ const\ struct\ ibv_sge\ *sg_list); +void ibv_wr_set_inline_data(struct ibv_qp_ex *qp, void *addr, size_t length); +void ibv_wr_set_inline_data_list(struct ibv_qp_ex *qp, size_t num_buf, + const struct ibv_data_buf *buf_list); +void ibv_wr_set_sge(struct ibv_qp_ex *qp, uint32_t lkey, uint64_t addr, + uint32_t length); +void ibv_wr_set_sge_list(struct ibv_qp_ex *qp, size_t num_sge, + const struct ibv_sge *sg_list); -void\ ibv_wr_set_ud_addr(struct\ ibv_qp_ex\ *qp,\ struct\ ibv_ah\ *ah, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint32_t\ remote_qpn,\ uint32_t\ remote_qkey); -void\ ibv_wr_set_xrc_srqn(struct\ ibv_qp_ex\ *qp,\ uint32_t\ remote_srqn); -\f[] +void ibv_wr_set_ud_addr(struct ibv_qp_ex *qp, struct ibv_ah *ah, + uint32_t remote_qpn, uint32_t remote_qkey); +void ibv_wr_set_xrc_srqn(struct ibv_qp_ex *qp, uint32_t remote_srqn); +\f[R] .fi .SH DESCRIPTION .PP The verbs work request API (ibv_wr_*) allows efficient posting of work to a send queue using function calls instead of the struct based -\f[I]ibv_post_send()\f[] scheme. +\f[I]ibv_post_send()\f[R] scheme. This approach is designed to minimize CPU branching and locking during the posting process. .PP This API is intended to be used to access additional functionality -beyond what is provided by \f[I]ibv_post_send()\f[]. +beyond what is provided by \f[I]ibv_post_send()\f[R]. .PP -WRs batches of \f[I]ibv_post_send()\f[] and this API WRs batches can +WRs batches of \f[I]ibv_post_send()\f[R] and this API WRs batches can interleave together just if they are not posted within the critical region of each other. -(A critical region in this API formed by \f[I]ibv_wr_start()\f[] and -\f[I]ibv_wr_complete()\f[]/\f[I]ibv_wr_abort()\f[]) +(A critical region in this API formed by \f[I]ibv_wr_start()\f[R] and +\f[I]ibv_wr_complete()\f[R]/\f[I]ibv_wr_abort()\f[R]) .SH USAGE .PP To use these APIs the QP must be created using ibv_create_qp_ex() which -allows setting the \f[B]IBV_QP_INIT_ATTR_SEND_OPS_FLAGS\f[] in -\f[I]comp_mask\f[]. -The \f[I]send_ops_flags\f[] should be set to the OR of the work request +allows setting the \f[B]IBV_QP_INIT_ATTR_SEND_OPS_FLAGS\f[R] in +\f[I]comp_mask\f[R]. +The \f[I]send_ops_flags\f[R] should be set to the OR of the work request types that will be posted to the QP. .PP If the QP does not support all the requested work request types then QP creation will fail. .PP Posting work requests to the QP is done within the critical region -formed by \f[I]ibv_wr_start()\f[] and -\f[I]ibv_wr_complete()\f[]/\f[I]ibv_wr_abort()\f[] (see CONCURRENCY +formed by \f[I]ibv_wr_start()\f[R] and +\f[I]ibv_wr_complete()\f[R]/\f[I]ibv_wr_abort()\f[R] (see CONCURRENCY below). .PP Each work request is created by calling a WR builder function (see the @@ -115,9 +115,9 @@ followed by allowed/required setter functions described below. The WR builder and setter combination can be called multiple times to efficiently post multiple work requests within a single critical region. .PP -Each WR builder will use the \f[I]wr_id\f[] member of \f[I]struct -ibv_qp_ex\f[] to set the value to be returned in the completion. -Some operations will also use the \f[I]wr_flags\f[] member to influence +Each WR builder will use the \f[I]wr_id\f[R] member of \f[I]struct +ibv_qp_ex\f[R] to set the value to be returned in the completion. +Some operations will also use the \f[I]wr_flags\f[R] member to influence operation (see Flags below). These values should be set before invoking the WR builder function. .PP @@ -125,28 +125,28 @@ For example a simple send could be formed as follows: .IP .nf \f[C] -qpx\->wr_id\ =\ 1; +qpx\->wr_id = 1; ibv_wr_send(qpx); -ibv_wr_set_sge(qpx,\ lkey,\ &data,\ sizeof(data)); -\f[] +ibv_wr_set_sge(qpx, lkey, &data, sizeof(data)); +\f[R] .fi .PP The section WORK REQUESTS describes the various WR builders and setters in details. .PP -Posting work is completed by calling \f[I]ibv_wr_complete()\f[] or -\f[I]ibv_wr_abort()\f[]. -No work is executed to the queue until \f[I]ibv_wr_complete()\f[] +Posting work is completed by calling \f[I]ibv_wr_complete()\f[R] or +\f[I]ibv_wr_abort()\f[R]. +No work is executed to the queue until \f[I]ibv_wr_complete()\f[R] returns success. -\f[I]ibv_wr_abort()\f[] will discard all work prepared since -\f[I]ibv_wr_start()\f[]. +\f[I]ibv_wr_abort()\f[R] will discard all work prepared since +\f[I]ibv_wr_start()\f[R]. .SH WORK REQUESTS .PP Many of the operations match the opcodes available for -\f[I]ibv_post_send()\f[]. +\f[I]ibv_post_send()\f[R]. Each operation has a WR builder function, a list of allowed setters, and -a flag bit to request the operation with \f[I]send_ops_flags\f[] in -\f[I]struct ibv_qp_init_attr_ex\f[] (see the EXAMPLE below). +a flag bit to request the operation with \f[I]send_ops_flags\f[R] in +\f[I]struct ibv_qp_init_attr_ex\f[R] (see the EXAMPLE below). .PP .TS tab(@); @@ -268,45 +268,35 @@ only through the same RDMA hardware. It is not atomic with writes performed by the CPU, or by other RDMA hardware in the system. .TP -.B \f[I]ibv_wr_atomic_cmp_swp()\f[] -If the remote 64 bit memory location specified by \f[I]rkey\f[] and -\f[I]remote_addr\f[] equals \f[I]compare\f[] then set it to -\f[I]swap\f[]. -.RS -.RE +.B \f[I]ibv_wr_atomic_cmp_swp()\f[R] +If the remote 64 bit memory location specified by \f[I]rkey\f[R] and +\f[I]remote_addr\f[R] equals \f[I]compare\f[R] then set it to +\f[I]swap\f[R]. .TP -.B \f[I]ibv_wr_atomic_fetch_add()\f[] -Add \f[I]add\f[] to the 64 bit memory location specified \f[I]rkey\f[] -and \f[I]remote_addr\f[]. -.RS -.RE +.B \f[I]ibv_wr_atomic_fetch_add()\f[R] +Add \f[I]add\f[R] to the 64 bit memory location specified \f[I]rkey\f[R] +and \f[I]remote_addr\f[R]. .SS Memory Windows .PP Memory window type 2 operations (See man page for ibv_alloc_mw). .TP -.B \f[I]ibv_wr_bind_mw()\f[] -Bind a MW type 2 specified by \f[B]mw\f[], set a new \f[B]rkey\f[] and -set its properties by \f[B]bind_info\f[]. -.RS -.RE +.B \f[I]ibv_wr_bind_mw()\f[R] +Bind a MW type 2 specified by \f[B]mw\f[R], set a new \f[B]rkey\f[R] and +set its properties by \f[B]bind_info\f[R]. .TP -.B \f[I]ibv_wr_local_inv()\f[] -Invalidate a MW type 2 which is associated with \f[B]rkey\f[]. -.RS -.RE +.B \f[I]ibv_wr_local_inv()\f[R] +Invalidate a MW type 2 which is associated with \f[B]rkey\f[R]. .SS RDMA .TP -.B \f[I]ibv_wr_rdma_read()\f[] -Read from the remote memory location specified \f[I]rkey\f[] and -\f[I]remote_addr\f[]. +.B \f[I]ibv_wr_rdma_read()\f[R] +Read from the remote memory location specified \f[I]rkey\f[R] and +\f[I]remote_addr\f[R]. The number of bytes to read, and the local location to store the data, is determined by the DATA buffers set after this call. -.RS -.RE .TP -.B \f[I]ibv_wr_rdma_write()\f[], \f[I]ibv_wr_rdma_write_imm()\f[] -Write to the remote memory location specified \f[I]rkey\f[] and -\f[I]remote_addr\f[]. +.B \f[I]ibv_wr_rdma_write()\f[R], \f[I]ibv_wr_rdma_write_imm()\f[R] +Write to the remote memory location specified \f[I]rkey\f[R] and +\f[I]remote_addr\f[R]. The number of bytes to read, and the local location to get the data, is determined by the DATA buffers set after this call. .RS @@ -316,7 +306,7 @@ IBV_WC_RECV_RDMA_WITH_IMM containing the 32 bits of immediate data. .RE .SS Message Send .TP -.B \f[I]ibv_wr_send()\f[], \f[I]ibv_wr_send_imm()\f[] +.B \f[I]ibv_wr_send()\f[R], \f[I]ibv_wr_send_imm()\f[R] Send a message. The number of bytes to send, and the local location to get the data, is determined by the DATA buffers set after this call. @@ -326,58 +316,46 @@ The _imm version causes the remote side to get a IBV_WC_RECV_RDMA_WITH_IMM containing the 32 bits of immediate data. .RE .TP -.B \f[I]ibv_wr_send_inv()\f[] -The data transfer is the same as for \f[I]ibv_wr_send()\f[], however the -remote side will invalidate the MR specified by \f[I]invalidate_rkey\f[] -before delivering a completion. -.RS -.RE +.B \f[I]ibv_wr_send_inv()\f[R] +The data transfer is the same as for \f[I]ibv_wr_send()\f[R], however +the remote side will invalidate the MR specified by +\f[I]invalidate_rkey\f[R] before delivering a completion. .TP -.B \f[I]ibv_wr_send_tso()\f[] +.B \f[I]ibv_wr_send_tso()\f[R] Produce multiple SEND messages using TCP Segmentation Offload. The SGE points to a TCP Stream buffer which will be segmented into MSS size SENDs. The hdr includes the entire network headers up to and including the TCP header and is prefixed before each segment. -.RS -.RE .SS QP Specific setters .PP Certain QP types require each post to be accompanied by additional setters, these setters are mandatory for any operation listing a QP setter in the above table. .TP -.B \f[I]UD\f[] QPs -\f[I]ibv_wr_set_ud_addr()\f[] must be called to set the destination +.B \f[I]UD\f[R] QPs +\f[I]ibv_wr_set_ud_addr()\f[R] must be called to set the destination address of the work. -.RS -.RE .TP -.B \f[I]XRC_SEND\f[] QPs -\f[I]ibv_wr_set_xrc_srqn()\f[] must be called to set the destination +.B \f[I]XRC_SEND\f[R] QPs +\f[I]ibv_wr_set_xrc_srqn()\f[R] must be called to set the destination SRQN field. -.RS -.RE .SS DATA transfer setters .PP For work that requires to transfer data one of the following setters should be called once after the WR builder: .TP -.B \f[I]ibv_wr_set_sge()\f[] +.B \f[I]ibv_wr_set_sge()\f[R] Transfer data to/from a single buffer given by the lkey, addr and length. -This is equivalent to \f[I]ibv_wr_set_sge_list()\f[] with a single +This is equivalent to \f[I]ibv_wr_set_sge_list()\f[R] with a single element. -.RS -.RE .TP -.B \f[I]ibv_wr_set_sge_list()\f[] +.B \f[I]ibv_wr_set_sge_list()\f[R] Transfer data to/from a list of buffers, logically concatenated together. Each buffer is specified by an element in an array of \f[I]struct -ibv_sge\f[]. -.RS -.RE +ibv_sge\f[R]. .PP Inline setters will copy the send data during the setter and allows the caller to immediately re\-use the buffer. @@ -386,57 +364,45 @@ Generally this copy is done in a way that optimizes SEND latency and is suitable for small messages. The provider will limit the amount of data it can support in a single operation. -This limit is requested in the \f[I]max_inline_data\f[] member of -\f[I]struct ibv_qp_init_attr\f[]. +This limit is requested in the \f[I]max_inline_data\f[R] member of +\f[I]struct ibv_qp_init_attr\f[R]. Valid only for SEND and RDMA_WRITE. .TP -.B \f[I]ibv_wr_set_inline_data()\f[] +.B \f[I]ibv_wr_set_inline_data()\f[R] Copy send data from a single buffer given by the addr and length. -This is equivalent to \f[I]ibv_wr_set_inline_data_list()\f[] with a +This is equivalent to \f[I]ibv_wr_set_inline_data_list()\f[R] with a single element. -.RS -.RE .TP -.B \f[I]ibv_wr_set_inline_data_list()\f[] +.B \f[I]ibv_wr_set_inline_data_list()\f[R] Copy send data from a list of buffers, logically concatenated together. Each buffer is specified by an element in an array of \f[I]struct -ibv_inl_data\f[]. -.RS -.RE +ibv_inl_data\f[R]. .SS Flags .PP -A bit mask of flags may be specified in \f[I]wr_flags\f[] to control the -behavior of the work request. +A bit mask of flags may be specified in \f[I]wr_flags\f[R] to control +the behavior of the work request. .TP -.B \f[B]IBV_SEND_FENCE\f[] +.B \f[B]IBV_SEND_FENCE\f[R] Do not start this work request until prior work has completed. -.RS -.RE .TP -.B \f[B]IBV_SEND_IP_CSUM\f[] +.B \f[B]IBV_SEND_IP_CSUM\f[R] Offload the IPv4 and TCP/UDP checksum calculation -.RS -.RE .TP -.B \f[B]IBV_SEND_SIGNALED\f[] +.B \f[B]IBV_SEND_SIGNALED\f[R] A completion will be generated in the completion queue for the operation. -.RS -.RE .TP -.B \f[B]IBV_SEND_SOLICTED\f[] +.B \f[B]IBV_SEND_SOLICTED\f[R] Set the solicted bit in the RDMA packet. This informs the other side to generate a completion event upon receiving the RDMA operation. -.RS -.RE .SH CONCURRENCY .PP -The provider will provide locking to ensure that \f[I]ibv_wr_start()\f[] -and \f[I]ibv_wr_complete()/abort()\f[] form a per\-QP critical section -where no other threads can enter. +The provider will provide locking to ensure that +\f[I]ibv_wr_start()\f[R] and \f[I]ibv_wr_complete()/abort()\f[R] form a +per\-QP critical section where no other threads can enter. .PP -If an \f[I]ibv_td\f[] is provided during QP creation then no locking +If an \f[I]ibv_td\f[R] is provided during QP creation then no locking will be performed and it is up to the caller to ensure that only one thread can be within the critical region at a time. .SH RETURN VALUE @@ -446,41 +412,41 @@ The individual APIs do not return a failure indication to avoid branching. .PP If a failure is detected during operation, for instance due to an -invalid argument, then \f[I]ibv_wr_complete()\f[] will return failure +invalid argument, then \f[I]ibv_wr_complete()\f[R] will return failure and the entire posting will be aborted. .SH EXAMPLE .IP .nf \f[C] -/*\ create\ RC\ QP\ type\ and\ specify\ the\ required\ send\ opcodes\ */ -qp_init_attr_ex.qp_type\ =\ IBV_QPT_RC; -qp_init_attr_ex.comp_mask\ |=\ IBV_QP_INIT_ATTR_SEND_OPS_FLAGS; -qp_init_attr_ex.send_ops_flags\ |=\ IBV_QP_EX_WITH_RDMA_WRITE; -qp_init_attr_ex.send_ops_flags\ |=\ IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM; +/* create RC QP type and specify the required send opcodes */ +qp_init_attr_ex.qp_type = IBV_QPT_RC; +qp_init_attr_ex.comp_mask |= IBV_QP_INIT_ATTR_SEND_OPS_FLAGS; +qp_init_attr_ex.send_ops_flags |= IBV_QP_EX_WITH_RDMA_WRITE; +qp_init_attr_ex.send_ops_flags |= IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM; -ibv_qp\ *qp\ =\ ibv_create_qp_ex(ctx,\ qp_init_attr_ex); -ibv_qp_ex\ *qpx\ =\ ibv_qp_to_qp_ex(qp); +ibv_qp *qp = ibv_create_qp_ex(ctx, qp_init_attr_ex); +ibv_qp_ex *qpx = ibv_qp_to_qp_ex(qp); ibv_wr_start(qpx); -/*\ create\ 1st\ WRITE\ WR\ entry\ */ -qpx\->wr_id\ =\ my_wr_id_1; -ibv_wr_rdma_write(qpx,\ rkey,\ remote_addr_1); -ibv_wr_set_sge(qpx,\ lkey,\ local_addr_1,\ length_1); +/* create 1st WRITE WR entry */ +qpx\->wr_id = my_wr_id_1; +ibv_wr_rdma_write(qpx, rkey, remote_addr_1); +ibv_wr_set_sge(qpx, lkey, local_addr_1, length_1); -/*\ create\ 2nd\ WRITE_WITH_IMM\ WR\ entry\ */ -qpx\->wr_id\ =\ my_wr_id_2; -qpx\->wr_flags\ =\ IBV_SEND_SIGNALED; -ibv_wr_rdma_write_imm(qpx,\ rkey,\ remote_addr_2,\ htonl(0x1234)); -ibv_set_wr_sge(qpx,\ lkey,\ local_addr_2,\ length_2); +/* create 2nd WRITE_WITH_IMM WR entry */ +qpx\->wr_id = my_wr_id_2; +qpx\->wr_flags = IBV_SEND_SIGNALED; +ibv_wr_rdma_write_imm(qpx, rkey, remote_addr_2, htonl(0x1234)); +ibv_set_wr_sge(qpx, lkey, local_addr_2, length_2); -/*\ Begin\ processing\ WRs\ */ -ret\ =\ ibv_wr_complete(qpx); -\f[] +/* Begin processing WRs */ +ret = ibv_wr_complete(qpx); +\f[R] .fi .SH SEE ALSO .PP -\f[B]ibv_post_send\f[](3), \f[B]ibv_create_qp_ex(3)\f[]. +\f[B]ibv_post_send\f[R](3), \f[B]ibv_create_qp_ex(3)\f[R]. .SH AUTHOR .PP Jason Gunthorpe Guy Levi diff --git a/buildlib/pandoc-prebuilt/4a2fda3e7e3b15e84396f81e6aae0bde38dcfb98 b/buildlib/pandoc-prebuilt/4a2fda3e7e3b15e84396f81e6aae0bde38dcfb98 index 2f20af5..319c1f3 100644 --- a/buildlib/pandoc-prebuilt/4a2fda3e7e3b15e84396f81e6aae0bde38dcfb98 +++ b/buildlib/pandoc-prebuilt/4a2fda3e7e3b15e84396f81e6aae0bde38dcfb98 @@ -1,34 +1,34 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" -.TH "IBV_QUERY_GID" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.TH "IBV_QUERY_GID" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[cq]s Manual" .hy .SH NAME .PP -ibv_query_gid \- query an InfiniBand port\[aq]s GID table +ibv_query_gid \- query an InfiniBand port\[cq]s GID table .SH SYNOPSIS .IP .nf \f[C] -#include\ +#include -int\ ibv_query_gid(struct\ ibv_context\ *context, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint8_t\ port_num, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ int\ index, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ union\ ibv_gid\ *gid); -\f[] +int ibv_query_gid(struct ibv_context *context, + uint8_t port_num, + int index, + union ibv_gid *gid); +\f[R] .fi .SH DESCRIPTION .PP -\f[B]ibv_query_gid()\f[] returns the GID value in entry \f[I]index\f[] -of port \f[I]port_num\f[] for device context \f[I]context\f[] through -the pointer \f[I]gid\f[]. +\f[B]ibv_query_gid()\f[R] returns the GID value in entry \f[I]index\f[R] +of port \f[I]port_num\f[R] for device context \f[I]context\f[R] through +the pointer \f[I]gid\f[R]. .SH RETURN VALUE .PP -\f[B]ibv_query_gid()\f[] returns 0 on success, and \-1 on error. +\f[B]ibv_query_gid()\f[R] returns 0 on success, and \-1 on error. .SH SEE ALSO .PP -\f[B]ibv_open_device\f[](3), \f[B]ibv_query_device\f[](3), -\f[B]ibv_query_pkey\f[](3), \f[B]ibv_query_port\f[](3) +\f[B]ibv_open_device\f[R](3), \f[B]ibv_query_device\f[R](3), +\f[B]ibv_query_pkey\f[R](3), \f[B]ibv_query_port\f[R](3) .SH AUTHOR .PP Dotan Barak diff --git a/buildlib/pandoc-prebuilt/4aefe6ec699efe9cbeab3f78569c8ed5da970a2e b/buildlib/pandoc-prebuilt/4aefe6ec699efe9cbeab3f78569c8ed5da970a2e index f5f2c77..9a68b9a 100644 --- a/buildlib/pandoc-prebuilt/4aefe6ec699efe9cbeab3f78569c8ed5da970a2e +++ b/buildlib/pandoc-prebuilt/4aefe6ec699efe9cbeab3f78569c8ed5da970a2e @@ -1,4 +1,4 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" .TH "mlx5dv_create_flow_action_packet_reformat" "3" "" "" "" .hy @@ -10,15 +10,15 @@ for mlx5 provider .IP .nf \f[C] -#include\ +#include -struct\ ibv_flow_action\ * -mlx5dv_create_flow_action_packet_reformat(struct\ ibv_context\ *ctx, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ size_t\ data_sz, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *data, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ enum\ mlx5dv_flow_action_packet_reformat_type\ reformat_type, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ enum\ mlx5dv_flow_table_type\ ft_type) -\f[] +struct ibv_flow_action * +mlx5dv_create_flow_action_packet_reformat(struct ibv_context *ctx, + size_t data_sz, + void *data, + enum mlx5dv_flow_action_packet_reformat_type reformat_type, + enum mlx5dv_flow_table_type ft_type) +\f[R] .fi .SH DESCRIPTION .PP @@ -26,42 +26,36 @@ Create a packet reformat flow steering action. It allows adding/removing packet headers. .SH ARGUMENTS .TP -.B \f[I]ctx\f[] +.B \f[I]ctx\f[R] .IP .nf \f[C] -RDMA\ device\ context\ to\ create\ the\ action\ on. -\f[] +RDMA device context to create the action on. +\f[R] .fi -.RS -.RE .TP -.B \f[I]data_sz\f[] +.B \f[I]data_sz\f[R] .IP .nf \f[C] -The\ size\ of\ *data*\ buffer. -\f[] +The size of *data* buffer. +\f[R] .fi -.RS -.RE .TP -.B \f[I]data\f[] +.B \f[I]data\f[R] .IP .nf \f[C] -A\ buffer\ which\ contains\ headers\ in\ case\ the\ actions\ requires\ them. -\f[] +A buffer which contains headers in case the actions requires them. +\f[R] .fi -.RS -.RE .TP -.B \f[I]reformat_type\f[] +.B \f[I]reformat_type\f[R] .IP .nf \f[C] -The\ reformat\ type\ to\ be\ create.\ Use\ enum\ mlx5dv_flow_action_packet_reformat_type. -\f[] +The reformat type to be create. Use enum mlx5dv_flow_action_packet_reformat_type. +\f[R] .fi .RS .PP @@ -69,33 +63,33 @@ MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TUNNEL_TO_L2: Decap a generic L2 tunneled packet up to inner L2. .PP MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L2_TUNNEL: Generic encap, -\f[I]data\f[] should contain the encapsulating headers. +\f[I]data\f[R] should contain the encapsulating headers. .PP MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L3_TUNNEL_TO_L2: Will do decap where the inner packet starts from L3. -\f[I]data\f[] should be MAC or MAC + vlan (14 or 18 bytes) to be +\f[I]data\f[R] should be MAC or MAC + vlan (14 or 18 bytes) to be appended to the packet after the decap action. .PP MLX5DV_FLOW_ACTION_PACKET_REFORMAT_TYPE_L2_TO_L3_TUNNEL: Will do encap where is L2 of the original packet will not be included. -\f[I]data\f[] should be the encapsulating header. +\f[I]data\f[R] should be the encapsulating header. .RE .TP -.B \f[I]ft_type\f[] +.B \f[I]ft_type\f[R] .IP .nf \f[C] -It\ defines\ the\ flow\ table\ type\ to\ which\ the\ packet\ reformat\ action -\f[] +It defines the flow table type to which the packet reformat action +\f[R] .fi .RS will be attached. .RE .SH RETURN VALUE .PP -Upon success \f[I]mlx5dv_create_flow_action_packet_reformat\f[] will -return a new \f[I]struct ibv_flow_action\f[] object, on error NULL will +Upon success \f[I]mlx5dv_create_flow_action_packet_reformat\f[R] will +return a new \f[I]struct ibv_flow_action\f[R] object, on error NULL will be returned and errno will be set. .SH SEE ALSO .PP -\f[I]ibv_create_flow(3)\f[], \f[I]ibv_create_flow_action(3)\f[] +\f[I]ibv_create_flow(3)\f[R], \f[I]ibv_create_flow_action(3)\f[R] diff --git a/buildlib/pandoc-prebuilt/50f6e71e397cf5410c315ea80d7bd3a967080647 b/buildlib/pandoc-prebuilt/50f6e71e397cf5410c315ea80d7bd3a967080647 new file mode 100644 index 0000000..f7ca790 --- /dev/null +++ b/buildlib/pandoc-prebuilt/50f6e71e397cf5410c315ea80d7bd3a967080647 @@ -0,0 +1,31 @@ +.\" Automatically generated by Pandoc 2.5 +.\" +.TH "IBV_GET_DEVICE_INDEX" "3" "2020\-04\-22" "libibverbs" "Libibverbs Programmer\[cq]s Manual" +.hy +.SH NAME +.PP +ibv_get_device_index \- get an RDMA device index +.SH SYNOPSIS +.IP +.nf +\f[C] +#include + +int ibv_get_device_index(struct ibv_device *device); +\f[R] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_get_device_index()\f[R] returns stable IB device index as it is +assigned by the kernel. +.SH RETURN VALUE +.PP +\f[B]ibv_get_device_index()\f[R] returns an index, or \-1 if the kernel +doesn\[cq]t support device indexes. +.SH SEE ALSO +.PP +\f[B]ibv_get_device_name\f[R](3), \f[B]ibv_get_device_guid\f[R](3), +\f[B]ibv_get_device_list\f[R](3), \f[B]ibv_open_device\f[R](3) +.SH AUTHOR +.PP +Leon Romanovsky diff --git a/buildlib/pandoc-prebuilt/5149808f7f084016a7e6ee8afa1b217bcd78f25d b/buildlib/pandoc-prebuilt/5149808f7f084016a7e6ee8afa1b217bcd78f25d new file mode 100644 index 0000000..ad35642 --- /dev/null +++ b/buildlib/pandoc-prebuilt/5149808f7f084016a7e6ee8afa1b217bcd78f25d @@ -0,0 +1,35 @@ +.\" Automatically generated by Pandoc 2.5 +.\" +.TH "IBV_GET_SRQ_NUM" "3" "2013\-06\-26" "libibverbs" "Libibverbs Programmer\[cq]s Manual" +.hy +.SH NAME +.PP +ibv_get_srq_num \- return srq number associated with the given shared +receive queue (SRQ) +.SH SYNOPSIS +.IP +.nf +\f[C] +#include + +int ibv_get_srq_num(struct ibv_srq *srq, uint32_t *srq_num); +\f[R] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_get_srq_num()\f[R] return srq number associated with the given +XRC shared receive queue The argument \f[I]srq\f[R] is an ibv_srq +struct, as defined in . +\f[I]srq_num\f[R] is an output parameter that holds the returned srq +number. +.SH RETURN VALUE +.PP +\f[B]ibv_get_srq_num()\f[R] returns 0 on success, or the value of errno +on failure (which indicates the failure reason). +.SH SEE ALSO +.PP +\f[B]ibv_alloc_pd\f[R](3), \f[B]ibv_create_srq_ex\f[R](3), +\f[B]ibv_modify_srq\f[R](3) +.SH AUTHOR +.PP +Yishai Hadas diff --git a/buildlib/pandoc-prebuilt/515c4ddf52e644f7f347c40deac43d5ddb5bb19d b/buildlib/pandoc-prebuilt/515c4ddf52e644f7f347c40deac43d5ddb5bb19d index b857381..9969d9b 100644 --- a/buildlib/pandoc-prebuilt/515c4ddf52e644f7f347c40deac43d5ddb5bb19d +++ b/buildlib/pandoc-prebuilt/515c4ddf52e644f7f347c40deac43d5ddb5bb19d @@ -1,4 +1,4 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" .TH "EFADV_CREATE_DRIVER_QP" "3" "2019\-01\-23" "efa" "EFA Direct Verbs Manual" .hy @@ -8,16 +8,16 @@ efadv_create_driver_qp \- Create EFA specific Queue Pair # SYNOPSIS .IP .nf \f[C] -#include\ +#include -struct\ ibv_qp\ *efadv_create_driver_qp(struct\ ibv_pd\ *ibvpd, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ ibv_qp_init_attr\ *attr, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint32_t\ driver_qp_type); -\f[] +struct ibv_qp *efadv_create_driver_qp(struct ibv_pd *ibvpd, + struct ibv_qp_init_attr *attr, + uint32_t driver_qp_type); +\f[R] .fi .SH DESCRIPTION .PP -\f[B]efadv_create_driver_qp()\f[] Create device\-specific Queue Pairs. +\f[B]efadv_create_driver_qp()\f[R] Create device\-specific Queue Pairs. .PP Scalable Reliable Datagram (SRD) transport provides reliable out\-of\-order delivery, transparently utilizing multiple network paths @@ -26,7 +26,7 @@ Its interface is similar to UD, in particular it supports message size up to MTU, with error handling extended to support reliable communication. .TP -.B \f[I]driver_qp_type\f[] +.B \f[I]driver_qp_type\f[R] The type of QP to be created: .RS .PP @@ -38,7 +38,7 @@ efadv_create_driver_qp() returns a pointer to the created QP, or NULL if the request fails. .SH SEE ALSO .PP -\f[B]efadv\f[](7) +\f[B]efadv\f[R](7) .SH AUTHORS .PP Gal Pressman diff --git a/buildlib/pandoc-prebuilt/532c7a2d93d5555e2b0b1403669a2be45ec851d4 b/buildlib/pandoc-prebuilt/532c7a2d93d5555e2b0b1403669a2be45ec851d4 new file mode 100644 index 0000000..e1d28c2 --- /dev/null +++ b/buildlib/pandoc-prebuilt/532c7a2d93d5555e2b0b1403669a2be45ec851d4 @@ -0,0 +1,86 @@ +.\" Automatically generated by Pandoc 2.5 +.\" +.TH "IBV_GET_DEVICE_LIST" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[cq]s Manual" +.hy +.SH NAME +.PP +ibv_get_device_list, ibv_free_device_list \- get and release list of +available RDMA devices +.SH SYNOPSIS +.IP +.nf +\f[C] +#include + +struct ibv_device **ibv_get_device_list(int *num_devices); + +void ibv_free_device_list(struct ibv_device **list); +\f[R] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_get_device_list()\f[R] returns a NULL\-terminated array of RDMA +devices currently available. +The argument \f[I]num_devices\f[R] is optional; if not NULL, it is set +to the number of devices returned in the array. +.PP +\f[B]ibv_free_device_list()\f[R] frees the array of devices +\f[I]list\f[R] returned by \f[B]ibv_get_device_list()\f[R]. +.SH RETURN VALUE +.PP +\f[B]ibv_get_device_list()\f[R] returns the array of available RDMA +devices, or sets \f[I]errno\f[R] and returns NULL if the request fails. +If no devices are found then \f[I]num_devices\f[R] is set to 0, and +non\-NULL is returned. +.PP +\f[B]ibv_free_device_list()\f[R] returns no value. +.SH ERRORS +.TP +.B \f[B]EPERM\f[R] +Permission denied. +.TP +.B \f[B]ENOSYS\f[R] +No kernel support for RDMA. +.TP +.B \f[B]ENOMEM\f[R] +Insufficient memory to complete the operation. +.SH NOTES +.PP +Client code should open all the devices it intends to use with +\f[B]ibv_open_device()\f[R] before calling +\f[B]ibv_free_device_list()\f[R]. +Once it frees the array with \f[B]ibv_free_device_list()\f[R], it will +be able to use only the open devices; pointers to unopened devices will +no longer be valid. +.PP +Setting the environment variable \f[B]IBV_SHOW_WARNINGS\f[R] will cause +warnings to be emitted to stderr if a kernel verbs device is discovered, +but no corresponding userspace driver can be found for it. +.SH STATIC LINKING +.PP +If \f[B]libibverbs\f[R] is statically linked to the application then all +provider drivers must also be statically linked. +The library will not load dynamic providers when static linking is used. +.PP +To link the providers set the \f[B]RDMA_STATIC_PROVIDERS\f[R] define to +the comma separated list of desired providers when compiling the +application. +The special keyword `all' will statically link all supported +\f[B]libibverbs\f[R] providers. +.PP +This is intended to be used along with \f[B]pkg\-config(1)\f[R] to setup +the proper flags for \f[B]libibverbs\f[R] linking. +.PP +If this is not done then \f[B]ibv_get_device_list\f[R] will always +return an empty list. +.PP +Using only dynamic linking for \f[B]libibverbs\f[R] applications is +strongly recommended. +.SH SEE ALSO +.PP +\f[B]ibv_fork_init\f[R](3), \f[B]ibv_get_device_guid\f[R](3), +\f[B]ibv_get_device_name\f[R](3), \f[B]ibv_get_device_index\f[R](3), +\f[B]ibv_open_device\f[R](3) +.SH AUTHOR +.PP +Dotan Barak diff --git a/buildlib/pandoc-prebuilt/561ea56de897d453a681e70cd2be7d0c0335e784 b/buildlib/pandoc-prebuilt/561ea56de897d453a681e70cd2be7d0c0335e784 index 95140f7..92886c3 100644 --- a/buildlib/pandoc-prebuilt/561ea56de897d453a681e70cd2be7d0c0335e784 +++ b/buildlib/pandoc-prebuilt/561ea56de897d453a681e70cd2be7d0c0335e784 @@ -1,4 +1,4 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" .TH "mlx5dv_create_flow_action_modify_header" "3" "" "" "" .hy @@ -10,14 +10,14 @@ mlx5 provider .IP .nf \f[C] -#include\ +#include -struct\ ibv_flow_action\ * -mlx5dv_create_flow_action_modify_header(struct\ ibv_context\ *ctx, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ size_t\ actions_sz, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint64_t\ actions[], -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ enum\ mlx5dv_flow_table_type\ ft_type) -\f[] +struct ibv_flow_action * +mlx5dv_create_flow_action_modify_header(struct ibv_context *ctx, + size_t actions_sz, + uint64_t actions[], + enum mlx5dv_flow_table_type ft_type) +\f[R] .fi .SH DESCRIPTION .PP @@ -25,24 +25,17 @@ Create a modify header flow steering action, it allows mutating a packet header. .SH ARGUMENTS .TP -.B \f[I]ctx\f[] +.B \f[I]ctx\f[R] RDMA device context to create the action on. -.RS -.RE .TP -.B \f[I]actions_sz\f[] -The size of \f[I]actions\f[] buffer in bytes. -.RS -.RE +.B \f[I]actions_sz\f[R] +The size of \f[I]actions\f[R] buffer in bytes. .TP -.B \f[I]actions\f[] +.B \f[I]actions\f[R] A buffer which contains modify actions provided in device spec format -(i.e. -be64). -.RS -.RE +(i.e.\ be64). .TP -.B \f[I]ft_type\f[] +.B \f[I]ft_type\f[R] Defines the flow table type to which the modify header action will be attached. .RS @@ -53,9 +46,9 @@ MLX5DV_FLOW_TABLE_TYPE_NIC_TX: TX FLOW TABLE .RE .SH RETURN VALUE .PP -Upon success \f[I]mlx5dv_create_flow_action_modify_header\f[] will -return a new \f[I]struct ibv_flow_action\f[] object, on error NULL will +Upon success \f[I]mlx5dv_create_flow_action_modify_header\f[R] will +return a new \f[I]struct ibv_flow_action\f[R] object, on error NULL will be returned and errno will be set. .SH SEE ALSO .PP -\f[I]ibv_create_flow(3)\f[], \f[I]ibv_create_flow_action(3)\f[] +\f[I]ibv_create_flow(3)\f[R], \f[I]ibv_create_flow_action(3)\f[R] diff --git a/buildlib/pandoc-prebuilt/58748d44e47709c08982c6349ef8fc8891398ef3 b/buildlib/pandoc-prebuilt/58748d44e47709c08982c6349ef8fc8891398ef3 index dbc26ee..9293b06 100644 --- a/buildlib/pandoc-prebuilt/58748d44e47709c08982c6349ef8fc8891398ef3 +++ b/buildlib/pandoc-prebuilt/58748d44e47709c08982c6349ef8fc8891398ef3 @@ -1,4 +1,4 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" .TH "mlx5dv_devx_create_cmd_comp, mlx5dv_devx_destroy_cmd_comp, get_async" "3" "" "" "" .hy @@ -15,26 +15,26 @@ completion. .IP .nf \f[C] -#include\ +#include -struct\ mlx5dv_devx_cmd_comp\ { -\ \ \ \ int\ fd; +struct mlx5dv_devx_cmd_comp { + int fd; }; -struct\ mlx5dv_devx_cmd_comp\ * -mlx5dv_devx_create_cmd_comp(struct\ ibv_context\ *context) +struct mlx5dv_devx_cmd_comp * +mlx5dv_devx_create_cmd_comp(struct ibv_context *context) -void\ mlx5dv_devx_destroy_cmd_comp(struct\ mlx5dv_devx_cmd_comp\ *cmd_comp) +void mlx5dv_devx_destroy_cmd_comp(struct mlx5dv_devx_cmd_comp *cmd_comp) -struct\ mlx5dv_devx_async_cmd_hdr\ { -\ \ \ \ uint64_t\ \ \ \ wr_id; -\ \ \ \ uint8_t\ \ \ \ \ out_data[]; +struct mlx5dv_devx_async_cmd_hdr { + uint64_t wr_id; + uint8_t out_data[]; }; -int\ mlx5dv_devx_get_async_cmd_comp(struct\ mlx5dv_devx_cmd_comp\ *cmd_comp, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ mlx5dv_devx_async_cmd_hdr\ *cmd_resp, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ size_t\ cmd_resp_len) -\f[] +int mlx5dv_devx_get_async_cmd_comp(struct mlx5dv_devx_cmd_comp *cmd_comp, + struct mlx5dv_devx_async_cmd_hdr *cmd_resp, + size_t cmd_resp_len) +\f[R] .fi .SH DESCRIPTION .PP @@ -47,48 +47,42 @@ This lets an application run asynchronously without blocking and once the response is ready read it from this object. .PP The response can be read by the mlx5dv_devx_get_async_cmd_comp() API, -upon response the \f[I]wr_id\f[] that was supplied upon the asynchronous -command is returned and the \f[I]out_data\f[] includes the data itself. +upon response the \f[I]wr_id\f[R] that was supplied upon the +asynchronous command is returned and the \f[I]out_data\f[R] includes the +data itself. The application must supply a large enough buffer to match any command -that was issued on the \f[I]cmd_comp\f[], its size is given by the input -\f[I]cmd_resp_len\f[] parameter. +that was issued on the \f[I]cmd_comp\f[R], its size is given by the +input \f[I]cmd_resp_len\f[R] parameter. .SH ARGUMENTS .TP -.B \f[I]context\f[] +.B \f[I]context\f[R] .IP .nf \f[C] -RDMA\ device\ context\ to\ create\ the\ action\ on. -\f[] +RDMA device context to create the action on. +\f[R] .fi -.RS -.RE .TP -.B \f[I]cmd_comp\f[] +.B \f[I]cmd_comp\f[R] The command completion object. -.RS -.RE .TP -.B \f[I]cmd_resp\f[] +.B \f[I]cmd_resp\f[R] The output data from the asynchronous command. -.RS -.RE .TP -.B \f[I]cmd_resp_len\f[] +.B \f[I]cmd_resp_len\f[R] The output buffer size to hold the response. -.RS -.RE .SH RETURN VALUE .PP -Upon success \f[I]mlx5dv_devx_create_cmd_comp\f[] will return a new -\f[I]struct mlx5dv_devx_cmd_comp\f[] object, on error NULL will be +Upon success \f[I]mlx5dv_devx_create_cmd_comp\f[R] will return a new +\f[I]struct mlx5dv_devx_cmd_comp\f[R] object, on error NULL will be returned and errno will be set. .PP -Upon success \f[I]mlx5dv_devx_get_async_cmd_comp\f[] will return 0, +Upon success \f[I]mlx5dv_devx_get_async_cmd_comp\f[R] will return 0, otherwise errno will be returned. .SH SEE ALSO .PP -\f[I]mlx5dv_open_device(3)\f[], \f[I]mlx5dv_devx_obj_create(3)\f[] -.SH AUTHOR +\f[I]mlx5dv_open_device(3)\f[R], \f[I]mlx5dv_devx_obj_create(3)\f[R] +.PP +#AUTHOR .PP Yishai Hadas diff --git a/buildlib/pandoc-prebuilt/624de381c4dd90a5061dfb899e33d1aff4f8af1c b/buildlib/pandoc-prebuilt/624de381c4dd90a5061dfb899e33d1aff4f8af1c index 174184c..43c1687 100644 --- a/buildlib/pandoc-prebuilt/624de381c4dd90a5061dfb899e33d1aff4f8af1c +++ b/buildlib/pandoc-prebuilt/624de381c4dd90a5061dfb899e33d1aff4f8af1c @@ -1,4 +1,4 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" .TH "mlx5dv_is_supported" "3" "" "" "" .hy @@ -10,10 +10,10 @@ mlx5 provider .IP .nf \f[C] -#include\ +#include -bool\ mlx5dv_is_supported(struct\ ibv_device\ *device); -\f[] +bool mlx5dv_is_supported(struct ibv_device *device); +\f[R] .fi .SH DESCRIPTION .PP @@ -21,16 +21,14 @@ mlx5dv functions may be called only if this function returns true for the RDMA device. .SH ARGUMENTS .TP -.B \f[I]device\f[] +.B \f[I]device\f[R] RDMA device to check. -.RS -.RE .SH RETURN VALUE .PP Returns true if device is implemented by mlx5 provider. .SH SEE ALSO .PP -\f[I]mlx5dv(7)\f[] +\f[I]mlx5dv(7)\f[R] .SH AUTHOR .PP Artemy Kovalyov diff --git a/buildlib/pandoc-prebuilt/6465ebc6a2cda4fc52fa1badb65b07e7effaca6b b/buildlib/pandoc-prebuilt/6465ebc6a2cda4fc52fa1badb65b07e7effaca6b index 07353f7..8f993f1 100644 --- a/buildlib/pandoc-prebuilt/6465ebc6a2cda4fc52fa1badb65b07e7effaca6b +++ b/buildlib/pandoc-prebuilt/6465ebc6a2cda4fc52fa1badb65b07e7effaca6b @@ -1,4 +1,4 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" .TH "mlx5dv_create_mkey / mlx5dv_destroy_mkey" "3" "" "" "" .hy @@ -11,24 +11,24 @@ mlx5dv_create_mkey \- Destroys an indirect mkey .IP .nf \f[C] -#include\ +#include -struct\ mlx5dv_mkey_init_attr\ { -\ \ \ \ struct\ ibv_pd\ \ \ *pd; -\ \ \ \ uint32_t\ \ \ \ create_flags; -\ \ \ \ uint16_t\ \ \ \ max_entries; +struct mlx5dv_mkey_init_attr { + struct ibv_pd *pd; + uint32_t create_flags; + uint16_t max_entries; }; -struct\ mlx5dv_mkey\ { -\ \ \ \ uint32_t\ \ \ \ lkey; -\ \ \ \ uint32_t\ \ \ \ rkey; +struct mlx5dv_mkey { + uint32_t lkey; + uint32_t rkey; }; -struct\ mlx5dv_mkey\ * -mlx5dv_create_mkey(struct\ mlx5dv_mkey_init_attr\ *mkey_init_attr); +struct mlx5dv_mkey * +mlx5dv_create_mkey(struct mlx5dv_mkey_init_attr *mkey_init_attr); -int\ mlx5dv_destroy_mkey(struct\ mlx5dv_mkey\ *mkey); -\f[] +int mlx5dv_destroy_mkey(struct mlx5dv_mkey *mkey); +\f[R] .fi .SH DESCRIPTION .PP @@ -37,42 +37,38 @@ Create / destroy an indirect mkey. Create an indirect mkey to enable application uses its specific device functionality. .SH ARGUMENTS -.SS mkey_init_attr +.PP +##mkey_init_attr## .TP -.B \f[I]pd\f[] +.B \f[I]pd\f[R] ibv protection domain. -.RS -.RE .TP -.B \f[I]create_flags\f[] +.B \f[I]create_flags\f[R] MLX5DV_MKEY_INIT_ATTR_FLAGS_INDIRECT: Indirect mkey is being created. -.RS -.RE .TP -.B \f[I]max_entries\f[] +.B \f[I]max_entries\f[R] Requested max number of pointed entries by this indirect mkey. -The function will update the \f[I]mkey_init_attr\->max_entries\f[] with +The function will update the \f[I]mkey_init_attr\->max_entries\f[R] with the actual mkey value that was created; it will be greater than or equal to the value requested. -.RS -.RE .SH RETURN VALUE .PP -Upon success \f[I]mlx5dv_create_mkey\f[] will return a new \f[I]struct -mlx5dv_mkey\f[] on error NULL will be returned and errno will be set. +Upon success \f[I]mlx5dv_create_mkey\f[R] will return a new \f[I]struct +mlx5dv_mkey\f[R] on error NULL will be returned and errno will be set. .PP Upon success destroy 0 is returned or the value of errno on a failure. .SH Notes .PP To let this functionality works a DEVX context should be opened by using -\f[I]mlx5dv_open_device\f[]. +\f[I]mlx5dv_open_device\f[R]. .PP -The created indirect mkey can`t work with scatter to CQE feature, -consider \f[I]mlx5dv_create_qp()\f[] with +The created indirect mkey can\[ga]t work with scatter to CQE feature, +consider \f[I]mlx5dv_create_qp()\f[R] with MLX5DV_QP_CREATE_DISABLE_SCATTER_TO_CQE for small messages. .SH SEE ALSO .PP -\f[B]mlx5dv_open_device\f[](3), \f[B]mlx5dv_create_qp\f[](3) -.SH AUTHOR +\f[B]mlx5dv_open_device\f[R](3), \f[B]mlx5dv_create_qp\f[R](3) +.PP +#AUTHOR .PP Yishai Hadas diff --git a/buildlib/pandoc-prebuilt/6962baf519ab44a4635fd03f70c3033b30b7467e b/buildlib/pandoc-prebuilt/6962baf519ab44a4635fd03f70c3033b30b7467e index 6737fcc..a9c3c71 100644 --- a/buildlib/pandoc-prebuilt/6962baf519ab44a4635fd03f70c3033b30b7467e +++ b/buildlib/pandoc-prebuilt/6962baf519ab44a4635fd03f70c3033b30b7467e @@ -1,4 +1,4 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" .TH "mlx4dv_set_context_attr" "3" "" "" "" .hy @@ -9,12 +9,12 @@ mlx4dv_set_context_attr \- Set context attributes .IP .nf \f[C] -#include\ +#include -int\ mlx4dv_set_context_attr(struct\ ibv_context\ *context, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ enum\ mlx4dv_set_ctx_attr_type\ attr_type, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *attr); -\f[] +int mlx4dv_set_context_attr(struct ibv_context *context, + enum mlx4dv_set_ctx_attr_type attr_type, + void *attr); +\f[R] .fi .SH DESCRIPTION .PP @@ -22,70 +22,55 @@ mlx4dv_set_context_attr gives the ability to set vendor specific attributes on the RDMA context. .SH ARGUMENTS .TP -.B \f[I]context\f[] +.B \f[I]context\f[R] RDMA device context to work on. -.RS -.RE .TP -.B \f[I]attr_type\f[] +.B \f[I]attr_type\f[R] The type of the provided attribute. -.RS -.RE .TP -.B \f[I]attr\f[] +.B \f[I]attr\f[R] Pointer to the attribute to be set. ## attr_type -.RS -.RE .IP .nf \f[C] -enum\ mlx4dv_set_ctx_attr_type\ { -\ \ \ \ /*\ Attribute\ type\ uint8_t\ */ -\ \ \ \ MLX4DV_SET_CTX_ATTR_LOG_WQS_RANGE_SZ\ \ \ \ =\ 0, -\ \ \ \ MLX4DV_SET_CTX_ATTR_BUF_ALLOCATORS\ \ =\ 1, +enum mlx4dv_set_ctx_attr_type { + /* Attribute type uint8_t */ + MLX4DV_SET_CTX_ATTR_LOG_WQS_RANGE_SZ = 0, + MLX4DV_SET_CTX_ATTR_BUF_ALLOCATORS = 1, }; -\f[] +\f[R] .fi .TP -.B \f[I]MLX4DV_SET_CTX_ATTR_LOG_WQS_RANGE_SZ\f[] +.B \f[I]MLX4DV_SET_CTX_ATTR_LOG_WQS_RANGE_SZ\f[R] Change the LOG WQs Range size for RSS -.RS -.RE .TP -.B \f[I]MLX4DV_SET_CTX_ATTR_BUF_ALLOCATORS\f[] +.B \f[I]MLX4DV_SET_CTX_ATTR_BUF_ALLOCATORS\f[R] Provide an external buffer allocator -.RS -.RE .IP .nf \f[C] -struct\ mlx4dv_ctx_allocators\ { -\ \ \ \ void\ *(*alloc)(size_t\ size,\ void\ *priv_data); -\ \ \ \ void\ (*free)(void\ *ptr,\ void\ *priv_data); -\ \ \ \ void\ *data; +struct mlx4dv_ctx_allocators { + void *(*alloc)(size_t size, void *priv_data); + void (*free)(void *ptr, void *priv_data); + void *data; }; -\f[] +\f[R] .fi .TP -.B \f[I]alloc\f[] +.B \f[I]alloc\f[R] Function used for buffer allocation instead of libmlx4 internal method -.RS -.RE .TP -.B \f[I]free\f[] +.B \f[I]free\f[R] Function used to free buffers allocated by alloc function -.RS -.RE .TP -.B \f[I]data\f[] +.B \f[I]data\f[R] Metadata that can be used by alloc and free functions -.RS -.RE .SH RETURN VALUE .PP Returns 0 on success, or the value of errno on failure (which indicates the failure reason). -.SH AUTHOR +.PP +#AUTHOR .PP Majd Dibbiny diff --git a/buildlib/pandoc-prebuilt/697d7ae1cfe1af4b9264377df95979884266183b b/buildlib/pandoc-prebuilt/697d7ae1cfe1af4b9264377df95979884266183b new file mode 100644 index 0000000..2e79163 --- /dev/null +++ b/buildlib/pandoc-prebuilt/697d7ae1cfe1af4b9264377df95979884266183b @@ -0,0 +1,39 @@ +.\" Automatically generated by Pandoc 2.5 +.\" +.TH "ibv_import_device" "3" "2020\-5\-3" "libibverbs" "Libibverbs Programmer\[cq]s Manual" +.hy +.SH NAME +.PP +ibv_import_device \- import a device from a given command FD +.SH SYNOPSIS +.IP +.nf +\f[C] +#include + +struct ibv_context *ibv_import_device(int cmd_fd); +\f[R] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_import_device()\f[R] returns an \f[I]ibv_context\f[R] pointer +that is associated with the given \f[I]cmd_fd\f[R]. +.PP +The \f[I]cmd_fd\f[R] is obtained from the ibv_context cmd_fd member, +which must be dup\[cq]d (eg by dup(), SCM_RIGHTS, etc) before being +passed to ibv_import_device(). +.PP +Once the \f[I]ibv_context\f[R] usage has been ended +\f[I]ibv_close_device()\f[R] should be called. +This call may cleanup whatever is needed/opposite of the import +including closing the command FD. +.SH RETURN VALUE +.PP +\f[B]ibv_import_device()\f[R] returns a pointer to the allocated RDMA +context, or NULL if the request fails. +.SH SEE ALSO +.PP +\f[B]ibv_open_device\f[R](3), \f[B]ibv_close_device\f[R](3), +.SH AUTHOR +.PP +Yishai Hadas diff --git a/buildlib/pandoc-prebuilt/6a82b0bc695f8fd980a86aefaf5890804a010761 b/buildlib/pandoc-prebuilt/6a82b0bc695f8fd980a86aefaf5890804a010761 index 9868ffe..9797163 100644 --- a/buildlib/pandoc-prebuilt/6a82b0bc695f8fd980a86aefaf5890804a010761 +++ b/buildlib/pandoc-prebuilt/6a82b0bc695f8fd980a86aefaf5890804a010761 @@ -1,6 +1,6 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" -.TH "ibv_alloc_null_mr" "3" "2018\-6\-1" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.TH "ibv_alloc_null_mr" "3" "2018\-6\-1" "libibverbs" "Libibverbs Programmer\[cq]s Manual" .hy .SH NAME .PP @@ -9,15 +9,15 @@ ibv_alloc_null_mr \- allocate a null memory region (MR) .IP .nf \f[C] -#include\ +#include -struct\ ibv_mr\ *ibv_alloc_null_mr(struct\ ibv_pd\ *pd); -\f[] +struct ibv_mr *ibv_alloc_null_mr(struct ibv_pd *pd); +\f[R] .fi .SH DESCRIPTION .PP -\f[B]ibv_alloc_null_mr()\f[] allocates a null memory region (MR) that is -associated with the protection domain \f[I]pd\f[]. +\f[B]ibv_alloc_null_mr()\f[R] allocates a null memory region (MR) that +is associated with the protection domain \f[I]pd\f[R]. .PP A null MR discards all data written to it, and always returns 0 on read. It has the maximum length and only the lkey is valid, the MR is not @@ -29,19 +29,19 @@ This provides a way to avoid PCI bus transfers by using a scatter/gather list in commands if applications do not intend to access the data, or need data to be 0 filled. .PP -Specifically upon \f[B]ibv_post_send()\f[] the device skips PCI read -cycles and upon \f[B]ibv_post_recv()\f[] the device skips PCI write +Specifically upon \f[B]ibv_post_send()\f[R] the device skips PCI read +cycles and upon \f[B]ibv_post_recv()\f[R] the device skips PCI write cycles which finally improves performance. .PP -\f[B]ibv_dereg_mr()\f[] deregisters the MR. +\f[B]ibv_dereg_mr()\f[R] deregisters the MR. The use of ibv_rereg_mr() or ibv_bind_mw() with this MR is invalid. .SH RETURN VALUE .PP -\f[B]ibv_alloc_null_mr()\f[] returns a pointer to the allocated MR, or +\f[B]ibv_alloc_null_mr()\f[R] returns a pointer to the allocated MR, or NULL if the request fails. .SH SEE ALSO .PP -\f[B]ibv_reg_mr\f[](3), \f[B]ibv_dereg_mr\f[](3), +\f[B]ibv_reg_mr\f[R](3), \f[B]ibv_dereg_mr\f[R](3), .SH AUTHOR .PP Yonatan Cohen diff --git a/buildlib/pandoc-prebuilt/6ae20730dd330526b00e52b24456be5bc4e07a9a b/buildlib/pandoc-prebuilt/6ae20730dd330526b00e52b24456be5bc4e07a9a index 9f960a9..e8f5655 100644 --- a/buildlib/pandoc-prebuilt/6ae20730dd330526b00e52b24456be5bc4e07a9a +++ b/buildlib/pandoc-prebuilt/6ae20730dd330526b00e52b24456be5bc4e07a9a @@ -1,6 +1,6 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" -.TH "mlx5dv_alloc_dm" "3" "2018\-9\-1" "mlx5" "mlx5 Programmer\[aq]s Manual" +.TH "mlx5dv_alloc_dm" "3" "2018\-9\-1" "mlx5" "mlx5 Programmer\[cq]s Manual" .hy .SH NAME .PP @@ -9,33 +9,33 @@ mlx5dv_alloc_dm \- allocates device memory (DM) .IP .nf \f[C] -#include\ +#include -struct\ ibv_dm\ *mlx5dv_alloc_dm(struct\ ibv_context\ *context, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ ibv_alloc_dm_attr\ *dm_attr, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ mlx5dv_alloc_dm_attr\ *mlx5_dm_attr) -\f[] +struct ibv_dm *mlx5dv_alloc_dm(struct ibv_context *context, + struct ibv_alloc_dm_attr *dm_attr, + struct mlx5dv_alloc_dm_attr *mlx5_dm_attr) +\f[R] .fi .SH DESCRIPTION .PP -\f[B]mlx5dv_alloc_dm()\f[] allocates device memory (DM) with specific +\f[B]mlx5dv_alloc_dm()\f[R] allocates device memory (DM) with specific driver properties. .SH ARGUMENTS .PP -Please see \f[I]ibv_alloc_dm(3)\f[] man page for \f[I]context\f[] and -\f[I]dm_attr\f[]. +Please see \f[I]ibv_alloc_dm(3)\f[R] man page for \f[I]context\f[R] and +\f[I]dm_attr\f[R]. .SS mlx5_dm_attr .IP .nf \f[C] -struct\ mlx5dv_alloc_dm_attr\ { -\ \ \ \ enum\ mlx5dv_alloc_dm_type\ type; -\ \ \ \ uint64_t\ comp_mask; +struct mlx5dv_alloc_dm_attr { + enum mlx5dv_alloc_dm_type type; + uint64_t comp_mask; }; -\f[] +\f[R] .fi .TP -.B \f[I]type\f[] +.B \f[I]type\f[R] The device memory type user wishes to allocate: .RS .PP @@ -56,18 +56,16 @@ Can be used for direct table and header modification rules creation when allocated by a privileged user. .RE .TP -.B \f[I]comp_mask\f[] +.B \f[I]comp_mask\f[R] Bitmask specifying what fields in the structure are valid: Currently reserved and should be set to 0. -.RS -.RE .SH RETURN VALUE .PP -\f[B]mlx5dv_alloc_dm()\f[] returns a pointer to the created DM, on error -NULL will be returned and errno will be set. +\f[B]mlx5dv_alloc_dm()\f[R] returns a pointer to the created DM, on +error NULL will be returned and errno will be set. .SH SEE ALSO .PP -\f[B]ibv_alloc_dm\f[](3), +\f[B]ibv_alloc_dm\f[R](3), .SH AUTHOR .PP Ariel Levkovich diff --git a/buildlib/pandoc-prebuilt/6de8298d2452a2503f893112a0955baf560008c1 b/buildlib/pandoc-prebuilt/6de8298d2452a2503f893112a0955baf560008c1 index a09b93e..aebb0ac 100644 --- a/buildlib/pandoc-prebuilt/6de8298d2452a2503f893112a0955baf560008c1 +++ b/buildlib/pandoc-prebuilt/6de8298d2452a2503f893112a0955baf560008c1 @@ -1,6 +1,6 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" -.TH "IBV_RATE_TO_MULT" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.TH "IBV_RATE_TO_MULT" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[cq]s Manual" .hy .SH NAME .PP @@ -13,34 +13,35 @@ enumeration .IP .nf \f[C] -#include\ +#include -int\ ibv_rate_to_mult(enum\ ibv_rate\ rate); +int ibv_rate_to_mult(enum ibv_rate rate); -enum\ ibv_rate\ mult_to_ibv_rate(int\ mult); -\f[] +enum ibv_rate mult_to_ibv_rate(int mult); +\f[R] .fi .SH DESCRIPTION .PP -\f[B]ibv_rate_to_mult()\f[] converts the IB transmission rate -enumeration \f[I]rate\f[] to a multiple of 2.5 Gbit/sec (the base rate). -For example, if \f[I]rate\f[] is \f[B]IBV_RATE_5_GBPS\f[], the value 2 +\f[B]ibv_rate_to_mult()\f[R] converts the IB transmission rate +enumeration \f[I]rate\f[R] to a multiple of 2.5 Gbit/sec (the base +rate). +For example, if \f[I]rate\f[R] is \f[B]IBV_RATE_5_GBPS\f[R], the value 2 will be returned (5 Gbit/sec = 2 * 2.5 Gbit/sec). .PP -\f[B]mult_to_ibv_rate()\f[] converts the multiplier value (of 2.5 -Gbit/sec) \f[I]mult\f[] to an IB transmission rate enumeration. -For example, if \f[I]mult\f[] is 2, the rate enumeration -\f[B]IBV_RATE_5_GBPS\f[] will be returned. +\f[B]mult_to_ibv_rate()\f[R] converts the multiplier value (of 2.5 +Gbit/sec) \f[I]mult\f[R] to an IB transmission rate enumeration. +For example, if \f[I]mult\f[R] is 2, the rate enumeration +\f[B]IBV_RATE_5_GBPS\f[R] will be returned. .SH RETURN VALUE .PP -\f[B]ibv_rate_to_mult()\f[] returns the multiplier of the base rate 2.5 +\f[B]ibv_rate_to_mult()\f[R] returns the multiplier of the base rate 2.5 Gbit/sec. .PP -\f[B]mult_to_ibv_rate()\f[] returns the enumeration representing the IB +\f[B]mult_to_ibv_rate()\f[R] returns the enumeration representing the IB transmission rate. .SH SEE ALSO .PP -\f[B]ibv_query_port\f[](3) +\f[B]ibv_query_port\f[R](3) .SH AUTHOR .PP Dotan Barak diff --git a/buildlib/pandoc-prebuilt/70a57d40639cd86df012d1d8de4460ea2e550676 b/buildlib/pandoc-prebuilt/70a57d40639cd86df012d1d8de4460ea2e550676 new file mode 100644 index 0000000..8e330c3 --- /dev/null +++ b/buildlib/pandoc-prebuilt/70a57d40639cd86df012d1d8de4460ea2e550676 @@ -0,0 +1,128 @@ +.\" Automatically generated by Pandoc 2.5 +.\" +.TH "mlx5dv_create_qp" "3" "2018\-9\-1" "mlx5" "mlx5 Programmer\[cq]s Manual" +.hy +.SH NAME +.PP +mlx5dv_create_qp \- creates a queue pair (QP) +.SH SYNOPSIS +.IP +.nf +\f[C] +#include + +struct ibv_qp *mlx5dv_create_qp(struct ibv_context *context, + struct ibv_qp_init_attr_ex *qp_attr, + struct mlx5dv_qp_init_attr *mlx5_qp_attr) +\f[R] +.fi +.SH DESCRIPTION +.PP +\f[B]mlx5dv_create_qp()\f[R] creates a queue pair (QP) with specific +driver properties. +.SH ARGUMENTS +.PP +Please see \f[I]ibv_create_qp_ex(3)\f[R] man page for \f[I]context\f[R] +and \f[I]qp_attr\f[R]. +.SS mlx5_qp_attr +.IP +.nf +\f[C] +struct mlx5dv_qp_init_attr { + uint64_t comp_mask; + uint32_t create_flags; + struct mlx5dv_dc_init_attr dc_init_attr; + uint64_t send_ops_flags; +}; +\f[R] +.fi +.TP +.B \f[I]comp_mask\f[R] +Bitmask specifying what fields in the structure are valid: +MLX5DV_QP_INIT_ATTR_MASK_QP_CREATE_FLAGS: valid values in +\f[I]create_flags\f[R] MLX5DV_QP_INIT_ATTR_MASK_DC: valid values in +\f[I]dc_init_attr\f[R] MLX5DV_QP_INIT_ATTR_MASK_SEND_OPS_FLAGS: valid +values in \f[I]send_ops_flags\f[R] +.TP +.B \f[I]create_flags\f[R] +A bitwise OR of the various values described below. +.RS +.PP +MLX5DV_QP_CREATE_TUNNEL_OFFLOADS: Enable offloading such as checksum and +LRO for incoming tunneling traffic. +.PP +MLX5DV_QP_CREATE_TIR_ALLOW_SELF_LOOPBACK_UC: Allow receiving loopback +unicast traffic. +.PP +MLX5DV_QP_CREATE_TIR_ALLOW_SELF_LOOPBACK_MC: Allow receiving loopback +multicast traffic. +.PP +MLX5DV_QP_CREATE_DISABLE_SCATTER_TO_CQE: Disable scatter to CQE feature +which is enabled by default. +.PP +MLX5DV_QP_CREATE_ALLOW_SCATTER_TO_CQE: Allow scatter to CQE for +requester even if the qp was not configured to signal all WRs. +.PP +MLX5DV_QP_CREATE_PACKET_BASED_CREDIT_MODE: Set QP to work in +end\-to\-end packet\-based credit, instead of the default message\-based +credits (IB spec. +section 9.7.7.2). +.PD 0 +.P +.PD +It is the applications responsibility to make sure that the peer QP is +configured with same mode. +.RE +.TP +.B \f[I]dc_init_attr\f[R] +DC init attributes. +.SS \f[I]dc_init_attr\f[R] +.IP +.nf +\f[C] +struct mlx5dv_dc_init_attr { + enum mlx5dv_dc_type dc_type; + uint64_t dct_access_key; +}; +\f[R] +.fi +.TP +.B \f[I]dc_type\f[R] +MLX5DV_DCTYPE_DCT QP type: Target DC. +MLX5DV_DCTYPE_DCI QP type: Initiator DC. +.TP +.B \f[I]dct_access_key\f[R] +used to create a DCT QP. +.TP +.B \f[I]send_ops_flags\f[R] +A bitwise OR of the various values described below. +.RS +.PP +MLX5DV_QP_EX_WITH_MR_INTERLEAVED: Enables the mlx5dv_wr_mr_interleaved() +work requset on this QP. +.PP +MLX5DV_QP_EX_WITH_MR_LIST: Enables the mlx5dv_wr_mr_list() work requset +on this QP. +.RE +.SH NOTES +.PP +\f[B]mlx5dv_qp_ex_from_ibv_qp_ex()\f[R] is used to get \f[I]struct +mlx5dv_qp_ex\f[R] for accessing the send ops interfaces when +IBV_QP_INIT_ATTR_SEND_OPS_FLAGS is used. +.PP +The MLX5DV_QP_CREATE_DISABLE_SCATTER_TO_CQE flag should be set in cases +that IOVA doesn\[cq]t match the process\[cq] VA and the message payload +size is small enough to trigger the scatter to CQE feature. +.PP +When device memory is used IBV_SEND_INLINE and scatter to CQE should not +be used, as the memcpy is not possible. +.SH RETURN VALUE +.PP +\f[B]mlx5dv_create_qp()\f[R] returns a pointer to the created QP, on +error NULL will be returned and errno will be set. +.SH SEE ALSO +.PP +\f[B]ibv_query_device_ex\f[R](3), \f[B]ibv_create_qp_ex\f[R](3), +.SH AUTHOR +.PP +Yonatan Cohen diff --git a/buildlib/pandoc-prebuilt/71b9f30576194f743a340f6eaef13c674b4019d5 b/buildlib/pandoc-prebuilt/71b9f30576194f743a340f6eaef13c674b4019d5 index 6788c04..7d4f925 100644 --- a/buildlib/pandoc-prebuilt/71b9f30576194f743a340f6eaef13c674b4019d5 +++ b/buildlib/pandoc-prebuilt/71b9f30576194f743a340f6eaef13c674b4019d5 @@ -1,34 +1,34 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" -.TH "IBV_QUERY_PKEY" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.TH "IBV_QUERY_PKEY" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[cq]s Manual" .hy .SH NAME .PP -ibv_query_pkey \- query an InfiniBand port\[aq]s P_Key table +ibv_query_pkey \- query an InfiniBand port\[cq]s P_Key table .SH SYNOPSIS .IP .nf \f[C] -#include\ +#include -int\ ibv_query_pkey(struct\ ibv_context\ *context, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint8_t\ port_num, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ int\ index, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint16_t\ *pkey); -\f[] +int ibv_query_pkey(struct ibv_context *context, + uint8_t port_num, + int index, + uint16_t *pkey); +\f[R] .fi .SH DESCRIPTION .PP -\f[B]ibv_query_pkey()\f[] returns the P_Key value (in network byte -order) in entry \f[I]index\f[] of port \f[I]port_num\f[] for device -context \f[I]context\f[] through the pointer \f[I]pkey\f[]. +\f[B]ibv_query_pkey()\f[R] returns the P_Key value (in network byte +order) in entry \f[I]index\f[R] of port \f[I]port_num\f[R] for device +context \f[I]context\f[R] through the pointer \f[I]pkey\f[R]. .SH RETURN VALUE .PP -\f[B]ibv_query_pkey()\f[] returns 0 on success, and \-1 on error. +\f[B]ibv_query_pkey()\f[R] returns 0 on success, and \-1 on error. .SH SEE ALSO .PP -\f[B]ibv_open_device\f[](3), \f[B]ibv_query_device\f[](3), -\f[B]ibv_query_gid\f[](3), \f[B]ibv_query_port\f[](3) +\f[B]ibv_open_device\f[R](3), \f[B]ibv_query_device\f[R](3), +\f[B]ibv_query_gid\f[R](3), \f[B]ibv_query_port\f[R](3) .SH AUTHOR .PP Dotan Barak diff --git a/buildlib/pandoc-prebuilt/77100c5a6ee765b51de802a3368e8e9fdea4914c b/buildlib/pandoc-prebuilt/77100c5a6ee765b51de802a3368e8e9fdea4914c new file mode 100644 index 0000000..e48d89a --- /dev/null +++ b/buildlib/pandoc-prebuilt/77100c5a6ee765b51de802a3368e8e9fdea4914c @@ -0,0 +1,31 @@ +.\" Automatically generated by Pandoc 2.5 +.\" +.TH "IBV_GET_DEVICE_GUID" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[cq]s Manual" +.hy +.SH NAME +.PP +ibv_get_device_guid \- get an RDMA device\[cq]s GUID +.SH SYNOPSIS +.IP +.nf +\f[C] +#include + +uint64_t ibv_get_device_guid(struct ibv_device *device); +\f[R] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_get_device_guid()\f[R] returns the Global Unique IDentifier +(GUID) of the RDMA device \f[I]device\f[R]. +.SH RETURN VALUE +.PP +\f[B]ibv_get_device_guid()\f[R] returns the GUID of the device in +network byte order. +.SH SEE ALSO +.PP +\f[B]ibv_get_device_index\f[R](3), \f[B]ibv_get_device_list\f[R](3), +\f[B]ibv_get_device_name\f[R](3), \f[B]ibv_open_device\f[R](3) +.SH AUTHOR +.PP +Dotan Barak diff --git a/buildlib/pandoc-prebuilt/771e81c03946e49b29d803afc6498a1c2c346ce8 b/buildlib/pandoc-prebuilt/771e81c03946e49b29d803afc6498a1c2c346ce8 index 196e922..2cd01d0 100644 --- a/buildlib/pandoc-prebuilt/771e81c03946e49b29d803afc6498a1c2c346ce8 +++ b/buildlib/pandoc-prebuilt/771e81c03946e49b29d803afc6498a1c2c346ce8 @@ -1,6 +1,6 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" -.TH "MLX5DV_DUMP API" "3" "2019\-11\-18" "mlx5" "mlx5 Programmer\[aq]s Manual" +.TH "MLX5DV_DUMP API" "3" "2019\-11\-18" "mlx5" "mlx5 Programmer\[cq]s Manual" .hy .SH NAME .PP @@ -15,13 +15,13 @@ mlx5dv_dump_dr_rule \- Dump DR Rule .IP .nf \f[C] -#include\ +#include -int\ mlx5dv_dump_dr_domain(FILE\ *fout,\ struct\ mlx5dv_dr_domain\ *domain); -int\ mlx5dv_dump_dr_table(FILE\ *fout,\ struct\ mlx5dv_dr_table\ *table); -int\ mlx5dv_dump_dr_matcher(FILE\ *fout,\ struct\ mlx5dv_dr_matcher\ *matcher); -int\ mlx5dv_dump_dr_rule(FILE\ *fout,\ struct\ mlx5dv_dr_rule\ *rule); -\f[] +int mlx5dv_dump_dr_domain(FILE *fout, struct mlx5dv_dr_domain *domain); +int mlx5dv_dump_dr_table(FILE *fout, struct mlx5dv_dr_table *table); +int mlx5dv_dump_dr_matcher(FILE *fout, struct mlx5dv_dr_matcher *matcher); +int mlx5dv_dump_dr_rule(FILE *fout, struct mlx5dv_dr_rule *rule); +\f[R] .fi .SH DESCRIPTION .PP @@ -29,16 +29,16 @@ The Dump API (mlx5dv_dump_*) allows the dumping of the existing rdma\-core resources to the provided file. The output file format is vendor specific. .PP -\f[I]mlx5dv_dump_dr_domain()\f[] dumps a DR Domain object properties to +\f[I]mlx5dv_dump_dr_domain()\f[R] dumps a DR Domain object properties to a specified file. .PP -\f[I]mlx5dv_dump_dr_table()\f[] dumps a DR Table object properties to a +\f[I]mlx5dv_dump_dr_table()\f[R] dumps a DR Table object properties to a specified file. .PP -\f[I]mlx5dv_dump_dr_matcher()\f[] dumps a DR Matcher object properties +\f[I]mlx5dv_dump_dr_matcher()\f[R] dumps a DR Matcher object properties to a specified file. .PP -\f[I]mlx5dv_dump_dr_rule()\f[] dumps a DR Rule object properties to a +\f[I]mlx5dv_dump_dr_rule()\f[R] dumps a DR Rule object properties to a specified file. .SH RETURN VALUE .PP diff --git a/buildlib/pandoc-prebuilt/77296d8207743f01a23a4bd7b5c59c7b84c454e7 b/buildlib/pandoc-prebuilt/77296d8207743f01a23a4bd7b5c59c7b84c454e7 index 9076a2d..01156a6 100644 --- a/buildlib/pandoc-prebuilt/77296d8207743f01a23a4bd7b5c59c7b84c454e7 +++ b/buildlib/pandoc-prebuilt/77296d8207743f01a23a4bd7b5c59c7b84c454e7 @@ -1,4 +1,4 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" .TH "mlx5dv_devx_alloc_uar / mlx5dv_devx_free_uar" "3" "" "" "" .hy @@ -11,13 +11,13 @@ mlx5dv_devx_free_uar \- Frees a DEVX UAR .IP .nf \f[C] -#include\ +#include -struct\ mlx5dv_devx_uar\ *mlx5dv_devx_alloc_uar(struct\ ibv_context\ *context, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint32_t\ flags); +struct mlx5dv_devx_uar *mlx5dv_devx_alloc_uar(struct ibv_context *context, + uint32_t flags); -void\ mlx5dv_devx_free_uar(struct\ mlx5dv_devx_uar\ *devx_uar); -\f[] +void mlx5dv_devx_free_uar(struct mlx5dv_devx_uar *devx_uar); +\f[R] .fi .SH DESCRIPTION .PP @@ -29,59 +29,48 @@ device driver, the UAR information is needed for few commands as of QP creation. .SH ARGUMENTS .TP -.B \f[I]context\f[] +.B \f[I]context\f[R] RDMA device context to work on. -.RS -.RE .TP -.B \f[I]flags\f[] +.B \f[I]flags\f[R] Allocation flags for the UAR. MLX5DV_UAR_ALLOC_TYPE_BF: Allocate UAR with Blueflame properties. MLX5DV_UAR_ALLOC_TYPE_NC: Allocate UAR with non\-cache properties. -.RS -.RE .SS devx_uar .IP .nf \f[C] -struct\ mlx5dv_devx_uar\ { -\ \ \ \ void\ *reg_addr; -\ \ \ \ void\ *base_addr; -\ \ \ \ uint32_t\ page_id; -\ \ \ \ off_t\ mmap_off; -\ \ \ \ uint64_t\ comp_mask; +struct mlx5dv_devx_uar { + void *reg_addr; + void *base_addr; + uint32_t page_id; + off_t mmap_off; + uint64_t comp_mask; }; -\f[] +\f[R] .fi .TP -.B \f[I]reg_addr\f[] +.B \f[I]reg_addr\f[R] The write address of DB/BF. -.RS -.RE .TP -.B \f[I]base_addr\f[] +.B \f[I]base_addr\f[R] The base address of the UAR. -.RS -.RE .TP -.B \f[I]page_id\f[] +.B \f[I]page_id\f[R] The device page id to be used. -.RS -.RE .TP -.B \f[I]mmap_off\f[] +.B \f[I]mmap_off\f[R] The mmap offset parameter to be used for re\-mapping, to be used by a secondary process. -.RS -.RE .SH RETURN VALUE .PP -Upon success \f[I]mlx5dv_devx_alloc_uar\f[] will return a new -\f[I]struct mlx5dv_devx_uar\f[], on error NULL will be returned and +Upon success \f[I]mlx5dv_devx_alloc_uar\f[R] will return a new +\f[I]struct mlx5dv_devx_uar\f[R], on error NULL will be returned and errno will be set. .SH SEE ALSO .PP -\f[B]mlx5dv_open_device\f[], \f[B]mlx5dv_devx_obj_create\f[] -.SH AUTHOR +\f[B]mlx5dv_open_device\f[R], \f[B]mlx5dv_devx_obj_create\f[R] +.PP +#AUTHOR .PP Yishai Hadas diff --git a/buildlib/pandoc-prebuilt/77e091fce9252614b7c6136f15917606746eac44 b/buildlib/pandoc-prebuilt/77e091fce9252614b7c6136f15917606746eac44 index 7b8a484..17e0d62 100644 --- a/buildlib/pandoc-prebuilt/77e091fce9252614b7c6136f15917606746eac44 +++ b/buildlib/pandoc-prebuilt/77e091fce9252614b7c6136f15917606746eac44 @@ -1,4 +1,4 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" .TH "mlx5dv_devx_query_eqn" "3" "" "" "" .hy @@ -9,11 +9,11 @@ mlx5dv_devx_query_eqn \- Query EQN for a given vector id. .IP .nf \f[C] -#include\ +#include -int\ mlx5dv_devx_query_eqn(struct\ ibv_context\ *context,\ uint32_t\ vector, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint32_t\ *eqn); -\f[] +int mlx5dv_devx_query_eqn(struct ibv_context *context, uint32_t vector, + uint32_t *eqn); +\f[R] .fi .SH DESCRIPTION .PP @@ -25,27 +25,22 @@ device driver, the EQN information is needed for few commands such as CQ creation. .SH ARGUMENTS .TP -.B \f[I]context\f[] +.B \f[I]context\f[R] RDMA device context to work on. -.RS -.RE .TP -.B \f[I]vector\f[] +.B \f[I]vector\f[R] Completion vector number. -.RS -.RE .TP -.B \f[I]eqn\f[] +.B \f[I]eqn\f[R] The device EQ number which relates to the given input vector. -.RS -.RE .SH RETURN VALUE .PP returns 0 on success, or the value of errno on failure (which indicates the failure reason). .SH SEE ALSO .PP -\f[B]mlx5dv_open_device\f[], \f[B]mlx5dv_devx_obj_create\f[] -.SH AUTHOR +\f[B]mlx5dv_open_device\f[R], \f[B]mlx5dv_devx_obj_create\f[R] +.PP +#AUTHOR .PP Yishai Hadas diff --git a/buildlib/pandoc-prebuilt/7d3edfef629d9dea0b4104ab062e4db1ce3aa45f b/buildlib/pandoc-prebuilt/7d3edfef629d9dea0b4104ab062e4db1ce3aa45f new file mode 100644 index 0000000..b23ad4e --- /dev/null +++ b/buildlib/pandoc-prebuilt/7d3edfef629d9dea0b4104ab062e4db1ce3aa45f @@ -0,0 +1,121 @@ +.\" Automatically generated by Pandoc 2.5 +.\" +.TH "IBV_QUERY_GID_EX" "3" "2020\-04\-24" "libibverbs" "Libibverbs Programmer\[cq]s Manual" +.hy +.SH NAME +.PP +ibv_query_gid_ex \- Query an InfiniBand port\[cq]s GID table entry +.SH SYNOPSIS +.IP +.nf +\f[C] +#include + +int ibv_query_gid_ex(struct ibv_context *context, + uint32_t port_num, + uint32_t gid_index, + struct ibv_gid_entry *entry, + uint32_t flags); +\f[R] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_query_gid_ex()\f[R] returns the GID entry at \f[I]entry\f[R] +for \f[I]gid_index\f[R] of port \f[I]port_num\f[R] for device context +\f[I]context\f[R]. +.SH ARGUMENTS +.TP +.B \f[I]context\f[R] +The context of the device to query. +.TP +.B \f[I]port_num\f[R] +The number of port to query its GID table. +.TP +.B \f[I]gid_index\f[R] +The index of the GID table entry to query. +.TP +.B ## \f[I]entry\f[R] Argument +An ibv_gid_entry struct, as defined in . +.RS +.IP +.nf +\f[C] +struct ibv_gid_entry { + union ibv_gid gid; + uint32_t gid_index; + uint32_t port_num; + uint32_t gid_type; + uint32_t ndev_ifindex; +}; +\f[R] +.fi +.PP +\f[I]gid\f[R] +.RE +.IP +.nf +\f[C] + The GID entry. +\f[R] +.fi +.RS +.PP +\f[I]gid_index\f[R] +.RE +.IP +.nf +\f[C] + The GID table index of this entry. +\f[R] +.fi +.RS +.PP +\f[I]port_num\f[R] +.RE +.IP +.nf +\f[C] + The port number that this GID belongs to. +\f[R] +.fi +.RS +.PP +\f[I]gid_type\f[R] +.RE +.IP +.nf +\f[C] + enum ibv_gid_type, can be one of IBV_GID_TYPE_IB, IBV_GID_TYPE_ROCE_V1 or IBV_GID_TYPE_ROCE_V2. +\f[R] +.fi +.RS +.PP +\f[I]ndev_ifindex\f[R] +.RE +.IP +.nf +\f[C] + The interface index of the net device associated with this GID. + It is 0 if there is no net device associated with it. +\f[R] +.fi +.TP +.B \f[I]flags\f[R] +Extra fields to query post \f[I]ndev_ifindex\f[R], for now must be 0. +.SH RETURN VALUE +.PP +\f[B]ibv_query_gid_ex()\f[R] returns 0 on success or errno value on +error. +.SH ERRORS +.TP +.B ENODATA +\f[I]gid_index\f[R] is within the GID table size of port +\f[I]port_num\f[R] but there is no data in this index. +.SH SEE ALSO +.PP +\f[B]ibv_open_device\f[R](3), \f[B]ibv_query_device\f[R](3), +\f[B]ibv_query_pkey\f[R](3), \f[B]ibv_query_port\f[R](3), +\f[B]ibv_query_gid_table\f[R](3) +.SH AUTHOR +.PP +Parav Pandit diff --git a/buildlib/pandoc-prebuilt/8ffcb0db55efb46a50559f39000fd7b8b82d57cc b/buildlib/pandoc-prebuilt/8ffcb0db55efb46a50559f39000fd7b8b82d57cc index 292d3a7..e500d72 100644 --- a/buildlib/pandoc-prebuilt/8ffcb0db55efb46a50559f39000fd7b8b82d57cc +++ b/buildlib/pandoc-prebuilt/8ffcb0db55efb46a50559f39000fd7b8b82d57cc @@ -1,6 +1,6 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" -.TH "RDMA_INIT_QP_ATTR" "3" "2018\-12\-31" "librdmacm" "Librdmacm Programmer\[aq]s Manual" +.TH "RDMA_INIT_QP_ATTR" "3" "2018\-12\-31" "librdmacm" "Librdmacm Programmer\[cq]s Manual" .hy .SH NAME .PP @@ -9,44 +9,38 @@ rdma_init_qp_attr \- Returns qp attributes of a rdma_cm_id. .IP .nf \f[C] -#include\ +#include -int\ rdma_init_qp_attr(struct\ rdma_cm_id\ *id, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ ibv_qp_attr\ *qp_attr, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ int\ *qp_attr_mask); -\f[] +int rdma_init_qp_attr(struct rdma_cm_id *id, + struct ibv_qp_attr *qp_attr, + int *qp_attr_mask); +\f[R] .fi .SH DESCRIPTION .PP -\f[B]rdma_init_qp_attr()\f[] returns qp attributes of a rdma_cm_id. +\f[B]rdma_init_qp_attr()\f[R] returns qp attributes of a rdma_cm_id. .PP Information about qp attributes and qp attributes mask is returned -through the \f[I]qp_attr\f[] and \f[I]qp_attr_mask\f[] parameters. +through the \f[I]qp_attr\f[R] and \f[I]qp_attr_mask\f[R] parameters. .PP For details on the qp_attr structure, see ibv_modify_qp. .SH ARGUMENTS .TP -.B \f[I]id\f[] +.B \f[I]id\f[R] RDMA identifier. -.RS -.RE .TP -.B \f[I]qp_attr\f[] +.B \f[I]qp_attr\f[R] A reference to a qp attributes struct containing response information. -.RS -.RE .TP -.B \f[I]qp_attr_mask\f[] +.B \f[I]qp_attr_mask\f[R] A reference to a qp attributes mask containing response information. -.RS -.RE .SH RETURN VALUE .PP -\f[B]rdma_init_qp_attr()\f[] returns 0 on success, or \-1 on error. +\f[B]rdma_init_qp_attr()\f[R] returns 0 on success, or \-1 on error. If an error occurs, errno will be set to indicate the failure reason. .SH SEE ALSO .PP -\f[B]rdma_cm\f[](7), \f[B]ibv_modify_qp\f[](3) +\f[B]rdma_cm\f[R](7), \f[B]ibv_modify_qp\f[R](3) .SH AUTHOR .PP Danit Goldberg diff --git a/buildlib/pandoc-prebuilt/91431482687147a89ce2729e56c0ae743a2ecf97 b/buildlib/pandoc-prebuilt/91431482687147a89ce2729e56c0ae743a2ecf97 new file mode 100644 index 0000000..54b3354 --- /dev/null +++ b/buildlib/pandoc-prebuilt/91431482687147a89ce2729e56c0ae743a2ecf97 @@ -0,0 +1,67 @@ +.\" Automatically generated by Pandoc 2.5 +.\" +.TH "IBV_QUERY_GID_TABLE" "3" "2020\-04\-24" "libibverbs" "Libibverbs Programmer\[cq]s Manual" +.hy +.SH NAME +.PP +ibv_query_gid_table \- query an InfiniBand device\[cq]s GID table +.SH SYNOPSIS +.IP +.nf +\f[C] +#include + +ssize_t ibv_query_gid_table(struct ibv_context *context, + struct ibv_gid_entry *entries, + size_t max_entries, + uint32_t flags); +\f[R] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_query_gid_table()\f[R] returns the valid GID table entries of +the RDMA device context \f[I]context\f[R] at the pointer +\f[I]entries\f[R]. +.PP +A caller must allocate \f[I]entries\f[R] array for the GID table entries +it desires to query. +This API returns only valid GID table entries. +.PP +A caller must pass non zero number of entries at \f[I]max_entries\f[R] +that corresponds to the size of \f[I]entries\f[R] array. +.PP +\f[I]entries\f[R] array must be allocated such that it can contain all +the valid GID table entries of the device. +If there are more valid GID entries than the provided value of +\f[I]max_entries\f[R] and \f[I]entries\f[R] array, the call will fail. +For example, if a RDMA device \f[I]context\f[R] has a total of 10 valid +GID entries, \f[I]entries\f[R] should be allocated for at least 10 +entries, and \f[I]max_entries\f[R] should be set appropriately. +.SH ARGUMENTS +.TP +.B \f[I]context\f[R] +The context of the device to query. +.TP +.B \f[I]entries\f[R] +Array of ibv_gid_entry structs where the GID entries are returned. +Please see \f[B]ibv_query_gid_ex\f[R](3) man page for +\f[I]ibv_gid_entry\f[R]. +.TP +.B \f[I]max_entries\f[R] +Maximum number of entries that can be returned. +.TP +.B \f[I]flags\f[R] +Extra fields to query post \f[I]entries\->ndev_ifindex\f[R], for now +must be 0. +.SH RETURN VALUE +.PP +\f[B]ibv_query_gid_table()\f[R] returns the number of entries that were +read on success or negative errno value on error. +Number of entries returned is <= max_entries. +.SH SEE ALSO +.PP +\f[B]ibv_open_device\f[R](3), \f[B]ibv_query_device\f[R](3), +\f[B]ibv_query_port\f[R](3), \f[B]ibv_query_gid_ex\f[R](3) +.SH AUTHOR +.PP +Parav Pandit diff --git a/buildlib/pandoc-prebuilt/93db87fd38bdb056c0480494965253ba2b75235d b/buildlib/pandoc-prebuilt/93db87fd38bdb056c0480494965253ba2b75235d new file mode 100644 index 0000000..ad66446 --- /dev/null +++ b/buildlib/pandoc-prebuilt/93db87fd38bdb056c0480494965253ba2b75235d @@ -0,0 +1,380 @@ +.\" Automatically generated by Pandoc 2.5 +.\" +.TH "MLX5DV_DR API" "3" "2019\-03\-28" "mlx5" "mlx5 Programmer\[cq]s Manual" +.hy +.SH NAME +.PP +mlx5dv_dr_domain_create, mlx5dv_dr_domain_sync, +mlx5dv_dr_domain_destroy, mlx5dv_dr_domain_set_reclaim_device_memory \- +Manage flow domains +.PP +mlx5dv_dr_table_create, mlx5dv_dr_table_destroy \- Manage flow tables +.PP +mlx5dv_dr_matcher_create, mlx5dv_dr_matcher_destroy \- Manage flow +matchers +.PP +mlx5dv_dr_rule_create, mlx5dv_dr_rule_destroy \- Manage flow rules +.PP +mlx5dv_dr_action_create_drop \- Create drop action +.PP +mlx5dv_dr_action_create_default_miss \- Create default miss action +.PP +mlx5dv_dr_action_create_tag \- Create tag actions +.PP +mlx5dv_dr_action_create_dest_ibv_qp, mlx5dv_dr_action_create_dest_table, +mlx5dv_dr_action_create_dest_vport, +mlx5dv_dr_action_create_dest_devx_tir \- Create packet destination +actions +.PP +mlx5dv_dr_action_create_dest_array \- Create destination array action +.PP +mlx5dv_dr_action_create_packet_reformat \- Create packet reformat +actions +.PP +mlx5dv_dr_action_create_modify_header \- Create modify header actions +.PP +mlx5dv_dr_action_create_flow_counter \- Create devx flow counter actions +.PP +mlx5dv_dr_action_create_flow_meter, mlx5dv_dr_action_modify_flow_meter +\- Create and modify meter action +.PP +mlx5dv_dr_action_create_flow_sampler \- Create flow sampler action +.PP +mlx5dv_dr_action_destroy \- Destroy actions +.SH SYNOPSIS +.IP +.nf +\f[C] +#include + +struct mlx5dv_dr_domain *mlx5dv_dr_domain_create( + struct ibv_context *ctx, + enum mlx5dv_dr_domain_type type); + +int mlx5dv_dr_domain_sync( + struct mlx5dv_dr_domain *domain, + uint32_t flags); + +int mlx5dv_dr_domain_destroy(struct mlx5dv_dr_domain *domain); + +void mlx5dv_dr_domain_set_reclaim_device_memory( + struct mlx5dv_dr_domain *dmn, + bool enable); + +struct mlx5dv_dr_table *mlx5dv_dr_table_create( + struct mlx5dv_dr_domain *domain, + uint32_t level); + +int mlx5dv_dr_table_destroy(struct mlx5dv_dr_table *table); + +struct mlx5dv_dr_matcher *mlx5dv_dr_matcher_create( + struct mlx5dv_dr_table *table, + uint16_t priority, + uint8_t match_criteria_enable, + struct mlx5dv_flow_match_parameters *mask); + +int mlx5dv_dr_matcher_destroy(struct mlx5dv_dr_matcher *matcher); + +struct mlx5dv_dr_rule *mlx5dv_dr_rule_create( + struct mlx5dv_dr_matcher *matcher, + struct mlx5dv_flow_match_parameters *value, + size_t num_actions, + struct mlx5dv_dr_action *actions[]); + +void mlx5dv_dr_rule_destroy(struct mlx5dv_dr_rule *rule); + +struct mlx5dv_dr_action *mlx5dv_dr_action_create_drop(void); + +struct mlx5dv_dr_action *mlx5dv_dr_action_create_default_miss(void); + +struct mlx5dv_dr_action *mlx5dv_dr_action_create_tag( + uint32_t tag_value); + +struct mlx5dv_dr_action *mlx5dv_dr_action_create_dest_ibv_qp( + struct ibv_qp *ibqp); + +struct mlx5dv_dr_action *mlx5dv_dr_action_create_dest_table( + struct mlx5dv_dr_table *table); + +struct mlx5dv_dr_action *mlx5dv_dr_action_create_dest_vport( + struct mlx5dv_dr_domain *domain, + uint32_t vport); + +struct mlx5dv_dr_action *mlx5dv_dr_action_create_dest_devx_tir( + struct mlx5dv_devx_obj *devx_obj); + +struct mlx5dv_dr_action *mlx5dv_dr_action_create_packet_reformat( + struct mlx5dv_dr_domain *domain, + uint32_t flags, + enum mlx5dv_flow_action_packet_reformat_type reformat_type, + size_t data_sz, void *data); + +struct mlx5dv_dr_action *mlx5dv_dr_action_create_modify_header( + struct mlx5dv_dr_domain *domain, + uint32_t flags, + size_t actions_sz, + __be64 actions[]); + +struct mlx5dv_dr_action *mlx5dv_dr_action_create_flow_counter( + struct mlx5dv_devx_obj *devx_obj, + uint32_t offset); + +struct mlx5dv_dr_action * +mlx5dv_dr_action_create_flow_meter(struct mlx5dv_dr_flow_meter_attr *attr); + +int mlx5dv_dr_action_modify_flow_meter(struct mlx5dv_dr_action *action, + struct mlx5dv_dr_flow_meter_attr *attr, + __be64 modify_field_select); + +struct mlx5dv_dr_action * +mlx5dv_dr_action_create_flow_sampler(struct mlx5dv_dr_flow_sampler_attr *attr); + +struct mlx5dv_dr_action * +mlx5dv_dr_action_create_dest_array(struct mlx5dv_dr_domain *domain, + size_t num_dest, + struct mlx5dv_dr_action_dest_attr *dests[]); + +int mlx5dv_dr_action_destroy(struct mlx5dv_dr_action *action); +\f[R] +.fi +.SH DESCRIPTION +.PP +The Direct Rule API (mlx5dv_dr_*) allows complete access by verbs +application to the device\[ga]s packet steering functionality. +.PP +Steering flow rules are the combination of attributes with a match +pattern and a list of actions. +Rules can have several distinct actions (such as counting, +encapsulating, decapsulating before redirecting packets to a particular +queue or port, etc.). +In order to manage the rule execution order for the packet processing +matching by HW, multiple flow tables in an ordered chain and multiple +flow matchers sorted by priorities are defined. +.SS Domain +.PP +\f[I]mlx5dv_dr_domain_create()\f[R] creates a DR domain object to be +used with \f[I]mlx5dv_dr_table_create()\f[R] and +\f[I]mlx5dv_dr_action_create_*()\f[R]. +.PP +A domain should be destroyed by calling +\f[I]mlx5dv_dr_domain_destroy()\f[R] once all depended resources are +released. +.PP +The device support the following domains types: +.PP +\f[B]MLX5DV_DR_DOMAIN_TYPE_NIC_RX\f[R] Manage ethernet packets received +on the NIC. +Packets in this domain can be dropped, dispatched to QP\[ga]s, modified +or redirected to additional tables inside the domain. +Default behavior: Drop packet. +.PP +\f[B]MLX5DV_DR_DOMAIN_TYPE_NIC_TX\f[R] Manage ethernet packets transmit +on the NIC. +Packets in this domain can be dropped, modified or redirected to +additional tables inside the domain. +Default behavior: Forward packet to NIC vport (to eSwitch or wire). +.PP +\f[B]MLX5DV_DR_DOMAIN_TYPE_FDB\f[R] Manage ethernet packets in the +eSwitch Forwarding Data Base for packets received from wire or from any +other vport. +Packets in this domain can be dropped, dispatched to vport, modified or +redirected to additional tables inside the domain. +Default behavior: Forward packet to eSwitch manager vport. +.PP +\f[I]mlx5dv_dr_domain_sync()\f[R] is used in order to flush the rule +submission queue. +By default, rules in a domain are updated in HW asynchronously. +\f[B]flags\f[R] should be a set of type \f[I]enum +mlx5dv_dr_domain_sync_flags\f[R]: +.PP +\f[B]MLX5DV_DR_DOMAIN_SYNC_FLAGS_SW\f[R]: block until completion of all +software queued tasks. +.PP +\f[B]MLX5DV_DR_DOMAIN_SYNC_FLAGS_HW\f[R]: clear the steering HW cache to +enforce next packet hits the latest rules, in addition to the SW SYNC +handling. +.PP +\f[B]MLX5DV_DR_DOMAIN_SYNC_FLAGS_MEM\f[R]: sync device memory to free +cached memory. +.PP +\f[I]mlx5dv_dr_domain_set_reclaim_device_memory()\f[R] is used to enable +the reclaiming of device memory back to the system when not in use, by +default this feature is disabled. +.SS Table +.PP +\f[I]mlx5dv_dr_table_create()\f[R] creates a DR table in the +\f[B]domain\f[R], at the appropriate \f[B]level\f[R], and can be used +with \f[I]mlx5dv_dr_matcher_create()\f[R] and +\f[I]mlx5dv_dr_action_create_dest_table()\f[R]. +All packets start traversing the steering domain tree at table +\f[B]level\f[R] zero (0). +Using rule and action, packets can by redirected to other tables in the +domain. +.PP +A table should be destroyed by calling +\f[I]mlx5dv_dr_table_destroy()\f[R] once all depended resources are +released. +.SS Matcher +.PP +\f[I]mlx5dv_dr_matcher_create()\f[R] create a matcher object in +\f[B]table\f[R], at sorted \f[B]priority\f[R] (lower value is check +first). +A matcher can hold multiple rules, all with identical \f[B]mask\f[R] of +type \f[I]struct mlx5dv_flow_match_parameters\f[R] which represents the +exact attributes to be compared by HW steering. +The \f[B]match_criteria_enable\f[R] and \f[B]mask\f[R] are defined in a +device spec format. +Only the fields that where masked in the \f[I]matcher\f[R] should be +filled by the rule in \f[I]mlx5dv_dr_rule_create()\f[R]. +.PP +A matcher should be destroyed by calling +\f[I]mlx5dv_dr_matcher_destroy()\f[R] once all depended resources are +released. +.SS Actions +.PP +A set of action create API are defined by +\f[I]mlx5dv_dr_action_create_*()\f[R]. +All action are created as \f[I]struct mlx5dv_dr_action\f[R]. +An action should be destroyed by calling +\f[I]mlx5dv_dr_action_destroy()\f[R] once all depended rules are +destroyed. +.PP +When an action handle is reused for multiple rules, the same action will +be executed. +e.g.: action `count' will count multiple flows rules on the same HW flow +counter context. +action `drop' will drop packets of different rule from any matcher. +.PP +Action: Drop \f[I]mlx5dv_dr_action_create_drop\f[R] create a terminating +action which drops packets. +Can not be mixed with Destination actions. +.PP +Action: Default miss \f[I]mlx5dv_dr_action_create_default_miss\f[R] +create a terminating action which will execute the default behavior +based on the domain type. +.PP +Action: Tag \f[I]mlx5dv_dr_action_create_tag\f[R] creates a +non\-terminating action which tags packets with \f[B]tag_value\f[R]. +The \f[B]tag_value\f[R] is available in the CQE of the packet received. +Valid only on domain type NIC_RX. +.PP +Action: Destination \f[I]mlx5dv_dr_action_create_dest_ibv_qp\f[R] +creates a terminating action delivering the packet to a QP, defined by +\f[B]ibqp\f[R]. +Valid only on domain type NIC_RX. +\f[I]mlx5dv_dr_action_create_dest_table\f[R] creates a forwarding action +to another flow table, defined by \f[B]table\f[R]. +The destination \f[B]table\f[R] must be from the same domain with a +level higher than zero. +\f[I]mlx5dv_dr_action_create_dest_vport\f[R] creates a forwarding action +to a \f[B]vport\f[R] on the same \f[B]domain\f[R]. +Valid only on domain type FDB. +\f[I]mlx5dv_dr_action_create_dest_devx_tir\f[R] creates a terminating +action delivering the packet to a TIR, defined by \f[B]devx_obj\f[R]. +Valid only on domain type NIC_RX. +.PP +Action: Array \f[I]mlx5dv_dr_action_create_dest_array\f[R] creates an +action which replicates a packet to multiple destinations. +\f[B]num_dest\f[R] defines the number of replication destinations. +Each \f[B]dests\f[R] destination array entry can be of different +\f[B]type\f[R]. +Use type MLX5DV_DR_ACTION_DEST for direct forwarding to an action +destination. +Use type MLX5DV_DR_ACTION_DEST_REFORMAT when reformat action should be +performed on the packet before it is forwarding to the destination +action. +.PP +Action: Packet Reformat +\f[I]mlx5dv_dr_action_create_packet_reformat\f[R] create a packet +reformat context and action in the \f[B]domain\f[R]. +The \f[B]reformat_type\f[R], \f[B]data_sz\f[R] and \f[B]data\f[R] are +defined in \f[I]man mlx5dv_create_flow_action_packet_reformat\f[R]. +.PP +Action: Modify Header \f[I]mlx5dv_dr_action_create_modify_header\f[R] +create a modify header context and action in the \f[B]domain\f[R]. +The \f[B]actions_sz\f[R] and \f[B]actions\f[R] are defined in \f[I]man +mlx5dv_create_flow_action_modify_header\f[R]. +.PP +Action: Flow Count \f[I]mlx5dv_dr_action_create_flow_counter\f[R] +creates a flow counter action from a DEVX flow counter object, based on +\f[B]devx_obj\f[R] and specific counter index from \f[B]offset\f[R] in +the counter bulk. +.PP +Action: Meter \f[I]mlx5dv_dr_action_create_flow_meter\f[R] creates a +meter action based on the flow meter parameters. +The paramertes are according to the device specification. +\f[I]mlx5dv_dr_action_modify_flow_meter\f[R] modifies existing flow +meter \f[B]action\f[R] based on \f[B]modify_field_select\f[R]. +\f[B]modify_field_select\f[R] is according to the device specification. +.PP +Action: Sampler \f[I]mlx5dv_dr_action_create_flow_sampler\f[R] creates a +sampler action, allowing us to duplicate and sample a portion of +traffic. +Packets steered to the sampler action will be sampled with an +approximate probability of 1/sample_ratio provided in \f[B]attr\f[R], +and sample_actions provided in \f[B]attr\f[R] will be executed over +them. +All original packets will be steered to default_next_table in +\f[B]attr\f[R]. +A modify header format SET_ACTION data can be provided in action of +\f[B]attr\f[R], which can be executed on packets before going to default +flow table. +On some devices, this is required to set register value. +.PP +Action Flags: action \f[B]flags\f[R] can be set to one of the types of +\f[I]enum mlx5dv_dr_action_flags\f[R]: +.PP +\f[B]MLX5DV_DR_ACTION_FLAGS_ROOT_LEVEL\f[R]: is used to indicate the +action is targeted for flow table in level=0 (ROOT) of the specific +domain. +.SS Rule +.PP +\f[I]mlx5dv_dr_rule_create()\f[R] creates a HW steering rule entry in +\f[B]matcher\f[R]. +The \f[B]value\f[R] of type \f[I]struct +mlx5dv_flow_match_parameters\f[R] holds the exact attribute values of +the steering rule to be matched, in a device spec format. +Only the fields that where masked in the \f[I]matcher\f[R] should be +filled. +HW will perform the set of \f[B]num_actions\f[R] from the +\f[B]action\f[R] array of type \f[I]struct mlx5dv_dr_action\f[R], once a +packet matches the exact \f[B]value\f[R] of the rule (referred to as a +`hit'). +.PP +\f[I]mlx5dv_dr_rule_destroy()\f[R] destroys the rule. +.SH RETURN VALUE +.PP +The create API calls will return a pointer to the relevant object: +table, matcher, action, rule. +on failure, NULL will be returned and errno will be set. +.PP +The destroy API calls will returns 0 on success, or the value of errno +on failure (which indicates the failure reason). +.SH LIMITATIONS +.PP +Application can verify is a feature is supported by \f[I]trail and +error\f[R]. +No capabilities are exposed, as the combination of all the options +exposed are way to large to define. +.PP +Tables are size less by definition. +They are expected to grow and shrink to accommodate for all rules, +according to driver capabilities. +Once reaching a limit, an error is returned. +.PP +Matchers in same priority, in the same table, will have undefined +ordered. +.PP +A rule with identical value pattern to another rule on a given matcher +are rejected. +.PP +IP version in matcher mask and rule should be equal and set to 4, 6 or +0. +# SEE ALSO +.PP +\f[B]mlx5dv_open_device(3)\f[R], +\f[B]mlx5dv_create_flow_action_packet_reformat(3)\f[R], +\f[B]mlx5dv_create_flow_action_modify_header(3)\f[R]. +.SH AUTHOR +.PP +Alex Rosenbaum Alex Vesker diff --git a/buildlib/pandoc-prebuilt/972a32a8debfec8e394c32769fd0d69e06a946ef b/buildlib/pandoc-prebuilt/972a32a8debfec8e394c32769fd0d69e06a946ef new file mode 100644 index 0000000..b4b8825 --- /dev/null +++ b/buildlib/pandoc-prebuilt/972a32a8debfec8e394c32769fd0d69e06a946ef @@ -0,0 +1,72 @@ +.\" Automatically generated by Pandoc 2.5 +.\" +.TH "IBV_SET_ECE" "3" "2020\-01\-22" "libibverbs" "Libibverbs Programmer\[cq]s Manual" +.hy +.SH NAME +.PP +ibv_set_ece \- set ECE options and use them for QP configuration stage. +.SH SYNOPSIS +.IP +.nf +\f[C] +#include + +int ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece); +\f[R] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_set_ece()\f[R] set ECE options and use them for QP +configuration stage. +.PP +The desired ECE options will be used during various modify QP stages +based on supported options in relevant QP state. +.SH ARGUMENTS +.TP +.B \f[I]qp\f[R] +The queue pair (QP) associated with the ECE options. +.TP +.B ## \f[I]ece\f[R] Argument +The requested ECE values. +This is IN/OUT field, the accepted options will be returned in this +field. +.IP +.nf +\f[C] +struct ibv_ece { + uint32_t vendor_id; + uint32_t options; + uint32_t comp_mask; +}; +\f[R] +.fi +.TP +.B \f[I]vendor_id\f[R] +Unique identifier of the provider vendor on the network. +The providers will set IEEE OUI here to distinguish itself in +non\-homogenius network. +.TP +.B \f[I]options\f[R] +Provider specific attributes which are supported or needed to be enabled +by ECE users. +.TP +.B \f[I]comp_mask\f[R] +Bitmask specifying what fields in the structure are valid. +.SH RETURN VALUE +.PP +\f[B]ibv_set_ece()\f[R] returns 0 when the call was successful, or the +errno value which indicates the failure reason. +.TP +.B \f[I]EOPNOTSUPP\f[R] +libibverbs or provider driver doesn\[cq]t support the ibv_set_ece() +verb. +.TP +.B \f[I]EINVAL\f[R] +In one of the following: o The QP is invalid. +o The ECE options are invalid. +.SH SEE ALSO +.PP +\f[B]ibv_query_ece\f[R](3), +.SH AUTHOR +.PP +Leon Romanovsky diff --git a/buildlib/pandoc-prebuilt/983dc82fa7ae24ca010e5d6e9d76e86725041150 b/buildlib/pandoc-prebuilt/983dc82fa7ae24ca010e5d6e9d76e86725041150 index bb86789..f674b1a 100644 --- a/buildlib/pandoc-prebuilt/983dc82fa7ae24ca010e5d6e9d76e86725041150 +++ b/buildlib/pandoc-prebuilt/983dc82fa7ae24ca010e5d6e9d76e86725041150 @@ -1,6 +1,6 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" -.TH "IBV_REQ_NOTIFY_CQ" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.TH "IBV_REQ_NOTIFY_CQ" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[cq]s Manual" .hy .SH NAME .PP @@ -10,38 +10,38 @@ queue (CQ) .IP .nf \f[C] -#include\ +#include -int\ ibv_req_notify_cq(struct\ ibv_cq\ *cq,\ int\ solicited_only); -\f[] +int ibv_req_notify_cq(struct ibv_cq *cq, int solicited_only); +\f[R] .fi .SH DESCRIPTION .PP -\f[B]ibv_req_notify_cq()\f[] requests a completion notification on the -completion queue (CQ) \f[I]cq\f[]. +\f[B]ibv_req_notify_cq()\f[R] requests a completion notification on the +completion queue (CQ) \f[I]cq\f[R]. .PP -Upon the addition of a new CQ entry (CQE) to \f[I]cq\f[], a completion +Upon the addition of a new CQ entry (CQE) to \f[I]cq\f[R], a completion event will be added to the completion channel associated with the CQ. -If the argument \f[I]solicited_only\f[] is zero, a completion event is +If the argument \f[I]solicited_only\f[R] is zero, a completion event is generated for any new CQE. -If \f[I]solicited_only\f[] is non\-zero, an event is only generated for -a new CQE with that is considered "solicited." A CQE is solicited if it -is a receive completion for a message with the Solicited Event header -bit set, or if the status is not successful. +If \f[I]solicited_only\f[R] is non\-zero, an event is only generated for +a new CQE with that is considered \[lq]solicited.\[rq] A CQE is +solicited if it is a receive completion for a message with the Solicited +Event header bit set, or if the status is not successful. All other successful receive completions, or any successful send completion is unsolicited. .SH RETURN VALUE .PP -\f[B]ibv_req_notify_cq()\f[] returns 0 on success, or the value of errno -on failure (which indicates the failure reason). +\f[B]ibv_req_notify_cq()\f[R] returns 0 on success, or the value of +errno on failure (which indicates the failure reason). .SH NOTES .PP -The request for notification is "one shot." Only one completion event -will be generated for each call to \f[B]ibv_req_notify_cq()\f[]. +The request for notification is \[lq]one shot.\[rq] Only one completion +event will be generated for each call to \f[B]ibv_req_notify_cq()\f[R]. .SH SEE ALSO .PP -\f[B]ibv_create_comp_channel\f[](3), \f[B]ibv_create_cq\f[](3), -\f[B]ibv_get_cq_event\f[](3) +\f[B]ibv_create_comp_channel\f[R](3), \f[B]ibv_create_cq\f[R](3), +\f[B]ibv_get_cq_event\f[R](3) .SH AUTHOR .PP Dotan Barak diff --git a/buildlib/pandoc-prebuilt/a87ace1daaff861a7854e9773f09a3467c40f02e b/buildlib/pandoc-prebuilt/a87ace1daaff861a7854e9773f09a3467c40f02e index 4c02006..81a9285 100644 --- a/buildlib/pandoc-prebuilt/a87ace1daaff861a7854e9773f09a3467c40f02e +++ b/buildlib/pandoc-prebuilt/a87ace1daaff861a7854e9773f09a3467c40f02e @@ -1,4 +1,4 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" .TH "mlx5dv_devx_obj_create / destroy / modify /query / general" "3" "" "" "" .hy @@ -21,23 +21,23 @@ interface .IP .nf \f[C] -#include\ +#include -struct\ mlx5dv_devx_obj\ * -mlx5dv_devx_obj_create(struct\ ibv_context\ *context,\ const\ void\ *in,\ size_t\ inlen, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *out,\ size_t\ outlen); -int\ mlx5dv_devx_obj_query(struct\ mlx5dv_devx_obj\ *obj,\ const\ void\ *in,\ size_t\ inlen, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *out,\ size_t\ outlen); -int\ mlx5dv_devx_obj_query_async(struct\ mlx5dv_devx_obj\ *obj,\ const\ void\ *in, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ size_t\ inlen,\ size_t\ outlen, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint64_t\ wr_id, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ mlx5dv_devx_cmd_comp\ *cmd_comp); -int\ mlx5dv_devx_obj_modify(struct\ mlx5dv_devx_obj\ *obj,\ const\ void\ *in,\ size_t\ inlen, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *out,\ size_t\ outlen); -int\ mlx5dv_devx_obj_destroy(struct\ mlx5dv_devx_obj\ *obj); -int\ mlx5dv_devx_general_cmd(struct\ ibv_context\ *context,\ const\ void\ *in,\ size_t\ inlen, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *out,\ size_t\ outlen); -\f[] +struct mlx5dv_devx_obj * +mlx5dv_devx_obj_create(struct ibv_context *context, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_obj_query(struct mlx5dv_devx_obj *obj, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_obj_query_async(struct mlx5dv_devx_obj *obj, const void *in, + size_t inlen, size_t outlen, + uint64_t wr_id, + struct mlx5dv_devx_cmd_comp *cmd_comp); +int mlx5dv_devx_obj_modify(struct mlx5dv_devx_obj *obj, const void *in, size_t inlen, + void *out, size_t outlen); +int mlx5dv_devx_obj_destroy(struct mlx5dv_devx_obj *obj); +int mlx5dv_devx_general_cmd(struct ibv_context *context, const void *in, size_t inlen, + void *out, size_t outlen); +\f[R] .fi .SH DESCRIPTION .PP @@ -68,11 +68,11 @@ which is not related to an object such as query device capabilities. The mlx5dv_devx_obj_query_async() API is similar to the query object API, however, it runs asynchronously without blocking. The input includes an mlx5dv_devx_cmd_comp object and an identifier -named \[aq]wr_id\[aq] for this command. +named `wr_id' for this command. The response should be read upon success with the mlx5dv_devx_get_async_cmd_comp() API. -The \[aq]wr_id\[aq] that was supplied as an input is returned as part of -the response to let application knows for which command the response is +The `wr_id' that was supplied as an input is returned as part of the +response to let application knows for which command the response is related to. .PP An application can gradually migrate to use DEVX according to its needs, @@ -92,63 +92,48 @@ and data path. .PP To successfully create a DEVX object and work on, a DEVX context must be created, this is done by the mlx5dv_open_device() API with the -\f[I]MLX5DV_CONTEXT_FLAGS_DEVX\f[] flag. +\f[I]MLX5DV_CONTEXT_FLAGS_DEVX\f[R] flag. .SH ARGUMENTS .TP -.B \f[I]context\f[] +.B \f[I]context\f[R] RDMA device context to create the action on. -.RS -.RE .TP -.B \f[I]in\f[] -A buffer which contains the command\[aq]s input data provided in a +.B \f[I]in\f[R] +A buffer which contains the command\[cq]s input data provided in a device specification format. -.RS -.RE .TP -.B \f[I]inlen\f[] -The size of \f[I]in\f[] buffer in bytes. -.RS -.RE +.B \f[I]inlen\f[R] +The size of \f[I]in\f[R] buffer in bytes. .TP -.B \f[I]out\f[] -A buffer which contains the command\[aq]s output data according to the +.B \f[I]out\f[R] +A buffer which contains the command\[cq]s output data according to the device specification format. -.RS -.RE .TP -.B \f[I]outlen\f[] -The size of \f[I]out\f[] buffer in bytes. -.RS -.RE +.B \f[I]outlen\f[R] +The size of \f[I]out\f[R] buffer in bytes. .TP -.B \f[I]obj\f[] +.B \f[I]obj\f[R] For query, modify, destroy: the devx object to work on. -.RS -.RE .TP -.B \f[I]wr_id\f[] +.B \f[I]wr_id\f[R] The command identifier when working in asynchronous mode. -.RS -.RE .TP -.B \f[I]cmd_comp\f[] +.B \f[I]cmd_comp\f[R] The command completion object to read the response from in asynchronous mode. -.RS -.RE .SH RETURN VALUE .PP -Upon success \f[I]mlx5dv_devx_create_obj\f[] will return a new -\f[I]struct mlx5dv_devx_obj\f[] on error NULL will be returned and errno -will be set. +Upon success \f[I]mlx5dv_devx_create_obj\f[R] will return a new +\f[I]struct mlx5dv_devx_obj\f[R] on error NULL will be returned and +errno will be set. .PP Upon success query, modify, destroy, general commands, 0 is returned or the value of errno on a failure. .SH SEE ALSO .PP -\f[B]mlx5dv_open_device\f[], \f[B]mlx5dv_devx_create_cmd_comp\f[], -\f[B]mlx5dv_devx_get_async_cmd_comp\f[] -.SH AUTHOR +\f[B]mlx5dv_open_device\f[R], \f[B]mlx5dv_devx_create_cmd_comp\f[R], +\f[B]mlx5dv_devx_get_async_cmd_comp\f[R] +.PP +#AUTHOR .PP Yishai Hadas diff --git a/buildlib/pandoc-prebuilt/a8ffa02582b1604aa43cc72efd3bcde41e65b4cd b/buildlib/pandoc-prebuilt/a8ffa02582b1604aa43cc72efd3bcde41e65b4cd index fadb209..87776e1 100644 --- a/buildlib/pandoc-prebuilt/a8ffa02582b1604aa43cc72efd3bcde41e65b4cd +++ b/buildlib/pandoc-prebuilt/a8ffa02582b1604aa43cc72efd3bcde41e65b4cd @@ -1,6 +1,6 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" -.TH "IBV_FORK_INIT" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.TH "IBV_FORK_INIT" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[cq]s Manual" .hy .SH NAME .PP @@ -9,52 +9,53 @@ ibv_fork_init \- initialize libibverbs to support fork() .IP .nf \f[C] -#include\ +#include -int\ ibv_fork_init(void); -\f[] +int ibv_fork_init(void); +\f[R] .fi .SH DESCRIPTION .PP -\f[B]ibv_fork_init()\f[] initializes libibverbs\[aq]s data structures to -handle \f[B]fork()\f[] function calls correctly and avoid data -corruption, whether \f[B]fork()\f[] is called explicitly or implicitly -(such as in \f[B]system()\f[]). +\f[B]ibv_fork_init()\f[R] initializes libibverbs\[cq]s data structures +to handle \f[B]fork()\f[R] function calls correctly and avoid data +corruption, whether \f[B]fork()\f[R] is called explicitly or implicitly +(such as in \f[B]system()\f[R]). .PP It is not necessary to use this function if all parent process threads are always blocked until all child processes end or change address -spaces via an \f[B]exec()\f[] operation. +spaces via an \f[B]exec()\f[R] operation. .SH RETURN VALUE .PP -\f[B]ibv_fork_init()\f[] returns 0 on success, or the value of errno on +\f[B]ibv_fork_init()\f[R] returns 0 on success, or the value of errno on failure (which indicates the failure reason). .SH NOTES .PP -\f[B]ibv_fork_init()\f[] works on Linux kernels supporting the -\f[B]MADV_DONTFORK\f[] flag for \f[B]madvise()\f[] (2.6.17 and higher). +\f[B]ibv_fork_init()\f[R] works on Linux kernels supporting the +\f[B]MADV_DONTFORK\f[R] flag for \f[B]madvise()\f[R] (2.6.17 and +higher). .PP -Setting the environment variable \f[B]RDMAV_FORK_SAFE\f[] or -\f[B]IBV_FORK_SAFE\f[] has the same effect as calling -\f[B]ibv_fork_init()\f[]. +Setting the environment variable \f[B]RDMAV_FORK_SAFE\f[R] or +\f[B]IBV_FORK_SAFE\f[R] has the same effect as calling +\f[B]ibv_fork_init()\f[R]. .PP -Setting the environment variable \f[B]RDMAV_HUGEPAGES_SAFE\f[] tells the -library to check the underlying page size used by the kernel for memory -regions. +Setting the environment variable \f[B]RDMAV_HUGEPAGES_SAFE\f[R] tells +the library to check the underlying page size used by the kernel for +memory regions. This is required if an application uses huge pages either directly or indirectly via a library such as libhugetlbfs. .PP -Calling \f[B]ibv_fork_init()\f[] will reduce performance due to an extra -system call for every memory registration, and the additional memory -allocated to track memory regions. +Calling \f[B]ibv_fork_init()\f[R] will reduce performance due to an +extra system call for every memory registration, and the additional +memory allocated to track memory regions. The precise performance impact depends on the workload and usually will not be significant. .PP -Setting \f[B]RDMAV_HUGEPAGES_SAFE\f[] adds further overhead to all +Setting \f[B]RDMAV_HUGEPAGES_SAFE\f[R] adds further overhead to all memory registrations. .SH SEE ALSO .PP -\f[B]exec\f[](3), \f[B]fork\f[](2), \f[B]ibv_get_device_list\f[](3), -\f[B]system\f[](3), \f[B]wait\f[](2) +\f[B]exec\f[R](3), \f[B]fork\f[R](2), \f[B]ibv_get_device_list\f[R](3), +\f[B]system\f[R](3), \f[B]wait\f[R](2) .SH AUTHOR .PP Dotan Barak diff --git a/buildlib/pandoc-prebuilt/a91b9346f932b9f38e4ce3ec5ee815fd39fa0a91 b/buildlib/pandoc-prebuilt/a91b9346f932b9f38e4ce3ec5ee815fd39fa0a91 index 9850df6..39871c7 100644 --- a/buildlib/pandoc-prebuilt/a91b9346f932b9f38e4ce3ec5ee815fd39fa0a91 +++ b/buildlib/pandoc-prebuilt/a91b9346f932b9f38e4ce3ec5ee815fd39fa0a91 @@ -1,4 +1,4 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" .TH "mlx5dv_devx_create_event_channel, mlx5dv_devx_destroy_event_channel" "3" "" "" "" .hy @@ -12,18 +12,18 @@ mlx5dv_devx_destroy_event_channel \- Destroy a DEVX event channel. .IP .nf \f[C] -#include\ +#include -struct\ mlx5dv_devx_event_channel\ { -\ \ \ \ int\ fd; +struct mlx5dv_devx_event_channel { + int fd; }; -struct\ mlx5dv_devx_event_channel\ * -mlx5dv_devx_create_event_channel(struct\ ibv_context\ *context, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ enum\ mlx5dv_devx_create_event_channel_flags\ flags) +struct mlx5dv_devx_event_channel * +mlx5dv_devx_create_event_channel(struct ibv_context *context, + enum mlx5dv_devx_create_event_channel_flags flags) -void\ mlx5dv_devx_destroy_event_channel(struct\ mlx5dv_devx_event_channel\ *event_channel) -\f[] +void mlx5dv_devx_destroy_event_channel(struct mlx5dv_devx_event_channel *event_channel) +\f[R] .fi .SH DESCRIPTION .PP @@ -35,29 +35,26 @@ This lets an application to subscribe to get device events and once an event occurred read it from this object. .SH ARGUMENTS .TP -.B \f[I]context\f[] +.B \f[I]context\f[R] .IP .nf \f[C] -RDMA\ device\ context\ to\ create\ the\ channel\ on. -\f[] +RDMA device context to create the channel on. +\f[R] .fi -.RS -.RE .TP -.B \f[I]flags\f[] +.B \f[I]flags\f[R] MLX5DV_DEVX_CREATE_EVENT_CHANNEL_FLAGS_OMIT_EV_DATA: omit the event data on this channel. -.RS -.RE .SH RETURN VALUE .PP -Upon success \f[I]mlx5dv_devx_create_event_channel\f[] will return a new -\f[I]struct mlx5dv_devx_event_channel\f[] object, on error NULL will be -returned and errno will be set. +Upon success \f[I]mlx5dv_devx_create_event_channel\f[R] will return a +new \f[I]struct mlx5dv_devx_event_channel\f[R] object, on error NULL +will be returned and errno will be set. .SH SEE ALSO .PP -\f[I]mlx5dv_open_device(3)\f[], \f[I]mlx5dv_devx_obj_create(3)\f[] -.SH AUTHOR +\f[I]mlx5dv_open_device(3)\f[R], \f[I]mlx5dv_devx_obj_create(3)\f[R] +.PP +#AUTHOR .PP Yishai Hadas diff --git a/buildlib/pandoc-prebuilt/b4a6bc6bbb2f05ddc2593766851a6aaf9fd4d306 b/buildlib/pandoc-prebuilt/b4a6bc6bbb2f05ddc2593766851a6aaf9fd4d306 index 1055bb0..118c804 100644 --- a/buildlib/pandoc-prebuilt/b4a6bc6bbb2f05ddc2593766851a6aaf9fd4d306 +++ b/buildlib/pandoc-prebuilt/b4a6bc6bbb2f05ddc2593766851a6aaf9fd4d306 @@ -1,4 +1,4 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" .TH "mlx5dv_pp_alloc / mlx5dv_pp_free" "3" "" "" "" .hy @@ -11,16 +11,16 @@ mlx5dv_pp_free \- Frees a packet pacing entry .IP .nf \f[C] -#include\ +#include -struct\ mlx5dv_pp\ * -mlx5dv_pp_alloc(struct\ ibv_context\ *context, -\ \ \ \ \ \ \ \ size_t\ pp_context_sz, -\ \ \ \ \ \ \ \ const\ void\ *pp_context, -\ \ \ \ \ \ \ \ uint32_t\ flags); +struct mlx5dv_pp * +mlx5dv_pp_alloc(struct ibv_context *context, + size_t pp_context_sz, + const void *pp_context, + uint32_t flags); -void\ mlx5dv_pp_free(struct\ mlx5dv_pp\ *dv_pp); -\f[] +void mlx5dv_pp_free(struct mlx5dv_pp *dv_pp); +\f[R] .fi .SH DESCRIPTION .PP @@ -32,48 +32,38 @@ device driver, the packet pacing information is needed for few commands where a packet pacing index is needed. .SH ARGUMENTS .TP -.B \f[I]context\f[] +.B \f[I]context\f[R] RDMA device context to work on, need to be opened with DEVX support by using mlx5dv_open_device(). -.RS -.RE .TP -.B \f[I]pp_context_sz\f[] -Length of \f[I]pp_context\f[] input buffer. -.RS -.RE +.B \f[I]pp_context_sz\f[R] +Length of \f[I]pp_context\f[R] input buffer. .TP -.B \f[I]pp_context\f[] +.B \f[I]pp_context\f[R] Packet pacing context according to the device specification. -.RS -.RE .TP -.B \f[I]flags\f[] +.B \f[I]flags\f[R] MLX5DV_PP_ALLOC_FLAGS_DEDICATED_INDEX: allocate a dedicated index. -.RS -.RE .SS dv_pp .IP .nf \f[C] -struct\ mlx5dv_pp\ { -\ \ \ \ uint16_t\ index; +struct mlx5dv_pp { + uint16_t index; }; -\f[] +\f[R] .fi .TP -.B \f[I]index\f[] +.B \f[I]index\f[R] The device index to be used. -.RS -.RE .SH RETURN VALUE .PP -Upon success \f[I]mlx5dv_pp_alloc\f[] returns a pointer to the created +Upon success \f[I]mlx5dv_pp_alloc\f[R] returns a pointer to the created packet pacing object, on error NULL will be returned and errno will be set. .SH SEE ALSO .PP -\f[B]mlx5dv_open_device\f[], \f[B]mlx5dv_devx_obj_create\f[] +\f[B]mlx5dv_open_device\f[R], \f[B]mlx5dv_devx_obj_create\f[R] .SH AUTHOR .PP Yishai Hadas diff --git a/buildlib/pandoc-prebuilt/b680cb65b283c73a1e31eab15c2fc6c4f9d10da5 b/buildlib/pandoc-prebuilt/b680cb65b283c73a1e31eab15c2fc6c4f9d10da5 new file mode 100644 index 0000000..de7c25d --- /dev/null +++ b/buildlib/pandoc-prebuilt/b680cb65b283c73a1e31eab15c2fc6c4f9d10da5 @@ -0,0 +1,120 @@ +.\" Automatically generated by Pandoc 2.5 +.\" +.TH "IBV_ADVISE_MR" "3" "2018\-10\-19" "libibverbs" "Libibverbs Programmer\[cq]s Manual" +.hy +.SH NAME +.PP +ibv_advise_mr \- Gives advice or directions to the kernel about an +address range belongs to a memory region (MR). +.SH SYNOPSIS +.IP +.nf +\f[C] +#include + +int ibv_advise_mr(struct ibv_pd *pd, + enum ibv_advise_mr_advice advice, + uint32_t flags, + struct ibv_sge *sg_list, + uint32_t num_sge) +\f[R] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_advise_mr()\f[R] Give advice or directions to the kernel about +an address range belonging to a memory region (MR). +Applications that are aware of future access patterns can use this verb +in order to leverage this knowledge to improve system or application +performance. +.PP +\f[B]Conventional advice values\f[R] +.TP +.B \f[I]IBV_ADVISE_MR_ADVICE_PREFETCH\f[R] +Pre\-fetch a range of an on\-demand paging MR. +Make pages present with read\-only permission before the actual IO is +conducted. +This would provide a way to reduce latency by overlapping paging\-in and +either compute time or IO to other ranges. +.TP +.B \f[I]IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE\f[R] +Like IBV_ADVISE_MR_ADVICE_PREFETCH but with read\-access and +write\-access permission to the fetched memory. +.TP +.B \f[I]IBV_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT\f[R] +Pre\-fetch a range of an on\-demand paging MR without faulting. +This allows presented pages in the CPU to become presented to the +device. +.SH ARGUMENTS +.TP +.B \f[I]pd\f[R] +The protection domain (PD) associated with the MR. +.TP +.B \f[I]advice\f[R] +The requested advise value (as listed above). +.TP +.B \f[I]flags\f[R] +Describes the properties of the advise operation \f[B]Conventional +advice values\f[R] \f[I]IBV_ADVISE_MR_FLAG_FLUSH\f[R] : Request to be a +synchronized operation. +Return to the caller after the operation is completed. +.TP +.B \f[I]sg_list\f[R] +Pointer to the s/g array When using IBV_ADVISE_OP_PREFETCH advise value, +all the lkeys of all the scatter gather elements (SGEs) must be +associated with ODP MRs (MRs that were registered with +IBV_ACCESS_ON_DEMAND). +.TP +.B \f[I]num_sge\f[R] +Number of elements in the s/g array +.SH RETURN VALUE +.PP +\f[B]ibv_advise_mr()\f[R] returns 0 when the call was successful, or the +value of errno on failure (which indicates the failure reason). +.TP +.B \f[I]EOPNOTSUPP\f[R] +libibverbs or provider driver doesn\[cq]t support the ibv_advise_mr() +verb (ENOSYS may sometimes be returned by old versions of libibverbs). +.TP +.B \f[I]ENOTSUP\f[R] +The advise operation isn\[cq]t supported. +.TP +.B \f[I]EFAULT\f[R] +In one of the following: o When the range requested is out of the MR +bounds, or when parts of it are not part of the process address space. +o One of the lkeys provided in the scatter gather list is invalid or +with wrong write access. +.TP +.B \f[I]EINVAL\f[R] +In one of the following: o The PD is invalid. +o The flags are invalid. +.SH NOTES +.PP +An application may pre\-fetch any address range within an ODP MR when +using the \f[B]IBV_ADVISE_MR_ADVICE_PREFETCH\f[R] or +\f[B]IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE\f[R] advice. +Semantically, this operation is best\-effort. +That means the kernel does not guarantee that underlying pages are +updated in the HCA or the pre\-fetched pages would remain resident. +.PP +When using \f[B]IBV_ADVISE_MR_ADVICE_PREFETCH\f[R] or +\f[B]IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE\f[R] advice, the operation will +be done in the following stages: o Page in the user pages to memory +(pages aren\[cq]t pinned). +o Get the dma mapping of these user pages. +o Post the underlying page translations to the HCA. +.PP +If \f[B]IBV_ADVISE_MR_FLAG_FLUSH\f[R] is specified then the underlying +pages are guaranteed to be updated in the HCA before returning SUCCESS. +Otherwise the driver can choose to postpone the posting of the new +translations to the HCA. +When performing a local RDMA access operation it is recommended to use +IBV_ADVISE_MR_FLAG_FLUSH flag with one of the pre\-fetch advices to +increase probability that the pages translations are valid in the HCA +and avoid future page faults. +.SH SEE ALSO +.PP +\f[B]ibv_reg_mr\f[R](3), \f[B]ibv_rereg_mr\f[R](3), +\f[B]ibv_dereg_mr\f[R](3) +.SH AUTHOR +.PP +Aviad Yehezkel diff --git a/buildlib/pandoc-prebuilt/bc330f50986a4c202ab66bc12d82b6904bff909f b/buildlib/pandoc-prebuilt/bc330f50986a4c202ab66bc12d82b6904bff909f index 7b980ff..a8f205e 100644 --- a/buildlib/pandoc-prebuilt/bc330f50986a4c202ab66bc12d82b6904bff909f +++ b/buildlib/pandoc-prebuilt/bc330f50986a4c202ab66bc12d82b6904bff909f @@ -1,4 +1,4 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" .TH "EFADV" "7" "2019\-01\-19" "efa" "EFA Direct Verbs Manual" .hy @@ -25,7 +25,7 @@ The direct include of efadv.h together with linkage to efa library will allow usage of this new interface. .SH SEE ALSO .PP -\f[B]verbs\f[](7) +\f[B]verbs\f[R](7) .SH AUTHORS .PP Gal Pressman diff --git a/buildlib/pandoc-prebuilt/bde0f0fb11d80958e182842cb166935bb5be3347 b/buildlib/pandoc-prebuilt/bde0f0fb11d80958e182842cb166935bb5be3347 index 024e42c..0332bb3 100644 --- a/buildlib/pandoc-prebuilt/bde0f0fb11d80958e182842cb166935bb5be3347 +++ b/buildlib/pandoc-prebuilt/bde0f0fb11d80958e182842cb166935bb5be3347 @@ -1,6 +1,6 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" -.TH "IBV_GET_PKEY_INDEX" "3" "2018\-07\-16" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.TH "IBV_GET_PKEY_INDEX" "3" "2018\-07\-16" "libibverbs" "Libibverbs Programmer\[cq]s Manual" .hy .SH NAME .PP @@ -9,32 +9,32 @@ ibv_get_pkey_index \- obtain the index in the P_Key table of a P_Key .IP .nf \f[C] -#include\ +#include -int\ ibv_get_pkey_index(struct\ ibv_context\ *context, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint8_t\ port_num, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ __be16\ pkey); -\f[] +int ibv_get_pkey_index(struct ibv_context *context, + uint8_t port_num, + __be16 pkey); +\f[R] .fi .SH DESCRIPTION .PP Every InfiniBand HCA maintains a P_Key table for each of its ports that is indexed by an integer and with a P_Key in each element. Certain InfiniBand data structures that work with P_Keys expect a P_Key -index, e.g. -\f[B]struct ibv_qp_attr\f[] and \f[B]struct ib_mad_addr\f[]. -Hence the function \f[B]ibv_get_pkey_index()\f[] that accepts a P_Key in -network byte order and that returns an index in the P_Key table as +index, e.g.\ \f[B]struct ibv_qp_attr\f[R] and \f[B]struct +ib_mad_addr\f[R]. +Hence the function \f[B]ibv_get_pkey_index()\f[R] that accepts a P_Key +in network byte order and that returns an index in the P_Key table as result. .SH RETURN VALUE .PP -\f[B]ibv_get_pkey_index()\f[] returns the P_Key index on success, and +\f[B]ibv_get_pkey_index()\f[R] returns the P_Key index on success, and \-1 on error. .SH SEE ALSO .PP -\f[B]ibv_open_device\f[](3), \f[B]ibv_query_device\f[](3), -\f[B]ibv_query_gid\f[](3), \f[B]ibv_query_pkey\f[](3), -\f[B]ibv_query_port\f[](3) +\f[B]ibv_open_device\f[R](3), \f[B]ibv_query_device\f[R](3), +\f[B]ibv_query_gid\f[R](3), \f[B]ibv_query_pkey\f[R](3), +\f[B]ibv_query_port\f[R](3) .SH AUTHOR .PP Bart Van Assche diff --git a/buildlib/pandoc-prebuilt/c0c239f1fb706358d4ee439f21164f7fb0662c86 b/buildlib/pandoc-prebuilt/c0c239f1fb706358d4ee439f21164f7fb0662c86 index d03ec65..47d7c09 100644 --- a/buildlib/pandoc-prebuilt/c0c239f1fb706358d4ee439f21164f7fb0662c86 +++ b/buildlib/pandoc-prebuilt/c0c239f1fb706358d4ee439f21164f7fb0662c86 @@ -1,6 +1,6 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" -.TH "IBV_RATE_TO_MBPS" "3" "2012\-03\-31" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.TH "IBV_RATE_TO_MBPS" "3" "2012\-03\-31" "libibverbs" "Libibverbs Programmer\[cq]s Manual" .hy .SH NAME .PP @@ -11,33 +11,33 @@ mbps_to_ibv_rate \- convert Mbit/sec to an IB rate enumeration .IP .nf \f[C] -#include\ +#include -int\ ibv_rate_to_mbps(enum\ ibv_rate\ rate); +int ibv_rate_to_mbps(enum ibv_rate rate); -enum\ ibv_rate\ mbps_to_ibv_rate(int\ mbps); -\f[] +enum ibv_rate mbps_to_ibv_rate(int mbps); +\f[R] .fi .SH DESCRIPTION .PP -\f[B]ibv_rate_to_mbps()\f[] converts the IB transmission rate -enumeration \f[I]rate\f[] to a number of Mbit/sec. -For example, if \f[I]rate\f[] is \f[B]IBV_RATE_5_GBPS\f[], the value -5000 will be returned (5 Gbit/sec = 5000 Mbit/sec). +\f[B]ibv_rate_to_mbps()\f[R] converts the IB transmission rate +enumeration \f[I]rate\f[R] to a number of Mbit/sec.\ For example, if +\f[I]rate\f[R] is \f[B]IBV_RATE_5_GBPS\f[R], the value 5000 will be +returned (5 Gbit/sec = 5000 Mbit/sec). .PP -\f[B]mbps_to_ibv_rate()\f[] converts the number of Mbit/sec -\f[I]mult\f[] to an IB transmission rate enumeration. -For example, if \f[I]mult\f[] is 5000, the rate enumeration -\f[B]IBV_RATE_5_GBPS\f[] will be returned. +\f[B]mbps_to_ibv_rate()\f[R] converts the number of Mbit/sec +\f[I]mult\f[R] to an IB transmission rate enumeration. +For example, if \f[I]mult\f[R] is 5000, the rate enumeration +\f[B]IBV_RATE_5_GBPS\f[R] will be returned. .SH RETURN VALUE .PP -\f[B]ibv_rate_to_mbps()\f[] returns the number of Mbit/sec. +\f[B]ibv_rate_to_mbps()\f[R] returns the number of Mbit/sec. .PP -\f[B]mbps_to_ibv_rate()\f[] returns the enumeration representing the IB +\f[B]mbps_to_ibv_rate()\f[R] returns the enumeration representing the IB transmission rate. .SH SEE ALSO .PP -\f[B]ibv_query_port\f[](3) +\f[B]ibv_query_port\f[R](3) .SH AUTHOR .PP Dotan Barak diff --git a/buildlib/pandoc-prebuilt/c10b498742b7bd02b349331d8ab6ed7a2951bbc5 b/buildlib/pandoc-prebuilt/c10b498742b7bd02b349331d8ab6ed7a2951bbc5 index 39c281a..b1b353f 100644 --- a/buildlib/pandoc-prebuilt/c10b498742b7bd02b349331d8ab6ed7a2951bbc5 +++ b/buildlib/pandoc-prebuilt/c10b498742b7bd02b349331d8ab6ed7a2951bbc5 @@ -1,6 +1,6 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" -.TH "RDMA_ESTABLISH" "3" "2019\-01\-16" "librdmacm" "Librdmacm Programmer\[aq]s Manual" +.TH "RDMA_ESTABLISH" "3" "2019\-01\-16" "librdmacm" "Librdmacm Programmer\[cq]s Manual" .hy .SH NAME .PP @@ -9,14 +9,14 @@ rdma_establish \- Complete an active connection request. .IP .nf \f[C] -#include\ +#include -int\ rdma_establish(struct\ rdma_cm_id\ *id); -\f[] +int rdma_establish(struct rdma_cm_id *id); +\f[R] .fi .SH DESCRIPTION .PP -\f[B]rdma_establish()\f[] Acknowledge an incoming connection response +\f[B]rdma_establish()\f[R] Acknowledge an incoming connection response event and complete the connection establishment. .PP Notes: @@ -32,18 +32,16 @@ This function should not be used on an rdma_cm_id on which a QP has been created. .SH ARGUMENTS .TP -.B \f[I]id\f[] +.B \f[I]id\f[R] RDMA identifier. -.RS -.RE .SH RETURN VALUE .PP -\f[B]rdma_establish()\f[] returns 0 on success, or \-1 on error. +\f[B]rdma_establish()\f[R] returns 0 on success, or \-1 on error. If an error occurs, errno will be set to indicate the failure reason. .SH SEE ALSO .PP -\f[B]rdma_connect\f[](3), \f[B]rdma_disconnect\f[](3) -\f[B]rdma_get_cm_event\f[](3) +\f[B]rdma_connect\f[R](3), \f[B]rdma_disconnect\f[R](3) +\f[B]rdma_get_cm_event\f[R](3) .SH AUTHORS .PP Danit Goldberg diff --git a/buildlib/pandoc-prebuilt/c6c59b5def9ab3d0083324e4053a36a863365865 b/buildlib/pandoc-prebuilt/c6c59b5def9ab3d0083324e4053a36a863365865 new file mode 100644 index 0000000..2158cee --- /dev/null +++ b/buildlib/pandoc-prebuilt/c6c59b5def9ab3d0083324e4053a36a863365865 @@ -0,0 +1,55 @@ +.\" Automatically generated by Pandoc 2.5 +.\" +.TH "ibv_import_pd, ibv_unimport_pd" "3" "2020\-5\-3" "libibverbs" "Libibverbs Programmer\[cq]s Manual" +.hy +.SH NAME +.PP +ibv_import_pd \- import a PD from a given ibv_context +.PP +ibv_unimport_pd \- unimport a PD +.SH SYNOPSIS +.IP +.nf +\f[C] +#include + +struct ibv_pd *ibv_import_pd(struct ibv_context *context, uint32_t pd_handle); +void ibv_unimport_pd(struct ibv_pd *pd) +\f[R] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_import_pd()\f[R] returns a protection domain (PD) that is +associated with the given \f[I]pd_handle\f[R] in the given +\f[I]context\f[R]. +.PP +The input \f[I]pd_handle\f[R] value must be a valid kernel handle for a +PD object in the given \f[I]context\f[R]. +It can be achieved from the original PD by getting its ibv_pd\->handle +member value. +.PP +The returned \f[I]ibv_pd\f[R] can be used in all verbs that get a +protection domain. +.PP +\f[B]ibv_unimport_pd()\f[R] unimport the PD. +Once the PD usage has been ended ibv_dealloc_pd() or ibv_unimport_pd() +should be called. +The first one will go to the kernel to destroy the object once the +second one way cleanup what ever is needed/opposite of the import +without calling the kernel. +.PP +This is the responsibility of the application to coordinate between all +ibv_context(s) that use this PD. +Once destroy is done no other process can touch the object except for +unimport. +All users of the context must collaborate to ensure this. +.SH RETURN VALUE +.PP +\f[B]ibv_import_pd()\f[R] returns a pointer to the allocated PD, or NULL +if the request fails. +.SH SEE ALSO +.PP +\f[B]ibv_alloc_pd\f[R](3), \f[B]ibv_dealloc_pd\f[R](3), +.SH AUTHOR +.PP +Yishai Hadas diff --git a/buildlib/pandoc-prebuilt/c6cf51c33703f96d23549f640ab1e80205143daf b/buildlib/pandoc-prebuilt/c6cf51c33703f96d23549f640ab1e80205143daf index 6199fb3..aa6271c 100644 --- a/buildlib/pandoc-prebuilt/c6cf51c33703f96d23549f640ab1e80205143daf +++ b/buildlib/pandoc-prebuilt/c6cf51c33703f96d23549f640ab1e80205143daf @@ -1,21 +1,21 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" -.TH "ibv_attach_counters_point_flow" "3" "2018\-04\-02" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.TH "ibv_attach_counters_point_flow" "3" "2018\-04\-02" "libibverbs" "Libibverbs Programmer\[cq]s Manual" .hy .SH NAME .PP -\f[B]ibv_attach_counters_point_flow\f[] \- attach individual counter +\f[B]ibv_attach_counters_point_flow\f[R] \- attach individual counter definition to a flow object .SH SYNOPSIS .IP .nf \f[C] -#include\ +#include -int\ ibv_attach_counters_point_flow(struct\ ibv_counters\ *counters, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ ibv_counter_attach_attr\ *counter_attach_attr, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ ibv_flow\ *flow); -\f[] +int ibv_attach_counters_point_flow(struct ibv_counters *counters, + struct ibv_counter_attach_attr *counter_attach_attr, + struct ibv_flow *flow); +\f[R] .fi .SH DESCRIPTION .PP @@ -27,11 +27,11 @@ verb object resource. .PP A static attach can be created when NULL is provided instead of the reference to the verbs object (e.g.: in case of flow providing NULL -instead of \f[I]flow\f[]). +instead of \f[I]flow\f[R]). In this case, this counters object will only start collecting values after it is bound to the verbs resource, for flow this is when referencing the counters handle when creating a flow with -\f[B]ibv_create_flow\f[](). +\f[B]ibv_create_flow\f[R](). .PP Once an ibv_counters is bound statically to a verbs resource, no additional attach is allowed till the counter object is not bound to any @@ -49,100 +49,82 @@ Attaching a counters handle to multiple objects of the same type will accumulate the values into a single index. e.g.: creating several ibv_flow(s) with the same ibv_counters handle will collect the values from all relevant flows into the relevant index -location when reading the values from \f[B]ibv_read_counters\f[](), +location when reading the values from \f[B]ibv_read_counters\f[R](), setting the index more than once with different or same counter_desc will aggregate the values from all relevant counters into the relevant index location. .PP The runtime values of counters can be read from the hardware by calling -\f[B]ibv_read_counters\f[](). +\f[B]ibv_read_counters\f[R](). .SH ARGUMENTS .TP -.B \f[I]counters\f[] +.B \f[I]counters\f[R] Existing counters to attach new counter point on. -.RS -.RE .TP -.B \f[I]counter_attach_attr\f[] +.B \f[I]counter_attach_attr\f[R] An ibv_counter_attach_attr struct, as defined in verbs.h. -.RS -.RE .TP -.B \f[I]flow\f[] +.B \f[I]flow\f[R] Existing flow to attach a new counters point on (in static mode it must be NULL). -.RS -.RE -.SS \f[I]counter_attach_attr\f[] Argument +.SS \f[I]counter_attach_attr\f[R] Argument .IP .nf \f[C] -struct\ ibv_counter_attach_attr\ { -\ \ \ \ enum\ ibv_counter_description\ counter_desc; -\ \ \ \ uint32_t\ index; -\ \ \ \ uint32_t\ comp_mask; +struct ibv_counter_attach_attr { + enum ibv_counter_description counter_desc; + uint32_t index; + uint32_t comp_mask; }; -\f[] +\f[R] .fi -.SS \f[I]counter_desc\f[] Argument +.SS \f[I]counter_desc\f[R] Argument .IP .nf \f[C] -enum\ ibv_counter_description\ { -\ \ \ \ IBV_COUNTER_PACKETS, -\ \ \ \ IBV_COUNTER_BYTES, +enum ibv_counter_description { + IBV_COUNTER_PACKETS, + IBV_COUNTER_BYTES, }; -\f[] +\f[R] .fi .TP -.B \f[I]index\f[] +.B \f[I]index\f[R] Desired location of the specific counter at the counters object. -.RS -.RE .TP -.B \f[I]comp_mask\f[] +.B \f[I]comp_mask\f[R] Bitmask specifying what fields in the structure are valid. -.RS -.RE .SH RETURN VALUE .PP -\f[B]ibv_attach_counters_point_flow\f[]() returns 0 on success, or the +\f[B]ibv_attach_counters_point_flow\f[R]() returns 0 on success, or the value of errno on failure (which indicates the failure reason) .SH ERRORS .TP .B EINVAL invalid argument(s) passed -.RS -.RE .TP .B ENOTSUP -\f[I]counter_desc\f[] is not supported on the requested object -.RS -.RE +\f[I]counter_desc\f[R] is not supported on the requested object .TP .B EBUSY the counter object is already bound to a flow, additional attach calls is not allowed (valid for static attach only) -.RS -.RE .TP .B ENOMEM not enough memory -.RS -.RE .SH NOTES .PP Counter values in each index location are cleared upon creation when -calling \f[B]ibv_create_counters\f[](). +calling \f[B]ibv_create_counters\f[R](). Attaching counters points will only increase these values accordingly. .SH EXAMPLE .PP -An example of use of \f[B]ibv_attach_counters_point_flow\f[]() is shown -in \f[B]ibv_read_counters\f[] +An example of use of \f[B]ibv_attach_counters_point_flow\f[R]() is shown +in \f[B]ibv_read_counters\f[R] .SH SEE ALSO .PP -\f[B]ibv_create_counters\f[], \f[B]ibv_destroy_counters\f[], -\f[B]ibv_read_counters\f[], \f[B]ibv_create_flow\f[] +\f[B]ibv_create_counters\f[R], \f[B]ibv_destroy_counters\f[R], +\f[B]ibv_read_counters\f[R], \f[B]ibv_create_flow\f[R] .SH AUTHORS .PP Raed Salem diff --git a/buildlib/pandoc-prebuilt/ca22a60969c4c2b09f35bd74358cc9247766569b b/buildlib/pandoc-prebuilt/ca22a60969c4c2b09f35bd74358cc9247766569b new file mode 100644 index 0000000..5a7d893 --- /dev/null +++ b/buildlib/pandoc-prebuilt/ca22a60969c4c2b09f35bd74358cc9247766569b @@ -0,0 +1,67 @@ +.\" Automatically generated by Pandoc 2.5 +.\" +.TH "IBV_QUERY_ECE" "3" "2020\-01\-22" "libibverbs" "Libibverbs Programmer\[cq]s Manual" +.hy +.SH NAME +.PP +ibv_query_ece \- query ECE options. +.SH SYNOPSIS +.IP +.nf +\f[C] +#include + +int ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece); +\f[R] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_query_ece()\f[R] query ECE options. +.PP +Return to the user current ECE state for the QP. +.SH ARGUMENTS +.TP +.B \f[I]qp\f[R] +The queue pair (QP) associated with the ECE options. +.TP +.B ## \f[I]ece\f[R] Argument +The ECE values. +.IP +.nf +\f[C] +struct ibv_ece { + uint32_t vendor_id; + uint32_t options; + uint32_t comp_mask; +}; +\f[R] +.fi +.TP +.B \f[I]vendor_id\f[R] +Unique identifier of the provider vendor on the network. +The providers will set IEEE OUI here to distinguish itself in +non\-homogenius network. +.TP +.B \f[I]options\f[R] +Provider specific attributes which are supported. +.TP +.B \f[I]comp_mask\f[R] +Bitmask specifying what fields in the structure are valid. +.SH RETURN VALUE +.PP +\f[B]ibv_query_ece()\f[R] returns 0 when the call was successful, or the +errno value which indicates the failure reason. +.TP +.B \f[I]EOPNOTSUPP\f[R] +libibverbs or provider driver doesn\[cq]t support the ibv_set_ece() +verb. +.TP +.B \f[I]EINVAL\f[R] +In one of the following: o The QP is invalid. +o The ECE options are invalid. +.SH SEE ALSO +.PP +\f[B]ibv_set_ece\f[R](3), +.SH AUTHOR +.PP +Leon Romanovsky diff --git a/buildlib/pandoc-prebuilt/cf4e4cd11a7895e2b33c4b3e1625393ebf105452 b/buildlib/pandoc-prebuilt/cf4e4cd11a7895e2b33c4b3e1625393ebf105452 index 25b2a6c..df8891a 100644 --- a/buildlib/pandoc-prebuilt/cf4e4cd11a7895e2b33c4b3e1625393ebf105452 +++ b/buildlib/pandoc-prebuilt/cf4e4cd11a7895e2b33c4b3e1625393ebf105452 @@ -1,6 +1,6 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" -.TH "mlx5dv_create_cq" "3" "2018\-9\-1" "mlx5" "mlx5 Programmer\[aq]s Manual" +.TH "mlx5dv_create_cq" "3" "2018\-9\-1" "mlx5" "mlx5 Programmer\[cq]s Manual" .hy .SH NAME .PP @@ -9,48 +9,48 @@ mlx5dv_create_cq \- creates a completion queue (CQ) .IP .nf \f[C] -#include\ +#include -struct\ ibv_cq_ex\ *mlx5dv_create_cq(struct\ ibv_context\ *context, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ ibv_cq_init_attr_ex\ *cq_attr, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ mlx5dv_cq_init_attr\ *mlx5_cq_attr); -\f[] +struct ibv_cq_ex *mlx5dv_create_cq(struct ibv_context *context, + struct ibv_cq_init_attr_ex *cq_attr, + struct mlx5dv_cq_init_attr *mlx5_cq_attr); +\f[R] .fi .SH DESCRIPTION .PP -\f[B]mlx5dv_create_cq()\f[] creates a completion queue (CQ) with +\f[B]mlx5dv_create_cq()\f[R] creates a completion queue (CQ) with specific driver properties. .SH ARGUMENTS .PP -Please see \f[B]ibv_create_cq_ex(3)\f[] man page for \f[B]context\f[] -and \f[B]cq_attr\f[] +Please see \f[B]ibv_create_cq_ex(3)\f[R] man page for \f[B]context\f[R] +and \f[B]cq_attr\f[R] .SS mlx5_cq_attr .IP .nf \f[C] -struct\ mlx5dv_cq_init_attr\ { -\ \ \ \ uint64_t\ comp_mask; -\ \ \ \ uint8_t\ \ cqe_comp_res_format; -\ \ \ \ uint32_t\ flags; -\ \ \ \ uint16_t\ cqe_size; +struct mlx5dv_cq_init_attr { + uint64_t comp_mask; + uint8_t cqe_comp_res_format; + uint32_t flags; + uint16_t cqe_size; }; -\f[] +\f[R] .fi .TP -.B \f[I]comp_mask\f[] +.B \f[I]comp_mask\f[R] Bitmask specifying what fields in the structure are valid: .RS .PP MLX5DV_CQ_INIT_ATTR_MASK_COMPRESSED_CQE enables creating a CQ in a mode that few CQEs may be compressed into a single CQE, valid values in -\f[I]cqe_comp_res_format\f[] +\f[I]cqe_comp_res_format\f[R] .PP -MLX5DV_CQ_INIT_ATTR_MASK_FLAGS valid values in \f[I]flags\f[] +MLX5DV_CQ_INIT_ATTR_MASK_FLAGS valid values in \f[I]flags\f[R] .PP -MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE valid values in \f[I]cqe_size\f[] +MLX5DV_CQ_INIT_ATTR_MASK_CQE_SIZE valid values in \f[I]cqe_size\f[R] .RE .TP -.B \f[I]cqe_comp_res_format\f[] +.B \f[I]cqe_comp_res_format\f[R] A bitwise OR of the various CQE response formats of the responder side: .RS .PP @@ -61,25 +61,23 @@ MLX5DV_CQE_RES_FORMAT_CSUM CQE compression with RX checksum MLX5DV_CQE_RES_FORMAT_CSUM_STRIDX CQE compression with stride index .RE .TP -.B \f[I]flags\f[] +.B \f[I]flags\f[R] A bitwise OR of the various values described below: .RS .PP MLX5DV_CQ_INIT_ATTR_FLAGS_CQE_PAD create a padded 128B CQE .RE .TP -.B \f[I]cqe_size\f[] +.B \f[I]cqe_size\f[R] configure the CQE size to be 64 or 128 bytes other values will fail mlx5dv_create_cq. -.RS -.RE .SH RETURN VALUE .PP -\f[B]mlx5dv_create_cq()\f[] returns a pointer to the created CQ, or NULL -if the request fails and errno will be set. +\f[B]mlx5dv_create_cq()\f[R] returns a pointer to the created CQ, or +NULL if the request fails and errno will be set. .SH SEE ALSO .PP -\f[B]ibv_create_cq_ex\f[](3), +\f[B]ibv_create_cq_ex\f[R](3), .SH AUTHOR .PP Yonatan Cohen diff --git a/buildlib/pandoc-prebuilt/d5c7e7b0425b7c207ee41b58a93e749b88d7afee b/buildlib/pandoc-prebuilt/d5c7e7b0425b7c207ee41b58a93e749b88d7afee index 1a3a154..372fc03 100644 --- a/buildlib/pandoc-prebuilt/d5c7e7b0425b7c207ee41b58a93e749b88d7afee +++ b/buildlib/pandoc-prebuilt/d5c7e7b0425b7c207ee41b58a93e749b88d7afee @@ -1,95 +1,81 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" -.TH "mlx5dv_create_flow_matcher" "3" "2018\-9\-19" "mlx5" "mlx5 Programmer\[aq]s Manual" +.TH "mlx5dv_create_flow_matcher" "3" "2018\-9\-19" "mlx5" "mlx5 Programmer\[cq]s Manual" .hy .SH NAME .PP mlx5dv_create_flow_matcher \- creates a matcher to be used with -\f[I]mlx5dv_create_flow(3)\f[] +\f[I]mlx5dv_create_flow(3)\f[R] .SH SYNOPSIS .IP .nf \f[C] -#include\ +#include -struct\ mlx5dv_flow_matcher\ * -mlx5dv_create_flow_matcher(struct\ ibv_context\ *context, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ mlx5dv_flow_matcher_attr\ *attr) -\f[] +struct mlx5dv_flow_matcher * +mlx5dv_create_flow_matcher(struct ibv_context *context, + struct mlx5dv_flow_matcher_attr *attr) +\f[R] .fi .SH DESCRIPTION .PP -\f[B]mlx5dv_create_flow_matcher()\f[] creates a flow matcher (mask) to -be used with \f[I]mlx5dv_create_flow(3)\f[]. +\f[B]mlx5dv_create_flow_matcher()\f[R] creates a flow matcher (mask) to +be used with \f[I]mlx5dv_create_flow(3)\f[R]. .SH ARGUMENTS .PP -Please see \f[I]ibv_open_device(3)\f[] for \f[I]context\f[]. -.SS \f[I]attr\f[] +Please see \f[I]ibv_open_device(3)\f[R] for \f[I]context\f[R]. +.SS \f[I]attr\f[R] .IP .nf \f[C] -struct\ mlx5dv_flow_matcher_attr\ { -\ \ \ \ enum\ ibv_flow_attr_type\ type; -\ \ \ \ uint32_t\ flags;\ /*\ From\ enum\ ibv_flow_flags\ */ -\ \ \ \ uint16_t\ priority; -\ \ \ \ uint8_t\ match_criteria_enable;\ /*\ Device\ spec\ format\ */ -\ \ \ \ struct\ mlx5dv_flow_match_parameters\ *match_mask; -\ \ \ \ uint64_t\ comp_mask; -\ \ \ \ enum\ mlx5dv_flow_table_type\ ft_type; +struct mlx5dv_flow_matcher_attr { + enum ibv_flow_attr_type type; + uint32_t flags; /* From enum ibv_flow_flags */ + uint16_t priority; + uint8_t match_criteria_enable; /* Device spec format */ + struct mlx5dv_flow_match_parameters *match_mask; + uint64_t comp_mask; + enum mlx5dv_flow_table_type ft_type; }; -\f[] +\f[R] .fi .TP -.B \f[I]type\f[] +.B \f[I]type\f[R] Type of matcher to be created: IBV_FLOW_ATTR_NORMAL: Normal rule according to specification. -.RS -.RE .TP -.B \f[I]flags\f[] +.B \f[I]flags\f[R] special flags to control rule: 0: Nothing or zero value means matcher will store ingress flow rules. IBV_FLOW_ATTR_FLAGS_EGRESS: Specified this matcher will store egress flow rules. -.RS -.RE .TP -.B \f[I]priority\f[] -See \f[I]ibv_create_flow(3)\f[]. -.RS -.RE +.B \f[I]priority\f[R] +See \f[I]ibv_create_flow(3)\f[R]. .TP -.B \f[I]match_criteria_enable\f[] -What match criteria is configured in \f[I]match_mask\f[], passed in +.B \f[I]match_criteria_enable\f[R] +What match criteria is configured in \f[I]match_mask\f[R], passed in device spec format. -.RS -.RE -.SS \f[I]match_mask\f[] +.SS \f[I]match_mask\f[R] .IP .nf \f[C] -struct\ mlx5dv_flow_match_parameters\ { -\ \ \ \ size_t\ match_sz; -\ \ \ \ uint64_t\ match_buf[];\ /*\ Device\ spec\ format\ */ +struct mlx5dv_flow_match_parameters { + size_t match_sz; + uint64_t match_buf[]; /* Device spec format */ }; -\f[] +\f[R] .fi .TP -.B \f[I]match_sz\f[] -Size in bytes of \f[I]match_buf\f[]. -.RS -.RE +.B \f[I]match_sz\f[R] +Size in bytes of \f[I]match_buf\f[R]. .TP -.B \f[I]match_buf\f[] +.B \f[I]match_buf\f[R] Set which mask to be used, passed in device spec format. -.RS -.RE .TP -.B \f[I]comp_mask\f[] -MLX5DV_FLOW_MATCHER_MASK_FT_TYPE for \f[I]ft_type\f[] -.RS -.RE -.SS \f[I]ft_type\f[] +.B \f[I]comp_mask\f[R] +MLX5DV_FLOW_MATCHER_MASK_FT_TYPE for \f[I]ft_type\f[R] +.SS \f[I]ft_type\f[R] .PP Specified in which flow table type, the matcher will store the flow rules: MLX5DV_FLOW_TABLE_TYPE_NIC_RX: Specified this matcher will store @@ -104,12 +90,12 @@ MLX5DV_FLOW_TABLE_TYPE_RDMA_TX: Specified this matcher will store egress RDMA flow rules. .SH RETURN VALUE .PP -\f[B]mlx5dv_create_flow_matcher\f[] returns a pointer to -\f[I]mlx5dv_flow_matcher\f[], on error NULL will be returned and errno +\f[B]mlx5dv_create_flow_matcher\f[R] returns a pointer to +\f[I]mlx5dv_flow_matcher\f[R], on error NULL will be returned and errno will be set. .SH SEE ALSO .PP -\f[I]ibv_open_device(3)\f[], \f[I]ibv_create_flow(3)\f[] +\f[I]ibv_open_device(3)\f[R], \f[I]ibv_create_flow(3)\f[R] .SH AUTHOR .PP Mark Bloch diff --git a/buildlib/pandoc-prebuilt/e17edb66e91620850eb7da65f8e01f7fd1d1ddfd b/buildlib/pandoc-prebuilt/e17edb66e91620850eb7da65f8e01f7fd1d1ddfd new file mode 100644 index 0000000..4e0e038 --- /dev/null +++ b/buildlib/pandoc-prebuilt/e17edb66e91620850eb7da65f8e01f7fd1d1ddfd @@ -0,0 +1,43 @@ +.\" Automatically generated by Pandoc 2.5 +.\" +.TH "mlx5dv_modify_qp_lag_port" "3" "" "" "" +.hy +.SH NAME +.PP +mlx5dv_modify_qp_lag_port \- Modify the lag port information of a given +QP +.SH SYNOPSIS +.IP +.nf +\f[C] +#include + +int mlx5dv_modify_qp_lag_port(struct ibv_qp *qp, uint8_t port_num); +\f[R] +.fi +.SH DESCRIPTION +.PP +This API enables modifying the configured port num of a given QP. +.PP +If the QP state is modified later, the port num may be implicitly +re\-configured. +.PP +Use query mlx5dv_query_qp_lag_port to check the configured and active +port num values. +.SH ARGUMENTS +.TP +.B \f[I]qp\f[R] +The ibv_qp object to issue the action on. +.TP +.B \f[I]port_num\f[R] +The port_num to set for the QP. +.SH RETURN VALUE +.PP +0 on success; EOPNOTSUPP if not in LAG mode, or other errno value on +other failures. +.SH SEE ALSO +.PP +\f[I]mlx5dv_query_qp_lag_port(3)\f[R] +.SH AUTHOR +.PP +Aharon Landau diff --git a/buildlib/pandoc-prebuilt/e44e94a238c3c63d976a79adb52e34fb24140a85 b/buildlib/pandoc-prebuilt/e44e94a238c3c63d976a79adb52e34fb24140a85 index f1b5250..64245f1 100644 --- a/buildlib/pandoc-prebuilt/e44e94a238c3c63d976a79adb52e34fb24140a85 +++ b/buildlib/pandoc-prebuilt/e44e94a238c3c63d976a79adb52e34fb24140a85 @@ -1,6 +1,6 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" -.TH "IBV_REREG_MR" "3" "2016\-03\-13" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.TH "IBV_REREG_MR" "3" "2016\-03\-13" "libibverbs" "Libibverbs Programmer\[cq]s Manual" .hy .SH NAME .PP @@ -9,79 +9,74 @@ ibv_rereg_mr \- re\-register a memory region (MR) .IP .nf \f[C] -#include\ +#include -int\ ibv_rereg_mr(struct\ ibv_mr\ *mr, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ int\ flags, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ ibv_pd\ *pd, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ void\ *addr, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ size_t\ length, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ int\ access); -\f[] +int ibv_rereg_mr(struct ibv_mr *mr, + int flags, + struct ibv_pd *pd, + void *addr, + size_t length, + int access); +\f[R] .fi .SH DESCRIPTION .PP -\f[B]ibv_rereg_mr()\f[] Modifies the attributes of an existing memory -region (MR) \f[I]mr\f[]. +\f[B]ibv_rereg_mr()\f[R] Modifies the attributes of an existing memory +region (MR) \f[I]mr\f[R]. Conceptually, this call performs the functions deregister memory region followed by register memory region. Where possible, resources are reused instead of deallocated and reallocated. .PP -\f[I]flags\f[] is a bit\-mask used to indicate which of the following +\f[I]flags\f[R] is a bit\-mask used to indicate which of the following properties of the memory region are being modified. Flags should be a combination (bit field) of: .TP -.B \f[B]IBV_REREG_MR_CHANGE_TRANSLATION \f[] +.B \f[B]IBV_REREG_MR_CHANGE_TRANSLATION \f[R] Change translation (location and length) -.RS -.RE .TP -.B \f[B]IBV_REREG_MR_CHANGE_PD \f[] +.B \f[B]IBV_REREG_MR_CHANGE_PD \f[R] Change protection domain -.RS -.RE .TP -.B \f[B]IBV_REREG_MR_CHANGE_ACCESS \f[] +.B \f[B]IBV_REREG_MR_CHANGE_ACCESS \f[R] Change access flags -.RS -.RE .PP -When \f[B]IBV_REREG_MR_CHANGE_PD\f[] is used, \f[I]pd\f[] represents the -new PD this MR should be registered to. +When \f[B]IBV_REREG_MR_CHANGE_PD\f[R] is used, \f[I]pd\f[R] represents +the new PD this MR should be registered to. .PP -When \f[B]IBV_REREG_MR_CHANGE_TRANSLATION\f[] is used, \f[I]addr\f[]. +When \f[B]IBV_REREG_MR_CHANGE_TRANSLATION\f[R] is used, \f[I]addr\f[R]. represents the virtual address (user\-space pointer) of the new MR, -while \f[I]length\f[] represents its length. +while \f[I]length\f[R] represents its length. .PP -The access and other flags are represented in the field \f[I]access\f[]. +The access and other flags are represented in the field +\f[I]access\f[R]. This field describes the desired memory protection attributes; it is either 0 or the bitwise OR of one or more of ibv_access_flags. .SH RETURN VALUE .PP -\f[B]ibv_rereg_mr()\f[] returns 0 on success, otherwise an error has -occurred, \f[I]enum ibv_rereg_mr_err_code\f[] represents the error as of -below. +\f[B]ibv_rereg_mr()\f[R] returns 0 on success, otherwise an error has +occurred, \f[I]enum ibv_rereg_mr_err_code\f[R] represents the error as +of below. .PP IBV_REREG_MR_ERR_INPUT \- Old MR is valid, an input error was detected by libibverbs. .PP -IBV_REREG_MR_ERR_DONT_FORK_NEW \- Old MR is valid, failed via don\[aq]t +IBV_REREG_MR_ERR_DONT_FORK_NEW \- Old MR is valid, failed via don\[cq]t fork on new address range. .PP IBV_REREG_MR_ERR_DO_FORK_OLD \- New MR is valid, failed via do fork on old address range. .PP -IBV_REREG_MR_ERR_CMD \- MR shouldn\[aq]t be used, command error. +IBV_REREG_MR_ERR_CMD \- MR shouldn\[cq]t be used, command error. .PP -IBV_REREG_MR_ERR_CMD_AND_DO_FORK_NEW \- MR shouldn\[aq]t be used, +IBV_REREG_MR_ERR_CMD_AND_DO_FORK_NEW \- MR shouldn\[cq]t be used, command error, invalid fork state on new address range. .SH NOTES .PP Even on a failure, the user still needs to call ibv_dereg_mr on this MR. .SH SEE ALSO .PP -\f[B]ibv_dereg_mr\f[](3), \f[B]ibv_reg_mr\f[](3) +\f[B]ibv_dereg_mr\f[R](3), \f[B]ibv_reg_mr\f[R](3) .SH AUTHORS .PP Matan Barak , Yishai Hadas diff --git a/buildlib/pandoc-prebuilt/e4d776d0b6f839435f0db61df3122af0280416e6 b/buildlib/pandoc-prebuilt/e4d776d0b6f839435f0db61df3122af0280416e6 index 0a0b430..0ff5b5c 100644 --- a/buildlib/pandoc-prebuilt/e4d776d0b6f839435f0db61df3122af0280416e6 +++ b/buildlib/pandoc-prebuilt/e4d776d0b6f839435f0db61df3122af0280416e6 @@ -1,61 +1,51 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" -.TH "ibv_read_counters" "3" "2018\-04\-02" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.TH "ibv_read_counters" "3" "2018\-04\-02" "libibverbs" "Libibverbs Programmer\[cq]s Manual" .hy .SH NAME .PP -\f[B]ibv_read_counters\f[] \- Read counter values +\f[B]ibv_read_counters\f[R] \- Read counter values .SH SYNOPSIS .IP .nf \f[C] -#include\ +#include -int\ ibv_read_counters(struct\ ibv_counters\ *counters, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint64_t\ *counters_value, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint32_t\ ncounters, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint32_t\ flags); -\f[] +int ibv_read_counters(struct ibv_counters *counters, + uint64_t *counters_value, + uint32_t ncounters, + uint32_t flags); +\f[R] .fi .SH DESCRIPTION .PP -\f[B]ibv_read_counters\f[]() returns the values of the chosen counters -into \f[I]counters_value\f[] array of which can accumulate -\f[I]ncounters\f[]. +\f[B]ibv_read_counters\f[R]() returns the values of the chosen counters +into \f[I]counters_value\f[R] array of which can accumulate +\f[I]ncounters\f[R]. The values are filled according to the configuration defined by the user -in the \f[B]ibv_attach_counters_point_xxx\f[] functions. +in the \f[B]ibv_attach_counters_point_xxx\f[R] functions. .SH ARGUMENTS .TP -.B \f[I]counters\f[] +.B \f[I]counters\f[R] Counters object to read. -.RS -.RE .TP -.B \f[I]counters_value\f[] +.B \f[I]counters_value\f[R] Input buffer to hold read result. -.RS -.RE .TP -.B \f[I]ncounters\f[] +.B \f[I]ncounters\f[R] Number of counters to fill. -.RS -.RE .TP -.B \f[I]flags\f[] +.B \f[I]flags\f[R] Use enum ibv_read_counters_flags. -.RS -.RE -.SS \f[I]flags\f[] Argument +.SS \f[I]flags\f[R] Argument .TP .B IBV_READ_COUNTERS_ATTR_PREFER_CACHED Will prefer reading the values from driver cache, else it will do volatile hardware access which is the default. -.RS -.RE .SH RETURN VALUE .PP -\f[B]ibv_read_counters\f[]() returns 0 on success, or the value of errno -on failure (which indicates the failure reason) +\f[B]ibv_read_counters\f[R]() returns 0 on success, or the value of +errno on failure (which indicates the failure reason) .SH EXAMPLE .PP Example: Statically attach counters to a new flow @@ -67,116 +57,116 @@ resources are released. .IP .nf \f[C] -/*\ create\ counters\ object\ and\ define\ its\ counters\ points\ \ \ \ \ \ \ \ */ -/*\ create\ simple\ L2\ flow\ with\ hardcoded\ MAC,\ and\ a\ count\ action\ */ -/*\ read\ counters\ periodically,\ every\ 1sec,\ until\ loop\ ends\ \ \ \ \ \ */ -/*\ assumes\ user\ prepared\ a\ RAW_PACKET\ QP\ as\ input\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ */ -/*\ only\ limited\ error\ checking\ in\ run\ time\ for\ code\ simplicity\ \ */ - -#include\ -#include\ - -/*\ the\ below\ MAC\ should\ be\ replaced\ by\ user\ */ -#define\ FLOW_SPEC_ETH_MAC_VAL\ { -\ \ \ \ .dst_mac\ =\ {\ 0x00,\ 0x01,\ 0x02,\ 0x03,\ 0x04,0x05}, -\ \ \ \ .src_mac\ =\ {\ 0x00,\ 0x00,\ 0x00,\ 0x00,\ 0x00,\ 0x00}, -\ \ \ \ .ether_type\ =\ 0,\ .vlan_tag\ =\ 0,\ } -#define\ FLOW_SPEC_ETH_MAC_MASK\ { -\ \ \ \ .dst_mac\ =\ {\ 0xFF,\ 0xFF,\ 0xFF,\ 0xFF,\ 0xFF,\ 0xFF}, -\ \ \ \ .src_mac\ =\ {\ 0xFF,\ 0xFF,\ 0xFF,\ 0xFF,\ 0xFF,\ 0xFF}, -\ \ \ \ .ether_type\ =\ 0,\ .vlan_tag\ =\ 0,\ } - -void\ example_create_flow_with_counters_on_raw_qp(struct\ ibv_qp\ *qp)\ { -\ \ \ \ int\ idx\ =\ 0; -\ \ \ \ int\ loop\ =\ 10; -\ \ \ \ struct\ ibv_flow\ *flow\ =\ NULL; -\ \ \ \ struct\ ibv_counters\ *counters\ =\ NULL; -\ \ \ \ struct\ ibv_counters_init_attr\ init_attr\ =\ {0}; -\ \ \ \ struct\ ibv_counter_attach_attr\ attach_attr\ =\ {0}; - -\ \ \ \ /*\ create\ single\ counters\ handle\ */ -\ \ \ \ counters\ =\ ibv_create_counters(qp\->context,\ &init_attr); - -\ \ \ \ /*\ define\ counters\ points\ */ -\ \ \ \ attach_attr.counter_desc\ =\ IBV_COUNTER_PACKETS; -\ \ \ \ attach_attr.index\ =\ idx++; -\ \ \ \ ret\ =\ ibv_attach_counters_point_flow(counters,\ &attach_attr,\ NULL); -\ \ \ \ if\ (ret\ ==\ ENOTSUP)\ { -\ \ \ \ \ \ \ \ fprintf(stderr,\ "Attaching\ IBV_COUNTER_PACKETS\ to\ flow\ is\ not\ \\ -supported"); -\ \ \ \ \ \ \ \ exit(1); -\ \ \ \ } -\ \ \ \ attach_attr.counter_desc\ =\ IBV_COUNTER_BYTES; -\ \ \ \ attach_attr.index\ =\ idx++; -\ \ \ \ ibv_attach_counters_point_flow(counters,\ &attach_attr,\ NULL); -\ \ \ \ if\ (ret\ ==\ ENOTSUP)\ { -\ \ \ \ \ \ \ \ fprintf(stderr,\ "Attaching\ IBV_COUNTER_BYTES\ to\ flow\ is\ not\ \\ -supported"); -\ \ \ \ \ \ \ \ exit(1); -\ \ \ \ } - -\ \ \ \ /*\ define\ a\ new\ flow\ attr\ that\ includes\ the\ counters\ handle\ */ -\ \ \ \ struct\ raw_eth_flow_attr\ { -\ \ \ \ \ \ \ \ struct\ ibv_flow_attr\ \ \ \ \ \ \ \ \ \ \ \ \ \ attr; -\ \ \ \ \ \ \ \ struct\ ibv_flow_spec_eth\ \ \ \ \ \ \ \ \ \ spec_eth; -\ \ \ \ \ \ \ \ struct\ ibv_flow_spec_counter_action\ spec_count; -\ \ \ \ }\ flow_attr\ =\ { -\ \ \ \ \ \ \ \ .attr\ =\ { -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ .comp_mask\ \ =\ 0, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ .type\ \ \ \ \ \ \ =\ IBV_FLOW_ATTR_NORMAL, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ .size\ \ \ \ \ \ \ =\ sizeof(flow_attr), -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ .priority\ \ \ =\ 0, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ .num_of_specs\ =\ 2,\ /*\ ETH\ +\ COUNT\ */ -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ .port\ \ \ \ \ \ \ =\ 1, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ .flags\ \ \ \ \ \ =\ 0, -\ \ \ \ \ \ \ \ \ \ \ \ }, -\ \ \ \ \ \ \ \ .spec_eth\ =\ { -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ .type\ =\ IBV_EXP_FLOW_SPEC_ETH, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ .size\ =\ sizeof(struct\ ibv_flow_spec_eth), -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ .val\ \ =\ FLOW_SPEC_ETH_MAC_VAL, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ .mask\ =\ FLOW_SPEC_ETH_MAC_MASK, -\ \ \ \ \ \ \ \ \ \ \ \ }, -\ \ \ \ \ \ \ \ .spec_count\ =\ { -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ .type\ \ \ =\ IBV_FLOW_SPEC_ACTION_COUNT, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ .size\ \ \ =\ sizeof(struct\ ibv_flow_spec_counter_action), -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ .counters\ =\ counters,\ /*\ attached\ this\ counters\ handle -to\ the\ newly\ created\ ibv_flow\ */\ }\ }; - -\ \ \ \ /*\ create\ the\ flow\ */ -\ \ \ \ flow\ =\ ibv_create_flow(qp,\ &flow_attr.attr); - -\ \ \ \ /*\ allocate\ array\ for\ counters\ value\ reading\ */ -\ \ \ \ uint64_t\ *counters_value\ =\ malloc(sizeof(uint64_t)\ *\ idx); - -\ \ \ \ /*\ periodical\ read\ and\ print\ of\ flow\ counters\ */ -\ \ \ \ while\ (\-\-loop)\ { -\ \ \ \ \ \ \ \ sleep(1); - -\ \ \ \ \ \ \ \ /*\ read\ hardware\ counters\ values\ */ -\ \ \ \ \ \ \ \ ibv_read_counters(counters,\ counters_value,\ idx, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ IBV_READ_COUNTERS_ATTR_PREFER_CACHED); - -\ \ \ \ \ \ \ \ printf("PACKETS\ =\ %"PRIu64",\ BYTES\ =\ %"PRIu64\ \\n", -\ \ \ \ \ \ \ \ \ \ \ \ counters_value[0],\ counters_value[1]\ ); -\ \ \ \ } - -\ \ \ \ /*\ all\ done,\ release\ all\ */ -\ \ \ \ free(counters_value); - -\ \ \ \ /*\ destroy\ flow\ and\ detach\ counters\ */ -\ \ \ \ ibv_destroy_flow(flow); - -\ \ \ \ /*\ destroy\ counters\ handle\ */ -\ \ \ \ ibv_destroy_counters(counters); - -\ \ \ \ return; +/* create counters object and define its counters points */ +/* create simple L2 flow with hardcoded MAC, and a count action */ +/* read counters periodically, every 1sec, until loop ends */ +/* assumes user prepared a RAW_PACKET QP as input */ +/* only limited error checking in run time for code simplicity */ + +#include +#include + +/* the below MAC should be replaced by user */ +#define FLOW_SPEC_ETH_MAC_VAL { + .dst_mac = { 0x00, 0x01, 0x02, 0x03, 0x04,0x05}, + .src_mac = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00}, + .ether_type = 0, .vlan_tag = 0, } +#define FLOW_SPEC_ETH_MAC_MASK { + .dst_mac = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}, + .src_mac = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF}, + .ether_type = 0, .vlan_tag = 0, } + +void example_create_flow_with_counters_on_raw_qp(struct ibv_qp *qp) { + int idx = 0; + int loop = 10; + struct ibv_flow *flow = NULL; + struct ibv_counters *counters = NULL; + struct ibv_counters_init_attr init_attr = {0}; + struct ibv_counter_attach_attr attach_attr = {0}; + + /* create single counters handle */ + counters = ibv_create_counters(qp\->context, &init_attr); + + /* define counters points */ + attach_attr.counter_desc = IBV_COUNTER_PACKETS; + attach_attr.index = idx++; + ret = ibv_attach_counters_point_flow(counters, &attach_attr, NULL); + if (ret == ENOTSUP) { + fprintf(stderr, \[dq]Attaching IBV_COUNTER_PACKETS to flow is not \[rs] +supported\[dq]); + exit(1); + } + attach_attr.counter_desc = IBV_COUNTER_BYTES; + attach_attr.index = idx++; + ibv_attach_counters_point_flow(counters, &attach_attr, NULL); + if (ret == ENOTSUP) { + fprintf(stderr, \[dq]Attaching IBV_COUNTER_BYTES to flow is not \[rs] +supported\[dq]); + exit(1); + } + + /* define a new flow attr that includes the counters handle */ + struct raw_eth_flow_attr { + struct ibv_flow_attr attr; + struct ibv_flow_spec_eth spec_eth; + struct ibv_flow_spec_counter_action spec_count; + } flow_attr = { + .attr = { + .comp_mask = 0, + .type = IBV_FLOW_ATTR_NORMAL, + .size = sizeof(flow_attr), + .priority = 0, + .num_of_specs = 2, /* ETH + COUNT */ + .port = 1, + .flags = 0, + }, + .spec_eth = { + .type = IBV_EXP_FLOW_SPEC_ETH, + .size = sizeof(struct ibv_flow_spec_eth), + .val = FLOW_SPEC_ETH_MAC_VAL, + .mask = FLOW_SPEC_ETH_MAC_MASK, + }, + .spec_count = { + .type = IBV_FLOW_SPEC_ACTION_COUNT, + .size = sizeof(struct ibv_flow_spec_counter_action), + .counters = counters, /* attached this counters handle +to the newly created ibv_flow */ } }; + + /* create the flow */ + flow = ibv_create_flow(qp, &flow_attr.attr); + + /* allocate array for counters value reading */ + uint64_t *counters_value = malloc(sizeof(uint64_t) * idx); + + /* periodical read and print of flow counters */ + while (\-\-loop) { + sleep(1); + + /* read hardware counters values */ + ibv_read_counters(counters, counters_value, idx, + IBV_READ_COUNTERS_ATTR_PREFER_CACHED); + + printf(\[dq]PACKETS = %\[dq]PRIu64\[dq], BYTES = %\[dq]PRIu64 \[rs]n\[dq], + counters_value[0], counters_value[1] ); + } + + /* all done, release all */ + free(counters_value); + + /* destroy flow and detach counters */ + ibv_destroy_flow(flow); + + /* destroy counters handle */ + ibv_destroy_counters(counters); + + return; } -\f[] +\f[R] .fi .SH SEE ALSO .PP -\f[B]ibv_create_counters\f[], \f[B]ibv_destroy_counters\f[], -\f[B]ibv_attach_counters_point_flow\f[], \f[B]ibv_create_flow\f[] +\f[B]ibv_create_counters\f[R], \f[B]ibv_destroy_counters\f[R], +\f[B]ibv_attach_counters_point_flow\f[R], \f[B]ibv_create_flow\f[R] .SH AUTHORS .PP Raed Salem diff --git a/buildlib/pandoc-prebuilt/f34fcba7aaa68b2aa752241370d19d79cfdc2cb4 b/buildlib/pandoc-prebuilt/f34fcba7aaa68b2aa752241370d19d79cfdc2cb4 index 4810a00..d4231cd 100644 --- a/buildlib/pandoc-prebuilt/f34fcba7aaa68b2aa752241370d19d79cfdc2cb4 +++ b/buildlib/pandoc-prebuilt/f34fcba7aaa68b2aa752241370d19d79cfdc2cb4 @@ -1,6 +1,6 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" -.TH "MLX5DV_WR" "3" "2019\-02\-24" "mlx5" "mlx5 Programmer\[aq]s Manual" +.TH "MLX5DV_WR" "3" "2019\-02\-24" "mlx5" "mlx5 Programmer\[cq]s Manual" .hy .SH NAME .PP @@ -9,33 +9,33 @@ mlx5dv_wr_set_dc_addr \- Attach a DC info to the last work request .IP .nf \f[C] -#include\ +#include -static\ inline\ void\ mlx5dv_wr_set_dc_addr(struct\ mlx5dv_qp_ex\ *mqp, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ ibv_ah\ *ah, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint32_t\ remote_dctn, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint64_t\ remote_dc_key); +static inline void mlx5dv_wr_set_dc_addr(struct mlx5dv_qp_ex *mqp, + struct ibv_ah *ah, + uint32_t remote_dctn, + uint64_t remote_dc_key); -struct\ mlx5dv_mr_interleaved\ { -\ \ \ \ uint64_t\ \ \ \ \ \ \ \ addr; -\ \ \ \ uint32_t\ \ \ \ \ \ \ \ bytes_count; -\ \ \ \ uint32_t\ \ \ \ \ \ \ \ bytes_skip; -\ \ \ \ uint32_t\ \ \ \ \ \ \ \ lkey; +struct mlx5dv_mr_interleaved { + uint64_t addr; + uint32_t bytes_count; + uint32_t bytes_skip; + uint32_t lkey; }; -static\ inline\ void\ mlx5dv_wr_mr_interleaved(struct\ mlx5dv_qp_ex\ *mqp, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ mlx5dv_mkey\ *mkey, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint32_t\ access_flags,\ /*\ use\ enum\ ibv_access_flags\ */ -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint32_t\ repeat_count, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint16_t\ num_interleaved, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ mlx5dv_mr_interleaved\ *data); +static inline void mlx5dv_wr_mr_interleaved(struct mlx5dv_qp_ex *mqp, + struct mlx5dv_mkey *mkey, + uint32_t access_flags, /* use enum ibv_access_flags */ + uint32_t repeat_count, + uint16_t num_interleaved, + struct mlx5dv_mr_interleaved *data); -static\ inline\ void\ mlx5dv_wr_mr_list(struct\ mlx5dv_qp_ex\ *mqp, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ mlx5dv_mkey\ *mkey, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint32_t\ access_flags,\ /*\ use\ enum\ ibv_access_flags\ */ -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ uint16_t\ num_sges, -\ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ \ struct\ ibv_sge\ *sge); -\f[] +static inline void mlx5dv_wr_mr_list(struct mlx5dv_qp_ex *mqp, + struct mlx5dv_mkey *mkey, + uint32_t access_flags, /* use enum ibv_access_flags */ + uint16_t num_sges, + struct ibv_sge *sge); +\f[R] .fi .SH DESCRIPTION .PP @@ -46,7 +46,7 @@ This may be used together with or without ibv_wr_* calls. .SH USAGE .PP To use these APIs a QP must be created using mlx5dv_create_qp() with -\f[I]send_ops_flags\f[] of struct ibv_qp_init_attr_ex set. +\f[I]send_ops_flags\f[R] of struct ibv_qp_init_attr_ex set. .PP If the QP does not support all the requested work request types then QP creation will fail. @@ -61,117 +61,115 @@ the man for ibv_wr_post and mlx5dv_qp with its available builders and setters. .SS QP Specific builders .TP -.B \f[I]RC\f[] QPs -\f[I]mlx5dv_wr_mr_interleaved()\f[] +.B \f[I]RC\f[R] QPs +\f[I]mlx5dv_wr_mr_interleaved()\f[R] .RS .PP registers an interleaved memory layout by using an indirect mkey and some interleaved data. The layout of the memory pointed by the mkey after its registration will -be the \f[I]data\f[] representation for the \f[I]num_interleaved\f[] +be the \f[I]data\f[R] representation for the \f[I]num_interleaved\f[R] entries. -This single layout representation is repeated by \f[I]repeat_count\f[]. +This single layout representation is repeated by \f[I]repeat_count\f[R]. .PP -The \f[I]data\f[] as described by struct mlx5dv_mr_interleaved will hold -real data defined by \f[I]bytes_count\f[] and then a padding of -\f[I]bytes_skip\f[]. +The \f[I]data\f[R] as described by struct mlx5dv_mr_interleaved will +hold real data defined by \f[I]bytes_count\f[R] and then a padding of +\f[I]bytes_skip\f[R]. Post a successful registration, RDMA operations can use this -\f[I]mkey\f[]. +\f[I]mkey\f[R]. The hardware will scatter the data according to the pattern. -The \f[I]mkey\f[] should be used in a zero\-based mode. -The \f[I]addr\f[] field in its \f[I]ibv_sge\f[] is an offset in the +The \f[I]mkey\f[R] should be used in a zero\-based mode. +The \f[I]addr\f[R] field in its \f[I]ibv_sge\f[R] is an offset in the total data. -To create this \f[I]mkey\f[] mlx5dv_create_mkey() should be used. +To create this \f[I]mkey\f[R] mlx5dv_create_mkey() should be used. .PP Current implementation requires the IBV_SEND_INLINE option to be on in -\f[I]ibv_qp_ex\->wr_flags\f[] field. -To be able to have more than 3 \f[I]num_interleaved\f[] entries, the QP +\f[I]ibv_qp_ex\->wr_flags\f[R] field. +To be able to have more than 3 \f[I]num_interleaved\f[R] entries, the QP should be created with a larger WQE size that may fit it. -This should be done using the \f[I]max_inline_data\f[] attribute of -\f[I]struct ibv_qp_cap\f[] upon its creation. +This should be done using the \f[I]max_inline_data\f[R] attribute of +\f[I]struct ibv_qp_cap\f[R] upon its creation. .PP -As one entry will be consumed for strided header, the \f[I]mkey\f[] +As one entry will be consumed for strided header, the \f[I]mkey\f[R] should be created with one more entry than the required -\f[I]num_interleaved\f[]. +\f[I]num_interleaved\f[R]. .PP -In case \f[I]ibv_qp_ex\->wr_flags\f[] turns on IBV_SEND_SIGNALED, the +In case \f[I]ibv_qp_ex\->wr_flags\f[R] turns on IBV_SEND_SIGNALED, the reported WC opcode will be MLX5DV_WC_UMR. -Unregister the \f[I]mkey\f[] to enable another pattern registration +Unregister the \f[I]mkey\f[R] to enable another pattern registration should be done via ibv_post_send with IBV_WR_LOCAL_INV opcode. .RE -\f[I]mlx5dv_wr_mr_list()\f[] +\f[I]mlx5dv_wr_mr_list()\f[R] .RS .PP registers a memory layout based on list of ibv_sge. -The layout of the memory pointed by the \f[I]mkey\f[] after its -registration will be based on the list of \f[I]sge\f[] counted by -\f[I]num_sges\f[]. +The layout of the memory pointed by the \f[I]mkey\f[R] after its +registration will be based on the list of \f[I]sge\f[R] counted by +\f[I]num_sges\f[R]. Post a successful registration RDMA operations can use this -\f[I]mkey\f[], the hardware will scatter the data according to the +\f[I]mkey\f[R], the hardware will scatter the data according to the pattern. -The \f[I]mkey\f[] should be used in a zero\-based mode, the -\f[I]addr\f[] field in its \f[I]ibv_sge\f[] is an offset in the total +The \f[I]mkey\f[R] should be used in a zero\-based mode, the +\f[I]addr\f[R] field in its \f[I]ibv_sge\f[R] is an offset in the total data. .PP Current implementation requires the IBV_SEND_INLINE option to be on in -\f[I]ibv_qp_ex\->wr_flags\f[] field. -To be able to have more than 4 \f[I]num_sge\f[] entries, the QP should +\f[I]ibv_qp_ex\->wr_flags\f[R] field. +To be able to have more than 4 \f[I]num_sge\f[R] entries, the QP should be created with a larger WQE size that may fit it. -This should be done using the \f[I]max_inline_data\f[] attribute of -\f[I]struct ibv_qp_cap\f[] upon its creation. +This should be done using the \f[I]max_inline_data\f[R] attribute of +\f[I]struct ibv_qp_cap\f[R] upon its creation. .PP -In case \f[I]ibv_qp_ex\->wr_flags\f[] turns on IBV_SEND_SIGNALED, the +In case \f[I]ibv_qp_ex\->wr_flags\f[R] turns on IBV_SEND_SIGNALED, the reported WC opcode will be MLX5DV_WC_UMR. -Unregister the \f[I]mkey\f[] to enable other pattern registration should -be done via ibv_post_send with IBV_WR_LOCAL_INV opcode. +Unregister the \f[I]mkey\f[R] to enable other pattern registration +should be done via ibv_post_send with IBV_WR_LOCAL_INV opcode. .RE .SS QP Specific setters .TP -.B \f[I]DCI\f[] QPs -\f[I]mlx5dv_wr_set_dc_addr()\f[] must be called to set the DCI WR +.B \f[I]DCI\f[R] QPs +\f[I]mlx5dv_wr_set_dc_addr()\f[R] must be called to set the DCI WR properties. -The destination address of the work is specified by \f[I]ah\f[], the -remote DCT number is specified by \f[I]remote_dctn\f[] and the DC key is -specified by \f[I]remote_dc_key\f[]. +The destination address of the work is specified by \f[I]ah\f[R], the +remote DCT number is specified by \f[I]remote_dctn\f[R] and the DC key +is specified by \f[I]remote_dc_key\f[R]. This setter is available when the QP transport is DCI and send_ops_flags in struct ibv_qp_init_attr_ex is set. The available builders and setters for DCI QP are the same as RC QP. -.RS -.RE .SH EXAMPLE .IP .nf \f[C] -/*\ create\ DC\ QP\ type\ and\ specify\ the\ required\ send\ opcodes\ */ -attr_ex.qp_type\ =\ IBV_QPT_DRIVER; -attr_ex.comp_mask\ |=\ IBV_QP_INIT_ATTR_SEND_OPS_FLAGS; -attr_ex.send_ops_flags\ |=\ IBV_QP_EX_WITH_RDMA_WRITE; +/* create DC QP type and specify the required send opcodes */ +attr_ex.qp_type = IBV_QPT_DRIVER; +attr_ex.comp_mask |= IBV_QP_INIT_ATTR_SEND_OPS_FLAGS; +attr_ex.send_ops_flags |= IBV_QP_EX_WITH_RDMA_WRITE; -attr_dv.comp_mask\ |=\ MLX5DV_QP_INIT_ATTR_MASK_DC; -attr_dv.dc_init_attr.dc_type\ =\ MLX5DV_DCTYPE_DCI; +attr_dv.comp_mask |= MLX5DV_QP_INIT_ATTR_MASK_DC; +attr_dv.dc_init_attr.dc_type = MLX5DV_DCTYPE_DCI; -ibv_qp\ *qp\ =\ mlx5dv_create_qp(ctx,\ attr_ex,\ attr_dv); -ibv_qp_ex\ *qpx\ =\ ibv_qp_to_qp_ex(qp); -mlx5dv_qp_ex\ *mqpx\ =\ mlx5dv_qp_ex_from_ibv_qp_ex(qpx); +ibv_qp *qp = mlx5dv_create_qp(ctx, attr_ex, attr_dv); +ibv_qp_ex *qpx = ibv_qp_to_qp_ex(qp); +mlx5dv_qp_ex *mqpx = mlx5dv_qp_ex_from_ibv_qp_ex(qpx); ibv_wr_start(qpx); -/*\ Use\ ibv_qp_ex\ object\ to\ set\ WR\ generic\ attributes\ */ -qpx\->wr_id\ =\ my_wr_id_1; -qpx\->wr_flags\ =\ IBV_SEND_SIGNALED; -ibv_wr_rdma_write(qpx,\ rkey,\ remote_addr_1); -ibv_wr_set_sge(qpx,\ lkey,\ local_addr_1,\ length_1); +/* Use ibv_qp_ex object to set WR generic attributes */ +qpx\->wr_id = my_wr_id_1; +qpx\->wr_flags = IBV_SEND_SIGNALED; +ibv_wr_rdma_write(qpx, rkey, remote_addr_1); +ibv_wr_set_sge(qpx, lkey, local_addr_1, length_1); -/*\ Use\ mlx5\ DC\ setter\ using\ mlx5dv_qp_ex\ object\ */ -mlx5dv_wr_set_wr_dc_addr(mqpx,\ ah,\ remote_dctn,\ remote_dc_key); +/* Use mlx5 DC setter using mlx5dv_qp_ex object */ +mlx5dv_wr_set_wr_dc_addr(mqpx, ah, remote_dctn, remote_dc_key); -ret\ =\ ibv_wr_complete(qpx); -\f[] +ret = ibv_wr_complete(qpx); +\f[R] .fi .SH SEE ALSO .PP -\f[B]ibv_post_send\f[](3), \f[B]ibv_create_qp_ex(3)\f[], -\f[B]ibv_wr_post(3)\f[], \f[B]mlx5dv_create_mkey(3)\f[]. +\f[B]ibv_post_send\f[R](3), \f[B]ibv_create_qp_ex(3)\f[R], +\f[B]ibv_wr_post(3)\f[R], \f[B]mlx5dv_create_mkey(3)\f[R]. .SH AUTHOR .PP Guy Levi diff --git a/buildlib/pandoc-prebuilt/f951af66c47de282e7c5fede594de5d30db0292a b/buildlib/pandoc-prebuilt/f951af66c47de282e7c5fede594de5d30db0292a new file mode 100644 index 0000000..9eea2ac --- /dev/null +++ b/buildlib/pandoc-prebuilt/f951af66c47de282e7c5fede594de5d30db0292a @@ -0,0 +1,58 @@ +.\" Automatically generated by Pandoc 2.5 +.\" +.TH "ibv_import_mr ibv_unimport_mr" "3" "2020\-5\-3" "libibverbs" "Libibverbs Programmer\[cq]s Manual" +.hy +.SH NAME +.PP +ibv_import_mr \- import an MR from a given ibv_pd +.PP +ibv_unimport_mr \- unimport an MR +.SH SYNOPSIS +.IP +.nf +\f[C] +#include + +struct ibv_mr *ibv_import_mr(struct ibv_pd *pd, uint32_t mr_handle); +void ibv_unimport_mr(struct ibv_mr *mr) +\f[R] +.fi +.SH DESCRIPTION +.PP +\f[B]ibv_import_mr()\f[R] returns a Memory region (MR) that is +associated with the given \f[I]mr_handle\f[R] in the RDMA context that +assosicated with the given \f[I]pd\f[R]. +.PP +The input \f[I]mr_handle\f[R] value must be a valid kernel handle for an +MR object in the assosicated RDMA context. +It can be achieved from the original MR by getting its ibv_mr\->handle +member value. +.PP +\f[B]ibv_unimport_mr()\f[R] un import the MR. +Once the MR usage has been ended ibv_dereg_mr() or ibv_unimport_mr() +should be called. +The first one will go to the kernel to destroy the object once the +second one way cleanup what ever is needed/opposite of the import +without calling the kernel. +.PP +This is the responsibility of the application to coordinate between all +ibv_context(s) that use this MR. +Once destroy is done no other process can touch the object except for +unimport. +All users of the context must collaborate to ensure this. +.SH RETURN VALUE +.PP +\f[B]ibv_import_mr()\f[R] returns a pointer to the allocated MR, or NULL +if the request fails. +.SH NOTES +.PP +The \f[I]addr\f[R] field in the imported MR is not applicable, NULL +value is expected. +.SH SEE ALSO +.PP +\f[B]ibv_reg_mr\f[R](3), \f[B]ibv_reg_dm_mr\f[R](3), +\f[B]ibv_reg_mr_iova\f[R](3), \f[B]ibv_reg_mr_iova2\f[R](3), +\f[B]ibv_dereg_mr\f[R](3), +.SH AUTHOR +.PP +Yishai Hadas diff --git a/buildlib/pandoc-prebuilt/fa29d88a48409a61841ea15857c81feb01fd166d b/buildlib/pandoc-prebuilt/fa29d88a48409a61841ea15857c81feb01fd166d index 6da35cf..b02acb0 100644 --- a/buildlib/pandoc-prebuilt/fa29d88a48409a61841ea15857c81feb01fd166d +++ b/buildlib/pandoc-prebuilt/fa29d88a48409a61841ea15857c81feb01fd166d @@ -1,4 +1,4 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" .TH "mlx5dv_devx_umem_reg, mlx5dv_devx_umem_dereg" "3" "" "" "" .hy @@ -12,18 +12,18 @@ mlx5dv_devx_umem_dereg \- Deregister a devx umem object .IP .nf \f[C] -#include\ +#include -struct\ mlx5dv_devx_umem\ { -\ \ \ \ uint32_t\ umem_id; +struct mlx5dv_devx_umem { + uint32_t umem_id; }; -struct\ mlx5dv_devx_umem\ * -mlx5dv_devx_umem_reg(struct\ ibv_context\ *context,\ void\ *addr,\ size_t\ size, -\ \ \ \ \ \ \ \ \ \ \ \ \ uint32_t\ access) +struct mlx5dv_devx_umem * +mlx5dv_devx_umem_reg(struct ibv_context *context, void *addr, size_t size, + uint32_t access) -int\ mlx5dv_devx_umem_dereg(struct\ mlx5dv_devx_umem\ *dv_devx_umem) -\f[] +int mlx5dv_devx_umem_dereg(struct mlx5dv_devx_umem *dv_devx_umem) +\f[R] .fi .SH DESCRIPTION .PP @@ -38,51 +38,44 @@ memory. .PP The user will use that UMEM ID in device direct commands that use this memory instead of the physical addresses list, for example upon -\f[I]mlx5dv_devx_obj_create\f[] to create a QP. +\f[I]mlx5dv_devx_obj_create\f[R] to create a QP. .SH ARGUMENTS .TP -.B \f[I]context\f[] +.B \f[I]context\f[R] .IP .nf \f[C] -RDMA\ device\ context\ to\ create\ the\ action\ on. -\f[] +RDMA device context to create the action on. +\f[R] .fi -.RS -.RE .TP -.B \f[I]addr\f[] +.B \f[I]addr\f[R] The memory start address to register. -.RS -.RE .TP -.B \f[I]size\f[] +.B \f[I]size\f[R] .IP .nf \f[C] -The\ size\ of\ *addr*\ buffer. -\f[] +The size of *addr* buffer. +\f[R] .fi -.RS -.RE .TP -.B \f[I]access\f[] +.B \f[I]access\f[R] The desired memory protection attributes; it is either 0 or the bitwise -OR of one or more of \f[I]enum ibv_access_flags\f[]. -.RS -.RE +OR of one or more of \f[I]enum ibv_access_flags\f[R]. .SH RETURN VALUE .PP -Upon success \f[I]mlx5dv_devx_umem_reg\f[] will return a new \f[I]struct -mlx5dv_devx_umem\f[] object, on error NULL will be returned and errno -will be set. +Upon success \f[I]mlx5dv_devx_umem_reg\f[R] will return a new +\f[I]struct mlx5dv_devx_umem\f[R] object, on error NULL will be returned +and errno will be set. .PP -\f[I]mlx5dv_devx_umem_dereg\f[] returns 0 on success, or the value of +\f[I]mlx5dv_devx_umem_dereg\f[R] returns 0 on success, or the value of errno on failure (which indicates the failure reason). .SH SEE ALSO .PP -\f[I]mlx5dv_open_device(3)\f[], \f[I]ibv_reg_mr(3)\f[], -\f[I]mlx5dv_devx_obj_create(3)\f[] -.SH AUTHOR +\f[I]mlx5dv_open_device(3)\f[R], \f[I]ibv_reg_mr(3)\f[R], +\f[I]mlx5dv_devx_obj_create(3)\f[R] +.PP +#AUTHOR .PP Yishai Hadas diff --git a/buildlib/pandoc-prebuilt/fe1de88695b9f8551b1f861987b4188fdd592002 b/buildlib/pandoc-prebuilt/fe1de88695b9f8551b1f861987b4188fdd592002 index 79185e0..4093607 100644 --- a/buildlib/pandoc-prebuilt/fe1de88695b9f8551b1f861987b4188fdd592002 +++ b/buildlib/pandoc-prebuilt/fe1de88695b9f8551b1f861987b4188fdd592002 @@ -1,6 +1,6 @@ -.\" Automatically generated by Pandoc 1.19.2.4 +.\" Automatically generated by Pandoc 2.5 .\" -.TH "IBV_EVENT_TYPE_STR" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[aq]s Manual" +.TH "IBV_EVENT_TYPE_STR" "3" "2006\-10\-31" "libibverbs" "Libibverbs Programmer\[cq]s Manual" .hy .SH NAME .PP @@ -13,25 +13,25 @@ ibv_port_state_str \- Return string describing port_state enum value .IP .nf \f[C] -#include\ +#include -const\ char\ *ibv_event_type_str(enum\ ibv_event_type\ event_type); +const char *ibv_event_type_str(enum ibv_event_type event_type); -const\ char\ *ibv_node_type_str(enum\ ibv_node_type\ node_type); +const char *ibv_node_type_str(enum ibv_node_type node_type); -const\ char\ *ibv_port_state_str(enum\ ibv_port_state\ port_state); -\f[] +const char *ibv_port_state_str(enum ibv_port_state port_state); +\f[R] .fi .SH DESCRIPTION .PP -\f[B]ibv_node_type_str()\f[] returns a string describing the node type -enum value \f[I]node_type\f[]. +\f[B]ibv_node_type_str()\f[R] returns a string describing the node type +enum value \f[I]node_type\f[R]. .PP -\f[B]ibv_port_state_str()\f[] returns a string describing the port state -enum value \f[I]port_state\f[]. +\f[B]ibv_port_state_str()\f[R] returns a string describing the port +state enum value \f[I]port_state\f[R]. .PP -\f[B]ibv_event_type_str()\f[] returns a string describing the event type -enum value \f[I]event_type\f[]. +\f[B]ibv_event_type_str()\f[R] returns a string describing the event +type enum value \f[I]event_type\f[R]. .SH RETURN VALUE .PP These functions return a constant string that describes the enum value diff --git a/buildlib/pyverbs_functions.cmake b/buildlib/pyverbs_functions.cmake index ca41fbb..953cec2 100644 --- a/buildlib/pyverbs_functions.cmake +++ b/buildlib/pyverbs_functions.cmake @@ -23,7 +23,7 @@ function(rdma_cython_module PY_MODULE LINKER_FLAGS) string(REGEX REPLACE "\\.so$" "" SONAME "${FILENAME}${CMAKE_PYTHON_SO_SUFFIX}") add_library(${SONAME} SHARED ${CFILE}) set_target_properties(${SONAME} PROPERTIES - COMPILE_FLAGS "${CMAKE_C_FLAGS} -fPIC -fno-strict-aliasing -Wno-unused-function -Wno-redundant-decls -Wno-shadow -Wno-cast-function-type -Wno-implicit-fallthrough -Wno-unknown-warning -Wno-unknown-warning-option ${NO_VAR_TRACKING_FLAGS}" + COMPILE_FLAGS "${CMAKE_C_FLAGS} -fPIC -fno-strict-aliasing -Wno-unused-function -Wno-redundant-decls -Wno-shadow -Wno-cast-function-type -Wno-implicit-fallthrough -Wno-unknown-warning -Wno-unknown-warning-option -Wno-deprecated-declarations ${NO_VAR_TRACKING_FLAGS}" LIBRARY_OUTPUT_DIRECTORY "${BUILD_PYTHON}/${PY_MODULE}" PREFIX "") target_link_libraries(${SONAME} LINK_PRIVATE ${PYTHON_LIBRARIES} ibverbs rdmacm ${LINKER_FLAGS}) diff --git a/buildlib/sparse-include/31/bits-sysmacros.h.diff b/buildlib/sparse-include/31/bits-sysmacros.h.diff new file mode 100644 index 0000000..f5ca822 --- /dev/null +++ b/buildlib/sparse-include/31/bits-sysmacros.h.diff @@ -0,0 +1,24 @@ +--- /usr/include/x86_64-linux-gnu/bits/sysmacros.h 2020-04-14 19:26:04.000000000 +0000 ++++ include/bits/sysmacros.h 2020-05-05 19:03:23.910980758 +0000 +@@ -40,8 +40,8 @@ + __SYSMACROS_DECLARE_MAJOR (DECL_TEMPL) \ + { \ + unsigned int __major; \ +- __major = ((__dev & (__dev_t) 0x00000000000fff00u) >> 8); \ +- __major |= ((__dev & (__dev_t) 0xfffff00000000000u) >> 32); \ ++ __major = ((__dev & (__dev_t) 0x00000000000fff00ul) >> 8); \ ++ __major |= ((__dev & (__dev_t) 0xfffff00000000000ul) >> 32); \ + return __major; \ + } + +@@ -52,8 +52,8 @@ + __SYSMACROS_DECLARE_MINOR (DECL_TEMPL) \ + { \ + unsigned int __minor; \ +- __minor = ((__dev & (__dev_t) 0x00000000000000ffu) >> 0); \ +- __minor |= ((__dev & (__dev_t) 0x00000ffffff00000u) >> 12); \ ++ __minor = ((__dev & (__dev_t) 0x00000000000000fful) >> 0); \ ++ __minor |= ((__dev & (__dev_t) 0x00000ffffff00000ul) >> 12); \ + return __minor; \ + } + diff --git a/buildlib/sparse-include/31/netinet-in.h.diff b/buildlib/sparse-include/31/netinet-in.h.diff new file mode 100644 index 0000000..9d1a13d --- /dev/null +++ b/buildlib/sparse-include/31/netinet-in.h.diff @@ -0,0 +1,123 @@ +--- /usr/include/netinet/in.h 2020-04-14 19:26:04.000000000 +0000 ++++ include/netinet/in.h 2020-05-05 19:11:08.250904392 +0000 +@@ -22,12 +22,12 @@ + #include + #include + #include +- ++#include + + __BEGIN_DECLS + + /* Internet address. */ +-typedef uint32_t in_addr_t; ++typedef __be32 in_addr_t; + struct in_addr + { + in_addr_t s_addr; +@@ -116,7 +116,7 @@ + #endif /* !__USE_KERNEL_IPV6_DEFS */ + + /* Type to represent a port. */ +-typedef uint16_t in_port_t; ++typedef __be16 in_port_t; + + /* Standard well-known ports. */ + enum +@@ -175,37 +175,37 @@ + #define IN_CLASSB_HOST (0xffffffff & ~IN_CLASSB_NET) + #define IN_CLASSB_MAX 65536 + +-#define IN_CLASSC(a) ((((in_addr_t)(a)) & 0xe0000000) == 0xc0000000) ++#define IN_CLASSC(a) ((((uint32_t)(a)) & 0xe0000000) == 0xc0000000) + #define IN_CLASSC_NET 0xffffff00 + #define IN_CLASSC_NSHIFT 8 + #define IN_CLASSC_HOST (0xffffffff & ~IN_CLASSC_NET) + +-#define IN_CLASSD(a) ((((in_addr_t)(a)) & 0xf0000000) == 0xe0000000) ++#define IN_CLASSD(a) ((((uint32_t)(a)) & 0xf0000000) == 0xe0000000) + #define IN_MULTICAST(a) IN_CLASSD(a) + +-#define IN_EXPERIMENTAL(a) ((((in_addr_t)(a)) & 0xe0000000) == 0xe0000000) +-#define IN_BADCLASS(a) ((((in_addr_t)(a)) & 0xf0000000) == 0xf0000000) ++#define IN_EXPERIMENTAL(a) ((((uint32_t)(a)) & 0xe0000000) == 0xe0000000) ++#define IN_BADCLASS(a) ((((uint32_t)(a)) & 0xf0000000) == 0xf0000000) + + /* Address to accept any incoming messages. */ +-#define INADDR_ANY ((in_addr_t) 0x00000000) ++#define INADDR_ANY ((uint32_t) 0x00000000) + /* Address to send to all hosts. */ +-#define INADDR_BROADCAST ((in_addr_t) 0xffffffff) ++#define INADDR_BROADCAST ((uint32_t) 0xffffffff) + /* Address indicating an error return. */ +-#define INADDR_NONE ((in_addr_t) 0xffffffff) ++#define INADDR_NONE ((uint32_t) 0xffffffff) + + /* Network number for local host loopback. */ + #define IN_LOOPBACKNET 127 + /* Address to loopback in software to local host. */ + #ifndef INADDR_LOOPBACK +-# define INADDR_LOOPBACK ((in_addr_t) 0x7f000001) /* Inet 127.0.0.1. */ ++# define INADDR_LOOPBACK ((uint32_t) 0x7f000001) /* Inet 127.0.0.1. */ + #endif + + /* Defines for Multicast INADDR. */ +-#define INADDR_UNSPEC_GROUP ((in_addr_t) 0xe0000000) /* 224.0.0.0 */ +-#define INADDR_ALLHOSTS_GROUP ((in_addr_t) 0xe0000001) /* 224.0.0.1 */ +-#define INADDR_ALLRTRS_GROUP ((in_addr_t) 0xe0000002) /* 224.0.0.2 */ +-#define INADDR_ALLSNOOPERS_GROUP ((in_addr_t) 0xe000006a) /* 224.0.0.106 */ +-#define INADDR_MAX_LOCAL_GROUP ((in_addr_t) 0xe00000ff) /* 224.0.0.255 */ ++#define INADDR_UNSPEC_GROUP ((uint32_t) 0xe0000000) /* 224.0.0.0 */ ++#define INADDR_ALLHOSTS_GROUP ((uint32_t) 0xe0000001) /* 224.0.0.1 */ ++#define INADDR_ALLRTRS_GROUP ((uint32_t) 0xe0000002) /* 224.0.0.2 */ ++#define INADDR_ALLSNOOPERS_GROUP ((uint32_t) 0xe000006a) /* 224.0.0.106 */ ++#define INADDR_MAX_LOCAL_GROUP ((uint32_t) 0xe00000ff) /* 224.0.0.255 */ + + #if !__USE_KERNEL_IPV6_DEFS + /* IPv6 address */ +@@ -214,8 +214,8 @@ + union + { + uint8_t __u6_addr8[16]; +- uint16_t __u6_addr16[8]; +- uint32_t __u6_addr32[4]; ++ __be16 __u6_addr16[8]; ++ __be32 __u6_addr32[4]; + } __in6_u; + #define s6_addr __in6_u.__u6_addr8 + #ifdef __USE_MISC +@@ -254,7 +254,7 @@ + { + __SOCKADDR_COMMON (sin6_); + in_port_t sin6_port; /* Transport layer port # */ +- uint32_t sin6_flowinfo; /* IPv6 flow information */ ++ __be32 sin6_flowinfo; /* IPv6 flow information */ + struct in6_addr sin6_addr; /* IPv6 address */ + uint32_t sin6_scope_id; /* IPv6 scope-id */ + }; +@@ -372,12 +372,12 @@ + this was a short-sighted decision since on different systems the types + may have different representations but the values are always the same. */ + +-extern uint32_t ntohl (uint32_t __netlong) __THROW __attribute__ ((__const__)); +-extern uint16_t ntohs (uint16_t __netshort) ++extern uint32_t ntohl (__be32 __netlong) __THROW __attribute__ ((__const__)); ++extern uint16_t ntohs (__be16 __netshort) + __THROW __attribute__ ((__const__)); +-extern uint32_t htonl (uint32_t __hostlong) ++extern __be32 htonl (uint32_t __hostlong) + __THROW __attribute__ ((__const__)); +-extern uint16_t htons (uint16_t __hostshort) ++extern __be16 htons (uint16_t __hostshort) + __THROW __attribute__ ((__const__)); + + #include +@@ -386,7 +386,7 @@ + #include + #include + +-#ifdef __OPTIMIZE__ ++#ifdef __disabled_OPTIMIZE__ + /* We can optimize calls to the conversion functions. Either nothing has + to be done or we are using directly the byte-swapping functions which + often can be inlined. */ diff --git a/buildlib/sparse-include/31/stdlib.h.diff b/buildlib/sparse-include/31/stdlib.h.diff new file mode 100644 index 0000000..6fa42ea --- /dev/null +++ b/buildlib/sparse-include/31/stdlib.h.diff @@ -0,0 +1,23 @@ +--- /usr/include/stdlib.h 2020-04-14 19:26:04.000000000 +0000 ++++ include/stdlib.h 2020-05-05 19:03:23.910980758 +0000 +@@ -130,6 +130,20 @@ + + /* Likewise for '_FloatN' and '_FloatNx'. */ + ++/* For whatever reason our sparse does not understand these new compiler types */ ++#undef __GLIBC_USE_IEC_60559_TYPES_EXT ++#define __GLIBC_USE_IEC_60559_TYPES_EXT 0 ++#undef __HAVE_FLOAT32 ++#define __HAVE_FLOAT32 0 ++#undef __HAVE_FLOAT32X ++#define __HAVE_FLOAT32X 0 ++#undef __HAVE_FLOAT64 ++#define __HAVE_FLOAT64 0 ++#undef __HAVE_FLOAT64X ++#define __HAVE_FLOAT64X 0 ++#undef __HAVE_FLOAT128 ++#define __HAVE_FLOAT128 0 ++ + #if __HAVE_FLOAT16 && __GLIBC_USE (IEC_60559_TYPES_EXT) + extern _Float16 strtof16 (const char *__restrict __nptr, + char **__restrict __endptr) diff --git a/buildlib/sparse-include/31/sys-socket.h.diff b/buildlib/sparse-include/31/sys-socket.h.diff new file mode 100644 index 0000000..ee9e0cf --- /dev/null +++ b/buildlib/sparse-include/31/sys-socket.h.diff @@ -0,0 +1,11 @@ +--- /usr/include/x86_64-linux-gnu/sys/socket.h 2020-04-14 19:26:04.000000000 +0000 ++++ include/sys/socket.h 2020-05-05 19:03:23.910980758 +0000 +@@ -54,7 +54,7 @@ + uses with any of the listed types to be allowed without complaint. + G++ 2.7 does not support transparent unions so there we want the + old-style declaration, too. */ +-#if defined __cplusplus || !__GNUC_PREREQ (2, 7) || !defined __USE_GNU ++#if 1 + # define __SOCKADDR_ARG struct sockaddr *__restrict + # define __CONST_SOCKADDR_ARG const struct sockaddr * + #else diff --git a/debian/changelog b/debian/changelog index 8120d7b..82e7116 100644 --- a/debian/changelog +++ b/debian/changelog @@ -1,4 +1,4 @@ -rdma-core (29.0-1) unstable; urgency=medium +rdma-core (32.0-1) unstable; urgency=medium * New upstream release. diff --git a/debian/control b/debian/control index a97b2f0..0837cfc 100644 --- a/debian/control +++ b/debian/control @@ -21,7 +21,7 @@ Build-Depends: cmake (>= 2.8.11), python3-docutils, valgrind [amd64 arm64 armhf i386 mips mips64el mipsel powerpc ppc64 ppc64el s390x] Rules-Requires-Root: no -Standards-Version: 4.4.1 +Standards-Version: 4.5.0 Vcs-Git: https://github.com/linux-rdma/rdma-core.git Vcs-Browser: https://github.com/linux-rdma/rdma-core Homepage: https://github.com/linux-rdma/rdma-core diff --git a/debian/ibverbs-providers.symbols b/debian/ibverbs-providers.symbols index 34decae..a60df56 100644 --- a/debian/ibverbs-providers.symbols +++ b/debian/ibverbs-providers.symbols @@ -21,6 +21,9 @@ libmlx5.so.1 ibverbs-providers #MINVER# MLX5_1.11@MLX5_1.11 25 MLX5_1.12@MLX5_1.12 28 MLX5_1.13@MLX5_1.13 29 + MLX5_1.14@MLX5_1.14 30 + MLX5_1.15@MLX5_1.15 31 + MLX5_1.16@MLX5_1.16 32 mlx5dv_init_obj@MLX5_1.0 13 mlx5dv_init_obj@MLX5_1.2 15 mlx5dv_query_device@MLX5_1.0 13 @@ -98,6 +101,13 @@ libmlx5.so.1 ibverbs-providers #MINVER# mlx5dv_free_var@MLX5_1.12 28 mlx5dv_pp_alloc@MLX5_1.13 29 mlx5dv_pp_free@MLX5_1.13 29 + mlx5dv_dr_action_create_default_miss@MLX5_1.14 30 + mlx5dv_dr_domain_set_reclaim_device_memory@MLX5_1.14 30 + mlx5dv_modify_qp_lag_port@MLX5_1.14 30 + mlx5dv_query_qp_lag_port@MLX5_1.14 30 + mlx5dv_dr_action_create_dest_devx_tir@MLX5_1.15 31 + mlx5dv_dr_action_create_flow_sampler@MLX5_1.16 32 + mlx5dv_dr_action_create_dest_array@MLX5_1.16 32 libefa.so.1 ibverbs-providers #MINVER# * Build-Depends-Package: libibverbs-dev EFA_1.0@EFA_1.0 24 diff --git a/debian/libibumad3.symbols b/debian/libibumad3.symbols index f399f6c..5f61e1c 100644 --- a/debian/libibumad3.symbols +++ b/debian/libibumad3.symbols @@ -2,6 +2,7 @@ libibumad.so.3 libibumad3 #MINVER# * Build-Depends-Package: libibumad-dev IBUMAD_1.0@IBUMAD_1.0 1.3.9 IBUMAD_1.1@IBUMAD_1.1 3.1.26 + IBUMAD_1.2@IBUMAD_1.2 3.2.30 umad_addr_dump@IBUMAD_1.0 1.3.9 umad_attribute_str@IBUMAD_1.0 1.3.10.2 umad_class_str@IBUMAD_1.0 1.3.10.2 @@ -38,5 +39,6 @@ libibumad.so.3 libibumad3 #MINVER# umad_set_grh@IBUMAD_1.0 1.3.9 umad_set_pkey@IBUMAD_1.0 1.3.9 umad_size@IBUMAD_1.0 1.3.9 + umad_sort_ca_device_list@IBUMAD_1.2 3.2.30 umad_status@IBUMAD_1.0 1.3.9 umad_unregister@IBUMAD_1.0 1.3.9 diff --git a/debian/libibverbs1.symbols b/debian/libibverbs1.symbols index ec40b29..99257de 100644 --- a/debian/libibverbs1.symbols +++ b/debian/libibverbs1.symbols @@ -6,7 +6,12 @@ libibverbs.so.1 libibverbs1 #MINVER# IBVERBS_1.6@IBVERBS_1.6 24 IBVERBS_1.7@IBVERBS_1.7 25 IBVERBS_1.8@IBVERBS_1.8 28 + IBVERBS_1.9@IBVERBS_1.9 30 + IBVERBS_1.10@IBVERBS_1.10 31 + IBVERBS_1.11@IBVERBS_1.11 32 (symver)IBVERBS_PRIVATE_25 25 + _ibv_query_gid_ex@IBVERBS_1.11 32 + _ibv_query_gid_table@IBVERBS_1.11 32 ibv_ack_async_event@IBVERBS_1.0 1.1.6 ibv_ack_async_event@IBVERBS_1.1 1.1.6 ibv_ack_cq_events@IBVERBS_1.0 1.1.6 @@ -58,12 +63,16 @@ libibverbs.so.1 libibverbs1 #MINVER# ibv_get_cq_event@IBVERBS_1.1 1.1.6 ibv_get_device_guid@IBVERBS_1.0 1.1.6 ibv_get_device_guid@IBVERBS_1.1 1.1.6 + ibv_get_device_index@IBVERBS_1.9 30 ibv_get_device_list@IBVERBS_1.0 1.1.6 ibv_get_device_list@IBVERBS_1.1 1.1.6 ibv_get_device_name@IBVERBS_1.0 1.1.6 ibv_get_device_name@IBVERBS_1.1 1.1.6 ibv_get_pkey_index@IBVERBS_1.5 20 ibv_get_sysfs_path@IBVERBS_1.0 1.1.6 + ibv_import_device@IBVERBS_1.10 31 + ibv_import_mr@IBVERBS_1.10 31 + ibv_import_pd@IBVERBS_1.10 31 ibv_init_ah_from_wc@IBVERBS_1.1 1.1.6 ibv_modify_qp@IBVERBS_1.0 1.1.6 ibv_modify_qp@IBVERBS_1.1 1.1.6 @@ -76,6 +85,7 @@ libibverbs.so.1 libibverbs1 #MINVER# ibv_qp_to_qp_ex@IBVERBS_1.6 24 ibv_query_device@IBVERBS_1.0 1.1.6 ibv_query_device@IBVERBS_1.1 1.1.6 + ibv_query_ece@IBVERBS_1.10 31 ibv_query_gid@IBVERBS_1.0 1.1.6 ibv_query_gid@IBVERBS_1.1 1.1.6 ibv_query_pkey@IBVERBS_1.0 1.1.6 @@ -98,6 +108,9 @@ libibverbs.so.1 libibverbs1 #MINVER# ibv_resize_cq@IBVERBS_1.0 1.1.6 ibv_resize_cq@IBVERBS_1.1 1.1.6 ibv_resolve_eth_l2_from_gid@IBVERBS_1.1 1.2.0 + ibv_set_ece@IBVERBS_1.10 31 + ibv_unimport_mr@IBVERBS_1.10 31 + ibv_unimport_pd@IBVERBS_1.10 31 ibv_wc_status_str@IBVERBS_1.1 1.1.6 mbps_to_ibv_rate@IBVERBS_1.1 1.1.8 mult_to_ibv_rate@IBVERBS_1.0 1.1.6 diff --git a/debian/librdmacm-dev.install b/debian/librdmacm-dev.install index e12c300..68835cb 100644 --- a/debian/librdmacm-dev.install +++ b/debian/librdmacm-dev.install @@ -6,57 +6,6 @@ usr/include/rdma/rsocket.h usr/lib/*/librdmacm*.so usr/lib/*/librdmacm.a usr/lib/*/pkgconfig/librdmacm.pc -usr/share/man/man3/rdma_accept.3 -usr/share/man/man3/rdma_ack_cm_event.3 -usr/share/man/man3/rdma_bind_addr.3 -usr/share/man/man3/rdma_connect.3 -usr/share/man/man3/rdma_create_ep.3 -usr/share/man/man3/rdma_create_event_channel.3 -usr/share/man/man3/rdma_create_id.3 -usr/share/man/man3/rdma_create_qp.3 -usr/share/man/man3/rdma_create_srq.3 -usr/share/man/man3/rdma_dereg_mr.3 -usr/share/man/man3/rdma_destroy_ep.3 -usr/share/man/man3/rdma_destroy_event_channel.3 -usr/share/man/man3/rdma_destroy_id.3 -usr/share/man/man3/rdma_destroy_qp.3 -usr/share/man/man3/rdma_destroy_srq.3 -usr/share/man/man3/rdma_disconnect.3 -usr/share/man/man3/rdma_establish.3 -usr/share/man/man3/rdma_event_str.3 -usr/share/man/man3/rdma_free_devices.3 -usr/share/man/man3/rdma_get_cm_event.3 -usr/share/man/man3/rdma_get_devices.3 -usr/share/man/man3/rdma_get_dst_port.3 -usr/share/man/man3/rdma_get_local_addr.3 -usr/share/man/man3/rdma_get_peer_addr.3 -usr/share/man/man3/rdma_get_recv_comp.3 -usr/share/man/man3/rdma_get_request.3 -usr/share/man/man3/rdma_get_send_comp.3 -usr/share/man/man3/rdma_get_src_port.3 -usr/share/man/man3/rdma_getaddrinfo.3 -usr/share/man/man3/rdma_init_qp_attr.3 -usr/share/man/man3/rdma_join_multicast.3 -usr/share/man/man3/rdma_join_multicast_ex.3 -usr/share/man/man3/rdma_leave_multicast.3 -usr/share/man/man3/rdma_listen.3 -usr/share/man/man3/rdma_migrate_id.3 -usr/share/man/man3/rdma_notify.3 -usr/share/man/man3/rdma_post_read.3 -usr/share/man/man3/rdma_post_readv.3 -usr/share/man/man3/rdma_post_recv.3 -usr/share/man/man3/rdma_post_recvv.3 -usr/share/man/man3/rdma_post_send.3 -usr/share/man/man3/rdma_post_sendv.3 -usr/share/man/man3/rdma_post_ud_send.3 -usr/share/man/man3/rdma_post_write.3 -usr/share/man/man3/rdma_post_writev.3 -usr/share/man/man3/rdma_reg_msgs.3 -usr/share/man/man3/rdma_reg_read.3 -usr/share/man/man3/rdma_reg_write.3 -usr/share/man/man3/rdma_reject.3 -usr/share/man/man3/rdma_resolve_addr.3 -usr/share/man/man3/rdma_resolve_route.3 -usr/share/man/man3/rdma_set_option.3 +usr/share/man/man3/rdma_*.3 usr/share/man/man7/rdma_cm.7 usr/share/man/man7/rsocket.7 diff --git a/debian/librdmacm1.symbols b/debian/librdmacm1.symbols index 996122f..a2a2e82 100644 --- a/debian/librdmacm1.symbols +++ b/debian/librdmacm1.symbols @@ -3,6 +3,7 @@ librdmacm.so.1 librdmacm1 #MINVER# RDMACM_1.0@RDMACM_1.0 1.0.15 RDMACM_1.1@RDMACM_1.1 16 RDMACM_1.2@RDMACM_1.2 23 + RDMACM_1.3@RDMACM_1.3 31 raccept@RDMACM_1.0 1.0.16 rbind@RDMACM_1.0 1.0.16 rclose@RDMACM_1.0 1.0.16 @@ -31,6 +32,7 @@ librdmacm.so.1 librdmacm1 #MINVER# rdma_get_cm_event@RDMACM_1.0 1.0.15 rdma_get_devices@RDMACM_1.0 1.0.15 rdma_get_dst_port@RDMACM_1.0 1.0.19 + rdma_get_remote_ece@RDMACM_1.3 31 rdma_get_request@RDMACM_1.0 1.0.15 rdma_get_src_port@RDMACM_1.0 1.0.19 rdma_getaddrinfo@RDMACM_1.0 1.0.15 @@ -42,8 +44,10 @@ librdmacm.so.1 librdmacm1 #MINVER# rdma_migrate_id@RDMACM_1.0 1.0.15 rdma_notify@RDMACM_1.0 1.0.15 rdma_reject@RDMACM_1.0 1.0.15 + rdma_reject_ece@RDMACM_1.3 31 rdma_resolve_addr@RDMACM_1.0 1.0.15 rdma_resolve_route@RDMACM_1.0 1.0.15 + rdma_set_local_ece@RDMACM_1.3 31 rdma_set_option@RDMACM_1.0 1.0.15 rfcntl@RDMACM_1.0 1.0.16 rgetpeername@RDMACM_1.0 1.0.16 diff --git a/ibacm/man/ibacm.8 b/ibacm/man/ibacm.8 index 3b94f1e..c401bae 100644 --- a/ibacm/man/ibacm.8 +++ b/ibacm/man/ibacm.8 @@ -1,5 +1,5 @@ .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md -.TH "ibacm" 1 "2014-06-16" "ibacm" "ibacm" ibacm +.TH "ibacm" 8 "2014-06-16" "ibacm" "ibacm" ibacm .SH NAME ibacm \- address and route resolution services for InfiniBand. .SH SYNOPSIS diff --git a/ibacm/prov/acmp/src/acmp.c b/ibacm/prov/acmp/src/acmp.c index 2fee103..5e981b8 100644 --- a/ibacm/prov/acmp/src/acmp.c +++ b/ibacm/prov/acmp/src/acmp.c @@ -1230,8 +1230,10 @@ acmp_sa_resp(struct acm_sa_mad *mad) req->msg.hdr.opcode |= ACM_OP_ACK; if (!mad->umad.status) { + struct acm_ep_addr_data *resolve_data = req->msg.resolve_data; + req->msg.hdr.status = (uint8_t) (be16toh(sa_mad->status) >> 8); - memcpy(&req->msg.resolve_data[0].info.path, sa_mad->data, + memcpy(&resolve_data->info.path, sa_mad->data, sizeof(struct ibv_path_record)); } else { req->msg.hdr.status = ACM_STATUS_ETIMEDOUT; diff --git a/ibacm/src/acm.c b/ibacm/src/acm.c index 1663c89..f1c8a2f 100644 --- a/ibacm/src/acm.c +++ b/ibacm/src/acm.c @@ -2116,7 +2116,9 @@ __acm_ep_insert_addr(struct acmc_ep *ep, const char *name, uint8_t *addr, } } ep->addr_info[i].addr.type = addr_type; - strncpy(ep->addr_info[i].string_buf, name, ACM_MAX_ADDRESS); + if (!check_snprintf(ep->addr_info[i].string_buf, + sizeof(ep->addr_info[i].string_buf), "%s", name)) + return EINVAL; memcpy(ep->addr_info[i].addr.info.addr, tmp, ACM_MAX_ADDRESS); ret = ep->port->prov->add_address(&ep->addr_info[i].addr, ep->prov_ep_context, diff --git a/ibacm/src/libacm.c b/ibacm/src/libacm.c index d09be36..74adbf4 100644 --- a/ibacm/src/libacm.c +++ b/ibacm/src/libacm.c @@ -40,6 +40,8 @@ #include #include +#include + static pthread_mutex_t acm_lock = PTHREAD_MUTEX_INITIALIZER; static int sock = -1; static short server_port = 6125; @@ -202,7 +204,9 @@ static int acm_format_ep_addr(struct acm_ep_addr_data *data, uint8_t *addr, switch (type) { case ACM_EP_INFO_NAME: - strncpy((char *) data->info.name, (char *) addr, ACM_MAX_ADDRESS); + if (!check_snprintf((char *)data->info.name, + sizeof(data->info.name), "%s", (char *)addr)) + return -1; break; case ACM_EP_INFO_ADDRESS_IP: memcpy(data->info.addr, &((struct sockaddr_in *) addr)->sin_addr, 4); diff --git a/infiniband-diags/ibstat.c b/infiniband-diags/ibstat.c index 47b918f..bdf0f90 100644 --- a/infiniband-diags/ibstat.c +++ b/infiniband-diags/ibstat.c @@ -309,6 +309,9 @@ int main(int argc, char *argv[]) if (!device_list && errno) IBPANIC("can't list IB device names"); + if (umad_sort_ca_device_list(&device_list, 0)) + IBWARN("can't sort list IB device names"); + if (argc) { for (node = device_list; node; node = node->next) if (!strcmp(node->ca_name, argv[0])) diff --git a/kernel-boot/rdma-hw-modules.rules b/kernel-boot/rdma-hw-modules.rules index bee416d..95eaf72 100644 --- a/kernel-boot/rdma-hw-modules.rules +++ b/kernel-boot/rdma-hw-modules.rules @@ -1,11 +1,12 @@ ACTION=="remove", GOTO="rdma_hw_modules_end" -SUBSYSTEM!="net", GOTO="rdma_hw_modules_end" +SUBSYSTEM!="net", GOTO="rdma_hw_modules_net_end" +# For Ethernet cards with RoCE support # Automatically load RDMA specific kernel modules when a multi-function device is installed - # These drivers autoload an ethernet driver based on hardware detection and # need userspace to load the module that has their RDMA component to turn on # RDMA. + ENV{ID_NET_DRIVER}=="be2net", RUN{builtin}+="kmod load ocrdma" ENV{ID_NET_DRIVER}=="bnxt_en", RUN{builtin}+="kmod load bnxt_re" ENV{ID_NET_DRIVER}=="cxgb4", RUN{builtin}+="kmod load iw_cxgb4" @@ -18,11 +19,6 @@ ENV{ID_NET_DRIVER}=="qede", RUN{builtin}+="kmod load qedr" # The user must explicitly load these modules via /etc/modules-load.d/ or otherwise # rxe -# When in IB mode the kernel PCI core module autoloads the protocol modules -# for these providers -# mlx4 -# mlx5 - # enic no longer has a userspace verbs driver, this rule should probably be # owned by libfabric ENV{ID_NET_DRIVER}=="enic", RUN{builtin}+="kmod load usnic_verbs" @@ -34,4 +30,17 @@ ENV{ID_NET_DRIVER}=="enic", RUN{builtin}+="kmod load usnic_verbs" # mthca # vmw_pvrdma +LABEL="rdma_hw_modules_net_end" + +SUBSYSTEM!="pci", GOTO="rdma_hw_modules_pci_end" +# For InfiniBand cards +# Normally the request_module inside the driver will trigger this, but in case that fails due to +# missing modules in the initrd, trigger it again. HW that doesn't create a netdevice will not +# trigger the net based rules above. + +ENV{DRIVER}=="mlx4_core", RUN{builtin}+="kmod load mlx4_ib" +ENV{DRIVER}=="mlx5_core", RUN{builtin}+="kmod load mlx5_ib" + +LABEL="rdma_hw_modules_pci_end" + LABEL="rdma_hw_modules_end" diff --git a/kernel-headers/rdma/efa-abi.h b/kernel-headers/rdma/efa-abi.h index 53b6e20..f89fbb5 100644 --- a/kernel-headers/rdma/efa-abi.h +++ b/kernel-headers/rdma/efa-abi.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */ /* - * Copyright 2018-2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2018-2020 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef EFA_ABI_USER_H @@ -20,6 +20,16 @@ * hex bit offset of the field. */ +enum { + EFA_ALLOC_UCONTEXT_CMD_COMP_TX_BATCH = 1 << 0, + EFA_ALLOC_UCONTEXT_CMD_COMP_MIN_SQ_WR = 1 << 1, +}; + +struct efa_ibv_alloc_ucontext_cmd { + __u32 comp_mask; + __u8 reserved_20[4]; +}; + enum efa_ibv_user_cmds_supp_udata { EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE = 1 << 0, EFA_USER_CMDS_SUPP_UDATA_CREATE_AH = 1 << 1, @@ -31,6 +41,9 @@ struct efa_ibv_alloc_ucontext_resp { __u16 sub_cqs_per_cq; __u16 inline_buf_size; __u32 max_llq_size; /* bytes */ + __u16 max_tx_batch; /* units of 64 bytes */ + __u16 min_sq_wr; + __u8 reserved_a0[4]; }; struct efa_ibv_alloc_pd_resp { @@ -92,6 +105,7 @@ struct efa_ibv_create_ah_resp { enum { EFA_QUERY_DEVICE_CAPS_RDMA_READ = 1 << 0, + EFA_QUERY_DEVICE_CAPS_RNR_RETRY = 1 << 1, }; struct efa_ibv_ex_query_device_resp { diff --git a/kernel-headers/rdma/hfi/hfi1_user.h b/kernel-headers/rdma/hfi/hfi1_user.h index 01ac585..d95ef9a 100644 --- a/kernel-headers/rdma/hfi/hfi1_user.h +++ b/kernel-headers/rdma/hfi/hfi1_user.h @@ -6,7 +6,7 @@ * * GPL LICENSE SUMMARY * - * Copyright(c) 2015 - 2018 Intel Corporation. + * Copyright(c) 2015 - 2020 Intel Corporation. * * This program is free software; you can redistribute it and/or modify * it under the terms of version 2 of the GNU General Public License as @@ -109,6 +109,7 @@ #define HFI1_CAP_OPFN (1UL << 16) /* Enable the OPFN protocol */ #define HFI1_CAP_SDMA_HEAD_CHECK (1UL << 17) /* SDMA head checking */ #define HFI1_CAP_EARLY_CREDIT_RETURN (1UL << 18) /* early credit return */ +#define HFI1_CAP_AIP (1UL << 19) /* Enable accelerated IP */ #define HFI1_RCVHDR_ENTSIZE_2 (1UL << 0) #define HFI1_RCVHDR_ENTSIZE_16 (1UL << 1) diff --git a/kernel-headers/rdma/hns-abi.h b/kernel-headers/rdma/hns-abi.h index eb76b38..9ec85f7 100644 --- a/kernel-headers/rdma/hns-abi.h +++ b/kernel-headers/rdma/hns-abi.h @@ -39,6 +39,8 @@ struct hns_roce_ib_create_cq { __aligned_u64 buf_addr; __aligned_u64 db_addr; + __u32 cqe_size; + __u32 reserved; }; struct hns_roce_ib_create_cq_resp { @@ -73,7 +75,7 @@ struct hns_roce_ib_create_qp_resp { struct hns_roce_ib_alloc_ucontext_resp { __u32 qp_tab_size; - __u32 reserved; + __u32 cqe_size; }; struct hns_roce_ib_alloc_pd_resp { diff --git a/kernel-headers/rdma/ib_user_ioctl_cmds.h b/kernel-headers/rdma/ib_user_ioctl_cmds.h index d4ddbe4..7968a18 100644 --- a/kernel-headers/rdma/ib_user_ioctl_cmds.h +++ b/kernel-headers/rdma/ib_user_ioctl_cmds.h @@ -69,6 +69,9 @@ enum uverbs_methods_device { UVERBS_METHOD_INFO_HANDLES, UVERBS_METHOD_QUERY_PORT, UVERBS_METHOD_GET_CONTEXT, + UVERBS_METHOD_QUERY_CONTEXT, + UVERBS_METHOD_QUERY_GID_TABLE, + UVERBS_METHOD_QUERY_GID_ENTRY, }; enum uverbs_attrs_invoke_write_cmd_attr_ids { @@ -87,6 +90,11 @@ enum uverbs_attrs_get_context_attr_ids { UVERBS_ATTR_GET_CONTEXT_CORE_SUPPORT, }; +enum uverbs_attrs_query_context_attr_ids { + UVERBS_ATTR_QUERY_CONTEXT_NUM_COMP_VECTORS, + UVERBS_ATTR_QUERY_CONTEXT_CORE_SUPPORT, +}; + enum uverbs_attrs_create_cq_cmd_attr_ids { UVERBS_ATTR_CREATE_CQ_HANDLE, UVERBS_ATTR_CREATE_CQ_CQE, @@ -95,6 +103,7 @@ enum uverbs_attrs_create_cq_cmd_attr_ids { UVERBS_ATTR_CREATE_CQ_COMP_VECTOR, UVERBS_ATTR_CREATE_CQ_FLAGS, UVERBS_ATTR_CREATE_CQ_RESP_CQE, + UVERBS_ATTR_CREATE_CQ_EVENT_FD, }; enum uverbs_attrs_destroy_cq_cmd_attr_ids { @@ -120,11 +129,91 @@ enum uverbs_attrs_destroy_flow_action_esp { UVERBS_ATTR_DESTROY_FLOW_ACTION_HANDLE, }; +enum uverbs_attrs_create_qp_cmd_attr_ids { + UVERBS_ATTR_CREATE_QP_HANDLE, + UVERBS_ATTR_CREATE_QP_XRCD_HANDLE, + UVERBS_ATTR_CREATE_QP_PD_HANDLE, + UVERBS_ATTR_CREATE_QP_SRQ_HANDLE, + UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE, + UVERBS_ATTR_CREATE_QP_RECV_CQ_HANDLE, + UVERBS_ATTR_CREATE_QP_IND_TABLE_HANDLE, + UVERBS_ATTR_CREATE_QP_USER_HANDLE, + UVERBS_ATTR_CREATE_QP_CAP, + UVERBS_ATTR_CREATE_QP_TYPE, + UVERBS_ATTR_CREATE_QP_FLAGS, + UVERBS_ATTR_CREATE_QP_SOURCE_QPN, + UVERBS_ATTR_CREATE_QP_EVENT_FD, + UVERBS_ATTR_CREATE_QP_RESP_CAP, + UVERBS_ATTR_CREATE_QP_RESP_QP_NUM, +}; + +enum uverbs_attrs_destroy_qp_cmd_attr_ids { + UVERBS_ATTR_DESTROY_QP_HANDLE, + UVERBS_ATTR_DESTROY_QP_RESP, +}; + +enum uverbs_methods_qp { + UVERBS_METHOD_QP_CREATE, + UVERBS_METHOD_QP_DESTROY, +}; + +enum uverbs_attrs_create_srq_cmd_attr_ids { + UVERBS_ATTR_CREATE_SRQ_HANDLE, + UVERBS_ATTR_CREATE_SRQ_PD_HANDLE, + UVERBS_ATTR_CREATE_SRQ_XRCD_HANDLE, + UVERBS_ATTR_CREATE_SRQ_CQ_HANDLE, + UVERBS_ATTR_CREATE_SRQ_USER_HANDLE, + UVERBS_ATTR_CREATE_SRQ_MAX_WR, + UVERBS_ATTR_CREATE_SRQ_MAX_SGE, + UVERBS_ATTR_CREATE_SRQ_LIMIT, + UVERBS_ATTR_CREATE_SRQ_MAX_NUM_TAGS, + UVERBS_ATTR_CREATE_SRQ_TYPE, + UVERBS_ATTR_CREATE_SRQ_EVENT_FD, + UVERBS_ATTR_CREATE_SRQ_RESP_MAX_WR, + UVERBS_ATTR_CREATE_SRQ_RESP_MAX_SGE, + UVERBS_ATTR_CREATE_SRQ_RESP_SRQ_NUM, +}; + +enum uverbs_attrs_destroy_srq_cmd_attr_ids { + UVERBS_ATTR_DESTROY_SRQ_HANDLE, + UVERBS_ATTR_DESTROY_SRQ_RESP, +}; + +enum uverbs_methods_srq { + UVERBS_METHOD_SRQ_CREATE, + UVERBS_METHOD_SRQ_DESTROY, +}; + enum uverbs_methods_cq { UVERBS_METHOD_CQ_CREATE, UVERBS_METHOD_CQ_DESTROY, }; +enum uverbs_attrs_create_wq_cmd_attr_ids { + UVERBS_ATTR_CREATE_WQ_HANDLE, + UVERBS_ATTR_CREATE_WQ_PD_HANDLE, + UVERBS_ATTR_CREATE_WQ_CQ_HANDLE, + UVERBS_ATTR_CREATE_WQ_USER_HANDLE, + UVERBS_ATTR_CREATE_WQ_TYPE, + UVERBS_ATTR_CREATE_WQ_EVENT_FD, + UVERBS_ATTR_CREATE_WQ_MAX_WR, + UVERBS_ATTR_CREATE_WQ_MAX_SGE, + UVERBS_ATTR_CREATE_WQ_FLAGS, + UVERBS_ATTR_CREATE_WQ_RESP_MAX_WR, + UVERBS_ATTR_CREATE_WQ_RESP_MAX_SGE, + UVERBS_ATTR_CREATE_WQ_RESP_WQ_NUM, +}; + +enum uverbs_attrs_destroy_wq_cmd_attr_ids { + UVERBS_ATTR_DESTROY_WQ_HANDLE, + UVERBS_ATTR_DESTROY_WQ_RESP, +}; + +enum uverbs_methods_wq { + UVERBS_METHOD_WQ_CREATE, + UVERBS_METHOD_WQ_DESTROY, +}; + enum uverbs_methods_actions_flow_action_ops { UVERBS_METHOD_FLOW_ACTION_ESP_CREATE, UVERBS_METHOD_FLOW_ACTION_DESTROY, @@ -161,6 +250,7 @@ enum uverbs_methods_mr { UVERBS_METHOD_DM_MR_REG, UVERBS_METHOD_MR_DESTROY, UVERBS_METHOD_ADVISE_MR, + UVERBS_METHOD_QUERY_MR, }; enum uverbs_attrs_mr_destroy_ids { @@ -174,6 +264,14 @@ enum uverbs_attrs_advise_mr_cmd_attr_ids { UVERBS_ATTR_ADVISE_MR_SGE_LIST, }; +enum uverbs_attrs_query_mr_cmd_attr_ids { + UVERBS_ATTR_QUERY_MR_HANDLE, + UVERBS_ATTR_QUERY_MR_RESP_LKEY, + UVERBS_ATTR_QUERY_MR_RESP_RKEY, + UVERBS_ATTR_QUERY_MR_RESP_LENGTH, + UVERBS_ATTR_QUERY_MR_RESP_IOVA, +}; + enum uverbs_attrs_create_counters_cmd_attr_ids { UVERBS_ATTR_CREATE_COUNTERS_HANDLE, }; @@ -256,4 +354,18 @@ enum uverbs_attrs_async_event_create { UVERBS_ATTR_ASYNC_EVENT_ALLOC_FD_HANDLE, }; +enum uverbs_attrs_query_gid_table_cmd_attr_ids { + UVERBS_ATTR_QUERY_GID_TABLE_ENTRY_SIZE, + UVERBS_ATTR_QUERY_GID_TABLE_FLAGS, + UVERBS_ATTR_QUERY_GID_TABLE_RESP_ENTRIES, + UVERBS_ATTR_QUERY_GID_TABLE_RESP_NUM_ENTRIES, +}; + +enum uverbs_attrs_query_gid_entry_cmd_attr_ids { + UVERBS_ATTR_QUERY_GID_ENTRY_PORT, + UVERBS_ATTR_QUERY_GID_ENTRY_GID_INDEX, + UVERBS_ATTR_QUERY_GID_ENTRY_FLAGS, + UVERBS_ATTR_QUERY_GID_ENTRY_RESP_ENTRY, +}; + #endif diff --git a/kernel-headers/rdma/ib_user_ioctl_verbs.h b/kernel-headers/rdma/ib_user_ioctl_verbs.h index a640bb8..2248379 100644 --- a/kernel-headers/rdma/ib_user_ioctl_verbs.h +++ b/kernel-headers/rdma/ib_user_ioctl_verbs.h @@ -64,6 +64,41 @@ enum ib_uverbs_access_flags { ~(IB_UVERBS_ACCESS_OPTIONAL_FIRST - 1) }; +enum ib_uverbs_srq_type { + IB_UVERBS_SRQT_BASIC, + IB_UVERBS_SRQT_XRC, + IB_UVERBS_SRQT_TM, +}; + +enum ib_uverbs_wq_type { + IB_UVERBS_WQT_RQ, +}; + +enum ib_uverbs_wq_flags { + IB_UVERBS_WQ_FLAGS_CVLAN_STRIPPING = 1 << 0, + IB_UVERBS_WQ_FLAGS_SCATTER_FCS = 1 << 1, + IB_UVERBS_WQ_FLAGS_DELAY_DROP = 1 << 2, + IB_UVERBS_WQ_FLAGS_PCI_WRITE_END_PADDING = 1 << 3, +}; + +enum ib_uverbs_qp_type { + IB_UVERBS_QPT_RC = 2, + IB_UVERBS_QPT_UC, + IB_UVERBS_QPT_UD, + IB_UVERBS_QPT_RAW_PACKET = 8, + IB_UVERBS_QPT_XRC_INI, + IB_UVERBS_QPT_XRC_TGT, + IB_UVERBS_QPT_DRIVER = 0xFF, +}; + +enum ib_uverbs_qp_create_flags { + IB_UVERBS_QP_CREATE_BLOCK_MULTICAST_LOOPBACK = 1 << 1, + IB_UVERBS_QP_CREATE_SCATTER_FCS = 1 << 8, + IB_UVERBS_QP_CREATE_CVLAN_STRIPPING = 1 << 9, + IB_UVERBS_QP_CREATE_PCI_WRITE_END_PADDING = 1 << 11, + IB_UVERBS_QP_CREATE_SQ_SIG_ALL = 1 << 12, +}; + enum ib_uverbs_query_port_cap_flags { IB_UVERBS_PCF_SM = 1 << 1, IB_UVERBS_PCF_NOTICE_SUP = 1 << 2, @@ -173,6 +208,7 @@ enum ib_uverbs_read_counters_flags { enum ib_uverbs_advise_mr_advice { IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH, IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE, + IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT, }; enum ib_uverbs_advise_mr_flag { @@ -185,6 +221,14 @@ struct ib_uverbs_query_port_resp_ex { __u8 reserved[6]; }; +struct ib_uverbs_qp_cap { + __u32 max_send_wr; + __u32 max_recv_wr; + __u32 max_send_sge; + __u32 max_recv_sge; + __u32 max_inline_data; +}; + enum rdma_driver_id { RDMA_DRIVER_UNKNOWN, RDMA_DRIVER_MLX5, @@ -207,4 +251,18 @@ enum rdma_driver_id { RDMA_DRIVER_SIW, }; +enum ib_uverbs_gid_type { + IB_UVERBS_GID_TYPE_IB, + IB_UVERBS_GID_TYPE_ROCE_V1, + IB_UVERBS_GID_TYPE_ROCE_V2, +}; + +struct ib_uverbs_gid_entry { + __aligned_u64 gid[2]; + __u32 gid_index; + __u32 port_num; + __u32 gid_type; + __u32 netdev_ifindex; /* It is 0 if there is no netdev associated with it */ +}; + #endif diff --git a/kernel-headers/rdma/ib_user_verbs.h b/kernel-headers/rdma/ib_user_verbs.h index 0474c74..456438c 100644 --- a/kernel-headers/rdma/ib_user_verbs.h +++ b/kernel-headers/rdma/ib_user_verbs.h @@ -457,6 +457,17 @@ struct ib_uverbs_poll_cq { __u32 ne; }; +enum ib_uverbs_wc_opcode { + IB_UVERBS_WC_SEND = 0, + IB_UVERBS_WC_RDMA_WRITE = 1, + IB_UVERBS_WC_RDMA_READ = 2, + IB_UVERBS_WC_COMP_SWAP = 3, + IB_UVERBS_WC_FETCH_ADD = 4, + IB_UVERBS_WC_BIND_MW = 5, + IB_UVERBS_WC_LOCAL_INV = 6, + IB_UVERBS_WC_TSO = 7, +}; + struct ib_uverbs_wc { __aligned_u64 wr_id; __u32 status; diff --git a/kernel-headers/rdma/mlx5-abi.h b/kernel-headers/rdma/mlx5-abi.h index df1cc36..27905a0 100644 --- a/kernel-headers/rdma/mlx5-abi.h +++ b/kernel-headers/rdma/mlx5-abi.h @@ -100,6 +100,7 @@ struct mlx5_ib_alloc_ucontext_req_v2 { enum mlx5_ib_alloc_ucontext_resp_mask { MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET = 1UL << 0, MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_DUMP_FILL_MKEY = 1UL << 1, + MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_ECE = 1UL << 2, }; enum mlx5_user_cmds_supp_uhw { @@ -322,6 +323,8 @@ struct mlx5_ib_create_qp { __aligned_u64 sq_buf_addr; __aligned_u64 access_key; }; + __u32 ece_options; + __u32 reserved; }; /* RX Hash function flags */ @@ -371,7 +374,7 @@ enum mlx5_ib_create_qp_resp_mask { struct mlx5_ib_create_qp_resp { __u32 bfreg_index; - __u32 reserved; + __u32 ece_options; __u32 comp_mask; __u32 tirn; __u32 tisn; @@ -420,12 +423,14 @@ struct mlx5_ib_burst_info { struct mlx5_ib_modify_qp { __u32 comp_mask; struct mlx5_ib_burst_info burst_info; - __u32 reserved; + __u32 ece_options; }; struct mlx5_ib_modify_qp_resp { __u32 response_length; __u32 dctn; + __u32 ece_options; + __u32 reserved; }; struct mlx5_ib_create_wq_resp { diff --git a/kernel-headers/rdma/mlx5_user_ioctl_cmds.h b/kernel-headers/rdma/mlx5_user_ioctl_cmds.h index 24f3388..e24d66d 100644 --- a/kernel-headers/rdma/mlx5_user_ioctl_cmds.h +++ b/kernel-headers/rdma/mlx5_user_ioctl_cmds.h @@ -228,6 +228,10 @@ enum mlx5_ib_flow_matcher_methods { MLX5_IB_METHOD_FLOW_MATCHER_DESTROY, }; +enum mlx5_ib_device_query_context_attrs { + MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX = (1U << UVERBS_ID_NS_SHIFT), +}; + #define MLX5_IB_DW_MATCH_PARAM 0x80 struct mlx5_ib_match_params { @@ -241,6 +245,11 @@ enum mlx5_ib_flow_type { MLX5_IB_FLOW_TYPE_MC_DEFAULT, }; +enum mlx5_ib_create_flow_flags { + MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS = 1 << 0, + MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP = 1 << 1, +}; + enum mlx5_ib_create_flow_attrs { MLX5_IB_ATTR_CREATE_FLOW_HANDLE = (1U << UVERBS_ID_NS_SHIFT), MLX5_IB_ATTR_CREATE_FLOW_MATCH_VALUE, @@ -251,9 +260,10 @@ enum mlx5_ib_create_flow_attrs { MLX5_IB_ATTR_CREATE_FLOW_TAG, MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX, MLX5_IB_ATTR_CREATE_FLOW_ARR_COUNTERS_DEVX_OFFSET, + MLX5_IB_ATTR_CREATE_FLOW_FLAGS, }; -enum mlx5_ib_destoy_flow_attrs { +enum mlx5_ib_destroy_flow_attrs { MLX5_IB_ATTR_DESTROY_FLOW_HANDLE = (1U << UVERBS_ID_NS_SHIFT), }; @@ -280,4 +290,14 @@ enum mlx5_ib_create_flow_action_create_packet_reformat_attrs { MLX5_IB_ATTR_CREATE_PACKET_REFORMAT_DATA_BUF, }; +enum mlx5_ib_query_pd_attrs { + MLX5_IB_ATTR_QUERY_PD_HANDLE = (1U << UVERBS_ID_NS_SHIFT), + MLX5_IB_ATTR_QUERY_PD_RESP_PDN, +}; + +enum mlx5_ib_pd_methods { + MLX5_IB_METHOD_PD_QUERY = (1U << UVERBS_ID_NS_SHIFT), + +}; + #endif diff --git a/kernel-headers/rdma/qedr-abi.h b/kernel-headers/rdma/qedr-abi.h index a0b83c9..bf7333b 100644 --- a/kernel-headers/rdma/qedr-abi.h +++ b/kernel-headers/rdma/qedr-abi.h @@ -39,8 +39,9 @@ /* user kernel communication data structures. */ enum qedr_alloc_ucontext_flags { - QEDR_ALLOC_UCTX_RESERVED = 1 << 0, - QEDR_ALLOC_UCTX_DB_REC = 1 << 1 + QEDR_ALLOC_UCTX_EDPM_MODE = 1 << 0, + QEDR_ALLOC_UCTX_DB_REC = 1 << 1, + QEDR_SUPPORT_DPM_SIZES = 1 << 2, }; struct qedr_alloc_ucontext_req { @@ -50,13 +51,14 @@ struct qedr_alloc_ucontext_req { #define QEDR_LDPM_MAX_SIZE (8192) #define QEDR_EDPM_TRANS_SIZE (64) +#define QEDR_EDPM_MAX_SIZE (ROCE_REQ_MAX_INLINE_DATA_SIZE) enum qedr_rdma_dpm_type { QEDR_DPM_TYPE_NONE = 0, QEDR_DPM_TYPE_ROCE_ENHANCED = 1 << 0, QEDR_DPM_TYPE_ROCE_LEGACY = 1 << 1, QEDR_DPM_TYPE_IWARP_LEGACY = 1 << 2, - QEDR_DPM_TYPE_RESERVED = 1 << 3, + QEDR_DPM_TYPE_ROCE_EDPM_MODE = 1 << 3, QEDR_DPM_SIZES_SET = 1 << 4, }; @@ -77,6 +79,8 @@ struct qedr_alloc_ucontext_resp { __u16 ldpm_limit_size; __u8 edpm_trans_size; __u8 reserved; + __u16 edpm_limit_size; + __u8 padding[6]; }; struct qedr_alloc_pd_ureq { diff --git a/kernel-headers/rdma/rdma_netlink.h b/kernel-headers/rdma/rdma_netlink.h index 8e27778..d2f5b83 100644 --- a/kernel-headers/rdma/rdma_netlink.h +++ b/kernel-headers/rdma/rdma_netlink.h @@ -287,6 +287,12 @@ enum rdma_nldev_command { RDMA_NLDEV_CMD_STAT_DEL, + RDMA_NLDEV_CMD_RES_QP_GET_RAW, + + RDMA_NLDEV_CMD_RES_CQ_GET_RAW, + + RDMA_NLDEV_CMD_RES_MR_GET_RAW, + RDMA_NLDEV_NUM_OPS }; @@ -525,6 +531,8 @@ enum rdma_nldev_attr { */ RDMA_NLDEV_ATTR_DEV_DIM, /* u8 */ + RDMA_NLDEV_ATTR_RES_RAW, /* binary */ + /* * Always the end */ @@ -561,5 +569,6 @@ enum rdma_nl_counter_mode { */ enum rdma_nl_counter_mask { RDMA_COUNTER_MASK_QP_TYPE = 1, + RDMA_COUNTER_MASK_PID = 1 << 1, }; #endif /* _UAPI_RDMA_NETLINK_H */ diff --git a/kernel-headers/rdma/rdma_user_cm.h b/kernel-headers/rdma/rdma_user_cm.h index e42940a..ed5a514 100644 --- a/kernel-headers/rdma/rdma_user_cm.h +++ b/kernel-headers/rdma/rdma_user_cm.h @@ -164,6 +164,8 @@ struct rdma_ucm_query_route_resp { __u32 num_paths; __u8 port_num; __u8 reserved[3]; + __u32 ibdev_index; + __u32 reserved1; }; struct rdma_ucm_query_addr_resp { @@ -175,6 +177,8 @@ struct rdma_ucm_query_addr_resp { __u16 dst_size; struct __kernel_sockaddr_storage src_addr; struct __kernel_sockaddr_storage dst_addr; + __u32 ibdev_index; + __u32 reserved1; }; struct rdma_ucm_query_path_resp { @@ -206,10 +210,16 @@ struct rdma_ucm_ud_param { __u8 reserved[7]; }; +struct rdma_ucm_ece { + __u32 vendor_id; + __u32 attr_mod; +}; + struct rdma_ucm_connect { struct rdma_ucm_conn_param conn_param; __u32 id; __u32 reserved; + struct rdma_ucm_ece ece; }; struct rdma_ucm_listen { @@ -222,12 +232,14 @@ struct rdma_ucm_accept { struct rdma_ucm_conn_param conn_param; __u32 id; __u32 reserved; + struct rdma_ucm_ece ece; }; struct rdma_ucm_reject { __u32 id; __u8 private_data_len; - __u8 reserved[3]; + __u8 reason; + __u8 reserved[2]; __u8 private_data[RDMA_MAX_PRIVATE_DATA]; }; @@ -287,6 +299,7 @@ struct rdma_ucm_event_resp { struct rdma_ucm_ud_param ud; } param; __u32 reserved; + struct rdma_ucm_ece ece; }; /* Option levels */ diff --git a/kernel-headers/rdma/rdma_user_ioctl.h b/kernel-headers/rdma/rdma_user_ioctl.h index d92d272..53c5518 100644 --- a/kernel-headers/rdma/rdma_user_ioctl.h +++ b/kernel-headers/rdma/rdma_user_ioctl.h @@ -43,7 +43,7 @@ /* * General blocks assignments - * It is closed on purpose do not expose it it user space + * It is closed on purpose - do not expose it to user space * #define MAD_CMD_BASE 0x00 * #define HFI1_CMD_BAS 0xE0 */ diff --git a/kernel-headers/rdma/rdma_user_ioctl_cmds.h b/kernel-headers/rdma/rdma_user_ioctl_cmds.h index 7b1ec80..38ab7ac 100644 --- a/kernel-headers/rdma/rdma_user_ioctl_cmds.h +++ b/kernel-headers/rdma/rdma_user_ioctl_cmds.h @@ -36,7 +36,7 @@ #include #include -/* Documentation/ioctl/ioctl-number.rst */ +/* Documentation/userspace-api/ioctl/ioctl-number.rst */ #define RDMA_IOCTL_MAGIC 0x1b #define RDMA_VERBS_IOCTL \ _IOWR(RDMA_IOCTL_MAGIC, 1, struct ib_uverbs_ioctl_hdr) diff --git a/kernel-headers/rdma/rdma_user_rxe.h b/kernel-headers/rdma/rdma_user_rxe.h index aae2e69..d8f2e0e 100644 --- a/kernel-headers/rdma/rdma_user_rxe.h +++ b/kernel-headers/rdma/rdma_user_rxe.h @@ -99,8 +99,8 @@ struct rxe_send_wr { struct ib_mr *mr; __aligned_u64 reserved; }; - __u32 key; - __u32 access; + __u32 key; + __u32 access; } reg; } wr; }; @@ -112,7 +112,7 @@ struct rxe_sge { }; struct mminfo { - __aligned_u64 offset; + __aligned_u64 offset; __u32 size; __u32 pad; }; diff --git a/kernel-headers/update b/kernel-headers/update index 6bdf73b..48a716d 100755 --- a/kernel-headers/update +++ b/kernel-headers/update @@ -151,7 +151,7 @@ def make_commit(args,ntree,kernel_desc): # And now create the commit msg="Update kernel headers\n\n"; - p = textwrap.fill("To commit %s"%(kernel_desc.decode()), + p = textwrap.fill("To commit %s."%(kernel_desc.decode()), width=74) msg = msg + p; msg = msg + "\n\nSigned-off-by: %s\n"%(emaila); diff --git a/libibnetdisc/man/ibnd_discover_fabric.3 b/libibnetdisc/man/ibnd_discover_fabric.3 index 2c09da7..baa4c81 100644 --- a/libibnetdisc/man/ibnd_discover_fabric.3 +++ b/libibnetdisc/man/ibnd_discover_fabric.3 @@ -28,7 +28,7 @@ Indicate that the library should print debug output which shows it's progress through the fabric. .B ibnd_set_max_smps_on_wire() -Set the number of SMP\'s which will be issued on the wire simultaneously. +Set the number of SMP's which will be issued on the wire simultaneously. .SH "RETURN VALUE" .B ibnd_discover_fabric() diff --git a/libibumad/CMakeLists.txt b/libibumad/CMakeLists.txt index 9d0a425..074396c 100644 --- a/libibumad/CMakeLists.txt +++ b/libibumad/CMakeLists.txt @@ -10,7 +10,7 @@ publish_headers(infiniband rdma_library(ibumad libibumad.map # See Documentation/versioning.md - 3 3.1.${PACKAGE_VERSION} + 3 3.2.${PACKAGE_VERSION} sysfs.c umad.c umad_str.c diff --git a/libibumad/libibumad.map b/libibumad/libibumad.map index 9f08d29..1d53677 100644 --- a/libibumad/libibumad.map +++ b/libibumad/libibumad.map @@ -45,3 +45,8 @@ IBUMAD_1.1 { umad_free_ca_device_list; umad_get_ca_device_list; } IBUMAD_1.0; + +IBUMAD_1.2 { + global: + umad_sort_ca_device_list; +} IBUMAD_1.1; diff --git a/libibumad/man/umad_addr_dump.3 b/libibumad/man/umad_addr_dump.3 index d082c37..46fb52a 100644 --- a/libibumad/man/umad_addr_dump.3 +++ b/libibumad/man/umad_addr_dump.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_ADDR_DUMP 3 "May 21, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_ADDR_DUMP 3 "May 21, 2007" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_addr_dump \- dump addr structure to stderr .SH "SYNOPSIS" diff --git a/libibumad/man/umad_alloc.3 b/libibumad/man/umad_alloc.3 index b5ef752..5ac8c17 100644 --- a/libibumad/man/umad_alloc.3 +++ b/libibumad/man/umad_alloc.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_ALLOC 3 "May 21, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_ALLOC 3 "May 21, 2007" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_alloc \- allocate memory for umad buffers .SH "SYNOPSIS" diff --git a/libibumad/man/umad_class_str.3 b/libibumad/man/umad_class_str.3 index 9adb0fd..1e74470 100644 --- a/libibumad/man/umad_class_str.3 +++ b/libibumad/man/umad_class_str.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_CLASS_STR 3 "Feb 15, 2013" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_CLASS_STR 3 "Feb 15, 2013" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_*_str \- class of functions to return string representations of enums diff --git a/libibumad/man/umad_close_port.3 b/libibumad/man/umad_close_port.3 index 341c2d2..5bb0b50 100644 --- a/libibumad/man/umad_close_port.3 +++ b/libibumad/man/umad_close_port.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_OPEN_PORT 3 "May 11, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_OPEN_PORT 3 "May 11, 2007" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_close_port \- close InfiniBand device port for umad access .SH "SYNOPSIS" diff --git a/libibumad/man/umad_debug.3 b/libibumad/man/umad_debug.3 index 224d5c0..1816f68 100644 --- a/libibumad/man/umad_debug.3 +++ b/libibumad/man/umad_debug.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_DEBUG 3 "May 21, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_DEBUG 3 "May 21, 2007" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_debug \- set debug level .SH "SYNOPSIS" diff --git a/libibumad/man/umad_dump.3 b/libibumad/man/umad_dump.3 index c01d51b..5e3d743 100644 --- a/libibumad/man/umad_dump.3 +++ b/libibumad/man/umad_dump.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_DUMP 3 "May 17, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_DUMP 3 "May 17, 2007" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_dump \- dump umad buffer to stderr .SH "SYNOPSIS" diff --git a/libibumad/man/umad_free.3 b/libibumad/man/umad_free.3 index e347317..fd7c176 100644 --- a/libibumad/man/umad_free.3 +++ b/libibumad/man/umad_free.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_FREE 3 "May 17, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_FREE 3 "May 17, 2007" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_free \- frees memory of umad buffers .SH "SYNOPSIS" diff --git a/libibumad/man/umad_get_ca.3 b/libibumad/man/umad_get_ca.3 index 760b6b6..8bdfed9 100644 --- a/libibumad/man/umad_get_ca.3 +++ b/libibumad/man/umad_get_ca.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_GET_CA 3 "May 21, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_GET_CA 3 "May 21, 2007" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_get_ca, umad_release_ca \- get and release InfiniBand device port attributes .SH "SYNOPSIS" diff --git a/libibumad/man/umad_get_ca_portguids.3 b/libibumad/man/umad_get_ca_portguids.3 index c2a5592..160e988 100644 --- a/libibumad/man/umad_get_ca_portguids.3 +++ b/libibumad/man/umad_get_ca_portguids.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_GET_CA_PORTGUIDS 3 "August 8, 2016" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_GET_CA_PORTGUIDS 3 "August 8, 2016" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_get_ca_portguids \- get the InfiniBand device ports GUIDs .SH "SYNOPSIS" diff --git a/libibumad/man/umad_get_cas_names.3 b/libibumad/man/umad_get_cas_names.3 index 0366c16..2939620 100644 --- a/libibumad/man/umad_get_cas_names.3 +++ b/libibumad/man/umad_get_cas_names.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_GET_CAS_NAMES 3 "May 21, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_GET_CAS_NAMES 3 "May 21, 2007" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_get_cas_names \- get list of available InfiniBand device names .SH "SYNOPSIS" diff --git a/libibumad/man/umad_get_fd.3 b/libibumad/man/umad_get_fd.3 index 5fe5311..c4c11f3 100644 --- a/libibumad/man/umad_get_fd.3 +++ b/libibumad/man/umad_get_fd.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_GET_FD 3 "May 17, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_GET_FD 3 "May 17, 2007" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_get_fd \- get the umad fd for the requested port .SH "SYNOPSIS" diff --git a/libibumad/man/umad_get_issm_path.3 b/libibumad/man/umad_get_issm_path.3 index 4abef18..054e45d 100644 --- a/libibumad/man/umad_get_issm_path.3 +++ b/libibumad/man/umad_get_issm_path.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_GET_ISSM_PATH 3 "Oct 18, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_GET_ISSM_PATH 3 "Oct 18, 2007" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_get_issm_path \- get path of issm device .SH "SYNOPSIS" @@ -27,7 +27,7 @@ Opening issm device sets PortInfo:CapMask IsSM bit and closing clears it. .SH "RETURN VALUE" .B umad_open_port() returns 0 on success and a negative value on error as follows: - -ENODEV IB device can\'t be resolved + -ENODEV IB device can't be resolved -EINVAL port is not valid (bad .I portnum\fR or no umad device) diff --git a/libibumad/man/umad_get_mad.3 b/libibumad/man/umad_get_mad.3 index ac56c48..6bb6102 100644 --- a/libibumad/man/umad_get_mad.3 +++ b/libibumad/man/umad_get_mad.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_GET_MAD 3 "May 21, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_GET_MAD 3 "May 21, 2007" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_get_mad \- get the MAD pointer of a umad buffer .SH "SYNOPSIS" diff --git a/libibumad/man/umad_get_mad_addr.3 b/libibumad/man/umad_get_mad_addr.3 index 4a92b7b..5a26394 100644 --- a/libibumad/man/umad_get_mad_addr.3 +++ b/libibumad/man/umad_get_mad_addr.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_GET_MAD_ADDR 3 "May 21, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_GET_MAD_ADDR 3 "May 21, 2007" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_get_mad_addr \- get the address of the ib_mad_addr from a umad buffer .SH "SYNOPSIS" diff --git a/libibumad/man/umad_get_pkey.3 b/libibumad/man/umad_get_pkey.3 index b9dd1be..eba3022 100644 --- a/libibumad/man/umad_get_pkey.3 +++ b/libibumad/man/umad_get_pkey.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_GET_PKEY 3 "Jan 15, 2008" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_GET_PKEY 3 "Jan 15, 2008" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_get_pkey \- get pkey index from umad buffer .SH "SYNOPSIS" diff --git a/libibumad/man/umad_get_port.3 b/libibumad/man/umad_get_port.3 index 44dbfb0..d4b54e8 100644 --- a/libibumad/man/umad_get_port.3 +++ b/libibumad/man/umad_get_port.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_GET_PORT 3 "May 21, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_GET_PORT 3 "May 21, 2007" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_get_port, umad_release_port \- open and close an InfiniBand port .SH "SYNOPSIS" diff --git a/libibumad/man/umad_open_port.3 b/libibumad/man/umad_open_port.3 index bd7026b..b9b6090 100644 --- a/libibumad/man/umad_open_port.3 +++ b/libibumad/man/umad_open_port.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_OPEN_PORT 3 "May 21, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_OPEN_PORT 3 "May 21, 2007" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_open_port \- open InfiniBand device port for umad access .SH "SYNOPSIS" @@ -24,12 +24,12 @@ for details). .SH "RETURN VALUE" .B umad_open_port() returns 0 or an unique positive value of umad device descriptor on success, and a negative value on error as follows: - -EOPNOTSUPP ABI version doesn\'t match - -ENODEV IB device can\'t be resolved + -EOPNOTSUPP ABI version doesn't match + -ENODEV IB device can't be resolved -EINVAL port is not valid (bad .I portnum\fR or no umad device) - -EIO umad device for this port can\'t be opened + -EIO umad device for this port can't be opened .SH "SEE ALSO" .BR umad_close_port (3), .BR umad_get_cas_names (3), diff --git a/libibumad/man/umad_poll.3 b/libibumad/man/umad_poll.3 index 57b7a65..43e71a3 100644 --- a/libibumad/man/umad_poll.3 +++ b/libibumad/man/umad_poll.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_POLL 3 "October 23, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_POLL 3 "October 23, 2007" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_poll \- poll umad .SH "SYNOPSIS" diff --git a/libibumad/man/umad_recv.3 b/libibumad/man/umad_recv.3 index 93eec99..feed890 100644 --- a/libibumad/man/umad_recv.3 +++ b/libibumad/man/umad_recv.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_RECV 3 "May 11, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_RECV 3 "May 11, 2007" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_recv \- receive umad .SH "SYNOPSIS" diff --git a/libibumad/man/umad_register.3 b/libibumad/man/umad_register.3 index 58b88f3..89fed06 100644 --- a/libibumad/man/umad_register.3 +++ b/libibumad/man/umad_register.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_REGISTER 3 "May 11, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_REGISTER 3 "May 11, 2007" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_register \- register the specified management class and version for port .SH "SYNOPSIS" diff --git a/libibumad/man/umad_register2.3 b/libibumad/man/umad_register2.3 index 74e8794..8a9c818 100644 --- a/libibumad/man/umad_register2.3 +++ b/libibumad/man/umad_register2.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_REGISTER2 3 "March 25, 2014" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_REGISTER2 3 "March 25, 2014" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_register2 \- register the specified management class and version for port .SH "SYNOPSIS" diff --git a/libibumad/man/umad_register_oui.3 b/libibumad/man/umad_register_oui.3 index 19430a9..06f682c 100644 --- a/libibumad/man/umad_register_oui.3 +++ b/libibumad/man/umad_register_oui.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_REGISTER_OUI 3 "May 17, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_REGISTER_OUI 3 "May 17, 2007" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_register_oui \- register the specified class in vendor range 2 for port .SH "SYNOPSIS" diff --git a/libibumad/man/umad_send.3 b/libibumad/man/umad_send.3 index 59af2cb..ac4c33f 100644 --- a/libibumad/man/umad_send.3 +++ b/libibumad/man/umad_send.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_SEND 3 "May 11, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_SEND 3 "May 11, 2007" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_send \- send umad .SH "SYNOPSIS" diff --git a/libibumad/man/umad_set_addr.3 b/libibumad/man/umad_set_addr.3 index 03ac862..7074e88 100644 --- a/libibumad/man/umad_set_addr.3 +++ b/libibumad/man/umad_set_addr.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_SET_ADDR 3 "May 17, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_SET_ADDR 3 "May 17, 2007" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_set_addr \- set MAD address fields within umad buffer using host ordering .SH "SYNOPSIS" diff --git a/libibumad/man/umad_set_addr_net.3 b/libibumad/man/umad_set_addr_net.3 index b395252..d55d402 100644 --- a/libibumad/man/umad_set_addr_net.3 +++ b/libibumad/man/umad_set_addr_net.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_SET_ADDR_NET 3 "May 21, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_SET_ADDR_NET 3 "May 21, 2007" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_set_addr_net \- set MAD address fields within umad buffer using network ordering .SH "SYNOPSIS" diff --git a/libibumad/man/umad_set_grh.3 b/libibumad/man/umad_set_grh.3 index 4ff52ec..b995c84 100644 --- a/libibumad/man/umad_set_grh.3 +++ b/libibumad/man/umad_set_grh.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_SET_GRH 3 "May 24, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_SET_GRH 3 "May 24, 2007" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_set_grh \- set GRH fields within umad buffer using host ordering .SH "SYNOPSIS" diff --git a/libibumad/man/umad_set_grh_net.3 b/libibumad/man/umad_set_grh_net.3 index 802b575..d7eb1de 100644 --- a/libibumad/man/umad_set_grh_net.3 +++ b/libibumad/man/umad_set_grh_net.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_SET_GRH_NET 3 "May 24, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_SET_GRH_NET 3 "May 24, 2007" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_set_grh_net \- set GRH fields within umad buffer using network ordering .SH "SYNOPSIS" diff --git a/libibumad/man/umad_set_pkey.3 b/libibumad/man/umad_set_pkey.3 index 23e858e..217d678 100644 --- a/libibumad/man/umad_set_pkey.3 +++ b/libibumad/man/umad_set_pkey.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_SET_PKEY 3 "June 20, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_SET_PKEY 3 "June 20, 2007" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_set_pkey \- set pkey index within umad buffer .SH "SYNOPSIS" diff --git a/libibumad/man/umad_size.3 b/libibumad/man/umad_size.3 index 74737cc..4c1a46f 100644 --- a/libibumad/man/umad_size.3 +++ b/libibumad/man/umad_size.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_SIZE 3 "May 21, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_SIZE 3 "May 21, 2007" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_size \- get the size of umad buffer .SH "SYNOPSIS" diff --git a/libibumad/man/umad_sort_ca_device_list.3.md b/libibumad/man/umad_sort_ca_device_list.3.md new file mode 100644 index 0000000..fbad4c5 --- /dev/null +++ b/libibumad/man/umad_sort_ca_device_list.3.md @@ -0,0 +1,57 @@ + +--- +date: "April 23, 2020" +footer: "OpenIB" +header: "OpenIB Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: UMAD_SORT_CA_DEVICE_LIST +--- + +# NAME + +umad_sort_ca_device_list - sort list of InfiniBand device names in +alphabetical order. + +# SYNOPSIS + +```c +#include + +int umad_sort_ca_device_list(struct umad_device_node **head, size_t size); +``` + +# DESCRIPTION + +**umad_sort_ca_device_list(struct umad_device_node **head, size_t size)** +sort the cas list of *struct umad_device_node* by IB devices (CAs) names +(Alphabetical sorting). +if *size_t size* input parameter is zero, the function will calculate the +size of the cas list. + +*struct umad_device_node* is defined as follows: + +```c +struct umad_device_node { + struct umad_device_node *next; + const char *ca_name; +}; +``` + +# RETURN VALUE + +**umad_sort_ca_device_list(struct umad_device_node **head, size_t size)** +returns zero value if sorting was succeded. +The function also returns pointer to list (struct umad_device_node **head) +sorted in alphabetical order as output parameter. +On error, non-zero value is returned. +*errno* is not set. + +# SEE ALSO + +**umad_get_ca_device_list**, **umad_free_ca_device_list** + +# AUTHORS + +Haim Boozaglo diff --git a/libibumad/man/umad_status.3 b/libibumad/man/umad_status.3 index fd5430a..8bae757 100644 --- a/libibumad/man/umad_status.3 +++ b/libibumad/man/umad_status.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_STATUS 3 "May 17, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_STATUS 3 "May 17, 2007" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_status \- get the status of a umad buffer .SH "SYNOPSIS" diff --git a/libibumad/man/umad_unregister.3 b/libibumad/man/umad_unregister.3 index 785d22d..a6433f3 100644 --- a/libibumad/man/umad_unregister.3 +++ b/libibumad/man/umad_unregister.3 @@ -1,7 +1,7 @@ .\" -*- nroff -*- .\" Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md .\" -.TH UMAD_UNREGISTER 3 "May 21, 2007" "OpenIB" "OpenIB Programmer\'s Manual" +.TH UMAD_UNREGISTER 3 "May 21, 2007" "OpenIB" "OpenIB Programmer's Manual" .SH "NAME" umad_unregister \- unregister umad agent .SH "SYNOPSIS" diff --git a/libibumad/umad.c b/libibumad/umad.c index fc242a6..90b8c0c 100644 --- a/libibumad/umad.c +++ b/libibumad/umad.c @@ -551,6 +551,13 @@ static int dev_to_umad_id(const char *dev, unsigned port) return -1; /* not found */ } +static int umad_ca_device_list_compare_function(const void *node_a, + const void *node_b) +{ + return strcmp((*((const struct umad_device_node **)node_a))->ca_name, + (*((const struct umad_device_node **)node_b))->ca_name); +} + /******************************* * Public interface */ @@ -1207,6 +1214,53 @@ void umad_dump(void *umad) umad_addr_dump(&mad->addr); } +int umad_sort_ca_device_list(struct umad_device_node **head, size_t size) +{ + int errsv = 0; + size_t i; + struct umad_device_node *node; + struct umad_device_node **nodes_array = NULL; + + if (!size) + for (node = *head; node; node = node->next) + size++; + if (size < 2) + return 0; + + nodes_array = calloc(size, sizeof(struct umad_device_node *)); + if (!nodes_array) { + errsv = ENOMEM; + goto exit; + } + + node = *head; + for (i = 0; i < size; i++) { + if (!node) { + errsv = EINVAL; + goto exit; + } + nodes_array[i] = node; + node = node->next; + } + if (node) { + errsv = EINVAL; + goto exit; + } + + qsort(nodes_array, size, sizeof(struct umad_device_node *), + umad_ca_device_list_compare_function); + + for (i = 0; i < size - 1; i++) + nodes_array[i]->next = nodes_array[i + 1]; + + *head = nodes_array[0]; + nodes_array[size - 1]->next = NULL; +exit: + free(nodes_array); + + return errsv; +} + struct umad_device_node *umad_get_ca_device_list(void) { DIR *dir; diff --git a/libibumad/umad.h b/libibumad/umad.h index ee2af2f..f1fb2d0 100644 --- a/libibumad/umad.h +++ b/libibumad/umad.h @@ -213,6 +213,7 @@ int umad_register(int portid, int mgmt_class, int mgmt_version, int umad_register_oui(int portid, int mgmt_class, uint8_t rmpp_version, uint8_t oui[3], long method_mask[16 / sizeof(long)]); int umad_unregister(int portid, int agentid); +int umad_sort_ca_device_list(struct umad_device_node **head, size_t size); struct umad_device_node *umad_get_ca_device_list(void); void umad_free_ca_device_list(struct umad_device_node *head); diff --git a/libibverbs/CMakeLists.txt b/libibverbs/CMakeLists.txt index 4328548..0fe4256 100644 --- a/libibverbs/CMakeLists.txt +++ b/libibverbs/CMakeLists.txt @@ -21,7 +21,7 @@ configure_file("libibverbs.map.in" rdma_library(ibverbs "${CMAKE_CURRENT_BINARY_DIR}/libibverbs.map" # See Documentation/versioning.md - 1 1.8.${PACKAGE_VERSION} + 1 1.11.${PACKAGE_VERSION} all_providers.c cmd.c cmd_ah.c @@ -36,7 +36,10 @@ rdma_library(ibverbs "${CMAKE_CURRENT_BINARY_DIR}/libibverbs.map" cmd_mr.c cmd_mw.c cmd_pd.c + cmd_qp.c cmd_rwq_ind.c + cmd_srq.c + cmd_wq.c cmd_xrcd.c compat-1_0.c device.c diff --git a/libibverbs/cmd.c b/libibverbs/cmd.c index 728d884..25c8a97 100644 --- a/libibverbs/cmd.c +++ b/libibverbs/cmd.c @@ -344,6 +344,7 @@ int ibv_cmd_reg_mr(struct ibv_pd *pd, void *addr, size_t length, vmr->ibv_mr.rkey = resp->rkey; vmr->ibv_mr.context = pd->context; vmr->mr_type = IBV_MR_TYPE_MR; + vmr->access = access; return 0; } @@ -480,127 +481,6 @@ int ibv_cmd_resize_cq(struct ibv_cq *cq, int cqe, return 0; } -int ibv_cmd_create_srq(struct ibv_pd *pd, - struct ibv_srq *srq, struct ibv_srq_init_attr *attr, - struct ibv_create_srq *cmd, size_t cmd_size, - struct ib_uverbs_create_srq_resp *resp, size_t resp_size) -{ - int ret; - - cmd->user_handle = (uintptr_t) srq; - cmd->pd_handle = pd->handle; - cmd->max_wr = attr->attr.max_wr; - cmd->max_sge = attr->attr.max_sge; - cmd->srq_limit = attr->attr.srq_limit; - - ret = execute_cmd_write(pd->context, IB_USER_VERBS_CMD_CREATE_SRQ, cmd, - cmd_size, resp, resp_size); - if (ret) - return ret; - - srq->handle = resp->srq_handle; - srq->context = pd->context; - - if (abi_ver > 5) { - attr->attr.max_wr = resp->max_wr; - attr->attr.max_sge = resp->max_sge; - } else { - struct ibv_create_srq_resp_v5 *resp_v5 = - (struct ibv_create_srq_resp_v5 *) resp; - - memmove((void *) resp + sizeof *resp, - (void *) resp_v5 + sizeof *resp_v5, - resp_size - sizeof *resp); - } - - return 0; -} - -int ibv_cmd_create_srq_ex(struct ibv_context *context, - struct verbs_srq *srq, int vsrq_sz, - struct ibv_srq_init_attr_ex *attr_ex, - struct ibv_create_xsrq *cmd, size_t cmd_size, - struct ib_uverbs_create_srq_resp *resp, size_t resp_size) -{ - struct verbs_xrcd *vxrcd = NULL; - int ret; - - if (attr_ex->comp_mask >= IBV_SRQ_INIT_ATTR_RESERVED) - return EOPNOTSUPP; - - if (!(attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_PD)) - return EINVAL; - - cmd->user_handle = (uintptr_t) srq; - cmd->pd_handle = attr_ex->pd->handle; - cmd->max_wr = attr_ex->attr.max_wr; - cmd->max_sge = attr_ex->attr.max_sge; - cmd->srq_limit = attr_ex->attr.srq_limit; - - cmd->srq_type = (attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_TYPE) ? - attr_ex->srq_type : IBV_SRQT_BASIC; - if (attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_XRCD) { - if (!(attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_CQ)) - return EINVAL; - - vxrcd = container_of(attr_ex->xrcd, struct verbs_xrcd, xrcd); - cmd->xrcd_handle = vxrcd->handle; - cmd->cq_handle = attr_ex->cq->handle; - } else if (attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_TM) { - if (cmd->srq_type != IBV_SRQT_TM) - return EINVAL; - if (!(attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_CQ) || - !attr_ex->tm_cap.max_num_tags) - return EINVAL; - - cmd->cq_handle = attr_ex->cq->handle; - cmd->max_num_tags = attr_ex->tm_cap.max_num_tags; - } else if (cmd->srq_type != IBV_SRQT_BASIC) { - return EINVAL; - } - - ret = execute_cmd_write(context, IB_USER_VERBS_CMD_CREATE_XSRQ, cmd, - cmd_size, resp, resp_size); - if (ret) - return ret; - - srq->srq.handle = resp->srq_handle; - srq->srq.context = context; - srq->srq.srq_context = attr_ex->srq_context; - srq->srq.pd = attr_ex->pd; - srq->srq.events_completed = 0; - pthread_mutex_init(&srq->srq.mutex, NULL); - pthread_cond_init(&srq->srq.cond, NULL); - - /* - * check that the last field is available. - * If it is than all the others exist as well - */ - if (vext_field_avail(struct verbs_srq, srq_num, vsrq_sz)) { - srq->comp_mask = IBV_SRQ_INIT_ATTR_TYPE; - srq->srq_type = (attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_TYPE) ? - attr_ex->srq_type : IBV_SRQT_BASIC; - if (srq->srq_type == IBV_SRQT_XRC) { - srq->comp_mask |= VERBS_SRQ_NUM; - srq->srq_num = resp->srqn; - } - if (attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_XRCD) { - srq->comp_mask |= VERBS_SRQ_XRCD; - srq->xrcd = vxrcd; - } - if (attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_CQ) { - srq->comp_mask |= VERBS_SRQ_CQ; - srq->cq = attr_ex->cq; - } - } - - attr_ex->attr.max_wr = resp->max_wr; - attr_ex->attr.max_sge = resp->max_sge; - - return 0; -} - - static int ibv_cmd_modify_srq_v3(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr, int srq_attr_mask, @@ -664,113 +544,6 @@ int ibv_cmd_query_srq(struct ibv_srq *srq, struct ibv_srq_attr *srq_attr, return 0; } -int ibv_cmd_destroy_srq(struct ibv_srq *srq) -{ - struct ibv_destroy_srq req; - struct ib_uverbs_destroy_srq_resp resp; - int ret; - - req.core_payload = (struct ib_uverbs_destroy_srq){ - .srq_handle = srq->handle, - }; - - ret = execute_cmd_write(srq->context, IB_USER_VERBS_CMD_DESTROY_SRQ, - &req, sizeof(req), &resp, sizeof(resp)); - if (verbs_is_destroy_err(&ret)) - return ret; - - pthread_mutex_lock(&srq->mutex); - while (srq->events_completed != resp.events_reported) - pthread_cond_wait(&srq->cond, &srq->mutex); - pthread_mutex_unlock(&srq->mutex); - - return 0; -} - -static int create_qp_ex_common(struct verbs_qp *qp, - struct ibv_qp_init_attr_ex *qp_attr, - struct verbs_xrcd *vxrcd, - struct ib_uverbs_create_qp *cmd) -{ - cmd->user_handle = (uintptr_t)qp; - - if (qp_attr->comp_mask & IBV_QP_INIT_ATTR_XRCD) { - vxrcd = container_of(qp_attr->xrcd, struct verbs_xrcd, xrcd); - cmd->pd_handle = vxrcd->handle; - } else { - if (!(qp_attr->comp_mask & IBV_QP_INIT_ATTR_PD)) - return EINVAL; - - cmd->pd_handle = qp_attr->pd->handle; - if (qp_attr->comp_mask & IBV_QP_INIT_ATTR_IND_TABLE) { - if (cmd->max_recv_wr || cmd->max_recv_sge || - cmd->recv_cq_handle || qp_attr->srq) - return EINVAL; - - /* send_cq is optinal */ - if (qp_attr->cap.max_send_wr) - cmd->send_cq_handle = qp_attr->send_cq->handle; - } else { - cmd->send_cq_handle = qp_attr->send_cq->handle; - - if (qp_attr->qp_type != IBV_QPT_XRC_SEND) { - cmd->recv_cq_handle = qp_attr->recv_cq->handle; - cmd->srq_handle = qp_attr->srq ? qp_attr->srq->handle : - 0; - } - } - } - - cmd->max_send_wr = qp_attr->cap.max_send_wr; - cmd->max_recv_wr = qp_attr->cap.max_recv_wr; - cmd->max_send_sge = qp_attr->cap.max_send_sge; - cmd->max_recv_sge = qp_attr->cap.max_recv_sge; - cmd->max_inline_data = qp_attr->cap.max_inline_data; - cmd->sq_sig_all = qp_attr->sq_sig_all; - cmd->qp_type = qp_attr->qp_type; - cmd->is_srq = !!qp_attr->srq; - cmd->reserved = 0; - - return 0; -} - -static void create_qp_handle_resp_common(struct ibv_context *context, - struct verbs_qp *qp, - struct ibv_qp_init_attr_ex *qp_attr, - struct ib_uverbs_create_qp_resp *resp, - struct verbs_xrcd *vxrcd, - int vqp_sz) -{ - if (abi_ver > 3) { - qp_attr->cap.max_recv_sge = resp->max_recv_sge; - qp_attr->cap.max_send_sge = resp->max_send_sge; - qp_attr->cap.max_recv_wr = resp->max_recv_wr; - qp_attr->cap.max_send_wr = resp->max_send_wr; - qp_attr->cap.max_inline_data = resp->max_inline_data; - } - - qp->qp.handle = resp->qp_handle; - qp->qp.qp_num = resp->qpn; - qp->qp.context = context; - qp->qp.qp_context = qp_attr->qp_context; - qp->qp.pd = qp_attr->pd; - qp->qp.send_cq = qp_attr->send_cq; - qp->qp.recv_cq = qp_attr->recv_cq; - qp->qp.srq = qp_attr->srq; - qp->qp.qp_type = qp_attr->qp_type; - qp->qp.state = IBV_QPS_RESET; - qp->qp.events_completed = 0; - pthread_mutex_init(&qp->qp.mutex, NULL); - pthread_cond_init(&qp->qp.cond, NULL); - - qp->comp_mask = 0; - if (vext_field_avail(struct verbs_qp, xrcd, vqp_sz) && - (qp_attr->comp_mask & IBV_QP_INIT_ATTR_XRCD)) { - qp->comp_mask |= VERBS_QP_XRCD; - qp->xrcd = vxrcd; - } -} - enum { CREATE_QP_EX2_SUP_CREATE_FLAGS = IBV_QP_CREATE_BLOCK_SELF_MCAST_LB | IBV_QP_CREATE_SCATTER_FCS | @@ -779,163 +552,6 @@ enum { IBV_QP_CREATE_PCI_WRITE_END_PADDING, }; -int ibv_cmd_create_qp_ex2(struct ibv_context *context, - struct verbs_qp *qp, int vqp_sz, - struct ibv_qp_init_attr_ex *qp_attr, - struct ibv_create_qp_ex *cmd, - size_t cmd_size, - struct ib_uverbs_ex_create_qp_resp *resp, - size_t resp_size) -{ - struct verbs_xrcd *vxrcd = NULL; - int err; - - if (!check_comp_mask(qp_attr->comp_mask, - IBV_QP_INIT_ATTR_PD | - IBV_QP_INIT_ATTR_XRCD | - IBV_QP_INIT_ATTR_CREATE_FLAGS | - IBV_QP_INIT_ATTR_MAX_TSO_HEADER | - IBV_QP_INIT_ATTR_IND_TABLE | - IBV_QP_INIT_ATTR_RX_HASH | - IBV_QP_INIT_ATTR_SEND_OPS_FLAGS)) - return EINVAL; - - memset(&cmd->core_payload, 0, sizeof(cmd->core_payload)); - - err = create_qp_ex_common(qp, qp_attr, vxrcd, - ibv_create_qp_ex_to_reg(cmd)); - if (err) - return err; - - if (qp_attr->comp_mask & IBV_QP_INIT_ATTR_CREATE_FLAGS) { - if (qp_attr->create_flags & ~CREATE_QP_EX2_SUP_CREATE_FLAGS) - return EINVAL; - cmd->create_flags = qp_attr->create_flags; - - if (qp_attr->create_flags & IBV_QP_CREATE_SOURCE_QPN) - cmd->source_qpn = qp_attr->source_qpn; - } - - if (qp_attr->comp_mask & IBV_QP_INIT_ATTR_IND_TABLE) { - cmd->rwq_ind_tbl_handle = qp_attr->rwq_ind_tbl->ind_tbl_handle; - cmd->comp_mask = IB_UVERBS_CREATE_QP_MASK_IND_TABLE; - } - - err = execute_cmd_write_ex(context, IB_USER_VERBS_EX_CMD_CREATE_QP, - cmd, cmd_size, resp, resp_size); - if (err) - return err; - - create_qp_handle_resp_common(context, qp, qp_attr, &resp->base, vxrcd, - vqp_sz); - - return 0; -} - -int ibv_cmd_create_qp_ex(struct ibv_context *context, - struct verbs_qp *qp, int vqp_sz, - struct ibv_qp_init_attr_ex *attr_ex, - struct ibv_create_qp *cmd, size_t cmd_size, - struct ib_uverbs_create_qp_resp *resp, size_t resp_size) -{ - struct verbs_xrcd *vxrcd = NULL; - int err; - - if (!check_comp_mask(attr_ex->comp_mask, - IBV_QP_INIT_ATTR_PD | - IBV_QP_INIT_ATTR_XRCD | - IBV_QP_INIT_ATTR_SEND_OPS_FLAGS)) - return EOPNOTSUPP; - - err = create_qp_ex_common(qp, attr_ex, vxrcd, - &cmd->core_payload); - if (err) - return err; - - err = execute_cmd_write(context, IB_USER_VERBS_CMD_CREATE_QP, cmd, - cmd_size, resp, resp_size); - if (err) - return err; - - if (abi_ver == 4) { - struct ibv_create_qp_resp_v4 *resp_v4 = - (struct ibv_create_qp_resp_v4 *)resp; - - memmove((void *)resp + sizeof *resp, - (void *)resp_v4 + sizeof *resp_v4, - resp_size - sizeof *resp); - } else if (abi_ver <= 3) { - struct ibv_create_qp_resp_v3 *resp_v3 = - (struct ibv_create_qp_resp_v3 *)resp; - - memmove((void *)resp + sizeof *resp, - (void *)resp_v3 + sizeof *resp_v3, - resp_size - sizeof *resp); - } - - create_qp_handle_resp_common(context, qp, attr_ex, resp, vxrcd, vqp_sz); - - return 0; -} - -int ibv_cmd_create_qp(struct ibv_pd *pd, - struct ibv_qp *qp, struct ibv_qp_init_attr *attr, - struct ibv_create_qp *cmd, size_t cmd_size, - struct ib_uverbs_create_qp_resp *resp, size_t resp_size) -{ - int ret; - - cmd->user_handle = (uintptr_t) qp; - cmd->pd_handle = pd->handle; - cmd->send_cq_handle = attr->send_cq->handle; - cmd->recv_cq_handle = attr->recv_cq->handle; - cmd->srq_handle = attr->srq ? attr->srq->handle : 0; - cmd->max_send_wr = attr->cap.max_send_wr; - cmd->max_recv_wr = attr->cap.max_recv_wr; - cmd->max_send_sge = attr->cap.max_send_sge; - cmd->max_recv_sge = attr->cap.max_recv_sge; - cmd->max_inline_data = attr->cap.max_inline_data; - cmd->sq_sig_all = attr->sq_sig_all; - cmd->qp_type = attr->qp_type; - cmd->is_srq = !!attr->srq; - cmd->reserved = 0; - - ret = execute_cmd_write(pd->context, IB_USER_VERBS_CMD_CREATE_QP, cmd, - cmd_size, resp, resp_size); - if (ret) - return ret; - - qp->handle = resp->qp_handle; - qp->qp_num = resp->qpn; - qp->context = pd->context; - - if (abi_ver > 3) { - attr->cap.max_recv_sge = resp->max_recv_sge; - attr->cap.max_send_sge = resp->max_send_sge; - attr->cap.max_recv_wr = resp->max_recv_wr; - attr->cap.max_send_wr = resp->max_send_wr; - attr->cap.max_inline_data = resp->max_inline_data; - } - - if (abi_ver == 4) { - struct ibv_create_qp_resp_v4 *resp_v4 = - (struct ibv_create_qp_resp_v4 *) resp; - - memmove((void *) resp + sizeof *resp, - (void *) resp_v4 + sizeof *resp_v4, - resp_size - sizeof *resp); - } else if (abi_ver <= 3) { - struct ibv_create_qp_resp_v3 *resp_v3 = - (struct ibv_create_qp_resp_v3 *) resp; - - memmove((void *) resp + sizeof *resp, - (void *) resp_v3 + sizeof *resp_v3, - resp_size - sizeof *resp); - } - - return 0; -} - int ibv_cmd_open_qp(struct ibv_context *context, struct verbs_qp *qp, int vqp_sz, struct ibv_qp_open_attr *attr, @@ -1438,29 +1054,6 @@ int ibv_cmd_create_ah(struct ibv_pd *pd, struct ibv_ah *ah, return 0; } -int ibv_cmd_destroy_qp(struct ibv_qp *qp) -{ - struct ibv_destroy_qp req; - struct ib_uverbs_destroy_qp_resp resp; - int ret; - - req.core_payload = (struct ib_uverbs_destroy_qp){ - .qp_handle = qp->handle, - }; - - ret = execute_cmd_write(qp->context, IB_USER_VERBS_CMD_DESTROY_QP, &req, - sizeof(req), &resp, sizeof(resp)); - if (verbs_is_destroy_err(&ret)) - return ret; - - pthread_mutex_lock(&qp->mutex); - while (qp->events_completed != resp.events_reported) - pthread_cond_wait(&qp->cond, &qp->mutex); - pthread_mutex_unlock(&qp->mutex); - - return 0; -} - int ibv_cmd_attach_mcast(struct ibv_qp *qp, const union ibv_gid *gid, uint16_t lid) { struct ibv_attach_mcast req; @@ -1744,53 +1337,6 @@ int ibv_cmd_create_flow(struct ibv_qp *qp, return 0; } -int ibv_cmd_create_wq(struct ibv_context *context, - struct ibv_wq_init_attr *wq_init_attr, - struct ibv_wq *wq, - struct ibv_create_wq *cmd, - size_t cmd_size, - struct ib_uverbs_ex_create_wq_resp *resp, - size_t resp_size) -{ - int err; - - if (wq_init_attr->comp_mask >= IBV_WQ_INIT_ATTR_RESERVED) - return EINVAL; - - cmd->user_handle = (uintptr_t)wq; - cmd->pd_handle = wq_init_attr->pd->handle; - cmd->cq_handle = wq_init_attr->cq->handle; - cmd->wq_type = wq_init_attr->wq_type; - cmd->max_sge = wq_init_attr->max_sge; - cmd->max_wr = wq_init_attr->max_wr; - cmd->comp_mask = 0; - - if (wq_init_attr->comp_mask & IBV_WQ_INIT_ATTR_FLAGS) { - if (wq_init_attr->create_flags & ~(IBV_WQ_FLAGS_RESERVED - 1)) - return EOPNOTSUPP; - cmd->create_flags = wq_init_attr->create_flags; - } - - err = execute_cmd_write_ex(context, IB_USER_VERBS_EX_CMD_CREATE_WQ, - cmd, cmd_size, resp, resp_size); - if (err) - return err; - - if (resp->response_length < sizeof(*resp)) - return EINVAL; - - wq->handle = resp->wq_handle; - wq_init_attr->max_wr = resp->max_wr; - wq_init_attr->max_sge = resp->max_sge; - wq->wq_num = resp->wqn; - wq->context = context; - wq->cq = wq_init_attr->cq; - wq->pd = wq_init_attr->pd; - wq->wq_type = wq_init_attr->wq_type; - - return 0; -} - int ibv_cmd_modify_wq(struct ibv_wq *wq, struct ibv_wq_attr *attr, struct ibv_modify_wq *cmd, size_t cmd_size) { @@ -1823,32 +1369,6 @@ int ibv_cmd_modify_wq(struct ibv_wq *wq, struct ibv_wq_attr *attr, return 0; } -int ibv_cmd_destroy_wq(struct ibv_wq *wq) -{ - struct ibv_destroy_wq req; - struct ib_uverbs_ex_destroy_wq_resp resp; - int ret; - - req.core_payload = (struct ib_uverbs_ex_destroy_wq){ - .wq_handle = wq->handle, - }; - - ret = execute_cmd_write_ex(wq->context, IB_USER_VERBS_EX_CMD_DESTROY_WQ, - &req, sizeof(req), &resp, sizeof(resp)); - if (verbs_is_destroy_err(&ret)) - return ret; - - if (resp.response_length < sizeof(resp)) - return EINVAL; - - pthread_mutex_lock(&wq->mutex); - while (wq->events_completed != resp.events_reported) - pthread_cond_wait(&wq->cond, &wq->mutex); - pthread_mutex_unlock(&wq->mutex); - - return 0; -} - int ibv_cmd_create_rwq_ind_table(struct ibv_context *context, struct ibv_rwq_ind_table_init_attr *init_attr, struct ibv_rwq_ind_table *rwq_ind_table, diff --git a/libibverbs/cmd_cq.c b/libibverbs/cmd_cq.c index 542daa7..e8113d4 100644 --- a/libibverbs/cmd_cq.c +++ b/libibverbs/cmd_cq.c @@ -31,14 +31,17 @@ */ #include +#include "ibverbs.h" static int ibv_icmd_create_cq(struct ibv_context *context, int cqe, struct ibv_comp_channel *channel, int comp_vector, uint32_t flags, struct ibv_cq *cq, struct ibv_command_buffer *link) { - DECLARE_FBCMD_BUFFER(cmdb, UVERBS_OBJECT_CQ, UVERBS_METHOD_CQ_CREATE, 7, link); + DECLARE_FBCMD_BUFFER(cmdb, UVERBS_OBJECT_CQ, UVERBS_METHOD_CQ_CREATE, 8, link); + struct verbs_ex_private *priv = get_priv(context); struct ib_uverbs_attr *handle; + struct ib_uverbs_attr *async_fd_attr; uint32_t resp_cqe; int ret; @@ -52,6 +55,12 @@ static int ibv_icmd_create_cq(struct ibv_context *context, int cqe, if (channel) fill_attr_in_fd(cmdb, UVERBS_ATTR_CREATE_CQ_COMP_CHANNEL, channel->fd); fill_attr_in_uint32(cmdb, UVERBS_ATTR_CREATE_CQ_COMP_VECTOR, comp_vector); + async_fd_attr = fill_attr_in_fd(cmdb, UVERBS_ATTR_CREATE_CQ_EVENT_FD, context->async_fd); + if (priv->imported) + fallback_require_ioctl(cmdb); + else + /* Prevent fallback to the 'write' mode if kernel doesn't support it */ + attr_optional(async_fd_attr); if (flags) { fallback_require_ex(cmdb); @@ -131,7 +140,7 @@ int ibv_cmd_create_cq(struct ibv_context *context, int cqe, int ibv_cmd_create_cq_ex(struct ibv_context *context, struct ibv_cq_init_attr_ex *cq_attr, - struct ibv_cq_ex *cq, + struct verbs_cq *cq, struct ibv_create_cq_ex *cmd, size_t cmd_size, struct ib_uverbs_ex_create_cq_resp *resp, @@ -153,7 +162,7 @@ int ibv_cmd_create_cq_ex(struct ibv_context *context, return ibv_icmd_create_cq(context, cq_attr->cqe, cq_attr->channel, cq_attr->comp_vector, flags, - ibv_cq_ex_to_cq(cq), cmdb); + &cq->cq, cmdb); } int ibv_cmd_destroy_cq(struct ibv_cq *cq) diff --git a/libibverbs/cmd_device.c b/libibverbs/cmd_device.c index 4de59c0..6c8e01e 100644 --- a/libibverbs/cmd_device.c +++ b/libibverbs/cmd_device.c @@ -30,8 +30,14 @@ * SOFTWARE. */ +#define _GNU_SOURCE +#include +#include +#include #include +#include + static void copy_query_port_resp_to_port_attr(struct ibv_port_attr *port_attr, struct ib_uverbs_query_port_resp *resp) { @@ -99,7 +105,7 @@ int ibv_cmd_query_port(struct ibv_context *context, uint8_t port_num, return 0; } -static int cmd_alloc_async_fd(struct ibv_context *context) +int ibv_cmd_alloc_async_fd(struct ibv_context *context) { DECLARE_COMMAND_BUFFER(cmdb, UVERBS_OBJECT_ASYNC_EVENT, UVERBS_METHOD_ASYNC_EVENT_ALLOC, 1); @@ -153,9 +159,6 @@ static int cmd_get_context(struct verbs_context *context_ex, return 0; } case SUCCESS: - ret = cmd_alloc_async_fd(context); - if (ret) - return ret; break; default: return ret; @@ -178,3 +181,338 @@ int ibv_cmd_get_context(struct verbs_context *context_ex, return cmd_get_context(context_ex, cmdb); } + +int ibv_cmd_query_context(struct ibv_context *context, + struct ibv_command_buffer *driver) +{ + DECLARE_COMMAND_BUFFER_LINK(cmd, UVERBS_OBJECT_DEVICE, + UVERBS_METHOD_QUERY_CONTEXT, + 2, + driver); + + struct verbs_device *verbs_device; + uint64_t core_support; + int ret; + + fill_attr_out_ptr(cmd, UVERBS_ATTR_QUERY_CONTEXT_NUM_COMP_VECTORS, + &context->num_comp_vectors); + fill_attr_out_ptr(cmd, UVERBS_ATTR_QUERY_CONTEXT_CORE_SUPPORT, + &core_support); + + ret = execute_ioctl(context, cmd); + if (ret) + return ret; + + verbs_device = verbs_get_device(context->device); + verbs_device->core_support = core_support; + + return 0; +} + +static int is_zero_gid(union ibv_gid *gid) +{ + const union ibv_gid zgid = {}; + + return !memcmp(gid, &zgid, sizeof(*gid)); +} + +static int query_sysfs_gid_ndev_ifindex(struct ibv_context *context, + uint8_t port_num, uint32_t gid_index, + uint32_t *ndev_ifindex) +{ + struct verbs_device *verbs_device = verbs_get_device(context->device); + char buff[IF_NAMESIZE]; + + if (ibv_read_ibdev_sysfs_file(buff, sizeof(buff), verbs_device->sysfs, + "ports/%d/gid_attrs/ndevs/%d", port_num, + gid_index) <= 0) { + *ndev_ifindex = 0; + return 0; + } + + *ndev_ifindex = if_nametoindex(buff); + return *ndev_ifindex ? 0 : errno; +} + +static int query_sysfs_gid(struct ibv_context *context, uint8_t port_num, int index, + union ibv_gid *gid) +{ + struct verbs_device *verbs_device = verbs_get_device(context->device); + char attr[41]; + uint16_t val; + int i; + + if (ibv_read_ibdev_sysfs_file(attr, sizeof(attr), verbs_device->sysfs, + "ports/%d/gids/%d", port_num, index) < 0) + return -1; + + for (i = 0; i < 8; ++i) { + if (sscanf(attr + i * 5, "%hx", &val) != 1) + return -1; + gid->raw[i * 2] = val >> 8; + gid->raw[i * 2 + 1] = val & 0xff; + } + + return 0; +} + +/* GID types as appear in sysfs, no change is expected as of ABI + * compatibility. + */ +#define V1_TYPE "IB/RoCE v1" +#define V2_TYPE "RoCE v2" +static int query_sysfs_gid_type(struct ibv_context *context, uint8_t port_num, + unsigned int index, enum ibv_gid_type_sysfs *type) +{ + struct verbs_device *verbs_device = verbs_get_device(context->device); + char buff[11]; + + /* Reset errno so that we can rely on its value upon any error flow in + * ibv_read_sysfs_file. + */ + errno = 0; + if (ibv_read_ibdev_sysfs_file(buff, sizeof(buff), verbs_device->sysfs, + "ports/%d/gid_attrs/types/%d", port_num, + index) <= 0) { + char *dir_path; + DIR *dir; + + if (errno == EINVAL) { + /* In IB, this file doesn't exist and the kernel sets + * errno to -EINVAL. + */ + *type = IBV_GID_TYPE_SYSFS_IB_ROCE_V1; + return 0; + } + if (asprintf(&dir_path, "%s/%s/%d/%s/", + verbs_device->sysfs->ibdev_path, "ports", port_num, + "gid_attrs") < 0) + return -1; + dir = opendir(dir_path); + free(dir_path); + if (!dir) { + if (errno == ENOENT) + /* Assuming that if gid_attrs doesn't exist, + * we have an old kernel and all GIDs are + * IB/RoCE v1 + */ + *type = IBV_GID_TYPE_SYSFS_IB_ROCE_V1; + else + return -1; + } else { + closedir(dir); + errno = EFAULT; + return -1; + } + } else { + if (!strcmp(buff, V1_TYPE)) { + *type = IBV_GID_TYPE_SYSFS_IB_ROCE_V1; + } else if (!strcmp(buff, V2_TYPE)) { + *type = IBV_GID_TYPE_SYSFS_ROCE_V2; + } else { + errno = ENOTSUP; + return -1; + } + } + + return 0; +} + +static int query_sysfs_gid_entry(struct ibv_context *context, uint32_t port_num, + uint32_t gid_index, + struct ibv_gid_entry *entry, + uint32_t attr_mask, int link_layer) +{ + enum ibv_gid_type_sysfs gid_type; + struct ibv_port_attr port_attr = {}; + int ret = 0; + + entry->gid_index = gid_index; + entry->port_num = port_num; + + if (attr_mask & VERBS_QUERY_GID_ATTR_GID) { + ret = query_sysfs_gid(context, port_num, gid_index, &entry->gid); + if (ret) + return EINVAL; + } + + if (attr_mask & VERBS_QUERY_GID_ATTR_TYPE) { + ret = query_sysfs_gid_type(context, port_num, gid_index, &gid_type); + if (ret) + return EINVAL; + + if (gid_type == IBV_GID_TYPE_SYSFS_IB_ROCE_V1) { + if (link_layer < 0) { + ret = ibv_query_port(context, port_num, + &port_attr); + if (ret) + goto out; + + link_layer = port_attr.link_layer; + } + + if (link_layer == IBV_LINK_LAYER_INFINIBAND) { + entry->gid_type = IBV_GID_TYPE_IB; + } else if (link_layer == IBV_LINK_LAYER_ETHERNET) { + entry->gid_type = IBV_GID_TYPE_ROCE_V1; + } else { + /* Unspecified link layer is IB by default */ + entry->gid_type = IBV_GID_TYPE_IB; + } + } else { + entry->gid_type = IBV_GID_TYPE_ROCE_V2; + } + } + + if (attr_mask & VERBS_QUERY_GID_ATTR_NDEV_IFINDEX) + ret = query_sysfs_gid_ndev_ifindex(context, port_num, gid_index, + &entry->ndev_ifindex); + +out: + return ret; +} + +static int query_gid_table_fb(struct ibv_context *context, + struct ibv_gid_entry *entries, size_t max_entries, + uint64_t *num_entries, size_t entry_size) +{ + struct ibv_device_attr dev_attr = {}; + struct ibv_port_attr port_attr = {}; + struct ibv_gid_entry entry = {}; + int attr_mask; + void *tmp; + int i, j; + int ret; + + ret = ibv_query_device(context, &dev_attr); + if (ret) + goto out; + + tmp = entries; + *num_entries = 0; + attr_mask = VERBS_QUERY_GID_ATTR_GID | VERBS_QUERY_GID_ATTR_TYPE | + VERBS_QUERY_GID_ATTR_NDEV_IFINDEX; + for (i = 0; i < dev_attr.phys_port_cnt; i++) { + ret = ibv_query_port(context, i + 1, &port_attr); + if (ret) + goto out; + + for (j = 0; j < port_attr.gid_tbl_len; j++) { + /* In case we already reached max_entries, query to some + * temp entry, in case all other entries are zeros the + * API should succceed. + */ + if (*num_entries == max_entries) + tmp = &entry; + ret = query_sysfs_gid_entry(context, i + 1, j, + tmp, + attr_mask, + port_attr.link_layer); + if (ret) + goto out; + if (is_zero_gid(&((struct ibv_gid_entry *)tmp)->gid)) + continue; + if (*num_entries == max_entries) { + ret = EINVAL; + goto out; + } + + (*num_entries)++; + tmp += entry_size; + } + } + +out: + return ret; +} + +/* Using async_event cmd_name because query_gid_ex and query_gid_table are not + * in verbs_context_ops while async_event is and doesn't use ioctl. + * If one of them is not supported, so is the other. Hence, we can use a single + * cmd_name for both of them. + */ +#define query_gid_kernel_cap async_event +int __ibv_query_gid_ex(struct ibv_context *context, uint32_t port_num, + uint32_t gid_index, struct ibv_gid_entry *entry, + uint32_t flags, size_t entry_size, + uint32_t fallback_attr_mask) +{ + DECLARE_COMMAND_BUFFER(cmdb, UVERBS_OBJECT_DEVICE, + UVERBS_METHOD_QUERY_GID_ENTRY, 4); + int ret; + + fill_attr_const_in(cmdb, UVERBS_ATTR_QUERY_GID_ENTRY_PORT, port_num); + fill_attr_const_in(cmdb, UVERBS_ATTR_QUERY_GID_ENTRY_GID_INDEX, + gid_index); + fill_attr_in_uint32(cmdb, UVERBS_ATTR_QUERY_GID_ENTRY_FLAGS, flags); + fill_attr_out(cmdb, UVERBS_ATTR_QUERY_GID_ENTRY_RESP_ENTRY, entry, + entry_size); + + switch (execute_ioctl_fallback(context, query_gid_kernel_cap, cmdb, + &ret)) { + case TRY_WRITE: + if (flags) + return EOPNOTSUPP; + + ret = query_sysfs_gid_entry(context, port_num, gid_index, + entry, fallback_attr_mask, -1); + if (ret) + return ret; + + if (fallback_attr_mask & VERBS_QUERY_GID_ATTR_GID && + is_zero_gid(&entry->gid)) + return ENODATA; + + return 0; + default: + return ret; + } +} + +int _ibv_query_gid_ex(struct ibv_context *context, uint32_t port_num, + uint32_t gid_index, struct ibv_gid_entry *entry, + uint32_t flags, size_t entry_size) +{ + return __ibv_query_gid_ex(context, port_num, gid_index, entry, + flags, entry_size, + VERBS_QUERY_GID_ATTR_GID | + VERBS_QUERY_GID_ATTR_TYPE | + VERBS_QUERY_GID_ATTR_NDEV_IFINDEX); +} + +ssize_t _ibv_query_gid_table(struct ibv_context *context, + struct ibv_gid_entry *entries, + size_t max_entries, uint32_t flags, + size_t entry_size) +{ + DECLARE_COMMAND_BUFFER(cmdb, UVERBS_OBJECT_DEVICE, + UVERBS_METHOD_QUERY_GID_TABLE, 4); + uint64_t num_entries; + int ret; + + fill_attr_const_in(cmdb, UVERBS_ATTR_QUERY_GID_TABLE_ENTRY_SIZE, + entry_size); + fill_attr_in_uint32(cmdb, UVERBS_ATTR_QUERY_GID_TABLE_FLAGS, flags); + fill_attr_out(cmdb, UVERBS_ATTR_QUERY_GID_TABLE_RESP_ENTRIES, entries, + _array_len(entry_size, max_entries)); + fill_attr_out_ptr(cmdb, UVERBS_ATTR_QUERY_GID_TABLE_RESP_NUM_ENTRIES, + &num_entries); + + switch (execute_ioctl_fallback(context, query_gid_kernel_cap, cmdb, + &ret)) { + case TRY_WRITE: + if (flags) + return -EOPNOTSUPP; + + ret = query_gid_table_fb(context, entries, max_entries, + &num_entries, entry_size); + break; + default: + break; + } + + if (ret) + return -ret; + + return num_entries; +} diff --git a/libibverbs/cmd_fallback.c b/libibverbs/cmd_fallback.c index 46c09f3..ee18217 100644 --- a/libibverbs/cmd_fallback.c +++ b/libibverbs/cmd_fallback.c @@ -104,7 +104,7 @@ enum write_fallback _execute_ioctl_fallback(struct ibv_context *ctx, if (*ret == EPROTONOSUPPORT) { /* * EPROTONOSUPPORT means we have the ioctl framework but this - * specific method is not supported + * specific method or a mandatory attribute is not supported */ bitmap_set_bit(priv->unsupported_ioctls, cmd_bit); return _check_legacy(cmdb, ret); diff --git a/libibverbs/cmd_mr.c b/libibverbs/cmd_mr.c index cb729b6..42dbe42 100644 --- a/libibverbs/cmd_mr.c +++ b/libibverbs/cmd_mr.c @@ -85,3 +85,34 @@ int ibv_cmd_dereg_mr(struct verbs_mr *vmr) return ret; return 0; } + +int ibv_cmd_query_mr(struct ibv_pd *pd, struct verbs_mr *vmr, + uint32_t mr_handle) +{ + DECLARE_FBCMD_BUFFER(cmd, UVERBS_OBJECT_MR, + UVERBS_METHOD_QUERY_MR, + 4, NULL); + struct ibv_mr *mr = &vmr->ibv_mr; + int ret; + + fill_attr_in_obj(cmd, UVERBS_ATTR_QUERY_MR_HANDLE, mr_handle); + fill_attr_out_ptr(cmd, UVERBS_ATTR_QUERY_MR_RESP_LKEY, + &mr->lkey); + fill_attr_out_ptr(cmd, UVERBS_ATTR_QUERY_MR_RESP_RKEY, + &mr->rkey); + fill_attr_out_ptr(cmd, UVERBS_ATTR_QUERY_MR_RESP_LENGTH, + &mr->length); + + ret = execute_ioctl(pd->context, cmd); + if (ret) + return ret; + + mr->handle = mr_handle; + mr->context = pd->context; + mr->pd = pd; + mr->addr = NULL; + + vmr->mr_type = IBV_MR_TYPE_IMPORTED_MR; + return 0; +} + diff --git a/libibverbs/cmd_qp.c b/libibverbs/cmd_qp.c new file mode 100644 index 0000000..056f397 --- /dev/null +++ b/libibverbs/cmd_qp.c @@ -0,0 +1,480 @@ +/* + * Copyright (c) 2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include "ibverbs.h" + +enum { + CREATE_QP_EX_SUP_CREATE_FLAGS = IBV_QP_CREATE_BLOCK_SELF_MCAST_LB | + IBV_QP_CREATE_SCATTER_FCS | + IBV_QP_CREATE_CVLAN_STRIPPING | + IBV_QP_CREATE_SOURCE_QPN | + IBV_QP_CREATE_PCI_WRITE_END_PADDING +}; + + +static void set_qp(struct verbs_qp *vqp, + struct ibv_qp *qp_in, + struct ibv_qp_init_attr_ex *attr_ex, + struct verbs_xrcd *vxrcd) +{ + struct ibv_qp *qp = vqp ? &vqp->qp : qp_in; + + qp->qp_context = attr_ex->qp_context; + qp->pd = attr_ex->pd; + qp->send_cq = attr_ex->send_cq; + qp->recv_cq = attr_ex->recv_cq; + qp->srq = attr_ex->srq; + qp->qp_type = attr_ex->qp_type; + qp->state = IBV_QPS_RESET; + qp->events_completed = 0; + pthread_mutex_init(&qp->mutex, NULL); + pthread_cond_init(&qp->cond, NULL); + + if (vqp) { + vqp->comp_mask = 0; + if (attr_ex->comp_mask & IBV_QP_INIT_ATTR_XRCD) { + vqp->comp_mask |= VERBS_QP_XRCD; + vqp->xrcd = vxrcd; + } + } +} + +static int ibv_icmd_create_qp(struct ibv_context *context, + struct verbs_qp *vqp, + struct ibv_qp *qp_in, + struct ibv_qp_init_attr_ex *attr_ex, + struct ibv_command_buffer *link) +{ + DECLARE_FBCMD_BUFFER(cmdb, UVERBS_OBJECT_QP, UVERBS_METHOD_QP_CREATE, 15, link); + struct verbs_ex_private *priv = get_priv(context); + struct ib_uverbs_attr *handle; + uint32_t qp_num; + uint32_t pd_handle; + uint32_t send_cq_handle = 0; + uint32_t recv_cq_handle = 0; + int ret; + struct ibv_qp *qp = vqp ? &vqp->qp : qp_in; + struct verbs_xrcd *vxrcd = NULL; + uint32_t create_flags = 0; + + qp->context = context; + + switch (attr_ex->qp_type) { + case IBV_QPT_XRC_RECV: + if (!(attr_ex->comp_mask & IBV_QP_INIT_ATTR_XRCD)) { + errno = EINVAL; + return errno; + } + + vxrcd = container_of(attr_ex->xrcd, struct verbs_xrcd, xrcd); + fill_attr_in_obj(cmdb, UVERBS_ATTR_CREATE_QP_XRCD_HANDLE, vxrcd->handle); + pd_handle = vxrcd->handle; + break; + case IBV_QPT_RC: + case IBV_QPT_UD: + case IBV_QPT_UC: + case IBV_QPT_RAW_PACKET: + case IBV_QPT_XRC_SEND: + case IBV_QPT_DRIVER: + if (!(attr_ex->comp_mask & IBV_QP_INIT_ATTR_PD)) { + errno = EINVAL; + return errno; + } + + fill_attr_in_obj(cmdb, UVERBS_ATTR_CREATE_QP_PD_HANDLE, attr_ex->pd->handle); + pd_handle = attr_ex->pd->handle; + + if (attr_ex->comp_mask & IBV_QP_INIT_ATTR_IND_TABLE) { + if (attr_ex->cap.max_recv_wr || attr_ex->cap.max_recv_sge || + attr_ex->recv_cq || attr_ex->srq) { + errno = EINVAL; + return errno; + } + + fallback_require_ex(cmdb); + fill_attr_in_obj(cmdb, UVERBS_ATTR_CREATE_QP_IND_TABLE_HANDLE, + attr_ex->rwq_ind_tbl->ind_tbl_handle); + + /* send_cq is optional */ + if (attr_ex->cap.max_send_wr) { + fill_attr_in_obj(cmdb, UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE, + attr_ex->send_cq->handle); + send_cq_handle = attr_ex->send_cq->handle; + } + } else { + fill_attr_in_obj(cmdb, UVERBS_ATTR_CREATE_QP_SEND_CQ_HANDLE, + attr_ex->send_cq->handle); + send_cq_handle = attr_ex->send_cq->handle; + + if (attr_ex->qp_type != IBV_QPT_XRC_SEND) { + fill_attr_in_obj(cmdb, UVERBS_ATTR_CREATE_QP_RECV_CQ_HANDLE, + attr_ex->recv_cq->handle); + recv_cq_handle = attr_ex->recv_cq->handle; + } + } + + /* compatible with kernel code from the 'write' mode */ + if (attr_ex->qp_type == IBV_QPT_XRC_SEND) { + attr_ex->cap.max_recv_wr = 0; + attr_ex->cap.max_recv_sge = 0; + } + + break; + default: + errno = EINVAL; + return errno; + } + + handle = fill_attr_out_obj(cmdb, UVERBS_ATTR_CREATE_QP_HANDLE); + fill_attr_const_in(cmdb, UVERBS_ATTR_CREATE_QP_TYPE, attr_ex->qp_type); + fill_attr_in_uint64(cmdb, UVERBS_ATTR_CREATE_QP_USER_HANDLE, (uintptr_t)qp); + + static_assert(offsetof(struct ibv_qp_cap, max_send_wr) == + offsetof(struct ib_uverbs_qp_cap, max_send_wr), "Bad layout"); + static_assert(offsetof(struct ibv_qp_cap, max_recv_wr) == + offsetof(struct ib_uverbs_qp_cap, max_recv_wr), "Bad layout"); + static_assert(offsetof(struct ibv_qp_cap, max_send_sge) == + offsetof(struct ib_uverbs_qp_cap, max_send_sge), "Bad layout"); + static_assert(offsetof(struct ibv_qp_cap, max_recv_sge) == + offsetof(struct ib_uverbs_qp_cap, max_recv_sge), "Bad layout"); + static_assert(offsetof(struct ibv_qp_cap, max_inline_data) == + offsetof(struct ib_uverbs_qp_cap, max_inline_data), "Bad layout"); + + fill_attr_in_ptr(cmdb, UVERBS_ATTR_CREATE_QP_CAP, &attr_ex->cap); + fill_attr_in_fd(cmdb, UVERBS_ATTR_CREATE_QP_EVENT_FD, context->async_fd); + + if (priv->imported) + fallback_require_ioctl(cmdb); + + if (attr_ex->sq_sig_all) + create_flags |= IB_UVERBS_QP_CREATE_SQ_SIG_ALL; + + if (attr_ex->comp_mask & IBV_QP_INIT_ATTR_CREATE_FLAGS) { + if (attr_ex->create_flags & ~CREATE_QP_EX_SUP_CREATE_FLAGS) { + errno = EINVAL; + return errno; + } + + fallback_require_ex(cmdb); + create_flags |= attr_ex->create_flags; + + if (attr_ex->create_flags & IBV_QP_CREATE_SOURCE_QPN) { + fill_attr_in_uint32(cmdb, UVERBS_ATTR_CREATE_QP_SOURCE_QPN, + attr_ex->source_qpn); + /* source QPN is a self attribute once moving to ioctl, + * no extra bit is supported. + */ + create_flags &= ~IBV_QP_CREATE_SOURCE_QPN; + } + } + + if (create_flags) + fill_attr_in_uint32(cmdb, UVERBS_ATTR_CREATE_QP_FLAGS, + create_flags); + + if (attr_ex->srq) + fill_attr_in_obj(cmdb, UVERBS_ATTR_CREATE_QP_SRQ_HANDLE, attr_ex->srq->handle); + + fill_attr_out_ptr(cmdb, UVERBS_ATTR_CREATE_QP_RESP_CAP, &attr_ex->cap); + fill_attr_out_ptr(cmdb, UVERBS_ATTR_CREATE_QP_RESP_QP_NUM, &qp_num); + + switch (execute_ioctl_fallback(context, create_qp, cmdb, &ret)) { + case TRY_WRITE: { + if (abi_ver > 4) { + DECLARE_LEGACY_UHW_BUFS(link, IB_USER_VERBS_CMD_CREATE_QP); + + *req = (struct ib_uverbs_create_qp){ + .pd_handle = pd_handle, + .user_handle = (uintptr_t)qp, + .max_send_wr = attr_ex->cap.max_send_wr, + .max_recv_wr = attr_ex->cap.max_recv_wr, + .max_send_sge = attr_ex->cap.max_send_sge, + .max_recv_sge = attr_ex->cap.max_recv_sge, + .max_inline_data = attr_ex->cap.max_inline_data, + .sq_sig_all = attr_ex->sq_sig_all, + .qp_type = attr_ex->qp_type, + .srq_handle = attr_ex->srq ? attr_ex->srq->handle : 0, + .is_srq = !!attr_ex->srq, + .recv_cq_handle = recv_cq_handle, + .send_cq_handle = send_cq_handle, + }; + + ret = execute_write_bufs( + context, IB_USER_VERBS_CMD_CREATE_QP, req, resp); + if (ret) + return ret; + + qp->handle = resp->qp_handle; + qp->qp_num = resp->qpn; + + attr_ex->cap.max_recv_sge = resp->max_recv_sge; + attr_ex->cap.max_send_sge = resp->max_send_sge; + attr_ex->cap.max_recv_wr = resp->max_recv_wr; + attr_ex->cap.max_send_wr = resp->max_send_wr; + attr_ex->cap.max_inline_data = resp->max_inline_data; + + } else if (abi_ver == 4) { + DECLARE_LEGACY_UHW_BUFS(link, IB_USER_VERBS_CMD_CREATE_QP_V4); + + *req = (struct ib_uverbs_create_qp){ + .pd_handle = pd_handle, + .user_handle = (uintptr_t)qp, + .max_send_wr = attr_ex->cap.max_send_wr, + .max_recv_wr = attr_ex->cap.max_recv_wr, + .max_send_sge = attr_ex->cap.max_send_sge, + .max_recv_sge = attr_ex->cap.max_recv_sge, + .max_inline_data = attr_ex->cap.max_inline_data, + .sq_sig_all = attr_ex->sq_sig_all, + .qp_type = attr_ex->qp_type, + .srq_handle = attr_ex->srq ? attr_ex->srq->handle : 0, + .is_srq = !!attr_ex->srq, + .recv_cq_handle = recv_cq_handle, + .send_cq_handle = send_cq_handle, + }; + + ret = execute_write_bufs( + context, IB_USER_VERBS_CMD_CREATE_QP_V4, req, resp); + if (ret) + return ret; + + qp->handle = resp->qp_handle; + qp->qp_num = resp->qpn; + + attr_ex->cap.max_recv_sge = resp->max_recv_sge; + attr_ex->cap.max_send_sge = resp->max_send_sge; + attr_ex->cap.max_recv_wr = resp->max_recv_wr; + attr_ex->cap.max_send_wr = resp->max_send_wr; + attr_ex->cap.max_inline_data = resp->max_inline_data; + } else { + DECLARE_LEGACY_UHW_BUFS(link, IB_USER_VERBS_CMD_CREATE_QP_V3); + + *req = (struct ib_uverbs_create_qp){ + .pd_handle = pd_handle, + .user_handle = (uintptr_t)qp, + .max_send_wr = attr_ex->cap.max_send_wr, + .max_recv_wr = attr_ex->cap.max_recv_wr, + .max_send_sge = attr_ex->cap.max_send_sge, + .max_recv_sge = attr_ex->cap.max_recv_sge, + .max_inline_data = attr_ex->cap.max_inline_data, + .sq_sig_all = attr_ex->sq_sig_all, + .qp_type = attr_ex->qp_type, + .srq_handle = attr_ex->srq ? attr_ex->srq->handle : 0, + .is_srq = !!attr_ex->srq, + .recv_cq_handle = recv_cq_handle, + .send_cq_handle = send_cq_handle, + }; + + ret = execute_write_bufs( + context, IB_USER_VERBS_CMD_CREATE_QP_V3, req, resp); + if (ret) + return ret; + + qp->handle = resp->qp_handle; + qp->qp_num = resp->qpn; + } + + set_qp(vqp, qp, attr_ex, vxrcd); + return 0; + } + + case TRY_WRITE_EX: { + DECLARE_LEGACY_UHW_BUFS_EX(link, + IB_USER_VERBS_EX_CMD_CREATE_QP); + + *req = (struct ib_uverbs_ex_create_qp){ + .pd_handle = pd_handle, + .user_handle = (uintptr_t)qp, + .max_send_wr = attr_ex->cap.max_send_wr, + .max_recv_wr = attr_ex->cap.max_recv_wr, + .max_send_sge = attr_ex->cap.max_send_sge, + .max_recv_sge = attr_ex->cap.max_recv_sge, + .max_inline_data = attr_ex->cap.max_inline_data, + .sq_sig_all = attr_ex->sq_sig_all, + .qp_type = attr_ex->qp_type, + .srq_handle = attr_ex->srq ? attr_ex->srq->handle : 0, + .is_srq = !!attr_ex->srq, + .recv_cq_handle = recv_cq_handle, + .send_cq_handle = send_cq_handle, + }; + + if (attr_ex->comp_mask & IBV_QP_INIT_ATTR_CREATE_FLAGS) { + req->create_flags = attr_ex->create_flags; + + if (attr_ex->create_flags & IBV_QP_CREATE_SOURCE_QPN) + req->source_qpn = attr_ex->source_qpn; + } + + if (attr_ex->comp_mask & IBV_QP_INIT_ATTR_IND_TABLE) { + req->rwq_ind_tbl_handle = attr_ex->rwq_ind_tbl->ind_tbl_handle; + req->comp_mask = IB_UVERBS_CREATE_QP_MASK_IND_TABLE; + } + + ret = execute_write_bufs_ex( + context, IB_USER_VERBS_EX_CMD_CREATE_QP, req, resp); + if (ret) + return ret; + + qp->handle = resp->base.qp_handle; + qp->qp_num = resp->base.qpn; + + attr_ex->cap.max_recv_sge = resp->base.max_recv_sge; + attr_ex->cap.max_send_sge = resp->base.max_send_sge; + attr_ex->cap.max_recv_wr = resp->base.max_recv_wr; + attr_ex->cap.max_send_wr = resp->base.max_send_wr; + attr_ex->cap.max_inline_data = resp->base.max_inline_data; + set_qp(vqp, qp, attr_ex, vxrcd); + return 0; + } + + case SUCCESS: + break; + + default: + return ret; + } + + qp->handle = read_attr_obj(UVERBS_ATTR_CREATE_QP_HANDLE, handle); + qp->qp_num = qp_num; + set_qp(vqp, qp, attr_ex, vxrcd); + + return 0; +} + +int ibv_cmd_create_qp(struct ibv_pd *pd, + struct ibv_qp *qp, struct ibv_qp_init_attr *attr, + struct ibv_create_qp *cmd, size_t cmd_size, + struct ib_uverbs_create_qp_resp *resp, size_t resp_size) +{ + DECLARE_CMD_BUFFER_COMPAT(cmdb, UVERBS_OBJECT_QP, + UVERBS_METHOD_QP_CREATE, cmd, cmd_size, resp, + resp_size); + + struct ibv_qp_init_attr_ex attr_ex = {}; + int ret; + + memcpy(&attr_ex, attr, sizeof(*attr)); + attr_ex.comp_mask |= IBV_QP_INIT_ATTR_PD; + attr_ex.pd = pd; + ret = ibv_icmd_create_qp(pd->context, NULL, qp, &attr_ex, cmdb); + if (!ret) + memcpy(&attr->cap, &attr_ex.cap, sizeof(attr_ex.cap)); + + return ret; +} + +int ibv_cmd_create_qp_ex(struct ibv_context *context, + struct verbs_qp *qp, + struct ibv_qp_init_attr_ex *attr_ex, + struct ibv_create_qp *cmd, size_t cmd_size, + struct ib_uverbs_create_qp_resp *resp, size_t resp_size) +{ + DECLARE_CMD_BUFFER_COMPAT(cmdb, UVERBS_OBJECT_QP, + UVERBS_METHOD_QP_CREATE, cmd, cmd_size, resp, + resp_size); + + if (!check_comp_mask(attr_ex->comp_mask, + IBV_QP_INIT_ATTR_PD | + IBV_QP_INIT_ATTR_XRCD | + IBV_QP_INIT_ATTR_SEND_OPS_FLAGS)) { + errno = EINVAL; + return errno; + } + + return ibv_icmd_create_qp(context, qp, NULL, attr_ex, cmdb); +} + +int ibv_cmd_create_qp_ex2(struct ibv_context *context, + struct verbs_qp *qp, + struct ibv_qp_init_attr_ex *attr_ex, + struct ibv_create_qp_ex *cmd, + size_t cmd_size, + struct ib_uverbs_ex_create_qp_resp *resp, + size_t resp_size) +{ + DECLARE_CMD_BUFFER_COMPAT(cmdb, UVERBS_OBJECT_QP, + UVERBS_METHOD_QP_CREATE, cmd, cmd_size, resp, + resp_size); + + if (!check_comp_mask(attr_ex->comp_mask, + IBV_QP_INIT_ATTR_PD | + IBV_QP_INIT_ATTR_XRCD | + IBV_QP_INIT_ATTR_CREATE_FLAGS | + IBV_QP_INIT_ATTR_MAX_TSO_HEADER | + IBV_QP_INIT_ATTR_IND_TABLE | + IBV_QP_INIT_ATTR_RX_HASH | + IBV_QP_INIT_ATTR_SEND_OPS_FLAGS)) { + errno = EINVAL; + return errno; + } + + return ibv_icmd_create_qp(context, qp, NULL, attr_ex, cmdb); +} + +int ibv_cmd_destroy_qp(struct ibv_qp *qp) +{ + DECLARE_FBCMD_BUFFER(cmdb, UVERBS_OBJECT_QP, UVERBS_METHOD_QP_DESTROY, 2, + NULL); + struct ib_uverbs_destroy_qp_resp resp; + int ret; + + fill_attr_out_ptr(cmdb, UVERBS_ATTR_DESTROY_QP_RESP, &resp); + fill_attr_in_obj(cmdb, UVERBS_ATTR_DESTROY_QP_HANDLE, qp->handle); + + switch (execute_ioctl_fallback(qp->context, destroy_qp, cmdb, &ret)) { + case TRY_WRITE: { + struct ibv_destroy_qp req; + + req.core_payload = (struct ib_uverbs_destroy_qp){ + .qp_handle = qp->handle, + }; + + ret = execute_cmd_write(qp->context, + IB_USER_VERBS_CMD_DESTROY_QP, &req, + sizeof(req), &resp, sizeof(resp)); + break; + } + + default: + break; + } + + if (verbs_is_destroy_err(&ret)) + return ret; + + pthread_mutex_lock(&qp->mutex); + while (qp->events_completed != resp.events_reported) + pthread_cond_wait(&qp->cond, &qp->mutex); + pthread_mutex_unlock(&qp->mutex); + + return 0; +} diff --git a/libibverbs/cmd_srq.c b/libibverbs/cmd_srq.c new file mode 100644 index 0000000..dfaaa6a --- /dev/null +++ b/libibverbs/cmd_srq.c @@ -0,0 +1,283 @@ +/* + * Copyright (c) 2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include "ibverbs.h" + +static void set_vsrq(struct verbs_srq *vsrq, + struct ibv_srq_init_attr_ex *attr_ex, + uint32_t srq_num) +{ + vsrq->srq_type = (attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_TYPE) ? + attr_ex->srq_type : IBV_SRQT_BASIC; + if (vsrq->srq_type == IBV_SRQT_XRC) { + vsrq->srq_num = srq_num; + vsrq->xrcd = container_of(attr_ex->xrcd, struct verbs_xrcd, xrcd); + } + if (attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_CQ) + vsrq->cq = attr_ex->cq; +} + +static int ibv_icmd_create_srq(struct ibv_pd *pd, struct verbs_srq *vsrq, + struct ibv_srq *srq_in, + struct ibv_srq_init_attr_ex *attr_ex, + struct ibv_command_buffer *link) +{ + DECLARE_FBCMD_BUFFER(cmdb, UVERBS_OBJECT_SRQ, UVERBS_METHOD_SRQ_CREATE, 13, link); + struct verbs_ex_private *priv = get_priv(pd->context); + struct ib_uverbs_attr *handle; + uint32_t max_wr; + uint32_t max_sge; + uint32_t srq_num; + int ret; + struct ibv_srq *srq = vsrq ? &vsrq->srq : srq_in; + struct verbs_xrcd *vxrcd = NULL; + enum ibv_srq_type srq_type; + + srq->context = pd->context; + pthread_mutex_init(&srq->mutex, NULL); + pthread_cond_init(&srq->cond, NULL); + + srq_type = (attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_TYPE) ? + attr_ex->srq_type : IBV_SRQT_BASIC; + switch (srq_type) { + case IBV_SRQT_XRC: + if (!(attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_XRCD) || + !(attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_CQ)) { + errno = EINVAL; + return errno; + } + + vxrcd = container_of(attr_ex->xrcd, struct verbs_xrcd, xrcd); + fill_attr_in_obj(cmdb, UVERBS_ATTR_CREATE_SRQ_XRCD_HANDLE, vxrcd->handle); + fill_attr_in_obj(cmdb, UVERBS_ATTR_CREATE_SRQ_CQ_HANDLE, attr_ex->cq->handle); + fill_attr_out_ptr(cmdb, UVERBS_ATTR_CREATE_SRQ_RESP_SRQ_NUM, &srq_num); + break; + case IBV_SRQT_TM: + if (!(attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_CQ) || + !(attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_TM) || + !(attr_ex->tm_cap.max_num_tags)) { + errno = EINVAL; + return errno; + } + + fill_attr_in_obj(cmdb, UVERBS_ATTR_CREATE_SRQ_CQ_HANDLE, attr_ex->cq->handle); + fill_attr_in_uint32(cmdb, UVERBS_ATTR_CREATE_SRQ_MAX_NUM_TAGS, attr_ex->tm_cap.max_num_tags); + break; + default: + break; + } + + handle = fill_attr_out_obj(cmdb, UVERBS_ATTR_CREATE_SRQ_HANDLE); + fill_attr_const_in(cmdb, UVERBS_ATTR_CREATE_SRQ_TYPE, srq_type); + fill_attr_in_uint64(cmdb, UVERBS_ATTR_CREATE_SRQ_USER_HANDLE, (uintptr_t)srq); + fill_attr_in_obj(cmdb, UVERBS_ATTR_CREATE_SRQ_PD_HANDLE, pd->handle); + fill_attr_in_uint32(cmdb, UVERBS_ATTR_CREATE_SRQ_MAX_WR, attr_ex->attr.max_wr); + fill_attr_in_uint32(cmdb, UVERBS_ATTR_CREATE_SRQ_MAX_SGE, attr_ex->attr.max_sge); + fill_attr_in_uint32(cmdb, UVERBS_ATTR_CREATE_SRQ_LIMIT, attr_ex->attr.srq_limit); + fill_attr_in_fd(cmdb, UVERBS_ATTR_CREATE_SRQ_EVENT_FD, pd->context->async_fd); + fill_attr_out_ptr(cmdb, UVERBS_ATTR_CREATE_SRQ_RESP_MAX_WR, &max_wr); + fill_attr_out_ptr(cmdb, UVERBS_ATTR_CREATE_SRQ_RESP_MAX_SGE, &max_sge); + + if (priv->imported) + fallback_require_ioctl(cmdb); + + switch (execute_ioctl_fallback(srq->context, create_srq, cmdb, &ret)) { + case TRY_WRITE: { + if (attr_ex->srq_type == IBV_SRQT_BASIC && abi_ver > 5) { + DECLARE_LEGACY_UHW_BUFS(link, IB_USER_VERBS_CMD_CREATE_SRQ); + + *req = (struct ib_uverbs_create_srq){ + .pd_handle = pd->handle, + .user_handle = (uintptr_t)srq, + .max_wr = attr_ex->attr.max_wr, + .max_sge = attr_ex->attr.max_sge, + .srq_limit = attr_ex->attr.srq_limit, + }; + + ret = execute_write_bufs( + srq->context, IB_USER_VERBS_CMD_CREATE_SRQ, req, resp); + if (ret) + return ret; + + srq->handle = resp->srq_handle; + attr_ex->attr.max_wr = resp->max_wr; + attr_ex->attr.max_sge = resp->max_sge; + } else if (attr_ex->srq_type == IBV_SRQT_BASIC && abi_ver <= 5) { + DECLARE_LEGACY_UHW_BUFS(link, IB_USER_VERBS_CMD_CREATE_SRQ_V5); + + *req = (struct ib_uverbs_create_srq){ + .pd_handle = pd->handle, + .user_handle = (uintptr_t)srq, + .max_wr = attr_ex->attr.max_wr, + .max_sge = attr_ex->attr.max_sge, + .srq_limit = attr_ex->attr.srq_limit, + }; + + ret = execute_write_bufs( + srq->context, IB_USER_VERBS_CMD_CREATE_SRQ_V5, req, resp); + if (ret) + return ret; + + srq->handle = resp->srq_handle; + } else { + DECLARE_LEGACY_UHW_BUFS(link, IB_USER_VERBS_CMD_CREATE_XSRQ); + + *req = (struct ib_uverbs_create_xsrq){ + .pd_handle = pd->handle, + .user_handle = (uintptr_t)srq, + .max_wr = attr_ex->attr.max_wr, + .max_sge = attr_ex->attr.max_sge, + .srq_limit = attr_ex->attr.srq_limit, + .srq_type = attr_ex->srq_type, + .cq_handle = attr_ex->cq->handle, + }; + + if (attr_ex->srq_type == IBV_SRQT_TM) + req->max_num_tags = attr_ex->tm_cap.max_num_tags; + else + req->xrcd_handle = vxrcd->handle; + + ret = execute_write_bufs( + srq->context, IB_USER_VERBS_CMD_CREATE_XSRQ, req, resp); + if (ret) + return ret; + + srq->handle = resp->srq_handle; + attr_ex->attr.max_wr = resp->max_wr; + attr_ex->attr.max_sge = resp->max_sge; + set_vsrq(vsrq, attr_ex, resp->srqn); + } + + return 0; + } + + case SUCCESS: + break; + + default: + return ret; + } + + srq->handle = read_attr_obj(UVERBS_ATTR_CREATE_SRQ_HANDLE, handle); + attr_ex->attr.max_wr = max_wr; + attr_ex->attr.max_sge = max_sge; + if (vsrq) + set_vsrq(vsrq, attr_ex, srq_num); + + return 0; +} + +int ibv_cmd_create_srq(struct ibv_pd *pd, struct ibv_srq *srq, + struct ibv_srq_init_attr *attr, + struct ibv_create_srq *cmd, size_t cmd_size, + struct ib_uverbs_create_srq_resp *resp, size_t resp_size) +{ + DECLARE_CMD_BUFFER_COMPAT(cmdb, UVERBS_OBJECT_SRQ, + UVERBS_METHOD_SRQ_CREATE, cmd, cmd_size, resp, + resp_size); + + struct ibv_srq_init_attr_ex attr_ex = {}; + int ret; + + memcpy(&attr_ex, attr, sizeof(*attr)); + ret = ibv_icmd_create_srq(pd, NULL, srq, &attr_ex, cmdb); + if (!ret) { + attr->attr.max_wr = attr_ex.attr.max_wr; + attr->attr.max_sge = attr_ex.attr.max_sge; + } + + return ret; +} + +int ibv_cmd_create_srq_ex(struct ibv_context *context, + struct verbs_srq *srq, + struct ibv_srq_init_attr_ex *attr_ex, + struct ibv_create_xsrq *cmd, size_t cmd_size, + struct ib_uverbs_create_srq_resp *resp, size_t resp_size) +{ + DECLARE_CMD_BUFFER_COMPAT(cmdb, UVERBS_OBJECT_SRQ, + UVERBS_METHOD_SRQ_CREATE, cmd, cmd_size, resp, + resp_size); + + if (attr_ex->comp_mask >= IBV_SRQ_INIT_ATTR_RESERVED) { + errno = EOPNOTSUPP; + return errno; + } + + if (!(attr_ex->comp_mask & IBV_SRQ_INIT_ATTR_PD)) { + errno = EINVAL; + return errno; + } + + return ibv_icmd_create_srq(attr_ex->pd, srq, NULL, attr_ex, cmdb); +} + +int ibv_cmd_destroy_srq(struct ibv_srq *srq) +{ + DECLARE_FBCMD_BUFFER(cmdb, UVERBS_OBJECT_SRQ, UVERBS_METHOD_SRQ_DESTROY, 2, + NULL); + struct ib_uverbs_destroy_srq_resp resp; + int ret; + + fill_attr_out_ptr(cmdb, UVERBS_ATTR_DESTROY_SRQ_RESP, &resp); + fill_attr_in_obj(cmdb, UVERBS_ATTR_DESTROY_SRQ_HANDLE, srq->handle); + + switch (execute_ioctl_fallback(srq->context, destroy_srq, cmdb, &ret)) { + case TRY_WRITE: { + struct ibv_destroy_srq req; + + req.core_payload = (struct ib_uverbs_destroy_srq){ + .srq_handle = srq->handle, + }; + + ret = execute_cmd_write(srq->context, + IB_USER_VERBS_CMD_DESTROY_SRQ, &req, + sizeof(req), &resp, sizeof(resp)); + break; + } + + default: + break; + } + + if (verbs_is_destroy_err(&ret)) + return ret; + + pthread_mutex_lock(&srq->mutex); + while (srq->events_completed != resp.events_reported) + pthread_cond_wait(&srq->cond, &srq->mutex); + pthread_mutex_unlock(&srq->mutex); + + return 0; +} + diff --git a/libibverbs/cmd_wq.c b/libibverbs/cmd_wq.c new file mode 100644 index 0000000..8ca4fbe --- /dev/null +++ b/libibverbs/cmd_wq.c @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include "ibverbs.h" + +static int ibv_icmd_create_wq(struct ibv_context *context, + struct ibv_wq_init_attr *wq_init_attr, + struct ibv_wq *wq, + struct ibv_command_buffer *link) +{ + DECLARE_FBCMD_BUFFER(cmdb, UVERBS_OBJECT_WQ, UVERBS_METHOD_WQ_CREATE, 13, link); + struct verbs_ex_private *priv = get_priv(context); + struct ib_uverbs_attr *handle; + uint32_t create_flags = 0; + uint32_t max_wr; + uint32_t max_sge; + uint32_t wq_num; + int ret; + + wq->context = context; + wq->cq = wq_init_attr->cq; + wq->pd = wq_init_attr->pd; + wq->wq_type = wq_init_attr->wq_type; + + handle = fill_attr_out_obj(cmdb, UVERBS_ATTR_CREATE_WQ_HANDLE); + fill_attr_in_uint64(cmdb, UVERBS_ATTR_CREATE_WQ_USER_HANDLE, (uintptr_t)wq); + fill_attr_in_obj(cmdb, UVERBS_ATTR_CREATE_WQ_PD_HANDLE, wq_init_attr->pd->handle); + fill_attr_in_obj(cmdb, UVERBS_ATTR_CREATE_WQ_CQ_HANDLE, wq_init_attr->cq->handle); + fill_attr_const_in(cmdb, UVERBS_ATTR_CREATE_WQ_TYPE, wq_init_attr->wq_type); + fill_attr_in_uint32(cmdb, UVERBS_ATTR_CREATE_WQ_MAX_WR, wq_init_attr->max_wr); + fill_attr_in_uint32(cmdb, UVERBS_ATTR_CREATE_WQ_MAX_SGE, wq_init_attr->max_sge); + fill_attr_in_fd(cmdb, UVERBS_ATTR_CREATE_WQ_EVENT_FD, wq->context->async_fd); + if (wq_init_attr->comp_mask & IBV_WQ_INIT_ATTR_FLAGS) { + if (wq_init_attr->create_flags & ~(IBV_WQ_FLAGS_RESERVED - 1)) { + errno = EOPNOTSUPP; + return errno; + } + create_flags = wq_init_attr->create_flags; + } + fill_attr_in_uint32(cmdb, UVERBS_ATTR_CREATE_WQ_FLAGS, create_flags); + fill_attr_out_ptr(cmdb, UVERBS_ATTR_CREATE_WQ_RESP_MAX_WR, &max_wr); + fill_attr_out_ptr(cmdb, UVERBS_ATTR_CREATE_WQ_RESP_MAX_SGE, &max_sge); + fill_attr_out_ptr(cmdb, UVERBS_ATTR_CREATE_WQ_RESP_WQ_NUM, &wq_num); + + if (priv->imported) + fallback_require_ioctl(cmdb); + fallback_require_ex(cmdb); + + switch (execute_ioctl_fallback(context, create_wq, cmdb, &ret)) { + case TRY_WRITE_EX: { + DECLARE_LEGACY_UHW_BUFS_EX(link, + IB_USER_VERBS_EX_CMD_CREATE_WQ); + + *req = (struct ib_uverbs_ex_create_wq){ + .user_handle = (uintptr_t)wq, + .pd_handle = wq_init_attr->pd->handle, + .cq_handle = wq_init_attr->cq->handle, + .max_wr = wq_init_attr->max_wr, + .max_sge = wq_init_attr->max_sge, + .wq_type = wq_init_attr->wq_type, + .create_flags = wq_init_attr->create_flags, + }; + + ret = execute_write_bufs_ex( + context, IB_USER_VERBS_EX_CMD_CREATE_WQ, req, resp); + if (ret) + return ret; + + wq->handle = resp->wq_handle; + wq_init_attr->max_wr = resp->max_wr; + wq_init_attr->max_sge = resp->max_sge; + wq->wq_num = resp->wqn; + return 0; + } + + case SUCCESS: + break; + + default: + return ret; + } + + wq->handle = read_attr_obj(UVERBS_ATTR_CREATE_WQ_HANDLE, handle); + wq->wq_num = wq_num; + wq_init_attr->max_wr = max_wr; + wq_init_attr->max_sge = max_sge; + + return 0; +} + +int ibv_cmd_create_wq(struct ibv_context *context, + struct ibv_wq_init_attr *wq_init_attr, + struct ibv_wq *wq, + struct ibv_create_wq *cmd, + size_t cmd_size, + struct ib_uverbs_ex_create_wq_resp *resp, + size_t resp_size) +{ + DECLARE_CMD_BUFFER_COMPAT(cmdb, UVERBS_OBJECT_WQ, + UVERBS_METHOD_WQ_CREATE, cmd, cmd_size, resp, + resp_size); + + if (wq_init_attr->comp_mask >= IBV_WQ_INIT_ATTR_RESERVED) { + errno = EINVAL; + return errno; + } + + return ibv_icmd_create_wq(context, wq_init_attr, wq, cmdb); +} + +int ibv_cmd_destroy_wq(struct ibv_wq *wq) +{ + DECLARE_FBCMD_BUFFER(cmdb, UVERBS_OBJECT_WQ, UVERBS_METHOD_WQ_DESTROY, 2, + NULL); + struct ib_uverbs_ex_destroy_wq_resp resp; + int ret; + + fill_attr_out_ptr(cmdb, UVERBS_ATTR_DESTROY_WQ_RESP, &resp.events_reported); + fill_attr_in_obj(cmdb, UVERBS_ATTR_DESTROY_WQ_HANDLE, wq->handle); + + switch (execute_ioctl_fallback(wq->context, destroy_wq, cmdb, &ret)) { + case TRY_WRITE: { + struct ibv_destroy_wq req; + + req.core_payload = (struct ib_uverbs_ex_destroy_wq){ + .wq_handle = wq->handle, + }; + + + ret = execute_cmd_write_ex(wq->context, IB_USER_VERBS_EX_CMD_DESTROY_WQ, + &req, sizeof(req), &resp, sizeof(resp)); + break; + } + + default: + break; + } + + if (verbs_is_destroy_err(&ret)) + return ret; + + pthread_mutex_lock(&wq->mutex); + while (wq->events_completed != resp.events_reported) + pthread_cond_wait(&wq->cond, &wq->mutex); + pthread_mutex_unlock(&wq->mutex); + + return 0; +} diff --git a/libibverbs/device.c b/libibverbs/device.c index bc7df1b..e9c429a 100644 --- a/libibverbs/device.c +++ b/libibverbs/device.c @@ -150,6 +150,13 @@ LATEST_SYMVER_FUNC(ibv_get_device_guid, 1_1, "IBVERBS_1.1", return htobe64(guid); } +int ibv_get_device_index(struct ibv_device *device) +{ + struct verbs_sysfs_dev *sysfs_dev = verbs_get_device(device)->sysfs; + + return sysfs_dev->ibdev_idx; +} + int ibv_get_fw_ver(char *value, size_t len, struct verbs_sysfs_dev *sysfs_dev) { /* @@ -256,23 +263,6 @@ int verbs_init_context(struct verbs_context *context_ex, context_ex->context.abi_compat = __VERBS_ABI_IS_EXTENDED; context_ex->sz = sizeof(*context_ex); - /* - * In order to maintain backward/forward binary compatibility - * with apps compiled against libibverbs-1.1.8 that use the - * flow steering addition, we need to set the two - * ABI_placeholder entries to match the driver set flow - * entries. This is because apps compiled against - * libibverbs-1.1.8 use an inline ibv_create_flow and - * ibv_destroy_flow function that looks in the placeholder - * spots for the proper entry points. For apps compiled - * against libibverbs-1.1.9 and later, the inline functions - * will be looking in the right place. - */ - context_ex->ABI_placeholder1 = - (void (*)(void))context_ex->ibv_create_flow; - context_ex->ABI_placeholder2 = - (void (*)(void))context_ex->ibv_destroy_flow; - context_ex->priv = calloc(1, sizeof(*context_ex->priv)); if (!context_ex->priv) { errno = ENOMEM; @@ -330,6 +320,23 @@ static void set_lib_ops(struct verbs_context *vctx) #undef ibv_query_port vctx->context.ops._compat_query_port = ibv_query_port; vctx->query_port = __lib_query_port; + + /* + * In order to maintain backward/forward binary compatibility + * with apps compiled against libibverbs-1.1.8 that use the + * flow steering addition, we need to set the two + * ABI_placeholder entries to match the driver set flow + * entries. This is because apps compiled against + * libibverbs-1.1.8 use an inline ibv_create_flow and + * ibv_destroy_flow function that looks in the placeholder + * spots for the proper entry points. For apps compiled + * against libibverbs-1.1.9 and later, the inline functions + * will be looking in the right place. + */ + vctx->ABI_placeholder1 = + (void (*)(void))vctx->ibv_create_flow; + vctx->ABI_placeholder2 = + (void (*)(void))vctx->ibv_destroy_flow; } struct ibv_context *verbs_open_device(struct ibv_device *device, void *private_data) @@ -337,6 +344,7 @@ struct ibv_context *verbs_open_device(struct ibv_device *device, void *private_d struct verbs_device *verbs_device = verbs_get_device(device); int cmd_fd; struct verbs_context *context_ex; + int ret; /* * We'll only be doing writes, but we need O_RDWR in case the @@ -356,6 +364,13 @@ struct ibv_context *verbs_open_device(struct ibv_device *device, void *private_d return NULL; set_lib_ops(context_ex); + if (context_ex->context.async_fd == -1) { + ret = ibv_cmd_alloc_async_fd(&context_ex->context); + if (ret) { + ibv_close_device(&context_ex->context); + return NULL; + } + } return &context_ex->context; } @@ -367,11 +382,74 @@ LATEST_SYMVER_FUNC(ibv_open_device, 1_1, "IBVERBS_1.1", return verbs_open_device(device, NULL); } +struct ibv_context *ibv_import_device(int cmd_fd) +{ + struct verbs_device *verbs_device = NULL; + struct verbs_context *context_ex; + struct ibv_device **dev_list; + struct ibv_context *ctx = NULL; + struct stat st; + int ret; + int i; + + if (fstat(cmd_fd, &st) || !S_ISCHR(st.st_mode)) { + errno = EINVAL; + return NULL; + } + + dev_list = ibv_get_device_list(NULL); + if (!dev_list) { + errno = ENODEV; + return NULL; + } + + for (i = 0; dev_list[i]; ++i) { + if (verbs_get_device(dev_list[i])->sysfs->sysfs_cdev == + st.st_rdev) { + verbs_device = verbs_get_device(dev_list[i]); + break; + } + } + + if (!verbs_device) { + errno = ENODEV; + goto out; + } + + if (!verbs_device->ops->import_context) { + errno = EOPNOTSUPP; + goto out; + } + + /* In case the underlay cdev number was assigned in the meantime to + * other device as of some disassociate flow, the next call on the + * FD will end up with EIO (i.e. query_context command) and we should + * be safe from using the wrong device. + */ + context_ex = verbs_device->ops->import_context(&verbs_device->device, cmd_fd); + if (!context_ex) + goto out; + + set_lib_ops(context_ex); + + context_ex->priv->imported = true; + ctx = &context_ex->context; + ret = ibv_cmd_alloc_async_fd(ctx); + if (ret) { + ibv_close_device(ctx); + ctx = NULL; + } +out: + ibv_free_device_list(dev_list); + return ctx; +} + void verbs_uninit_context(struct verbs_context *context_ex) { free(context_ex->priv); close(context_ex->context.cmd_fd); - close(context_ex->context.async_fd); + if (context_ex->context.async_fd != -1) + close(context_ex->context.async_fd); ibverbs_device_put(context_ex->context.device); } diff --git a/libibverbs/driver.h b/libibverbs/driver.h index a0e6f89..451924d 100644 --- a/libibverbs/driver.h +++ b/libibverbs/driver.h @@ -59,17 +59,8 @@ struct verbs_xrcd { uint32_t handle; }; -enum verbs_srq_mask { - VERBS_SRQ_TYPE = 1 << 0, - VERBS_SRQ_XRCD = 1 << 1, - VERBS_SRQ_CQ = 1 << 2, - VERBS_SRQ_NUM = 1 << 3, - VERBS_SRQ_RESERVED = 1 << 4 -}; - struct verbs_srq { struct ibv_srq srq; - uint32_t comp_mask; enum ibv_srq_type srq_type; struct verbs_xrcd *xrcd; struct ibv_cq *cq; @@ -81,19 +72,27 @@ enum verbs_qp_mask { VERBS_QP_EX = 1 << 1, }; -enum ibv_gid_type { - IBV_GID_TYPE_IB_ROCE_V1, - IBV_GID_TYPE_ROCE_V2, +enum ibv_gid_type_sysfs { + IBV_GID_TYPE_SYSFS_IB_ROCE_V1, + IBV_GID_TYPE_SYSFS_ROCE_V2, +}; + +enum verbs_query_gid_attr_mask { + VERBS_QUERY_GID_ATTR_GID = 1 << 0, + VERBS_QUERY_GID_ATTR_TYPE = 1 << 1, + VERBS_QUERY_GID_ATTR_NDEV_IFINDEX = 1 << 2, }; enum ibv_mr_type { IBV_MR_TYPE_MR, IBV_MR_TYPE_NULL_MR, + IBV_MR_TYPE_IMPORTED_MR, }; struct verbs_mr { struct ibv_mr ibv_mr; enum ibv_mr_type mr_type; + int access; }; static inline struct verbs_mr *verbs_get_mr(struct ibv_mr *mr) @@ -111,6 +110,13 @@ struct verbs_qp { }; static_assert(offsetof(struct ibv_qp_ex, qp_base) == 0, "Invalid qp layout"); +struct verbs_cq { + union { + struct ibv_cq cq; + struct ibv_cq_ex cq_ex; + }; +}; + enum ibv_flow_action_type { IBV_FLOW_ACTION_UNSPECIFIED, IBV_FLOW_ACTION_ESP = 1, @@ -218,6 +224,8 @@ struct verbs_device_ops { struct verbs_context *(*alloc_context)(struct ibv_device *device, int cmd_fd, void *private_data); + struct verbs_context *(*import_context)(struct ibv_device *device, + int cmd_fd); struct verbs_device *(*alloc_device)(struct verbs_sysfs_dev *sysfs_dev); void (*uninit_device)(struct verbs_device *device); @@ -318,6 +326,10 @@ struct verbs_context_ops { void (*free_context)(struct ibv_context *context); int (*free_dm)(struct ibv_dm *dm); int (*get_srq_num)(struct ibv_srq *srq, uint32_t *srq_num); + struct ibv_mr *(*import_mr)(struct ibv_pd *pd, + uint32_t mr_handle); + struct ibv_pd *(*import_pd)(struct ibv_context *context, + uint32_t pd_handle); int (*modify_cq)(struct ibv_cq *cq, struct ibv_modify_cq_attr *attr); int (*modify_flow_action_esp)(struct ibv_flow_action *action, struct ibv_flow_action_esp_attr *attr); @@ -348,6 +360,7 @@ struct verbs_context_ops { const struct ibv_query_device_ex_input *input, struct ibv_device_attr_ex *attr, size_t attr_size); + int (*query_ece)(struct ibv_qp *qp, struct ibv_ece *ece); int (*query_port)(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr); int (*query_qp)(struct ibv_qp *qp, struct ibv_qp_attr *attr, @@ -368,6 +381,9 @@ struct verbs_context_ops { int (*rereg_mr)(struct verbs_mr *vmr, int flags, struct ibv_pd *pd, void *addr, size_t length, int access); int (*resize_cq)(struct ibv_cq *cq, int cqe); + int (*set_ece)(struct ibv_qp *qp, struct ibv_ece *ece); + void (*unimport_mr)(struct ibv_mr *mr); + void (*unimport_pd)(struct ibv_pd *pd); }; static inline struct verbs_device * @@ -431,6 +447,8 @@ struct ibv_context *verbs_open_device(struct ibv_device *device, int ibv_cmd_get_context(struct verbs_context *context, struct ibv_get_context *cmd, size_t cmd_size, struct ib_uverbs_get_context_resp *resp, size_t resp_size); +int ibv_cmd_query_context(struct ibv_context *ctx, + struct ibv_command_buffer *driver); int ibv_cmd_query_device(struct ibv_context *context, struct ibv_device_attr *device_attr, uint64_t *raw_fw_ver, @@ -453,6 +471,7 @@ int ibv_cmd_query_device_ex(struct ibv_context *context, int ibv_cmd_query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr, struct ibv_query_port *cmd, size_t cmd_size); +int ibv_cmd_alloc_async_fd(struct ibv_context *context); int ibv_cmd_alloc_pd(struct ibv_context *context, struct ibv_pd *pd, struct ibv_alloc_pd *cmd, size_t cmd_size, struct ib_uverbs_alloc_pd_resp *resp, size_t resp_size); @@ -474,6 +493,8 @@ int ibv_cmd_rereg_mr(struct verbs_mr *vmr, uint32_t flags, void *addr, size_t cmd_sz, struct ib_uverbs_rereg_mr_resp *resp, size_t resp_sz); int ibv_cmd_dereg_mr(struct verbs_mr *vmr); +int ibv_cmd_query_mr(struct ibv_pd *pd, struct verbs_mr *vmr, + uint32_t mr_handle); int ibv_cmd_advise_mr(struct ibv_pd *pd, enum ibv_advise_mr_advice advice, uint32_t flags, @@ -491,7 +512,7 @@ int ibv_cmd_create_cq(struct ibv_context *context, int cqe, struct ib_uverbs_create_cq_resp *resp, size_t resp_size); int ibv_cmd_create_cq_ex(struct ibv_context *context, struct ibv_cq_init_attr_ex *cq_attr, - struct ibv_cq_ex *cq, + struct verbs_cq *cq, struct ibv_create_cq_ex *cmd, size_t cmd_size, struct ib_uverbs_ex_create_cq_resp *resp, @@ -512,7 +533,7 @@ int ibv_cmd_create_srq(struct ibv_pd *pd, struct ibv_create_srq *cmd, size_t cmd_size, struct ib_uverbs_create_srq_resp *resp, size_t resp_size); int ibv_cmd_create_srq_ex(struct ibv_context *context, - struct verbs_srq *srq, int vsrq_sz, + struct verbs_srq *srq, struct ibv_srq_init_attr_ex *attr_ex, struct ibv_create_xsrq *cmd, size_t cmd_size, struct ib_uverbs_create_srq_resp *resp, size_t resp_size); @@ -530,12 +551,12 @@ int ibv_cmd_create_qp(struct ibv_pd *pd, struct ibv_create_qp *cmd, size_t cmd_size, struct ib_uverbs_create_qp_resp *resp, size_t resp_size); int ibv_cmd_create_qp_ex(struct ibv_context *context, - struct verbs_qp *qp, int vqp_sz, + struct verbs_qp *qp, struct ibv_qp_init_attr_ex *attr_ex, struct ibv_create_qp *cmd, size_t cmd_size, struct ib_uverbs_create_qp_resp *resp, size_t resp_size); int ibv_cmd_create_qp_ex2(struct ibv_context *context, - struct verbs_qp *qp, int vqp_sz, + struct verbs_qp *qp, struct ibv_qp_init_attr_ex *qp_attr, struct ibv_create_qp_ex *cmd, size_t cmd_size, @@ -619,6 +640,11 @@ int ibv_cmd_reg_dm_mr(struct ibv_pd *pd, struct verbs_dm *dm, unsigned int access, struct verbs_mr *vmr, struct ibv_command_buffer *link); +int __ibv_query_gid_ex(struct ibv_context *context, uint32_t port_num, + uint32_t gid_index, struct ibv_gid_entry *entry, + uint32_t flags, size_t entry_size, + uint32_t fallback_attr_mask); + /* * sysfs helper functions */ @@ -633,23 +659,13 @@ int ibv_read_ibdev_sysfs_file(char *buf, size_t size, __attribute__((format(printf, 4, 5))); int ibv_get_fw_ver(char *value, size_t len, struct verbs_sysfs_dev *sysfs_dev); -static inline int verbs_get_srq_num(struct ibv_srq *srq, uint32_t *srq_num) -{ - struct verbs_srq *vsrq = container_of(srq, struct verbs_srq, srq); - if (vsrq->comp_mask & VERBS_SRQ_NUM) { - *srq_num = vsrq->srq_num; - return 0; - } - return EOPNOTSUPP; -} - static inline bool check_comp_mask(uint64_t input, uint64_t supported) { return (input & ~supported) == 0; } int ibv_query_gid_type(struct ibv_context *context, uint8_t port_num, - unsigned int index, enum ibv_gid_type *type); + unsigned int index, enum ibv_gid_type_sysfs *type); static inline int ibv_check_alloc_parent_domain(struct ibv_parent_domain_init_attr *attr) diff --git a/libibverbs/dummy_ops.c b/libibverbs/dummy_ops.c index 32fec71..e5af9e4 100644 --- a/libibverbs/dummy_ops.c +++ b/libibverbs/dummy_ops.c @@ -287,6 +287,20 @@ static int get_srq_num(struct ibv_srq *srq, uint32_t *srq_num) return EOPNOTSUPP; } +static struct ibv_mr *import_mr(struct ibv_pd *pd, + uint32_t mr_handle) +{ + errno = EOPNOTSUPP; + return NULL; +} + +static struct ibv_pd *import_pd(struct ibv_context *context, + uint32_t pd_handle) +{ + errno = EOPNOTSUPP; + return NULL; +} + static int modify_cq(struct ibv_cq *cq, struct ibv_modify_cq_attr *attr) { return EOPNOTSUPP; @@ -382,11 +396,16 @@ static int query_device_ex(struct ibv_context *context, if (attr_size < sizeof(attr->orig_attr)) return EOPNOTSUPP; - memset(&attr->orig_attr, 0, sizeof(attr->orig_attr)); + memset(attr, 0, attr_size); return ibv_query_device(context, &attr->orig_attr); } +static int query_ece(struct ibv_qp *qp, struct ibv_ece *ece) +{ + return EOPNOTSUPP; +} + static int query_port(struct ibv_context *context, uint8_t port_num, struct ibv_port_attr *port_attr) { @@ -450,6 +469,19 @@ static int resize_cq(struct ibv_cq *cq, int cqe) return EOPNOTSUPP; } +static int set_ece(struct ibv_qp *qp, struct ibv_ece *ece) +{ + return EOPNOTSUPP; +} + +static void unimport_mr(struct ibv_mr *mr) +{ +} + +static void unimport_pd(struct ibv_pd *pd) +{ +} + /* * Ops in verbs_dummy_ops simply return an EOPNOTSUPP error code when called, or * do nothing. They are placed in the ops structures if the provider does not @@ -504,6 +536,8 @@ const struct verbs_context_ops verbs_dummy_ops = { free_context, free_dm, get_srq_num, + import_mr, + import_pd, modify_cq, modify_flow_action_esp, modify_qp, @@ -519,6 +553,7 @@ const struct verbs_context_ops verbs_dummy_ops = { post_srq_recv, query_device, query_device_ex, + query_ece, query_port, query_qp, query_rt_values, @@ -529,6 +564,9 @@ const struct verbs_context_ops verbs_dummy_ops = { req_notify_cq, rereg_mr, resize_cq, + set_ece, + unimport_mr, + unimport_pd, }; /* @@ -620,6 +658,8 @@ void verbs_set_ops(struct verbs_context *vctx, SET_PRIV_OP_IC(ctx, free_context); SET_OP(vctx, free_dm); SET_OP(vctx, get_srq_num); + SET_PRIV_OP_IC(vctx, import_mr); + SET_PRIV_OP_IC(vctx, import_pd); SET_OP(vctx, modify_cq); SET_OP(vctx, modify_flow_action_esp); SET_PRIV_OP(ctx, modify_qp); @@ -635,6 +675,7 @@ void verbs_set_ops(struct verbs_context *vctx, SET_OP(ctx, post_srq_recv); SET_PRIV_OP(ctx, query_device); SET_OP(vctx, query_device_ex); + SET_PRIV_OP_IC(vctx, query_ece); SET_PRIV_OP_IC(ctx, query_port); SET_PRIV_OP(ctx, query_qp); SET_OP(vctx, query_rt_values); @@ -645,6 +686,9 @@ void verbs_set_ops(struct verbs_context *vctx, SET_OP(ctx, req_notify_cq); SET_PRIV_OP(ctx, rereg_mr); SET_PRIV_OP(ctx, resize_cq); + SET_PRIV_OP_IC(vctx, set_ece); + SET_PRIV_OP_IC(vctx, unimport_mr); + SET_PRIV_OP_IC(vctx, unimport_pd); #undef SET_OP #undef SET_OP2 diff --git a/libibverbs/examples/devinfo.c b/libibverbs/examples/devinfo.c index f10eb2d..c245217 100644 --- a/libibverbs/examples/devinfo.c +++ b/libibverbs/examples/devinfo.c @@ -164,17 +164,17 @@ static const char *vl_str(uint8_t vl_num) } #define DEVINFO_INVALID_GID_TYPE 2 -static const char *gid_type_str(enum ibv_gid_type type) +static const char *gid_type_str(enum ibv_gid_type_sysfs type) { switch (type) { - case IBV_GID_TYPE_IB_ROCE_V1: return "RoCE v1"; - case IBV_GID_TYPE_ROCE_V2: return "RoCE v2"; + case IBV_GID_TYPE_SYSFS_IB_ROCE_V1: return "RoCE v1"; + case IBV_GID_TYPE_SYSFS_ROCE_V2: return "RoCE v2"; default: return "Invalid gid type"; } } static void print_formated_gid(union ibv_gid *gid, int i, - enum ibv_gid_type type, int ll) + enum ibv_gid_type_sysfs type, int ll) { char gid_str[INET6_ADDRSTRLEN] = {}; char str[20] = {}; @@ -182,7 +182,7 @@ static void print_formated_gid(union ibv_gid *gid, int i, if (ll == IBV_LINK_LAYER_ETHERNET) sprintf(str, ", %s", gid_type_str(type)); - if (type == IBV_GID_TYPE_IB_ROCE_V1) + if (type == IBV_GID_TYPE_SYSFS_IB_ROCE_V1) printf("\t\t\tGID[%3d]:\t\t%02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x:%02x%02x%s\n", i, gid->raw[0], gid->raw[1], gid->raw[2], gid->raw[3], gid->raw[4], gid->raw[5], gid->raw[6], @@ -190,7 +190,7 @@ static void print_formated_gid(union ibv_gid *gid, int i, gid->raw[11], gid->raw[12], gid->raw[13], gid->raw[14], gid->raw[15], str); - if (type == IBV_GID_TYPE_ROCE_V2) { + if (type == IBV_GID_TYPE_SYSFS_ROCE_V2) { inet_ntop(AF_INET6, gid->raw, gid_str, sizeof(gid_str)); printf("\t\t\tGID[%3d]:\t\t%s%s\n", i, gid_str, str); } @@ -200,7 +200,7 @@ static int print_all_port_gids(struct ibv_context *ctx, struct ibv_port_attr *port_attr, uint8_t port_num) { - enum ibv_gid_type type; + enum ibv_gid_type_sysfs type; union ibv_gid gid; int tbl_len; int rc = 0; @@ -413,10 +413,10 @@ static void print_tso_caps(const struct ibv_tso_caps *caps) uint32_t unknown_general_caps = ~(1 << IBV_QPT_RAW_PACKET | 1 << IBV_QPT_UD); printf("\ttso_caps:\n"); - printf("\tmax_tso:\t\t\t%d\n", caps->max_tso); + printf("\t\tmax_tso:\t\t\t%d\n", caps->max_tso); if (caps->max_tso) { - printf("\tsupported_qp:\n"); + printf("\t\tsupported_qp:\n"); if (ibv_is_qpt_supported(caps->supported_qpts, IBV_QPT_RAW_PACKET)) printf("\t\t\t\t\tSUPPORT_RAW_PACKET\n"); if (ibv_is_qpt_supported(caps->supported_qpts, IBV_QPT_UD)) @@ -495,7 +495,7 @@ static void print_raw_packet_caps(uint32_t raw_packet_caps) static int print_hca_cap(struct ibv_device *ib_dev, uint8_t ib_port) { struct ibv_context *ctx; - struct ibv_device_attr_ex device_attr; + struct ibv_device_attr_ex device_attr = {}; struct ibv_port_attr port_attr; int rc = 0; uint8_t port; diff --git a/libibverbs/ibdev_nl.c b/libibverbs/ibdev_nl.c index b459f0b..d28facb 100644 --- a/libibverbs/ibdev_nl.c +++ b/libibverbs/ibdev_nl.c @@ -206,7 +206,8 @@ int find_sysfs_devs_nl(struct list_head *tmp_sysfs_dev_list) goto err; list_for_each_safe (tmp_sysfs_dev_list, dev, dev_tmp, entry) { - if (find_uverbs_nl(nl, dev) && find_uverbs_sysfs(dev)) { + if ((find_uverbs_nl(nl, dev) && find_uverbs_sysfs(dev)) || + try_access_device(dev)) { list_del(&dev->entry); free(dev); } diff --git a/libibverbs/ibverbs.h b/libibverbs/ibverbs.h index 4b9b88f..5e60ace 100644 --- a/libibverbs/ibverbs.h +++ b/libibverbs/ibverbs.h @@ -74,6 +74,7 @@ struct verbs_ex_private { uint32_t driver_id; bool use_ioctl_write; struct verbs_context_ops ops; + bool imported; }; static inline struct verbs_ex_private *get_priv(struct ibv_context *ctx) @@ -90,4 +91,6 @@ enum ibv_node_type decode_knode_type(unsigned int knode_type); int find_sysfs_devs_nl(struct list_head *tmp_sysfs_dev_list); +int try_access_device(const struct verbs_sysfs_dev *sysfs_dev); + #endif /* IB_VERBS_H */ diff --git a/libibverbs/init.c b/libibverbs/init.c index 7bac5af..f5340ea 100644 --- a/libibverbs/init.c +++ b/libibverbs/init.c @@ -64,7 +64,7 @@ struct ibv_driver { static LIST_HEAD(driver_list); -static int try_access_device(const struct verbs_sysfs_dev *sysfs_dev) +int try_access_device(const struct verbs_sysfs_dev *sysfs_dev) { struct stat cdev_stat; char *devpath; diff --git a/libibverbs/kern-abi.h b/libibverbs/kern-abi.h index dc2f33d..1238569 100644 --- a/libibverbs/kern-abi.h +++ b/libibverbs/kern-abi.h @@ -308,4 +308,15 @@ struct ibv_create_srq_resp_v5 { __u32 srq_handle; }; +#define _STRUCT_ib_uverbs_create_srq_v5 +enum { IB_USER_VERBS_CMD_CREATE_SRQ_V5 = IB_USER_VERBS_CMD_CREATE_SRQ }; +DECLARE_CMDX(IB_USER_VERBS_CMD_CREATE_SRQ_V5, ibv_create_srq_v5, ib_uverbs_create_srq, ibv_create_srq_resp_v5); + +#define _STRUCT_ib_uverbs_create_qp_v4 +enum { IB_USER_VERBS_CMD_CREATE_QP_V4 = IB_USER_VERBS_CMD_CREATE_QP }; +DECLARE_CMDX(IB_USER_VERBS_CMD_CREATE_QP_V4, ibv_create_qp_v4, ib_uverbs_create_qp, ibv_create_qp_resp_v4); + +#define _STRUCT_ib_uverbs_create_qp_v3 +enum { IB_USER_VERBS_CMD_CREATE_QP_V3 = IB_USER_VERBS_CMD_CREATE_QP }; +DECLARE_CMDX(IB_USER_VERBS_CMD_CREATE_QP_V3, ibv_create_qp_v3, ib_uverbs_create_qp, ibv_create_qp_resp_v3); #endif /* KERN_ABI_H */ diff --git a/libibverbs/libibverbs.map.in b/libibverbs/libibverbs.map.in index 5280cfe..7429016 100644 --- a/libibverbs/libibverbs.map.in +++ b/libibverbs/libibverbs.map.in @@ -126,6 +126,28 @@ IBVERBS_1.8 { ibv_reg_mr_iova2; } IBVERBS_1.7; +IBVERBS_1.9 { + global: + ibv_get_device_index; +} IBVERBS_1.8; + +IBVERBS_1.10 { + global: + ibv_import_device; + ibv_import_mr; + ibv_import_pd; + ibv_query_ece; + ibv_set_ece; + ibv_unimport_mr; + ibv_unimport_pd; +} IBVERBS_1.9; + +IBVERBS_1.11 { + global: + _ibv_query_gid_ex; + _ibv_query_gid_table; +} IBVERBS_1.10; + /* If any symbols in this stanza change ABI then the entire staza gets a new symbol version. See the top level CMakeLists.txt for this setting. */ @@ -180,8 +202,10 @@ IBVERBS_PRIVATE_@IBVERBS_PABI_VERSION@ { ibv_cmd_post_recv; ibv_cmd_post_send; ibv_cmd_post_srq_recv; + ibv_cmd_query_context; ibv_cmd_query_device; ibv_cmd_query_device_ex; + ibv_cmd_query_mr; ibv_cmd_query_port; ibv_cmd_query_qp; ibv_cmd_query_srq; diff --git a/libibverbs/man/CMakeLists.txt b/libibverbs/man/CMakeLists.txt index e1d5edf..1fb5ac1 100644 --- a/libibverbs/man/CMakeLists.txt +++ b/libibverbs/man/CMakeLists.txt @@ -32,10 +32,14 @@ rdma_man_pages( ibv_get_async_event.3 ibv_get_cq_event.3 ibv_get_device_guid.3.md + ibv_get_device_index.3.md ibv_get_device_list.3.md ibv_get_device_name.3.md ibv_get_pkey_index.3.md ibv_get_srq_num.3.md + ibv_import_device.3.md + ibv_import_mr.3.md + ibv_import_pd.3.md ibv_inc_rkey.3.md ibv_modify_qp.3 ibv_modify_qp_rate_limit.3 @@ -51,7 +55,10 @@ rdma_man_pages( ibv_post_srq_recv.3 ibv_query_device.3 ibv_query_device_ex.3 + ibv_query_ece.3.md ibv_query_gid.3.md + ibv_query_gid_ex.3.md + ibv_query_gid_table.3.md ibv_query_pkey.3.md ibv_query_port.3 ibv_query_qp.3 @@ -65,6 +72,7 @@ rdma_man_pages( ibv_req_notify_cq.3.md ibv_rereg_mr.3.md ibv_resize_cq.3.md + ibv_set_ece.3.md ibv_srq_pingpong.1 ibv_uc_pingpong.1 ibv_ud_pingpong.1 @@ -97,6 +105,8 @@ rdma_alias_man_pages( ibv_get_async_event.3 ibv_ack_async_event.3 ibv_get_cq_event.3 ibv_ack_cq_events.3 ibv_get_device_list.3 ibv_free_device_list.3 + ibv_import_pd.3 ibv_unimport_pd.3 + ibv_import_mr.3 ibv_unimport_mr.3 ibv_open_device.3 ibv_close_device.3 ibv_open_xrcd.3 ibv_close_xrcd.3 ibv_rate_to_mbps.3 mbps_to_ibv_rate.3 diff --git a/libibverbs/man/ibv_advise_mr.3.md b/libibverbs/man/ibv_advise_mr.3.md index 5794b68..73c7f21 100644 --- a/libibverbs/man/ibv_advise_mr.3.md +++ b/libibverbs/man/ibv_advise_mr.3.md @@ -45,6 +45,10 @@ application performance. : Like IBV_ADVISE_MR_ADVICE_PREFETCH but with read-access and write-access permission to the fetched memory. +*IBV_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT* +: Pre-fetch a range of an on-demand paging MR without faulting. + This allows presented pages in the CPU to become presented to the device. + # ARGUMENTS *pd* : The protection domain (PD) associated with the MR. diff --git a/libibverbs/man/ibv_create_wq.3 b/libibverbs/man/ibv_create_wq.3 index 10fe965..4299ede 100644 --- a/libibverbs/man/ibv_create_wq.3 +++ b/libibverbs/man/ibv_create_wq.3 @@ -6,7 +6,7 @@ ibv_create_wq, ibv_destroy_wq \- create or destroy a Work Queue (WQ). .SH "SYNOPSIS" .nf -.B #include +.B #include .sp .BI "struct ibv_wq *ibv_create_wq(struct ibv_context " "*context," .BI " struct ibv_wq_init_attr " "*wq_init_attr" ); diff --git a/libibverbs/man/ibv_get_device_guid.3.md b/libibverbs/man/ibv_get_device_guid.3.md index 683900f..376c787 100644 --- a/libibverbs/man/ibv_get_device_guid.3.md +++ b/libibverbs/man/ibv_get_device_guid.3.md @@ -22,7 +22,7 @@ uint64_t ibv_get_device_guid(struct ibv_device *device); # DESCRIPTION -**ibv_get_device_name()** returns the Global Unique IDentifier (GUID) of the +**ibv_get_device_guid()** returns the Global Unique IDentifier (GUID) of the RDMA device *device*. # RETURN VALUE @@ -31,7 +31,7 @@ RDMA device *device*. order. # SEE ALSO - +**ibv_get_device_index**(3), **ibv_get_device_list**(3), **ibv_get_device_name**(3), **ibv_open_device**(3) diff --git a/libibverbs/man/ibv_get_device_index.3.md b/libibverbs/man/ibv_get_device_index.3.md new file mode 100644 index 0000000..69f00c4 --- /dev/null +++ b/libibverbs/man/ibv_get_device_index.3.md @@ -0,0 +1,40 @@ +--- +date: ' 2020-04-22' +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: IBV_GET_DEVICE_INDEX +--- + +# NAME + +ibv_get_device_index - get an RDMA device index + +# SYNOPSIS + +```c +#include + +int ibv_get_device_index(struct ibv_device *device); +``` + +# DESCRIPTION + +**ibv_get_device_index()** returns stable IB device index as it is assigned by the kernel. + +# RETURN VALUE + +**ibv_get_device_index()** returns an index, or -1 if the kernel doesn't support device indexes. + +# SEE ALSO + +**ibv_get_device_name**(3), +**ibv_get_device_guid**(3), +**ibv_get_device_list**(3), +**ibv_open_device**(3) + +# AUTHOR + +Leon Romanovsky diff --git a/libibverbs/man/ibv_get_device_list.3.md b/libibverbs/man/ibv_get_device_list.3.md index 3d222f6..8f3995e 100644 --- a/libibverbs/man/ibv_get_device_list.3.md +++ b/libibverbs/man/ibv_get_device_list.3.md @@ -88,6 +88,7 @@ recommended. **ibv_fork_init**(3), **ibv_get_device_guid**(3), **ibv_get_device_name**(3), +**ibv_get_device_index**(3), **ibv_open_device**(3) # AUTHOR diff --git a/libibverbs/man/ibv_get_srq_num.3.md b/libibverbs/man/ibv_get_srq_num.3.md index f015b9e..9140a37 100644 --- a/libibverbs/man/ibv_get_srq_num.3.md +++ b/libibverbs/man/ibv_get_srq_num.3.md @@ -23,7 +23,7 @@ int ibv_get_srq_num(struct ibv_srq *srq, uint32_t *srq_num); # DESCRIPTION -**ibv_get_srq_num()** return srq number associated with the given shared +**ibv_get_srq_num()** return srq number associated with the given XRC shared receive queue The argument *srq* is an ibv_srq struct, as defined in . *srq_num* is an output parameter that holds the returned srq number. diff --git a/libibverbs/man/ibv_import_device.3.md b/libibverbs/man/ibv_import_device.3.md new file mode 100644 index 0000000..df205dc --- /dev/null +++ b/libibverbs/man/ibv_import_device.3.md @@ -0,0 +1,48 @@ +--- +date: 2020-5-3 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: ibv_import_device +--- + +# NAME + +ibv_import_device - import a device from a given command FD + +# SYNOPSIS + +```c +#include + +struct ibv_context *ibv_import_device(int cmd_fd); + +``` + + +# DESCRIPTION + +**ibv_import_device()** returns an *ibv_context* pointer that is associated with the given +*cmd_fd*. + +The *cmd_fd* is obtained from the ibv_context cmd_fd member, which must be dup'd (eg by dup(), SCM_RIGHTS, etc) +before being passed to ibv_import_device(). + +Once the *ibv_context* usage has been ended *ibv_close_device()* should be called. +This call may cleanup whatever is needed/opposite of the import including closing the command FD. + +# RETURN VALUE + +**ibv_import_device()** returns a pointer to the allocated RDMA context, or NULL if the request fails. + +# SEE ALSO + +**ibv_open_device**(3), +**ibv_close_device**(3), + +# AUTHOR + +Yishai Hadas + diff --git a/libibverbs/man/ibv_import_mr.3.md b/libibverbs/man/ibv_import_mr.3.md new file mode 100644 index 0000000..f6346e4 --- /dev/null +++ b/libibverbs/man/ibv_import_mr.3.md @@ -0,0 +1,64 @@ +--- +date: 2020-5-3 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: ibv_import_mr ibv_unimport_mr +--- + +# NAME + +ibv_import_mr - import an MR from a given ibv_pd + +ibv_unimport_mr - unimport an MR + +# SYNOPSIS + +```c +#include + +struct ibv_mr *ibv_import_mr(struct ibv_pd *pd, uint32_t mr_handle); +void ibv_unimport_mr(struct ibv_mr *mr) + +``` + + +# DESCRIPTION + +**ibv_import_mr()** returns a Memory region (MR) that is associated with the given +*mr_handle* in the RDMA context that assosicated with the given *pd*. + +The input *mr_handle* value must be a valid kernel handle for an MR object in the assosicated RDMA context. +It can be achieved from the original MR by getting its ibv_mr->handle member value. + +**ibv_unimport_mr()** un import the MR. +Once the MR usage has been ended ibv_dereg_mr() or ibv_unimport_mr() should be called. +The first one will go to the kernel to destroy the object once the second one way cleanup what +ever is needed/opposite of the import without calling the kernel. + +This is the responsibility of the application to coordinate between all ibv_context(s) that use this MR. +Once destroy is done no other process can touch the object except for unimport. All users of the context must +collaborate to ensure this. + +# RETURN VALUE + +**ibv_import_mr()** returns a pointer to the allocated MR, or NULL if the request fails. + +# NOTES + +The *addr* field in the imported MR is not applicable, NULL value is expected. + +# SEE ALSO + +**ibv_reg_mr**(3), +**ibv_reg_dm_mr**(3), +**ibv_reg_mr_iova**(3), +**ibv_reg_mr_iova2**(3), +**ibv_dereg_mr**(3), + +# AUTHOR + +Yishai Hadas + diff --git a/libibverbs/man/ibv_import_pd.3.md b/libibverbs/man/ibv_import_pd.3.md new file mode 100644 index 0000000..084bbfe --- /dev/null +++ b/libibverbs/man/ibv_import_pd.3.md @@ -0,0 +1,59 @@ +--- +date: 2020-5-3 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: ibv_import_pd, ibv_unimport_pd +--- + +# NAME + +ibv_import_pd - import a PD from a given ibv_context + +ibv_unimport_pd - unimport a PD + +# SYNOPSIS + +```c +#include + +struct ibv_pd *ibv_import_pd(struct ibv_context *context, uint32_t pd_handle); +void ibv_unimport_pd(struct ibv_pd *pd) + +``` + + +# DESCRIPTION + +**ibv_import_pd()** returns a protection domain (PD) that is associated with the given +*pd_handle* in the given *context*. + +The input *pd_handle* value must be a valid kernel handle for a PD object in the given *context*. +It can be achieved from the original PD by getting its ibv_pd->handle member value. + +The returned *ibv_pd* can be used in all verbs that get a protection domain. + +**ibv_unimport_pd()** unimport the PD. +Once the PD usage has been ended ibv_dealloc_pd() or ibv_unimport_pd() should be called. +The first one will go to the kernel to destroy the object once the second one way cleanup what +ever is needed/opposite of the import without calling the kernel. + +This is the responsibility of the application to coordinate between all ibv_context(s) that use this PD. +Once destroy is done no other process can touch the object except for unimport. All users of the context must +collaborate to ensure this. + +# RETURN VALUE + +**ibv_import_pd()** returns a pointer to the allocated PD, or NULL if the request fails. + +# SEE ALSO + +**ibv_alloc_pd**(3), +**ibv_dealloc_pd**(3), + +# AUTHOR + +Yishai Hadas + diff --git a/libibverbs/man/ibv_query_ece.3.md b/libibverbs/man/ibv_query_ece.3.md new file mode 100644 index 0000000..9b6f19d --- /dev/null +++ b/libibverbs/man/ibv_query_ece.3.md @@ -0,0 +1,75 @@ +--- +date: 2020-01-22 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: IBV_QUERY_ECE +--- + +# NAME + +ibv_query_ece - query ECE options. + +# SYNOPSIS + +```c +#include + +int ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece); +``` + +# DESCRIPTION + +**ibv_query_ece()** query ECE options. + +Return to the user current ECE state for the QP. + +# ARGUMENTS +*qp* +: The queue pair (QP) associated with the ECE options. + +## *ece* Argument +: The ECE values. + +```c +struct ibv_ece { + uint32_t vendor_id; + uint32_t options; + uint32_t comp_mask; +}; +``` + +*vendor_id* +: Unique identifier of the provider vendor on the network. + The providers will set IEEE OUI here to distinguish itself + in non-homogenius network. + +*options* +: Provider specific attributes which are supported. + +*comp_mask* +: Bitmask specifying what fields in the structure are valid. + +# RETURN VALUE + +**ibv_query_ece()** returns 0 when the call was successful, or the errno value + which indicates the failure reason. + +*EOPNOTSUPP* +: libibverbs or provider driver doesn't support the ibv_set_ece() verb. + +*EINVAL* +: In one of the following: + o The QP is invalid. + o The ECE options are invalid. + +# SEE ALSO + +**ibv_set_ece**(3), + +# AUTHOR + +Leon Romanovsky + diff --git a/libibverbs/man/ibv_query_gid_ex.3.md b/libibverbs/man/ibv_query_gid_ex.3.md new file mode 100644 index 0000000..9e14f01 --- /dev/null +++ b/libibverbs/man/ibv_query_gid_ex.3.md @@ -0,0 +1,93 @@ +--- +date: 2020-04-24 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: IBV_QUERY_GID_EX +--- + +# NAME + +ibv_query_gid_ex - Query an InfiniBand port's GID table entry + +# SYNOPSIS + +```c +#include + +int ibv_query_gid_ex(struct ibv_context *context, + uint32_t port_num, + uint32_t gid_index, + struct ibv_gid_entry *entry, + uint32_t flags); +``` + +# DESCRIPTION + +**ibv_query_gid_ex()** returns the GID entry at *entry* for +*gid_index* of port *port_num* for device context *context*. + +# ARGUMENTS + +*context* +: The context of the device to query. + +*port_num* +: The number of port to query its GID table. + +*gid_index* +: The index of the GID table entry to query. + +## *entry* Argument +: An ibv_gid_entry struct, as defined in . +```c +struct ibv_gid_entry { + union ibv_gid gid; + uint32_t gid_index; + uint32_t port_num; + uint32_t gid_type; + uint32_t ndev_ifindex; +}; +``` + + *gid* +: The GID entry. + + *gid_index* +: The GID table index of this entry. + + *port_num* +: The port number that this GID belongs to. + + *gid_type* +: enum ibv_gid_type, can be one of IBV_GID_TYPE_IB, IBV_GID_TYPE_ROCE_V1 or IBV_GID_TYPE_ROCE_V2. + + *ndev_ifindex* +: The interface index of the net device associated with this GID. + It is 0 if there is no net device associated with it. + +*flags* +: Extra fields to query post *ndev_ifindex*, for now must be 0. + +# RETURN VALUE + +**ibv_query_gid_ex()** returns 0 on success or errno value on error. + +# ERRORS + +ENODATA +: *gid_index* is within the GID table size of port *port_num* but there is no data in this index. + +# SEE ALSO + +**ibv_open_device**(3), +**ibv_query_device**(3), +**ibv_query_pkey**(3), +**ibv_query_port**(3), +**ibv_query_gid_table**(3) + +# AUTHOR + +Parav Pandit diff --git a/libibverbs/man/ibv_query_gid_table.3.md b/libibverbs/man/ibv_query_gid_table.3.md new file mode 100644 index 0000000..e10f51c --- /dev/null +++ b/libibverbs/man/ibv_query_gid_table.3.md @@ -0,0 +1,73 @@ +--- +date: 2020-04-24 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: IBV_QUERY_GID_TABLE +--- + +# NAME + +ibv_query_gid_table - query an InfiniBand device's GID table + +# SYNOPSIS + +```c +#include + +ssize_t ibv_query_gid_table(struct ibv_context *context, + struct ibv_gid_entry *entries, + size_t max_entries, + uint32_t flags); +``` + +# DESCRIPTION + +**ibv_query_gid_table()** returns the valid GID table entries of the RDMA +device context *context* at the pointer *entries*. + +A caller must allocate *entries* array for the GID table entries it +desires to query. This API returns only valid GID table entries. + +A caller must pass non zero number of entries at *max_entries* that corresponds +to the size of *entries* array. + +*entries* array must be allocated such that it can contain all the valid +GID table entries of the device. If there are more valid GID entries than +the provided value of *max_entries* and *entries* array, the call will fail. +For example, if a RDMA device *context* has a total of 10 valid +GID entries, *entries* should be allocated for at least 10 entries, and +*max_entries* should be set appropriately. + +# ARGUMENTS + +*context* +: The context of the device to query. + +*entries* +: Array of ibv_gid_entry structs where the GID entries are returned. + Please see **ibv_query_gid_ex**(3) man page for *ibv_gid_entry*. + +*max_entries* +: Maximum number of entries that can be returned. + +*flags* +: Extra fields to query post *entries->ndev_ifindex*, for now must be 0. + +# RETURN VALUE + +**ibv_query_gid_table()** returns the number of entries that were read on success or negative errno value on error. +Number of entries returned is <= max_entries. + +# SEE ALSO + +**ibv_open_device**(3), +**ibv_query_device**(3), +**ibv_query_port**(3), +**ibv_query_gid_ex**(3) + +# AUTHOR + +Parav Pandit diff --git a/libibverbs/man/ibv_set_ece.3.md b/libibverbs/man/ibv_set_ece.3.md new file mode 100644 index 0000000..d7910c8 --- /dev/null +++ b/libibverbs/man/ibv_set_ece.3.md @@ -0,0 +1,78 @@ +--- +date: 2020-01-22 +footer: libibverbs +header: "Libibverbs Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: IBV_SET_ECE +--- + +# NAME + +ibv_set_ece - set ECE options and use them for QP configuration stage. + +# SYNOPSIS + +```c +#include + +int ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece); +``` + +# DESCRIPTION + +**ibv_set_ece()** set ECE options and use them for QP configuration stage. + +The desired ECE options will be used during various modify QP stages +based on supported options in relevant QP state. + +# ARGUMENTS +*qp* +: The queue pair (QP) associated with the ECE options. + +## *ece* Argument +: The requested ECE values. This is IN/OUT field, the accepted options + will be returned in this field. + +```c +struct ibv_ece { + uint32_t vendor_id; + uint32_t options; + uint32_t comp_mask; +}; +``` + +*vendor_id* +: Unique identifier of the provider vendor on the network. + The providers will set IEEE OUI here to distinguish itself + in non-homogenius network. + +*options* +: Provider specific attributes which are supported or + needed to be enabled by ECE users. + +*comp_mask* +: Bitmask specifying what fields in the structure are valid. + +# RETURN VALUE + +**ibv_set_ece()** returns 0 when the call was successful, or the errno value + which indicates the failure reason. + +*EOPNOTSUPP* +: libibverbs or provider driver doesn't support the ibv_set_ece() verb. + +*EINVAL* +: In one of the following: + o The QP is invalid. + o The ECE options are invalid. + +# SEE ALSO + +**ibv_query_ece**(3), + +# AUTHOR + +Leon Romanovsky + diff --git a/libibverbs/memory.c b/libibverbs/memory.c index 2b1c1ae..5fa138e 100644 --- a/libibverbs/memory.c +++ b/libibverbs/memory.c @@ -587,6 +587,28 @@ static struct ibv_mem_node *undo_node(struct ibv_mem_node *node, return node; } +static int do_madvise(void *addr, size_t length, int advice, + unsigned long range_page_size) +{ + int ret; + void *p; + + ret = madvise(addr, length, advice); + + if (!ret || advice == MADV_DONTFORK) + return ret; + + if (length > range_page_size) { + /* if MADV_DOFORK failed we will try to remove VM_DONTCOPY + * flag from each page + */ + for (p = addr; p < addr + length; p += range_page_size) + madvise(p, range_page_size, MADV_DOFORK); + } + + return 0; +} + static int ibv_madvise_range(void *base, size_t size, int advice) { uintptr_t start, end; @@ -640,12 +662,13 @@ again: * and that may lead to a spurious failure. */ if (start > node->start) - ret = madvise((void *) start, node->end - start + 1, - advice); + ret = do_madvise((void *) start, + node->end - start + 1, + advice, range_page_size); else - ret = madvise((void *) node->start, - node->end - node->start + 1, - advice); + ret = do_madvise((void *) node->start, + node->end - node->start + 1, + advice, range_page_size); if (ret) { node = undo_node(node, start, inc); @@ -656,10 +679,7 @@ again: rolling_back = 1; advice = advice == MADV_DONTFORK ? MADV_DOFORK : MADV_DONTFORK; - tmp = __mm_prev(node); - if (!tmp || start > tmp->end) - goto out; - end = tmp->end; + end = node->end; goto again; } } diff --git a/libibverbs/verbs.c b/libibverbs/verbs.c index 629f24c..7fc1024 100644 --- a/libibverbs/verbs.c +++ b/libibverbs/verbs.c @@ -221,21 +221,20 @@ LATEST_SYMVER_FUNC(ibv_query_gid, 1_1, "IBVERBS_1.1", struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid) { - struct verbs_device *verbs_device = verbs_get_device(context->device); - char attr[41]; - uint16_t val; - int i; + struct ibv_gid_entry entry = {}; + int ret; - if (ibv_read_ibdev_sysfs_file(attr, sizeof(attr), verbs_device->sysfs, - "ports/%d/gids/%d", port_num, index) < 0) + ret = __ibv_query_gid_ex(context, port_num, index, &entry, 0, + sizeof(entry), VERBS_QUERY_GID_ATTR_GID); + /* Preserve API behavior for empty GID */ + if (ret == ENODATA) { + memset(gid, 0, sizeof(*gid)); + return 0; + } + if (ret) return -1; - for (i = 0; i < 8; ++i) { - if (sscanf(attr + i * 5, "%hx", &val) != 1) - return -1; - gid->raw[i * 2 ] = val >> 8; - gid->raw[i * 2 + 1] = val & 0xff; - } + memcpy(gid, &entry.gid, sizeof(entry.gid)); return 0; } @@ -296,60 +295,76 @@ LATEST_SYMVER_FUNC(ibv_dealloc_pd, 1_1, "IBVERBS_1.1", return get_ops(pd->context)->dealloc_pd(pd); } -#undef ibv_reg_mr -LATEST_SYMVER_FUNC(ibv_reg_mr, 1_1, "IBVERBS_1.1", - struct ibv_mr *, - struct ibv_pd *pd, void *addr, - size_t length, int access) +struct ibv_mr *ibv_reg_mr_iova2(struct ibv_pd *pd, void *addr, size_t length, + uint64_t iova, unsigned int access) { + struct verbs_device *device = verbs_get_device(pd->context->device); + bool odp_mr = access & IBV_ACCESS_ON_DEMAND; struct ibv_mr *mr; - if (ibv_dontfork_range(addr, length)) + if (!(device->core_support & IB_UVERBS_CORE_SUPPORT_OPTIONAL_MR_ACCESS)) + access &= ~IBV_ACCESS_OPTIONAL_RANGE; + + if (!odp_mr && ibv_dontfork_range(addr, length)) return NULL; - mr = get_ops(pd->context)->reg_mr(pd, addr, length, (uintptr_t) addr, - access); + mr = get_ops(pd->context)->reg_mr(pd, addr, length, iova, access); if (mr) { mr->context = pd->context; mr->pd = pd; mr->addr = addr; mr->length = length; - } else - ibv_dofork_range(addr, length); + } else { + if (!odp_mr) + ibv_dofork_range(addr, length); + } return mr; } +#undef ibv_reg_mr +LATEST_SYMVER_FUNC(ibv_reg_mr, 1_1, "IBVERBS_1.1", + struct ibv_mr *, + struct ibv_pd *pd, void *addr, + size_t length, int access) +{ + return ibv_reg_mr_iova2(pd, addr, length, (uintptr_t)addr, access); +} + #undef ibv_reg_mr_iova struct ibv_mr *ibv_reg_mr_iova(struct ibv_pd *pd, void *addr, size_t length, uint64_t iova, int access) { - struct ibv_mr *mr; + return ibv_reg_mr_iova2(pd, addr, length, iova, access); +} - if (ibv_dontfork_range(addr, length)) - return NULL; +struct ibv_pd *ibv_import_pd(struct ibv_context *context, + uint32_t pd_handle) +{ + return get_ops(context)->import_pd(context, pd_handle); +} - mr = get_ops(pd->context)->reg_mr(pd, addr, length, iova, access); - if (mr) { - mr->context = pd->context; - mr->pd = pd; - mr->addr = addr; - mr->length = length; - } else - ibv_dofork_range(addr, length); - return mr; +void ibv_unimport_pd(struct ibv_pd *pd) +{ + get_ops(pd->context)->unimport_pd(pd); } -struct ibv_mr *ibv_reg_mr_iova2(struct ibv_pd *pd, void *addr, size_t length, - uint64_t iova, unsigned int access) -{ - struct verbs_device *device = verbs_get_device(pd->context->device); - if (!(device->core_support & IB_UVERBS_CORE_SUPPORT_OPTIONAL_MR_ACCESS)) - access &= ~IBV_ACCESS_OPTIONAL_RANGE; +/** + * ibv_import_mr - Import a memory region + */ +struct ibv_mr *ibv_import_mr(struct ibv_pd *pd, uint32_t mr_handle) +{ + return get_ops(pd->context)->import_mr(pd, mr_handle); +} - return ibv_reg_mr_iova(pd, addr, length, iova, access); +/** + * ibv_unimport_mr - Unimport a memory region + */ +void ibv_unimport_mr(struct ibv_mr *mr) +{ + get_ops(mr->context)->unimport_mr(mr); } LATEST_SYMVER_FUNC(ibv_rereg_mr, 1_1, "IBVERBS_1.1", @@ -425,9 +440,10 @@ LATEST_SYMVER_FUNC(ibv_dereg_mr, 1_1, "IBVERBS_1.1", void *addr = mr->addr; size_t length = mr->length; enum ibv_mr_type type = verbs_get_mr(mr)->mr_type; + int access = verbs_get_mr(mr)->access; ret = get_ops(mr->context)->dereg_mr(verbs_get_mr(mr)); - if (!ret && type == IBV_MR_TYPE_MR) + if (!ret && type == IBV_MR_TYPE_MR && !(access & IBV_ACCESS_ON_DEMAND)) ibv_dofork_range(addr, length); return ret; @@ -600,20 +616,6 @@ LATEST_SYMVER_FUNC(ibv_create_qp, 1_1, "IBVERBS_1.1", { struct ibv_qp *qp = get_ops(pd->context)->create_qp(pd, qp_init_attr); - if (qp) { - qp->context = pd->context; - qp->qp_context = qp_init_attr->qp_context; - qp->pd = pd; - qp->send_cq = qp_init_attr->send_cq; - qp->recv_cq = qp_init_attr->recv_cq; - qp->srq = qp_init_attr->srq; - qp->qp_type = qp_init_attr->qp_type; - qp->state = IBV_QPS_RESET; - qp->events_completed = 0; - pthread_mutex_init(&qp->mutex, NULL); - pthread_cond_init(&qp->cond, NULL); - } - return qp; } @@ -682,72 +684,36 @@ LATEST_SYMVER_FUNC(ibv_create_ah, 1_1, "IBVERBS_1.1", return ah; } -/* GID types as appear in sysfs, no change is expected as of ABI - * compatibility. - */ -#define V1_TYPE "IB/RoCE v1" -#define V2_TYPE "RoCE v2" int ibv_query_gid_type(struct ibv_context *context, uint8_t port_num, - unsigned int index, enum ibv_gid_type *type) + unsigned int index, enum ibv_gid_type_sysfs *type) { - struct verbs_device *verbs_device = verbs_get_device(context->device); - char buff[11]; + struct ibv_gid_entry entry = {}; + int ret; - /* Reset errno so that we can rely on its value upon any error flow in - * ibv_read_sysfs_file. - */ - errno = 0; - if (ibv_read_ibdev_sysfs_file(buff, sizeof(buff), verbs_device->sysfs, - "ports/%d/gid_attrs/types/%d", port_num, - index) <= 0) { - char *dir_path; - DIR *dir; - - if (errno == EINVAL) { - /* In IB, this file doesn't exist and the kernel sets - * errno to -EINVAL. - */ - *type = IBV_GID_TYPE_IB_ROCE_V1; - return 0; - } - if (asprintf(&dir_path, "%s/%s/%d/%s/", - verbs_device->sysfs->ibdev_path, "ports", port_num, - "gid_attrs") < 0) - return -1; - dir = opendir(dir_path); - free(dir_path); - if (!dir) { - if (errno == ENOENT) - /* Assuming that if gid_attrs doesn't exist, - * we have an old kernel and all GIDs are - * IB/RoCE v1 - */ - *type = IBV_GID_TYPE_IB_ROCE_V1; - else - return -1; - } else { - closedir(dir); - errno = EFAULT; - return -1; - } - } else { - if (!strcmp(buff, V1_TYPE)) { - *type = IBV_GID_TYPE_IB_ROCE_V1; - } else if (!strcmp(buff, V2_TYPE)) { - *type = IBV_GID_TYPE_ROCE_V2; - } else { - errno = ENOTSUP; - return -1; - } + ret = __ibv_query_gid_ex(context, port_num, index, &entry, 0, + sizeof(entry), VERBS_QUERY_GID_ATTR_TYPE); + /* Preserve API behavior for empty GID */ + if (ret == ENODATA) { + *type = IBV_GID_TYPE_SYSFS_IB_ROCE_V1; + return 0; } + if (ret) + return -1; + + if (entry.gid_type == IBV_GID_TYPE_IB || + entry.gid_type == IBV_GID_TYPE_ROCE_V1) + *type = IBV_GID_TYPE_SYSFS_IB_ROCE_V1; + else + *type = IBV_GID_TYPE_SYSFS_ROCE_V2; return 0; } static int ibv_find_gid_index(struct ibv_context *context, uint8_t port_num, - union ibv_gid *gid, enum ibv_gid_type gid_type) + union ibv_gid *gid, + enum ibv_gid_type_sysfs gid_type) { - enum ibv_gid_type sgid_type = 0; + enum ibv_gid_type_sysfs sgid_type = 0; union ibv_gid sgid; int i = 0, ret; @@ -847,7 +813,7 @@ static inline int set_ah_attr_by_ipv4(struct ibv_context *context, map_ipv4_addr_to_ipv6(ip4h->daddr, (struct in6_addr *)&sgid); ret = ibv_find_gid_index(context, port_num, &sgid, - IBV_GID_TYPE_ROCE_V2); + IBV_GID_TYPE_SYSFS_ROCE_V2); if (ret < 0) return ret; @@ -877,9 +843,9 @@ static inline int set_ah_attr_by_ipv6(struct ibv_context *context, ah_attr->grh.dgid = grh->sgid; if (grh->next_hdr == IPPROTO_UDP) { - sgid_type = IBV_GID_TYPE_ROCE_V2; + sgid_type = IBV_GID_TYPE_SYSFS_ROCE_V2; } else if (grh->next_hdr == IB_NEXT_HDR) { - sgid_type = IBV_GID_TYPE_IB_ROCE_V1; + sgid_type = IBV_GID_TYPE_SYSFS_IB_ROCE_V1; } else { errno = EPROTONOSUPPORT; return -1; @@ -1080,3 +1046,18 @@ free_resources: return ret; } + +int ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece) +{ + if (!ece->vendor_id) { + errno = EOPNOTSUPP; + return errno; + } + + return get_ops(qp->context)->set_ece(qp, ece); +} + +int ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece) +{ + return get_ops(qp->context)->query_ece(qp, ece); +} diff --git a/libibverbs/verbs.h b/libibverbs/verbs.h index 288985d..ee57e05 100644 --- a/libibverbs/verbs.h +++ b/libibverbs/verbs.h @@ -43,6 +43,7 @@ #include #include #include +#include #include #ifdef __cplusplus @@ -68,6 +69,20 @@ union ibv_gid { } global; }; +enum ibv_gid_type { + IBV_GID_TYPE_IB, + IBV_GID_TYPE_ROCE_V1, + IBV_GID_TYPE_ROCE_V2, +}; + +struct ibv_gid_entry { + union ibv_gid gid; + uint32_t gid_index; + uint32_t port_num; + uint32_t gid_type; /* enum ibv_gid_type */ + uint32_t ndev_ifindex; +}; + #define vext_field_avail(type, fld, sz) (offsetof(type, fld) < (sz)) #ifdef __cplusplus @@ -624,8 +639,7 @@ enum ibv_rereg_mr_flags { IBV_REREG_MR_CHANGE_TRANSLATION = (1 << 0), IBV_REREG_MR_CHANGE_PD = (1 << 1), IBV_REREG_MR_CHANGE_ACCESS = (1 << 2), - IBV_REREG_MR_KEEP_VALID = (1 << 3), - IBV_REREG_MR_FLAGS_SUPPORTED = ((IBV_REREG_MR_KEEP_VALID << 1) - 1) + IBV_REREG_MR_FLAGS_SUPPORTED = ((IBV_REREG_MR_CHANGE_ACCESS << 1) - 1) }; struct ibv_mr { @@ -1401,6 +1415,21 @@ static inline void ibv_wr_abort(struct ibv_qp_ex *qp) qp->wr_abort(qp); } +struct ibv_ece { + /* + * Unique identifier of the provider vendor on the network. + * The providers will set IEEE OUI here to distinguish + * itself in non-homogenius network. + */ + uint32_t vendor_id; + /* + * Provider specific attributes which are supported or + * needed to be enabled by ECE users. + */ + uint32_t options; + uint32_t comp_mask; +}; + struct ibv_comp_channel { struct ibv_context *context; int fd; @@ -2203,6 +2232,15 @@ void ibv_free_device_list(struct ibv_device **list); const char *ibv_get_device_name(struct ibv_device *device); /** + * ibv_get_device_index - Return kernel device index + * + * Available for the kernel with support of IB device query + * over netlink interface. For the unsupported kernels, the + * relevant -1 will be returned. + */ +int ibv_get_device_index(struct ibv_device *device); + +/** * ibv_get_device_guid - Return device's node GUID */ __be64 ibv_get_device_guid(struct ibv_device *device); @@ -2218,6 +2256,32 @@ struct ibv_context *ibv_open_device(struct ibv_device *device); int ibv_close_device(struct ibv_context *context); /** + * ibv_import_device - Import device + */ +struct ibv_context *ibv_import_device(int cmd_fd); + +/** + * ibv_import_pd - Import a protetion domain + */ +struct ibv_pd *ibv_import_pd(struct ibv_context *context, + uint32_t pd_handle); + +/** + * ibv_unimport_pd - Unimport a protetion domain + */ +void ibv_unimport_pd(struct ibv_pd *pd); + +/** + * ibv_import_mr - Import a memory region + */ +struct ibv_mr *ibv_import_mr(struct ibv_pd *pd, uint32_t mr_handle); + +/** + * ibv_unimport_mr - Unimport a memory region + */ +void ibv_unimport_mr(struct ibv_mr *mr); + +/** * ibv_get_async_event - Get next async event * @event: Pointer to use to return async event * @@ -2280,6 +2344,36 @@ static inline int ___ibv_query_port(struct ibv_context *context, int ibv_query_gid(struct ibv_context *context, uint8_t port_num, int index, union ibv_gid *gid); +int _ibv_query_gid_ex(struct ibv_context *context, uint32_t port_num, + uint32_t gid_index, struct ibv_gid_entry *entry, + uint32_t flags, size_t entry_size); + +/** + * ibv_query_gid_ex - Read a GID table entry + */ +static inline int ibv_query_gid_ex(struct ibv_context *context, + uint32_t port_num, uint32_t gid_index, + struct ibv_gid_entry *entry, uint32_t flags) +{ + return _ibv_query_gid_ex(context, port_num, gid_index, entry, flags, + sizeof(*entry)); +} + +ssize_t _ibv_query_gid_table(struct ibv_context *context, + struct ibv_gid_entry *entries, size_t max_entries, + uint32_t flags, size_t entry_size); + +/* + * ibv_query_gid_table - Get all valid GID table entries + */ +static inline ssize_t ibv_query_gid_table(struct ibv_context *context, + struct ibv_gid_entry *entries, + size_t max_entries, uint32_t flags) +{ + return _ibv_query_gid_table(context, entries, max_entries, flags, + sizeof(*entries)); +} + /** * ibv_query_pkey - Get a P_Key table entry */ @@ -2320,7 +2414,7 @@ static inline int ibv_destroy_flow(struct ibv_flow *flow_id) struct verbs_context *vctx = verbs_get_ctx_op(flow_id->context, ibv_destroy_flow); if (!vctx) - return -EOPNOTSUPP; + return EOPNOTSUPP; return vctx->ibv_destroy_flow(flow_id); } @@ -3073,6 +3167,7 @@ static inline struct ibv_wq *ibv_create_wq(struct ibv_context *context, wq = vctx->create_wq(context, wq_init_attr); if (wq) { + wq->wq_context = wq_init_attr->wq_context; wq->events_completed = 0; pthread_mutex_init(&wq->mutex, NULL); pthread_cond_init(&wq->cond, NULL); @@ -3342,6 +3437,15 @@ static inline uint16_t ibv_flow_label_to_udp_sport(uint32_t fl) return (uint16_t)(fl_low | IB_ROCE_UDP_ENCAP_VALID_PORT_MIN); } +/** + * ibv_set_ece - Set ECE options + */ +int ibv_set_ece(struct ibv_qp *qp, struct ibv_ece *ece); + +/** + * ibv_query_ece - Get accepted ECE options + */ +int ibv_query_ece(struct ibv_qp *qp, struct ibv_ece *ece); #ifdef __cplusplus } #endif diff --git a/libibverbs/verbs_api.h b/libibverbs/verbs_api.h index ded6fa4..309f6fb 100644 --- a/libibverbs/verbs_api.h +++ b/libibverbs/verbs_api.h @@ -88,6 +88,7 @@ #define ibv_advise_mr_advice ib_uverbs_advise_mr_advice #define IBV_ADVISE_MR_ADVICE_PREFETCH IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH #define IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_WRITE +#define IBV_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT IB_UVERBS_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT #define IBV_ADVISE_MR_FLAG_FLUSH IB_UVERBS_ADVISE_MR_FLAG_FLUSH diff --git a/librdmacm/CMakeLists.txt b/librdmacm/CMakeLists.txt index f0767cf..b01fef4 100644 --- a/librdmacm/CMakeLists.txt +++ b/librdmacm/CMakeLists.txt @@ -11,7 +11,7 @@ publish_headers(infiniband rdma_library(rdmacm librdmacm.map # See Documentation/versioning.md - 1 1.2.${PACKAGE_VERSION} + 1 1.3.${PACKAGE_VERSION} acm.c addrinfo.c cma.c diff --git a/librdmacm/acm.h b/librdmacm/acm.h index 7397b35..69c6053 100644 --- a/librdmacm/acm.h +++ b/librdmacm/acm.h @@ -111,7 +111,7 @@ struct acm_ep_addr_data { */ struct acm_resolve_msg { struct acm_hdr hdr; - struct acm_ep_addr_data data[0]; + struct acm_ep_addr_data data[]; }; enum { @@ -130,7 +130,7 @@ enum { */ struct acm_perf_msg { struct acm_hdr hdr; - uint64_t data[0]; + uint64_t data[]; }; /* @@ -144,12 +144,12 @@ struct acm_ep_config_data { uint16_t pkey; uint16_t addr_cnt; uint8_t prov_name[ACM_MAX_PROV_NAME]; - union acm_ep_info addrs[0]; + union acm_ep_info addrs[]; }; struct acm_ep_query_msg { struct acm_hdr hdr; - struct acm_ep_config_data data[0]; + struct acm_ep_config_data data[]; }; struct acm_msg { diff --git a/librdmacm/cma.c b/librdmacm/cma.c index 9855d0a..6e39565 100644 --- a/librdmacm/cma.c +++ b/librdmacm/cma.c @@ -59,6 +59,7 @@ #include #include #include +#include #define CMA_INIT_CMD(req, req_size, op) \ do { \ @@ -74,11 +75,15 @@ do { \ (req)->response = (uintptr_t) (resp); \ } while (0) +#define UCMA_INVALID_IB_INDEX -1 + struct cma_port { uint8_t link_layer; }; struct cma_device { + struct ibv_device *dev; + struct list_node entry; struct ibv_context *verbs; struct ibv_pd *pd; struct ibv_xrcd *xrcd; @@ -89,6 +94,8 @@ struct cma_device { int max_qpsize; uint8_t max_initiator_depth; uint8_t max_responder_resources; + int ibv_idx; + uint8_t is_device_dead : 1; }; struct cma_id_private { @@ -106,6 +113,8 @@ struct cma_id_private { struct ibv_qp_init_attr *qp_init_attr; uint8_t initiator_depth; uint8_t responder_resources; + struct ibv_ece local_ece; + struct ibv_ece remote_ece; }; struct cma_multicast { @@ -128,9 +137,9 @@ struct cma_event { struct cma_multicast *mc; }; -static struct cma_device *cma_dev_array; -static int cma_dev_cnt; -static int cma_init_cnt; +static LIST_HEAD(cma_dev_list); +/* sorted based or index or guid, depends on kernel support */ +static struct ibv_device **dev_list; static pthread_mutex_t mut = PTHREAD_MUTEX_INITIALIZER; static int abi_ver = -1; static char dev_name[64] = "rdma_cm"; @@ -229,7 +238,7 @@ static int check_abi_version(void) /* * This function is called holding the mutex lock - * cma_dev_cnt must be set before calling this function to + * cma_dev_list must be not empty before calling this function to * ensure that the lock is not acquired recursively. */ static void ucma_set_af_ib_support(void) @@ -253,17 +262,136 @@ static void ucma_set_af_ib_support(void) rdma_destroy_id(id); } +static struct cma_device *insert_cma_dev(struct ibv_device *dev) +{ + struct cma_device *cma_dev, *p; + + cma_dev = calloc(1, sizeof(struct cma_device)); + if (!cma_dev) + return NULL; + + cma_dev->guid = ibv_get_device_guid(dev); + cma_dev->ibv_idx = ibv_get_device_index(dev); + cma_dev->dev = dev; + + /* reverse iteration, optimized to ibv_idx which is growing */ + list_for_each_rev(&cma_dev_list, p, entry) { + if (cma_dev->ibv_idx == UCMA_INVALID_IB_INDEX) { + /* index not available, sort by guid */ + if (be64toh(p->guid) < be64toh(cma_dev->guid)) + break; + } else { + if (p->ibv_idx < cma_dev->ibv_idx) + break; + } + } + list_add_after(&cma_dev_list, &p->entry, &cma_dev->entry); + + return cma_dev; +} + +static void remove_cma_dev(struct cma_device *cma_dev) +{ + if (cma_dev->refcnt) { + /* we were asked to be deleted by sync_devices_list() */ + cma_dev->is_device_dead = true; + return; + } + + if (cma_dev->pd) + ibv_dealloc_pd(cma_dev->pd); + if (cma_dev->xrcd) + ibv_close_xrcd(cma_dev->xrcd); + if (cma_dev->verbs) + ibv_close_device(cma_dev->verbs); + list_del_from(&cma_dev_list, &cma_dev->entry); + free(cma_dev); +} + +static int dev_cmp(const void *a, const void *b) +{ + return (int)(*(char *const *)a - *(char *const *)b); +} + +static int sync_devices_list(void) +{ + struct ibv_device **new_list; + int i, j, numb_dev; + + new_list = ibv_get_device_list(&numb_dev); + if (!new_list) + return ERR(ENODEV); + + if (!numb_dev) { + ibv_free_device_list(new_list); + return ERR(ENODEV); + } + + qsort(new_list, numb_dev, sizeof(struct ibv_device *), dev_cmp); + if (unlikely(!dev_list)) { + /* first sync */ + for (j = 0; new_list[j]; j++) + insert_cma_dev(new_list[j]); + goto out; + } + + for (i = 0, j = 0; dev_list[i] || new_list[j];) { + if (dev_list[i] == new_list[j]) { + i++; + j++; + continue; + } + /* + * The device list is sorted by pointer address, + * so we need to compare the new list with old one. + * + * 1. If the device exists in new list, but doesn't exist in + * old list, we will add that device to the list. + * 2. If the device exists in old list, but doesn't exist in + * new list, we should delete it. + */ + if ((dev_list[i] > new_list[j] && new_list[j]) || + (!dev_list[i] && new_list[j])) { + insert_cma_dev(new_list[j++]); + continue; + } + if ((dev_list[i] < new_list[j] && dev_list[i]) || + (!new_list[j] && dev_list[i])) { + /* + * We will try our best to remove the entry, + * but if some process holds it, we will remove it + * later, when rdma-cm will put this resource back. + */ + struct cma_device *c, *t; + + list_for_each_safe(&cma_dev_list, c, t, entry) { + if (c->dev == dev_list[i]) + remove_cma_dev(c); + } + i++; + } + } + + ibv_free_device_list(dev_list); +out: + dev_list = new_list; + return 0; +} + int ucma_init(void) { - struct ibv_device **dev_list = NULL; - int i, ret, dev_cnt; + int ret; - /* Quick check without lock to see if we're already initialized */ - if (cma_dev_cnt) + /* + * ucma_set_af_ib_support() below recursively calls to this function + * again under the &mut lock, so do this fast check and return + * immediately. + */ + if (!list_empty(&cma_dev_list)) return 0; pthread_mutex_lock(&mut); - if (cma_dev_cnt) { + if (!list_empty(&cma_dev_list)) { pthread_mutex_unlock(&mut); return 0; } @@ -275,60 +403,26 @@ int ucma_init(void) goto err1; } - dev_list = ibv_get_device_list(&dev_cnt); - if (!dev_list) { - ret = ERR(ENODEV); + ret = sync_devices_list(); + if (ret) goto err1; - } - - if (!dev_cnt) { - ret = ERR(ENODEV); - goto err2; - } - cma_dev_array = calloc(dev_cnt, sizeof(*cma_dev_array)); - if (!cma_dev_array) { - ret = ERR(ENOMEM); - goto err2; - } - - for (i = 0; dev_list[i]; i++) - cma_dev_array[i].guid = ibv_get_device_guid(dev_list[i]); - - cma_dev_cnt = dev_cnt; ucma_set_af_ib_support(); pthread_mutex_unlock(&mut); - ibv_free_device_list(dev_list); return 0; -err2: - ibv_free_device_list(dev_list); err1: fastlock_destroy(&idm_lock); pthread_mutex_unlock(&mut); return ret; } -static struct ibv_context *ucma_open_device(__be64 guid) +static bool match(struct cma_device *cma_dev, __be64 guid, uint32_t idx) { - struct ibv_device **dev_list; - struct ibv_context *verbs = NULL; - int i; + if (idx == UCMA_INVALID_IB_INDEX) + return cma_dev->guid == guid; - dev_list = ibv_get_device_list(NULL); - if (!dev_list) { - return NULL; - } - - for (i = 0; dev_list[i]; i++) { - if (ibv_get_device_guid(dev_list[i]) == guid) { - verbs = ibv_open_device(dev_list[i]); - break; - } - } - - ibv_free_device_list(dev_list); - return verbs; + return cma_dev->ibv_idx == idx && cma_dev->guid == guid; } static int ucma_init_device(struct cma_device *cma_dev) @@ -340,7 +434,7 @@ static int ucma_init_device(struct cma_device *cma_dev) if (cma_dev->verbs) return 0; - cma_dev->verbs = ucma_open_device(cma_dev->guid); + cma_dev->verbs = ibv_open_device(cma_dev->dev); if (!cma_dev->verbs) return ERR(ENODEV); @@ -367,7 +461,6 @@ static int ucma_init_device(struct cma_device *cma_dev) cma_dev->max_qpsize = attr.max_qp_wr; cma_dev->max_initiator_depth = (uint8_t) attr.max_qp_init_rd_atom; cma_dev->max_responder_resources = (uint8_t) attr.max_qp_rd_atom; - cma_init_cnt++; return 0; err: @@ -378,20 +471,19 @@ err: static int ucma_init_all(void) { - int i, ret = 0; - - if (!cma_dev_cnt) { - ret = ucma_init(); - if (ret) - return ret; - } + struct cma_device *dev; + int ret = 0; - if (cma_init_cnt == cma_dev_cnt) - return 0; + ret = ucma_init(); + if (ret) + return ret; pthread_mutex_lock(&mut); - for (i = 0; i < cma_dev_cnt; i++) { - ret = ucma_init_device(&cma_dev_array[i]); + list_for_each(&cma_dev_list, dev, entry) { + if (dev->is_device_dead) + continue; + + ret = ucma_init_device(dev); if (ret) break; } @@ -402,19 +494,47 @@ static int ucma_init_all(void) struct ibv_context **rdma_get_devices(int *num_devices) { struct ibv_context **devs = NULL; - int i; + struct cma_device *dev; + int cma_dev_cnt = 0; + int i = 0; - if (ucma_init_all()) + if (ucma_init()) + goto err_init; + + pthread_mutex_lock(&mut); + if (sync_devices_list()) goto out; + list_for_each(&cma_dev_list, dev, entry) { + if (dev->is_device_dead) + continue; + + /* reinit newly added devices */ + if (ucma_init_device(dev)) { + cma_dev_cnt = 0; + /* + * There is no need to uninit already + * initialized devices, due to an error to other device. + */ + goto out; + } + cma_dev_cnt++; + } + devs = malloc(sizeof(*devs) * (cma_dev_cnt + 1)); if (!devs) goto out; - for (i = 0; i < cma_dev_cnt; i++) - devs[i] = cma_dev_array[i].verbs; + list_for_each(&cma_dev_list, dev, entry) { + if (dev->is_device_dead) + continue; + devs[i++] = dev->verbs; + dev->refcnt++; + } devs[i] = NULL; out: + pthread_mutex_unlock(&mut); +err_init: if (num_devices) *num_devices = devs ? cma_dev_cnt : 0; return devs; @@ -422,6 +542,25 @@ out: void rdma_free_devices(struct ibv_context **list) { + struct cma_device *c, *tmp; + int i; + + pthread_mutex_lock(&mut); + list_for_each_safe(&cma_dev_list, c, tmp, entry) { + for (i = 0; list[i]; i++) { + if (list[i] != c->verbs) + /* + * Skip devices that were added after + * user received the list. + */ + continue; + c->refcnt--; + if (c->is_device_dead) + /* try to remove */ + remove_cma_dev(c); + } + } + pthread_mutex_unlock(&mut); free(list); } @@ -452,35 +591,60 @@ void rdma_destroy_event_channel(struct rdma_event_channel *channel) free(channel); } -static int ucma_get_device(struct cma_id_private *id_priv, __be64 guid) +static struct cma_device *ucma_get_cma_device(__be64 guid, uint32_t idx) { struct cma_device *cma_dev; - int i, ret; - for (i = 0; i < cma_dev_cnt; i++) { - cma_dev = &cma_dev_array[i]; - if (cma_dev->guid == guid) + list_for_each(&cma_dev_list, cma_dev, entry) + if (!cma_dev->is_device_dead && match(cma_dev, guid, idx)) goto match; - } - return ERR(ENODEV); + if (sync_devices_list()) + return NULL; + /* + * Kernel informed us that we have new device and it must + * be in global dev_list[], let's find the right one. + */ + list_for_each(&cma_dev_list, cma_dev, entry) + if (!cma_dev->is_device_dead && match(cma_dev, guid, idx)) + goto match; + cma_dev = NULL; match: + if (cma_dev) + cma_dev->refcnt++; + return cma_dev; +} + +static int ucma_get_device(struct cma_id_private *id_priv, __be64 guid, + uint32_t idx) +{ + struct cma_device *cma_dev; + int ret; + pthread_mutex_lock(&mut); - if ((ret = ucma_init_device(cma_dev))) + cma_dev = ucma_get_cma_device(guid, idx); + if (!cma_dev) { + pthread_mutex_unlock(&mut); + return ERR(ENODEV); + } + + ret = ucma_init_device(cma_dev); + if (ret) goto out; - if (!cma_dev->refcnt++) { + if (!cma_dev->pd) cma_dev->pd = ibv_alloc_pd(cma_dev->verbs); - if (!cma_dev->pd) { - cma_dev->refcnt--; - ret = ERR(ENOMEM); - goto out; - } + if (!cma_dev->pd) { + ret = ERR(ENOMEM); + goto out; } + id_priv->cma_dev = cma_dev; id_priv->id.verbs = cma_dev->verbs; id_priv->id.pd = cma_dev->pd; out: + if (ret) + cma_dev->refcnt--; pthread_mutex_unlock(&mut); return ret; } @@ -492,6 +656,10 @@ static void ucma_put_device(struct cma_device *cma_dev) ibv_dealloc_pd(cma_dev->pd); if (cma_dev->xrcd) ibv_close_xrcd(cma_dev->xrcd); + cma_dev->pd = NULL; + cma_dev->xrcd = NULL; + if (cma_dev->is_device_dead) + remove_cma_dev(cma_dev); } pthread_mutex_unlock(&mut); } @@ -701,6 +869,12 @@ static int ucma_query_addr(struct rdma_cm_id *id) cmd.id = id_priv->handle; cmd.option = UCMA_QUERY_ADDR; + /* + * If kernel doesn't support ibdev_index, this field will + * be left as is by the kernel. + */ + resp.ibdev_index = UCMA_INVALID_IB_INDEX; + ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; @@ -711,7 +885,8 @@ static int ucma_query_addr(struct rdma_cm_id *id) memcpy(&id->route.addr.dst_addr, &resp.dst_addr, resp.dst_size); if (!id_priv->cma_dev && resp.node_guid) { - ret = ucma_get_device(id_priv, resp.node_guid); + ret = ucma_get_device(id_priv, resp.node_guid, + resp.ibdev_index); if (ret) return ret; id->port_num = resp.port_num; @@ -826,6 +1001,12 @@ static int ucma_query_route(struct rdma_cm_id *id) id_priv = container_of(id, struct cma_id_private, id); cmd.id = id_priv->handle; + /* + * If kernel doesn't support ibdev_index, this field will + * be left as is by the kernel. + */ + resp.ibdev_index = UCMA_INVALID_IB_INDEX; + ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; @@ -855,7 +1036,8 @@ static int ucma_query_route(struct rdma_cm_id *id) sizeof resp.dst_addr); if (!id_priv->cma_dev && resp.node_guid) { - ret = ucma_get_device(id_priv, resp.node_guid); + ret = ucma_get_device(id_priv, resp.node_guid, + resp.ibdev_index); if (ret) return ret; id_priv->id.port_num = resp.port_num; @@ -1382,6 +1564,61 @@ void rdma_destroy_srq(struct rdma_cm_id *id) ucma_destroy_cqs(id); } +static int init_ece(struct rdma_cm_id *id, struct ibv_qp *qp) +{ + struct cma_id_private *id_priv = + container_of(id, struct cma_id_private, id); + struct ibv_ece ece = {}; + int ret; + + ret = ibv_query_ece(qp, &ece); + if (ret && ret != EOPNOTSUPP) + return ERR(ret); + + id_priv->local_ece.vendor_id = ece.vendor_id; + id_priv->local_ece.options = ece.options; + + if (!id_priv->remote_ece.vendor_id) + /* + * This QP was created explicitly and we don't need + * to do anything additional to the setting local_ece values. + */ + return 0; + + /* This QP was created due to REQ event */ + if (id_priv->remote_ece.vendor_id != id_priv->local_ece.vendor_id) { + /* + * Signal to the provider that other ECE node is different + * vendor and clear ECE options. + */ + ece.vendor_id = id_priv->local_ece.vendor_id; + ece.options = 0; + } else { + ece.vendor_id = id_priv->remote_ece.vendor_id; + ece.options = id_priv->remote_ece.options; + } + ret = ibv_set_ece(qp, &ece); + return (ret && ret != EOPNOTSUPP) ? ERR(ret) : 0; +} + +static int set_local_ece(struct rdma_cm_id *id, struct ibv_qp *qp) +{ + struct cma_id_private *id_priv = + container_of(id, struct cma_id_private, id); + struct ibv_ece ece = {}; + int ret; + + if (!id_priv->remote_ece.vendor_id) + return 0; + + ret = ibv_query_ece(qp, &ece); + if (ret && ret != EOPNOTSUPP) + return ERR(ret); + + id_priv->local_ece.options = ece.options; + return 0; +} + int rdma_create_qp_ex(struct rdma_cm_id *id, struct ibv_qp_init_attr_ex *attr) { @@ -1429,12 +1666,19 @@ int rdma_create_qp_ex(struct rdma_cm_id *id, goto err1; } + ret = init_ece(id, qp); + if (ret) + goto err2; + if (ucma_is_ud_qp(id->qp_type)) ret = ucma_init_ud_qp(id_priv, qp); else ret = ucma_init_conn_qp(id_priv, qp); if (ret) goto err2; + ret = set_local_ece(id, qp); + if (ret) + goto err2; id->pd = qp->pd; id->qp = qp; @@ -1524,6 +1768,13 @@ static void ucma_copy_conn_param_to_kern(struct cma_id_private *id_priv, } } +static void ucma_copy_ece_param_to_kern_req(struct cma_id_private *id_priv, + struct ucma_abi_ece *dst) +{ + dst->vendor_id = id_priv->local_ece.vendor_id; + dst->attr_mod = id_priv->local_ece.options; +} + int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { uint32_t qp_num = conn_param ? conn_param->qp_num : 0; @@ -1556,6 +1807,8 @@ int rdma_connect(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param, conn_param, qp_num, srq); + ucma_copy_ece_param_to_kern_req(id_priv, &cmd.ece); + ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) return (ret >= 0) ? ERR(ENODATA) : -1; @@ -1636,6 +1889,14 @@ err: return ret; } +static void ucma_copy_ece_param_to_kern_rep(struct cma_id_private *id_priv, + struct ucma_abi_ece *dst) +{ + /* Return result with same ID as received. */ + dst->vendor_id = id_priv->remote_ece.vendor_id; + dst->attr_mod = id_priv->local_ece.options; +} + int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) { uint32_t qp_num = id->qp ? id->qp->qp_num : conn_param->qp_num; @@ -1677,6 +1938,7 @@ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) cmd.uid = (uintptr_t) id_priv; ucma_copy_conn_param_to_kern(id_priv, &cmd.conn_param, conn_param, qp_num, srq); + ucma_copy_ece_param_to_kern_rep(id_priv, &cmd.ece); ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) { @@ -1690,8 +1952,8 @@ int rdma_accept(struct rdma_cm_id *id, struct rdma_conn_param *conn_param) return ucma_complete(id); } -int rdma_reject(struct rdma_cm_id *id, const void *private_data, - uint8_t private_data_len) +static int reject_with_reason(struct rdma_cm_id *id, const void *private_data, + uint8_t private_data_len, uint8_t reason) { struct ucma_abi_reject cmd; struct cma_id_private *id_priv; @@ -1705,6 +1967,7 @@ int rdma_reject(struct rdma_cm_id *id, const void *private_data, memcpy(cmd.private_data, private_data, private_data_len); cmd.private_data_len = private_data_len; } + cmd.reason = reason; ret = write(id->channel->fd, &cmd, sizeof cmd); if (ret != sizeof cmd) @@ -1713,6 +1976,19 @@ int rdma_reject(struct rdma_cm_id *id, const void *private_data, return 0; } +int rdma_reject(struct rdma_cm_id *id, const void *private_data, + uint8_t private_data_len) +{ + return reject_with_reason(id, private_data, private_data_len, 0); +} + +int rdma_reject_ece(struct rdma_cm_id *id, const void *private_data, + uint8_t private_data_len) +{ + /* IBTA defines CM_REJ_VENDOR_OPTION_NOT_SUPPORTED as 35 */ + return reject_with_reason(id, private_data, private_data_len, 35); +} + int rdma_notify(struct rdma_cm_id *id, enum ibv_event_type event) { struct ucma_abi_notify cmd; @@ -2014,8 +2290,8 @@ static int ucma_query_req_info(struct rdma_cm_id *id) return 0; } -static int ucma_process_conn_req(struct cma_event *evt, - uint32_t handle) +static int ucma_process_conn_req(struct cma_event *evt, uint32_t handle, + struct ucma_abi_ece *ece) { struct cma_id_private *id_priv; int ret; @@ -2035,6 +2311,8 @@ static int ucma_process_conn_req(struct cma_event *evt, ucma_insert_id(id_priv); id_priv->initiator_depth = evt->event.param.conn.initiator_depth; id_priv->responder_resources = evt->event.param.conn.responder_resources; + id_priv->remote_ece.vendor_id = ece->vendor_id; + id_priv->remote_ece.options = ece->attr_mod; if (evt->id_priv->sync) { ret = rdma_migrate_id(&id_priv->id, NULL); @@ -2083,6 +2361,50 @@ err: return ret; } +static int ucma_process_conn_resp_ece(struct cma_id_private *id_priv, + struct ucma_abi_ece *ece) +{ + struct ibv_ece ibv_ece = { .vendor_id = ece->vendor_id, + .options = ece->attr_mod }; + int ret; + + /* This is response handler */ + if (!ece->vendor_id) { + /* + * Kernel or user-space doesn't support ECE transfer, + * clear everything. + */ + ibv_ece.vendor_id = id_priv->local_ece.vendor_id; + ibv_ece.options = 0; + } else if (ece->vendor_id != id_priv->local_ece.vendor_id) { + /* + * At this point remote vendor_id should be the same + * as the local one, or something bad happened in + * ECE handshake implementation. + */ + ucma_modify_qp_err(&id_priv->id); + return ERR(EINVAL); + } + + id_priv->remote_ece.vendor_id = ece->vendor_id; + ret = ibv_set_ece(id_priv->id.qp, &ibv_ece); + if (ret && ret != EOPNOTSUPP) + return ret; + + ret = ucma_process_conn_resp(id_priv); + if (ret) + return ret; + + ret = ibv_query_ece(id_priv->id.qp, &ibv_ece); + if (ret && ret != EOPNOTSUPP) { + ucma_modify_qp_err(&id_priv->id); + return ret; + } + + id_priv->local_ece.options = (ret == EOPNOTSUPP) ? 0 : ibv_ece.options; + return 0; +} + static int ucma_process_join(struct cma_event *evt) { evt->mc->mgid = evt->event.param.ud.ah_attr.grh.dgid; @@ -2152,7 +2474,7 @@ int rdma_establish(struct rdma_cm_id *id) int rdma_get_cm_event(struct rdma_event_channel *channel, struct rdma_cm_event **event) { - struct ucma_abi_event_resp resp; + struct ucma_abi_event_resp resp = {}; struct ucma_abi_get_event cmd; struct cma_event *evt; int ret; @@ -2219,7 +2541,7 @@ retry: else ucma_copy_conn_event(evt, &resp.param.conn); - ret = ucma_process_conn_req(evt, resp.id); + ret = ucma_process_conn_req(evt, resp.id, &resp.ece); if (ret) goto retry; break; @@ -2227,9 +2549,11 @@ retry: ucma_copy_conn_event(evt, &resp.param.conn); if (!evt->id_priv->id.qp) { evt->event.event = RDMA_CM_EVENT_CONNECT_RESPONSE; + evt->id_priv->remote_ece.vendor_id = resp.ece.vendor_id; + evt->id_priv->remote_ece.options = resp.ece.attr_mod; } else { - evt->event.status = - ucma_process_conn_resp(evt->id_priv); + evt->event.status = ucma_process_conn_resp_ece( + evt->id_priv, &resp.ece); if (!evt->event.status) evt->event.event = RDMA_CM_EVENT_ESTABLISHED; else { @@ -2522,17 +2846,20 @@ void rdma_destroy_ep(struct rdma_cm_id *id) int ucma_max_qpsize(struct rdma_cm_id *id) { struct cma_id_private *id_priv; - int i, max_size = 0; + struct cma_device *dev; + int max_size = 0; id_priv = container_of(id, struct cma_id_private, id); if (id && id_priv->cma_dev) { max_size = id_priv->cma_dev->max_qpsize; } else { ucma_init_all(); - for (i = 0; i < cma_dev_cnt; i++) { - if (!max_size || max_size > cma_dev_array[i].max_qpsize) - max_size = cma_dev_array[i].max_qpsize; - } + pthread_mutex_lock(&mut); + list_for_each(&cma_dev_list, dev, entry) + if (!dev->is_device_dead && + (!max_size || max_size > dev->max_qpsize)) + max_size = dev->max_qpsize; + pthread_mutex_unlock(&mut); } return max_size; } @@ -2561,3 +2888,31 @@ __be16 rdma_get_dst_port(struct rdma_cm_id *id) return ucma_get_port(&id->route.addr.dst_addr); } +int rdma_set_local_ece(struct rdma_cm_id *id, struct ibv_ece *ece) +{ + struct cma_id_private *id_priv; + + if (!id || id->qp || !ece || !ece->vendor_id || ece->comp_mask) + return ERR(EINVAL); + + id_priv = container_of(id, struct cma_id_private, id); + id_priv->local_ece.vendor_id = ece->vendor_id; + id_priv->local_ece.options = ece->options; + + return 0; +} + +int rdma_get_remote_ece(struct rdma_cm_id *id, struct ibv_ece *ece) +{ + struct cma_id_private *id_priv; + + if (!id || id->qp || !ece) + return ERR(EINVAL); + + id_priv = container_of(id, struct cma_id_private, id); + ece->vendor_id = id_priv->remote_ece.vendor_id; + ece->options = id_priv->remote_ece.options; + ece->comp_mask = 0; + + return 0; +} diff --git a/librdmacm/librdmacm.map b/librdmacm/librdmacm.map index 7f55e84..d162ef0 100644 --- a/librdmacm/librdmacm.map +++ b/librdmacm/librdmacm.map @@ -82,3 +82,10 @@ RDMACM_1.2 { rdma_establish; rdma_init_qp_attr; } RDMACM_1.1; + +RDMACM_1.3 { + global: + rdma_get_remote_ece; + rdma_reject_ece; + rdma_set_local_ece; +} RDMACM_1.2; diff --git a/librdmacm/man/CMakeLists.txt b/librdmacm/man/CMakeLists.txt index 2d1efbf..6159c3e 100644 --- a/librdmacm/man/CMakeLists.txt +++ b/librdmacm/man/CMakeLists.txt @@ -29,6 +29,7 @@ rdma_man_pages( rdma_get_local_addr.3 rdma_get_peer_addr.3 rdma_get_recv_comp.3 + rdma_get_remote_ece.3.md rdma_get_request.3 rdma_get_send_comp.3 rdma_get_src_port.3 @@ -56,6 +57,7 @@ rdma_man_pages( rdma_resolve_addr.3 rdma_resolve_route.3 rdma_server.1 + rdma_set_local_ece.3.md rdma_set_option.3 rdma_xclient.1 rdma_xserver.1 diff --git a/librdmacm/man/rdma_cm.7 b/librdmacm/man/rdma_cm.7 index 8e5ad99..122c96f 100644 --- a/librdmacm/man/rdma_cm.7 +++ b/librdmacm/man/rdma_cm.7 @@ -26,6 +26,10 @@ parameter in specific calls. If an event channel is provided, an rdma_cm identi will report its event data (results of connecting, for example), on that channel. If a channel is not provided, then all rdma_cm operations for the selected rdma_cm identifier will block until they complete. +.P +The RDMA CM gives an option to different libibverbs providers to advertise and +use various specific to that provider QP configuration options. This functionality +is called ECE (enhanced connection establishment). .SH "RDMA VERBS" The rdma_cm supports the full range of verbs available through the libibverbs library and interfaces. However, it also provides wrapper functions for some @@ -111,6 +115,8 @@ destroy the QP release the rdma_cm_id .IP rdma_destroy_event_channel release the event channel +.IP rdma_set_local_ece +set desired ECE options .P An almost identical process is used to setup unreliable datagram (UD) communication between nodes. No actual connection is formed between QPs @@ -157,6 +163,10 @@ release the connected rdma_cm_id release the listening rdma_cm_id .IP rdma_destroy_event_channel release the event channel +.IP rdma_get_remote_ece +get ECe options sent by the client +.IP rdma_set_local_ece +set desired ECE options .SH "RETURN CODES" .IP "= 0" success @@ -198,6 +208,7 @@ rdma_get_dst_port(3), rdma_get_local_addr(3), rdma_get_peer_addr(3), rdma_get_recv_comp(3), +rdma_get_remote_ece(3), rdma_get_request(3), rdma_get_send_comp(3), rdma_get_src_port(3), @@ -221,7 +232,8 @@ rdma_reg_write(3), rdma_reject(3), rdma_resolve_addr(3), rdma_resolve_route(3), -rdma_set_option(3) +rdma_get_remote_ece(3), +rdma_set_option(3), mckey(1), rdma_client(1), rdma_server(1), diff --git a/librdmacm/man/rdma_get_remote_ece.3.md b/librdmacm/man/rdma_get_remote_ece.3.md new file mode 100644 index 0000000..1db1f8e --- /dev/null +++ b/librdmacm/man/rdma_get_remote_ece.3.md @@ -0,0 +1,61 @@ +--- +date: 2020-02-02 +footer: librdmacm +header: "Librdmacm Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: RDMA_GET_REMOTE_ECE +--- + +# NAME + +rdma_get_remote_ece - Get remote ECE paraemters as received from the peer. + +# SYNOPSIS + +```c +#include + +int rdma_get_remote_ece(struct rdma_cm_id *id, struct ibv_ece *ece); +``` +# DESCRIPTION + +**rdma_get_remote_ece()** get ECE parameters as were received from the communication peer. + +This function is suppose to be used by the users of external QPs. The call needs +to be performed before replying to the peer and needed to allow for the passive +side to know ECE options of other side. + +Being used by external QP and RDMA_CM doesn't manage that QP, the peer needs +to call to libibverbs API by itself. + +Usual flow for the passive side will be: + + * ibv_create_qp() <- create data QP. + * ece = rdma_get_remote_ece() <- get ECE options from remote peer + * ibv_set_ece(ece) <- set local ECE options with data received from the peer. + * ibv_modify_qp() <- enable data QP. + * rdma_set_local_ece(ece) <- set desired ECE options after respective + libibverbs provider masked unsupported options. + * rdma_accept()/rdma_establish()/rdma_reject_ece() + +# ARGUMENTS + +*id +: RDMA communication identifier. + +*ece +: ECE struct to be filled. + +# RETURN VALUE + +**rdma_get_remote_ece()** returns 0 on success, or -1 on error. If an error occurs, errno will be set to indicate the failure reason. + +# SEE ALSO + +**rdma_cm**(7), rdma_set_local_ece(3) + +# AUTHOR + +Leon Romanovsky diff --git a/librdmacm/man/rdma_set_local_ece.3.md b/librdmacm/man/rdma_set_local_ece.3.md new file mode 100644 index 0000000..253e60d --- /dev/null +++ b/librdmacm/man/rdma_set_local_ece.3.md @@ -0,0 +1,62 @@ +--- +date: 2020-02-02 +footer: librdmacm +header: "Librdmacm Programmer's Manual" +layout: page +license: 'Licensed under the OpenIB.org BSD license (FreeBSD Variant) - See COPYING.md' +section: 3 +title: RDMA_SET_LOCAL_ECE +--- + +# NAME + +rdma_set_local_ece - Set local ECE paraemters to be used for REQ/REP communication. + +# SYNOPSIS + +```c +#include + +int rdma_set_local_ece(struct rdma_cm_id *id, struct ibv_ece *ece); +``` +# DESCRIPTION + +**rdma_set_local_ece()** set local ECE parameters. + +This function is suppose to be used by the users of external QPs. The call needs +to be performed before replying to the peer and needed to configure RDMA_CM with +desired ECE options. + +Being used by external QP and RDMA_CM doesn't manage that QP, the peer needs +to call to libibverbs API by itself. + +Usual flow for the passive side will be: + + * ibv_create_qp() <- create data QP. + * ece = ibv_get_ece() <- get ECE from libibvers provider. + * rdma_set_local_ece(ece) <- set desired ECE options. + * rdma_connect() <- send connection request + * ece = rdma_get_remote_ece() <- get ECE options from remote peer + * ibv_set_ece(ece) <- set local ECE options with data received from the peer. + * ibv_modify_qp() <- enable data QP. + * rdma_accept()/rdma_establish()/rdma_reject_ece() + +# ARGUMENTS + +*id* +: RDMA communication identifier. + +*ece +: ECE parameters. + +# RETURN VALUE + +**rdma_set_local_ece()** returns 0 on success, or -1 on error. If an error occurs, errno will be set to indicate the failure reason. + +# SEE ALSO + +**rdma_cm**(7), rdma_get_remote_ece(3) + +# AUTHOR + +Leon Romanovsky diff --git a/librdmacm/rdma_cma.h b/librdmacm/rdma_cma.h index 1905033..e1f4e23 100644 --- a/librdmacm/rdma_cma.h +++ b/librdmacm/rdma_cma.h @@ -525,6 +525,14 @@ int rdma_reject(struct rdma_cm_id *id, const void *private_data, uint8_t private_data_len); /** + * rdma_reject_ece - Called to reject a connection request with ECE + * rejected reason. + * The same as rdma_reject() + */ +int rdma_reject_ece(struct rdma_cm_id *id, const void *private_data, + uint8_t private_data_len); + +/** * rdma_notify - Notifies the librdmacm of an asynchronous event. * @id: RDMA identifier. * @event: Asynchronous event. @@ -753,6 +761,22 @@ void rdma_freeaddrinfo(struct rdma_addrinfo *res); */ int rdma_init_qp_attr(struct rdma_cm_id *id, struct ibv_qp_attr *qp_attr, int *qp_attr_mask); + +/** + * rdma_set_local_ece - Set local ECE options to be used for REQ/REP + * communication. In use to implement ECE handshake in external QP. + * @id: Communication identifier to establish connection + * @ece: ECE parameters + */ +int rdma_set_local_ece(struct rdma_cm_id *id, struct ibv_ece *ece); + +/** + * rdma_get_remote_ece - Provide remote ECE parameters as received + * in REQ/REP events. In use to implement ECE handshake in external QP. + * @id: Communication identifier to establish connection + * @ece: ECE parameters + */ +int rdma_get_remote_ece(struct rdma_cm_id *id, struct ibv_ece *ece); #ifdef __cplusplus } #endif diff --git a/librdmacm/rdma_cma_abi.h b/librdmacm/rdma_cma_abi.h index ab4adb0..9177282 100644 --- a/librdmacm/rdma_cma_abi.h +++ b/librdmacm/rdma_cma_abi.h @@ -180,6 +180,8 @@ struct ucma_abi_query_route_resp { __u32 num_paths; __u8 port_num; __u8 reserved[3]; + __u32 ibdev_index; + __u32 reserved1; }; struct ucma_abi_query_addr_resp { @@ -191,6 +193,8 @@ struct ucma_abi_query_addr_resp { __u16 dst_size; struct sockaddr_storage src_addr; struct sockaddr_storage dst_addr; + __u32 ibdev_index; + __u32 reserved1; }; struct ucma_abi_query_path_resp { @@ -223,6 +227,11 @@ struct ucma_abi_ud_param { __u8 reserved2[4]; /* Round to 8-byte boundary to support 32/64 */ }; +struct ucma_abi_ece { + __u32 vendor_id; + __u32 attr_mod; +}; + struct ucma_abi_connect { __u32 cmd; __u16 in; @@ -230,6 +239,7 @@ struct ucma_abi_connect { struct ucma_abi_conn_param conn_param; __u32 id; __u32 reserved; + struct ucma_abi_ece ece; }; struct ucma_abi_listen { @@ -248,6 +258,7 @@ struct ucma_abi_accept { struct ucma_abi_conn_param conn_param; __u32 id; __u32 reserved; + struct ucma_abi_ece ece; }; struct ucma_abi_reject { @@ -256,7 +267,8 @@ struct ucma_abi_reject { __u16 out; __u32 id; __u8 private_data_len; - __u8 reserved[3]; + __u8 reason; + __u8 reserved[2]; __u8 private_data[RDMA_MAX_PRIVATE_DATA]; }; @@ -322,6 +334,7 @@ struct ucma_abi_event_resp { struct ucma_abi_conn_param conn; struct ucma_abi_ud_param ud; } param; + struct ucma_abi_ece ece; }; struct ucma_abi_set_option { diff --git a/providers/bnxt_re/main.c b/providers/bnxt_re/main.c index 8893673..baeee73 100644 --- a/providers/bnxt_re/main.c +++ b/providers/bnxt_re/main.c @@ -100,8 +100,6 @@ static const struct verbs_context_ops bnxt_re_cntx_ops = { .create_cq = bnxt_re_create_cq, .poll_cq = bnxt_re_poll_cq, .req_notify_cq = bnxt_re_arm_cq, - .cq_event = bnxt_re_cq_event, - .resize_cq = bnxt_re_resize_cq, .destroy_cq = bnxt_re_destroy_cq, .create_srq = bnxt_re_create_srq, .modify_srq = bnxt_re_modify_srq, diff --git a/providers/bnxt_re/verbs.c b/providers/bnxt_re/verbs.c index 2218e3a..03237e7 100644 --- a/providers/bnxt_re/verbs.c +++ b/providers/bnxt_re/verbs.c @@ -215,11 +215,6 @@ fail: return NULL; } -int bnxt_re_resize_cq(struct ibv_cq *ibvcq, int ncqe) -{ - return -ENOSYS; -} - int bnxt_re_destroy_cq(struct ibv_cq *ibvcq) { int status; @@ -754,11 +749,6 @@ static void bnxt_re_cleanup_cq(struct bnxt_re_qp *qp, struct bnxt_re_cq *cq) pthread_spin_unlock(&cntx->fqlock); } -void bnxt_re_cq_event(struct ibv_cq *ibvcq) -{ - -} - int bnxt_re_arm_cq(struct ibv_cq *ibvcq, int flags) { struct bnxt_re_cq *cq = to_bnxt_re_cq(ibvcq); diff --git a/providers/bnxt_re/verbs.h b/providers/bnxt_re/verbs.h index 2e99488..b9fd84b 100644 --- a/providers/bnxt_re/verbs.h +++ b/providers/bnxt_re/verbs.h @@ -66,10 +66,8 @@ int bnxt_re_dereg_mr(struct verbs_mr *vmr); struct ibv_cq *bnxt_re_create_cq(struct ibv_context *uctx, int ncqe, struct ibv_comp_channel *ch, int vec); -int bnxt_re_resize_cq(struct ibv_cq *ibvcq, int ncqe); int bnxt_re_destroy_cq(struct ibv_cq *ibvcq); int bnxt_re_poll_cq(struct ibv_cq *ibvcq, int nwc, struct ibv_wc *wc); -void bnxt_re_cq_event(struct ibv_cq *ibvcq); int bnxt_re_arm_cq(struct ibv_cq *ibvcq, int flags); struct ibv_qp *bnxt_re_create_qp(struct ibv_pd *ibvpd, diff --git a/providers/efa/efa-abi.h b/providers/efa/efa-abi.h index 7e1a878..7cd18b8 100644 --- a/providers/efa/efa-abi.h +++ b/providers/efa/efa-abi.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ /* - * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef __EFA_ABI_H__ @@ -12,12 +12,12 @@ #define EFA_ABI_VERSION 1 -DECLARE_DRV_CMD(efa_alloc_ucontext, IB_USER_VERBS_CMD_GET_CONTEXT, empty, - efa_ibv_alloc_ucontext_resp); +DECLARE_DRV_CMD(efa_alloc_ucontext, IB_USER_VERBS_CMD_GET_CONTEXT, + efa_ibv_alloc_ucontext_cmd, efa_ibv_alloc_ucontext_resp); DECLARE_DRV_CMD(efa_alloc_pd, IB_USER_VERBS_CMD_ALLOC_PD, empty, efa_ibv_alloc_pd_resp); -DECLARE_DRV_CMD(efa_create_cq, IB_USER_VERBS_CMD_CREATE_CQ, efa_ibv_create_cq, - efa_ibv_create_cq_resp); +DECLARE_DRV_CMD(efa_create_cq, IB_USER_VERBS_EX_CMD_CREATE_CQ, + efa_ibv_create_cq, efa_ibv_create_cq_resp); DECLARE_DRV_CMD(efa_create_qp, IB_USER_VERBS_CMD_CREATE_QP, efa_ibv_create_qp, efa_ibv_create_qp_resp); DECLARE_DRV_CMD(efa_create_ah, IB_USER_VERBS_CMD_CREATE_AH, empty, diff --git a/providers/efa/efa.c b/providers/efa/efa.c index 41955e5..35f9b24 100644 --- a/providers/efa/efa.c +++ b/providers/efa/efa.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause /* - * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All rights reserved. */ #include @@ -20,6 +20,7 @@ static void efa_free_context(struct ibv_context *ibvctx); static const struct verbs_match_ent efa_table[] = { VERBS_DRIVER_ID(RDMA_DRIVER_EFA), VERBS_PCI_MATCH(PCI_VENDOR_ID_AMAZON, 0xefa0, NULL), + VERBS_PCI_MATCH(PCI_VENDOR_ID_AMAZON, 0xefa1, NULL), {} }; @@ -27,6 +28,7 @@ static const struct verbs_context_ops efa_ctx_ops = { .alloc_pd = efa_alloc_pd, .create_ah = efa_create_ah, .create_cq = efa_create_cq, + .create_cq_ex = efa_create_cq_ex, .create_qp = efa_create_qp, .create_qp_ex = efa_create_qp_ex, .dealloc_pd = efa_dealloc_pd, @@ -51,18 +53,21 @@ static struct verbs_context *efa_alloc_context(struct ibv_device *vdev, void *private_data) { struct efa_alloc_ucontext_resp resp = {}; + struct efa_alloc_ucontext cmd = {}; struct ibv_device_attr_ex attr; - struct ibv_get_context cmd; unsigned int qp_table_sz; struct efa_context *ctx; int err; + cmd.comp_mask |= EFA_ALLOC_UCONTEXT_CMD_COMP_TX_BATCH; + cmd.comp_mask |= EFA_ALLOC_UCONTEXT_CMD_COMP_MIN_SQ_WR; + ctx = verbs_init_and_alloc_context(vdev, cmd_fd, ctx, ibvctx, RDMA_DRIVER_EFA); if (!ctx) return NULL; - if (ibv_cmd_get_context(&ctx->ibvctx, &cmd, sizeof(cmd), + if (ibv_cmd_get_context(&ctx->ibvctx, &cmd.ibv_cmd, sizeof(cmd), &resp.ibv_resp, sizeof(resp))) goto err_free_ctx; @@ -71,6 +76,8 @@ static struct verbs_context *efa_alloc_context(struct ibv_device *vdev, ctx->cqe_size = sizeof(struct efa_io_rx_cdesc); ctx->inline_buf_size = resp.inline_buf_size; ctx->max_llq_size = resp.max_llq_size; + ctx->max_tx_batch = resp.max_tx_batch; + ctx->min_sq_wr = resp.min_sq_wr; pthread_spin_init(&ctx->qp_table_lock, PTHREAD_PROCESS_PRIVATE); /* ah udata is mandatory for ah number retrieval */ diff --git a/providers/efa/efa.h b/providers/efa/efa.h index 5be7d71..f4488a4 100644 --- a/providers/efa/efa.h +++ b/providers/efa/efa.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ /* - * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef __EFA_H__ @@ -22,6 +22,15 @@ struct efa_context { uint16_t sub_cqs_per_cq; uint16_t inline_buf_size; uint32_t max_llq_size; + uint32_t device_caps; + uint32_t max_sq_wr; + uint32_t max_rq_wr; + uint16_t max_sq_sge; + uint16_t max_rq_sge; + uint32_t max_rdma_size; + uint16_t max_wr_rdma_sge; + uint16_t max_tx_batch; + uint16_t min_sq_wr; size_t cqe_size; struct efa_qp **qp_table; unsigned int qp_table_sz_m1; @@ -43,7 +52,7 @@ struct efa_sub_cq { }; struct efa_cq { - struct ibv_cq ibvcq; + struct verbs_cq verbs_cq; uint32_t cqn; size_t cqe_size; uint8_t *buf; @@ -52,6 +61,8 @@ struct efa_cq { /* Index of next sub cq idx to poll. This is used to guarantee fairness for sub cqs */ uint16_t next_poll_idx; pthread_spinlock_t lock; + struct efa_wq *cur_wq; + struct efa_io_cdesc_common *cur_cqe; struct efa_sub_cq sub_cq_arr[]; }; @@ -66,32 +77,32 @@ struct efa_wq { uint32_t wqe_cnt; uint32_t wqe_posted; uint32_t wqe_completed; - uint16_t desc_idx; + uint16_t pc; /* Producer counter */ uint16_t desc_mask; /* wrid_idx_pool_next: Index of the next entry to use in wrid_idx_pool. */ uint16_t wrid_idx_pool_next; int max_sge; int phase; pthread_spinlock_t wqlock; + + uint32_t *db; + uint16_t sub_cq_idx; }; struct efa_rq { struct efa_wq wq; - uint32_t *db; uint8_t *buf; size_t buf_size; - uint16_t sub_cq_idx; }; struct efa_sq { struct efa_wq wq; - uint32_t *db; uint8_t *desc; uint32_t desc_offset; size_t desc_ring_mmap_size; size_t max_inline_data; size_t max_wr_rdma_sge; - uint16_t sub_cq_idx; + uint16_t max_batch_wr; /* Buffer for pending WR entries in the current session */ uint8_t *local_queue; @@ -126,20 +137,8 @@ struct efa_ah { struct efa_dev { struct verbs_device vdev; uint32_t pg_sz; - uint32_t device_caps; - uint32_t max_sq_wr; - uint32_t max_rq_wr; - uint16_t max_sq_sge; - uint16_t max_rq_sge; - uint32_t max_rdma_size; - uint16_t max_wr_rdma_sge; }; -static inline bool is_rdma_read_cap(struct efa_dev *dev) -{ - return dev->device_caps & EFA_QUERY_DEVICE_CAPS_RDMA_READ; -} - static inline struct efa_dev *to_efa_dev(struct ibv_device *ibvdev) { return container_of(ibvdev, struct efa_dev, vdev.device); @@ -157,7 +156,12 @@ static inline struct efa_pd *to_efa_pd(struct ibv_pd *ibvpd) static inline struct efa_cq *to_efa_cq(struct ibv_cq *ibvcq) { - return container_of(ibvcq, struct efa_cq, ibvcq); + return container_of(ibvcq, struct efa_cq, verbs_cq.cq); +} + +static inline struct efa_cq *to_efa_cq_ex(struct ibv_cq_ex *ibvcqx) +{ + return container_of(ibvcqx, struct efa_cq, verbs_cq.cq_ex); } static inline struct efa_qp *to_efa_qp(struct ibv_qp *ibvqp) diff --git a/providers/efa/efadv.h b/providers/efa/efadv.h index 91458f3..2a2222c 100644 --- a/providers/efa/efadv.h +++ b/providers/efa/efadv.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ /* - * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef __EFADV_H__ @@ -37,6 +37,7 @@ struct ibv_qp *efadv_create_qp_ex(struct ibv_context *ibvctx, enum { EFADV_DEVICE_ATTR_CAPS_RDMA_READ = 1 << 0, + EFADV_DEVICE_ATTR_CAPS_RNR_RETRY = 1 << 1, }; struct efadv_device_attr { diff --git a/providers/efa/man/efadv_query_device.3.md b/providers/efa/man/efadv_query_device.3.md index 863090e..bb3af00 100644 --- a/providers/efa/man/efadv_query_device.3.md +++ b/providers/efa/man/efadv_query_device.3.md @@ -69,6 +69,9 @@ struct efadv_device_attr { EFADV_DEVICE_ATTR_CAPS_RDMA_READ: RDMA read is supported. + EFADV_DEVICE_ATTR_CAPS_RNR_RETRY: + RNR retry is supported for SRD QPs. + *max_rdma_size* : Maximum RDMA transfer size in bytes. diff --git a/providers/efa/verbs.c b/providers/efa/verbs.c index 03b8cf9..54d208a 100644 --- a/providers/efa/verbs.c +++ b/providers/efa/verbs.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause /* - * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All rights reserved. */ #include @@ -22,6 +22,9 @@ #include "efadv.h" #include "verbs.h" +#define EFA_DEV_CAP(ctx, cap) \ + ((ctx)->device_caps & EFA_QUERY_DEVICE_CAPS_##cap) + static bool is_buf_cleared(void *buf, size_t len) { int i; @@ -34,11 +37,25 @@ static bool is_buf_cleared(void *buf, size_t len) return true; } +#define min3(a, b, c) \ + ({ \ + typeof(a) _tmpmin = min(a, b); \ + min(_tmpmin, c); \ + }) + #define is_ext_cleared(ptr, inlen) \ is_buf_cleared(ptr + sizeof(*ptr), inlen - sizeof(*ptr)) #define is_reserved_cleared(reserved) is_buf_cleared(reserved, sizeof(reserved)) +struct efa_wq_init_attr { + uint64_t db_mmap_key; + uint32_t db_off; + int cmd_fd; + int pgsz; + uint16_t sub_cq_idx; +}; + int efa_query_device(struct ibv_context *ibvctx, struct ibv_device_attr *dev_attr) { @@ -74,7 +91,6 @@ int efa_query_device_ex(struct ibv_context *context, size_t attr_size) { struct efa_context *ctx = to_efa_context(context); - struct efa_dev *dev = to_efa_dev(context->device); int cmd_supp_uhw = ctx->cmds_supp_udata_mask & EFA_USER_CMDS_SUPP_UDATA_QUERY_DEVICE; struct ibv_device_attr *a = &attr->orig_attr; @@ -90,13 +106,13 @@ int efa_query_device_ex(struct ibv_context *context, if (err) return err; - dev->device_caps = resp.device_caps; - dev->max_sq_wr = resp.max_sq_wr; - dev->max_rq_wr = resp.max_rq_wr; - dev->max_sq_sge = resp.max_sq_sge; - dev->max_rq_sge = resp.max_rq_sge; - dev->max_rdma_size = resp.max_rdma_size; - dev->max_wr_rdma_sge = a->max_sge_rd; + ctx->device_caps = resp.device_caps; + ctx->max_sq_wr = resp.max_sq_wr; + ctx->max_rq_wr = resp.max_rq_wr; + ctx->max_sq_sge = resp.max_sq_sge; + ctx->max_rq_sge = resp.max_rq_sge; + ctx->max_rdma_size = resp.max_rdma_size; + ctx->max_wr_rdma_sge = a->max_sge_rd; a->max_qp_wr = min_t(int, a->max_qp_wr, ctx->max_llq_size / sizeof(struct efa_io_tx_wqe)); @@ -111,7 +127,6 @@ int efadv_query_device(struct ibv_context *ibvctx, uint32_t inlen) { struct efa_context *ctx = to_efa_context(ibvctx); - struct efa_dev *dev = to_efa_dev(ibvctx->device); uint64_t comp_mask_out = 0; if (!is_efa_dev(ibvctx->device)) @@ -121,17 +136,20 @@ int efadv_query_device(struct ibv_context *ibvctx, return EINVAL; memset(attr, 0, inlen); - attr->max_sq_wr = dev->max_sq_wr; - attr->max_rq_wr = dev->max_rq_wr; - attr->max_sq_sge = dev->max_sq_sge; - attr->max_rq_sge = dev->max_rq_sge; + attr->max_sq_wr = ctx->max_sq_wr; + attr->max_rq_wr = ctx->max_rq_wr; + attr->max_sq_sge = ctx->max_sq_sge; + attr->max_rq_sge = ctx->max_rq_sge; attr->inline_buf_size = ctx->inline_buf_size; if (vext_field_avail(typeof(*attr), max_rdma_size, inlen)) { - attr->max_rdma_size = dev->max_rdma_size; + attr->max_rdma_size = ctx->max_rdma_size; - if (is_rdma_read_cap(dev)) + if (EFA_DEV_CAP(ctx, RDMA_READ)) attr->device_caps |= EFADV_DEVICE_ATTR_CAPS_RDMA_READ; + + if (EFA_DEV_CAP(ctx, RNR_RETRY)) + attr->device_caps |= EFADV_DEVICE_ATTR_CAPS_RNR_RETRY; } attr->comp_mask = comp_mask_out; @@ -214,13 +232,38 @@ int efa_dereg_mr(struct verbs_mr *vmr) return 0; } +static uint32_t efa_wq_get_next_wrid_idx_locked(struct efa_wq *wq, + uint64_t wr_id) +{ + uint32_t wrid_idx; + + /* Get the next wrid to be used from the index pool */ + wrid_idx = wq->wrid_idx_pool[wq->wrid_idx_pool_next]; + wq->wrid[wrid_idx] = wr_id; + + /* Will never overlap, as validate function succeeded */ + wq->wrid_idx_pool_next++; + assert(wq->wrid_idx_pool_next <= wq->wqe_cnt); + + return wrid_idx; +} + +static void efa_wq_put_wrid_idx_unlocked(struct efa_wq *wq, uint32_t wrid_idx) +{ + pthread_spin_lock(&wq->wqlock); + wq->wrid_idx_pool_next--; + wq->wrid_idx_pool[wq->wrid_idx_pool_next] = wrid_idx; + wq->wqe_completed++; + pthread_spin_unlock(&wq->wqlock); +} + static uint32_t efa_sub_cq_get_current_index(struct efa_sub_cq *sub_cq) { return sub_cq->consumed_cnt & sub_cq->qmask; } static int efa_cqe_is_pending(struct efa_io_cdesc_common *cqe_common, - int phase) + int phase) { return EFA_GET(&cqe_common->flags, EFA_IO_CDESC_COMMON_PHASE) == phase; } @@ -232,97 +275,6 @@ efa_sub_cq_get_cqe(struct efa_sub_cq *sub_cq, int entry) (entry * sub_cq->cqe_size)); } -static void efa_sub_cq_initialize(struct efa_sub_cq *sub_cq, uint8_t *buf, - int sub_cq_size, int cqe_size) -{ - sub_cq->consumed_cnt = 0; - sub_cq->phase = 1; - sub_cq->buf = buf; - sub_cq->qmask = sub_cq_size - 1; - sub_cq->cqe_size = cqe_size; - sub_cq->ref_cnt = 0; -} - -struct ibv_cq *efa_create_cq(struct ibv_context *ibvctx, int ncqe, - struct ibv_comp_channel *channel, int vec) -{ - struct efa_context *ctx = to_efa_context(ibvctx); - struct efa_create_cq_resp resp = {}; - struct efa_create_cq cmd = {}; - uint16_t num_sub_cqs; - struct efa_cq *cq; - int sub_buf_size; - int sub_cq_size; - uint8_t *buf; - int err; - int i; - - cq = calloc(1, sizeof(*cq) + - sizeof(*cq->sub_cq_arr) * ctx->sub_cqs_per_cq); - if (!cq) - return NULL; - - num_sub_cqs = ctx->sub_cqs_per_cq; - cmd.num_sub_cqs = num_sub_cqs; - cmd.cq_entry_size = ctx->cqe_size; - - ncqe = roundup_pow_of_two(ncqe); - err = ibv_cmd_create_cq(ibvctx, ncqe, channel, vec, - &cq->ibvcq, &cmd.ibv_cmd, sizeof(cmd), - &resp.ibv_resp, sizeof(resp)); - if (err) { - errno = err; - goto err_free_cq; - } - - sub_cq_size = cq->ibvcq.cqe; - cq->cqn = resp.cq_idx; - cq->buf_size = resp.q_mmap_size; - cq->num_sub_cqs = num_sub_cqs; - cq->cqe_size = ctx->cqe_size; - - cq->buf = mmap(NULL, cq->buf_size, PROT_READ, MAP_SHARED, - ibvctx->cmd_fd, resp.q_mmap_key); - if (cq->buf == MAP_FAILED) - goto err_destroy_cq; - - buf = cq->buf; - sub_buf_size = cq->cqe_size * sub_cq_size; - for (i = 0; i < num_sub_cqs; i++) { - efa_sub_cq_initialize(&cq->sub_cq_arr[i], buf, sub_cq_size, - cq->cqe_size); - buf += sub_buf_size; - } - - pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE); - - return &cq->ibvcq; - -err_destroy_cq: - ibv_cmd_destroy_cq(&cq->ibvcq); -err_free_cq: - free(cq); - return NULL; -} - -int efa_destroy_cq(struct ibv_cq *ibvcq) -{ - struct efa_cq *cq = to_efa_cq(ibvcq); - int err; - - munmap(cq->buf, cq->buf_size); - - pthread_spin_destroy(&cq->lock); - - err = ibv_cmd_destroy_cq(ibvcq); - if (err) - return err; - - free(cq); - - return 0; -} - static struct efa_io_cdesc_common * cq_next_sub_cqe_get(struct efa_sub_cq *sub_cq) { @@ -376,44 +328,26 @@ static enum ibv_wc_status to_ibv_status(enum efa_io_comp_status status) } } -static int efa_poll_sub_cq(struct efa_cq *cq, struct efa_sub_cq *sub_cq, - struct efa_qp **cur_qp, struct ibv_wc *wc) +static void efa_process_cqe(struct efa_cq *cq, struct ibv_wc *wc, + struct efa_qp *qp) { - struct efa_context *ctx = to_efa_context(cq->ibvcq.context); - struct efa_io_cdesc_common *cqe; - uint32_t qpn, wrid_idx; - struct efa_wq *wq; - - cqe = cq_next_sub_cqe_get(sub_cq); - if (!cqe) - return ENOMEM; - - qpn = cqe->qp_num; - if (!*cur_qp || qpn != (*cur_qp)->verbs_qp.qp.qp_num) { - /* We do not have to take the QP table lock here, - * because CQs will be locked while QPs are removed - * from the table. - */ - *cur_qp = ctx->qp_table[qpn & ctx->qp_table_sz_m1]; - if (!*cur_qp) - return EINVAL; - } + struct efa_io_cdesc_common *cqe = cq->cur_cqe; + uint32_t wrid_idx; - wrid_idx = cqe->req_id; wc->status = to_ibv_status(cqe->status); wc->vendor_err = cqe->status; wc->wc_flags = 0; - wc->qp_num = qpn; + wc->qp_num = cqe->qp_num; if (EFA_GET(&cqe->flags, EFA_IO_CDESC_COMMON_Q_TYPE) == EFA_IO_SEND_QUEUE) { - wq = &(*cur_qp)->sq.wq; + cq->cur_wq = &qp->sq.wq; wc->opcode = IBV_WC_SEND; } else { struct efa_io_rx_cdesc *rcqe = container_of(cqe, struct efa_io_rx_cdesc, common); - wq = &(*cur_qp)->rq.wq; + cq->cur_wq = &qp->rq.wq; wc->byte_len = cqe->length; wc->opcode = IBV_WC_RECV; @@ -427,23 +361,78 @@ static int efa_poll_sub_cq(struct efa_cq *cq, struct efa_sub_cq *sub_cq, } } - pthread_spin_lock(&wq->wqlock); - wq->wrid_idx_pool_next--; - wq->wrid_idx_pool[wq->wrid_idx_pool_next] = wrid_idx; - wc->wr_id = wq->wrid[wrid_idx]; - wq->wqe_completed++; - pthread_spin_unlock(&wq->wqlock); + wrid_idx = cqe->req_id; + /* We do not have to take the WQ lock here, + * because this wrid index has not been freed yet, + * so there is no contention on this index. + */ + wc->wr_id = cq->cur_wq->wrid[wrid_idx]; +} + +static void efa_process_ex_cqe(struct efa_cq *cq, struct efa_qp *qp) +{ + struct ibv_cq_ex *ibvcqx = &cq->verbs_cq.cq_ex; + struct efa_io_cdesc_common *cqe = cq->cur_cqe; + uint32_t wrid_idx; + + wrid_idx = cqe->req_id; + + if (EFA_GET(&cqe->flags, EFA_IO_CDESC_COMMON_Q_TYPE) == + EFA_IO_SEND_QUEUE) { + cq->cur_wq = &qp->sq.wq; + } else { + cq->cur_wq = &qp->rq.wq; + } + + ibvcqx->wr_id = cq->cur_wq->wrid[wrid_idx]; + ibvcqx->status = to_ibv_status(cqe->status); +} + +static inline int efa_poll_sub_cq(struct efa_cq *cq, struct efa_sub_cq *sub_cq, + struct efa_qp **cur_qp, struct ibv_wc *wc, + bool extended) ALWAYS_INLINE; +static inline int efa_poll_sub_cq(struct efa_cq *cq, struct efa_sub_cq *sub_cq, + struct efa_qp **cur_qp, struct ibv_wc *wc, + bool extended) +{ + struct efa_context *ctx = to_efa_context(cq->verbs_cq.cq.context); + uint32_t qpn; + + cq->cur_cqe = cq_next_sub_cqe_get(sub_cq); + if (!cq->cur_cqe) + return ENOENT; + + qpn = cq->cur_cqe->qp_num; + if (!*cur_qp || qpn != (*cur_qp)->verbs_qp.qp.qp_num) { + /* We do not have to take the QP table lock here, + * because CQs will be locked while QPs are removed + * from the table. + */ + *cur_qp = ctx->qp_table[qpn & ctx->qp_table_sz_m1]; + if (!*cur_qp) + return EINVAL; + } + + if (extended) { + efa_process_ex_cqe(cq, *cur_qp); + } else { + efa_process_cqe(cq, wc, *cur_qp); + efa_wq_put_wrid_idx_unlocked(cq->cur_wq, cq->cur_cqe->req_id); + } return 0; } -static int efa_poll_sub_cqs(struct efa_cq *cq, struct ibv_wc *wc) +static inline int efa_poll_sub_cqs(struct efa_cq *cq, struct ibv_wc *wc, + bool extended) ALWAYS_INLINE; +static inline int efa_poll_sub_cqs(struct efa_cq *cq, struct ibv_wc *wc, + bool extended) { uint16_t num_sub_cqs = cq->num_sub_cqs; struct efa_sub_cq *sub_cq; struct efa_qp *qp = NULL; uint16_t sub_cq_idx; - int err = ENOMEM; + int err = ENOENT; for (sub_cq_idx = 0; sub_cq_idx < num_sub_cqs; sub_cq_idx++) { sub_cq = &cq->sub_cq_arr[cq->next_poll_idx++]; @@ -452,8 +441,8 @@ static int efa_poll_sub_cqs(struct efa_cq *cq, struct ibv_wc *wc) if (!sub_cq->ref_cnt) continue; - err = efa_poll_sub_cq(cq, sub_cq, &qp, wc); - if (err != ENOMEM) + err = efa_poll_sub_cq(cq, sub_cq, &qp, wc, extended); + if (err != ENOENT) break; } @@ -468,9 +457,9 @@ int efa_poll_cq(struct ibv_cq *ibvcq, int nwc, struct ibv_wc *wc) pthread_spin_lock(&cq->lock); for (i = 0; i < nwc; i++) { - ret = efa_poll_sub_cqs(cq, &wc[i]); + ret = efa_poll_sub_cqs(cq, &wc[i], false); if (ret) { - if (ret == ENOMEM) + if (ret == ENOENT) ret = 0; break; } @@ -480,6 +469,275 @@ int efa_poll_cq(struct ibv_cq *ibvcq, int nwc, struct ibv_wc *wc) return i ?: -ret; } +static int efa_start_poll(struct ibv_cq_ex *ibvcqx, + struct ibv_poll_cq_attr *attr) +{ + struct efa_cq *cq = to_efa_cq_ex(ibvcqx); + int ret; + + if (unlikely(attr->comp_mask)) + return EINVAL; + + pthread_spin_lock(&cq->lock); + + ret = efa_poll_sub_cqs(cq, NULL, true); + if (ret) + pthread_spin_unlock(&cq->lock); + + return ret; +} + +static int efa_next_poll(struct ibv_cq_ex *ibvcqx) +{ + struct efa_cq *cq = to_efa_cq_ex(ibvcqx); + int ret; + + efa_wq_put_wrid_idx_unlocked(cq->cur_wq, cq->cur_cqe->req_id); + ret = efa_poll_sub_cqs(cq, NULL, true); + + return ret; +} + +static void efa_end_poll(struct ibv_cq_ex *ibvcqx) +{ + struct efa_cq *cq = to_efa_cq_ex(ibvcqx); + + if (cq->cur_cqe) + efa_wq_put_wrid_idx_unlocked(cq->cur_wq, cq->cur_cqe->req_id); + + pthread_spin_unlock(&cq->lock); +} + +static enum ibv_wc_opcode efa_wc_read_opcode(struct ibv_cq_ex *ibvcqx) +{ + struct efa_cq *cq = to_efa_cq_ex(ibvcqx); + struct efa_io_cdesc_common *cqe = cq->cur_cqe; + + if (EFA_GET(&cqe->flags, EFA_IO_CDESC_COMMON_Q_TYPE) == + EFA_IO_SEND_QUEUE) + return IBV_WC_SEND; + + return IBV_WC_RECV; +} + +static uint32_t efa_wc_read_vendor_err(struct ibv_cq_ex *ibvcqx) +{ + struct efa_cq *cq = to_efa_cq_ex(ibvcqx); + + return cq->cur_cqe->status; +} + +static unsigned int efa_wc_read_wc_flags(struct ibv_cq_ex *ibvcqx) +{ + struct efa_cq *cq = to_efa_cq_ex(ibvcqx); + unsigned int wc_flags = 0; + + if (EFA_GET(&cq->cur_cqe->flags, EFA_IO_CDESC_COMMON_HAS_IMM)) + wc_flags |= IBV_WC_WITH_IMM; + + return wc_flags; +} + +static uint32_t efa_wc_read_byte_len(struct ibv_cq_ex *ibvcqx) +{ + struct efa_cq *cq = to_efa_cq_ex(ibvcqx); + + return cq->cur_cqe->length; +} + +static __be32 efa_wc_read_imm_data(struct ibv_cq_ex *ibvcqx) +{ + struct efa_cq *cq = to_efa_cq_ex(ibvcqx); + struct efa_io_rx_cdesc *rcqe; + + rcqe = container_of(cq->cur_cqe, struct efa_io_rx_cdesc, common); + + return htobe32(rcqe->imm); +} + +static uint32_t efa_wc_read_qp_num(struct ibv_cq_ex *ibvcqx) +{ + struct efa_cq *cq = to_efa_cq_ex(ibvcqx); + + return cq->cur_cqe->qp_num; +} + +static uint32_t efa_wc_read_src_qp(struct ibv_cq_ex *ibvcqx) +{ + struct efa_cq *cq = to_efa_cq_ex(ibvcqx); + struct efa_io_rx_cdesc *rcqe; + + rcqe = container_of(cq->cur_cqe, struct efa_io_rx_cdesc, common); + + return rcqe->src_qp_num; +} + +static uint32_t efa_wc_read_slid(struct ibv_cq_ex *ibvcqx) +{ + struct efa_cq *cq = to_efa_cq_ex(ibvcqx); + struct efa_io_rx_cdesc *rcqe; + + rcqe = container_of(cq->cur_cqe, struct efa_io_rx_cdesc, common); + + return rcqe->ah; +} + +static uint8_t efa_wc_read_sl(struct ibv_cq_ex *ibvcqx) +{ + return 0; +} + +static uint8_t efa_wc_read_dlid_path_bits(struct ibv_cq_ex *ibvcqx) +{ + return 0; +} + +static void efa_cq_fill_pfns(struct ibv_cq_ex *ibvcqx, + struct ibv_cq_init_attr_ex *attr) +{ + ibvcqx->start_poll = efa_start_poll; + ibvcqx->end_poll = efa_end_poll; + ibvcqx->next_poll = efa_next_poll; + + ibvcqx->read_opcode = efa_wc_read_opcode; + ibvcqx->read_vendor_err = efa_wc_read_vendor_err; + ibvcqx->read_wc_flags = efa_wc_read_wc_flags; + + if (attr->wc_flags & IBV_WC_EX_WITH_BYTE_LEN) + ibvcqx->read_byte_len = efa_wc_read_byte_len; + if (attr->wc_flags & IBV_WC_EX_WITH_IMM) + ibvcqx->read_imm_data = efa_wc_read_imm_data; + if (attr->wc_flags & IBV_WC_EX_WITH_QP_NUM) + ibvcqx->read_qp_num = efa_wc_read_qp_num; + if (attr->wc_flags & IBV_WC_EX_WITH_SRC_QP) + ibvcqx->read_src_qp = efa_wc_read_src_qp; + if (attr->wc_flags & IBV_WC_EX_WITH_SLID) + ibvcqx->read_slid = efa_wc_read_slid; + if (attr->wc_flags & IBV_WC_EX_WITH_SL) + ibvcqx->read_sl = efa_wc_read_sl; + if (attr->wc_flags & IBV_WC_EX_WITH_DLID_PATH_BITS) + ibvcqx->read_dlid_path_bits = efa_wc_read_dlid_path_bits; +} + +static void efa_sub_cq_initialize(struct efa_sub_cq *sub_cq, uint8_t *buf, + int sub_cq_size, int cqe_size) +{ + sub_cq->consumed_cnt = 0; + sub_cq->phase = 1; + sub_cq->buf = buf; + sub_cq->qmask = sub_cq_size - 1; + sub_cq->cqe_size = cqe_size; + sub_cq->ref_cnt = 0; +} + +static struct ibv_cq_ex *create_cq(struct ibv_context *ibvctx, + struct ibv_cq_init_attr_ex *attr) +{ + struct efa_context *ctx = to_efa_context(ibvctx); + struct efa_create_cq_resp resp = {}; + struct efa_create_cq cmd = {}; + uint16_t num_sub_cqs; + struct efa_cq *cq; + int sub_buf_size; + int sub_cq_size; + uint8_t *buf; + int err; + int i; + + cq = calloc(1, sizeof(*cq) + + sizeof(*cq->sub_cq_arr) * ctx->sub_cqs_per_cq); + if (!cq) + return NULL; + + num_sub_cqs = ctx->sub_cqs_per_cq; + cmd.num_sub_cqs = num_sub_cqs; + cmd.cq_entry_size = ctx->cqe_size; + + attr->cqe = roundup_pow_of_two(attr->cqe); + err = ibv_cmd_create_cq_ex(ibvctx, attr, &cq->verbs_cq, + &cmd.ibv_cmd, sizeof(cmd), + &resp.ibv_resp, sizeof(resp)); + if (err) { + errno = err; + goto err_free_cq; + } + + sub_cq_size = cq->verbs_cq.cq.cqe; + cq->cqn = resp.cq_idx; + cq->buf_size = resp.q_mmap_size; + cq->num_sub_cqs = num_sub_cqs; + cq->cqe_size = ctx->cqe_size; + + cq->buf = mmap(NULL, cq->buf_size, PROT_READ, MAP_SHARED, + ibvctx->cmd_fd, resp.q_mmap_key); + if (cq->buf == MAP_FAILED) + goto err_destroy_cq; + + buf = cq->buf; + sub_buf_size = cq->cqe_size * sub_cq_size; + for (i = 0; i < num_sub_cqs; i++) { + efa_sub_cq_initialize(&cq->sub_cq_arr[i], buf, sub_cq_size, + cq->cqe_size); + buf += sub_buf_size; + } + + efa_cq_fill_pfns(&cq->verbs_cq.cq_ex, attr); + pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE); + + return &cq->verbs_cq.cq_ex; + +err_destroy_cq: + ibv_cmd_destroy_cq(&cq->verbs_cq.cq); +err_free_cq: + free(cq); + return NULL; +} + +struct ibv_cq *efa_create_cq(struct ibv_context *ibvctx, int ncqe, + struct ibv_comp_channel *channel, int vec) +{ + struct ibv_cq_init_attr_ex attr_ex = { + .cqe = ncqe, + .channel = channel, + .comp_vector = vec + }; + struct ibv_cq_ex *ibvcqx; + + ibvcqx = create_cq(ibvctx, &attr_ex); + + return ibvcqx ? ibv_cq_ex_to_cq(ibvcqx) : NULL; +} + +struct ibv_cq_ex *efa_create_cq_ex(struct ibv_context *ibvctx, + struct ibv_cq_init_attr_ex *attr_ex) +{ + if (!check_comp_mask(attr_ex->comp_mask, 0) || + !check_comp_mask(attr_ex->wc_flags, IBV_WC_STANDARD_FLAGS)) { + errno = EOPNOTSUPP; + return NULL; + } + + return create_cq(ibvctx, attr_ex); +} + +int efa_destroy_cq(struct ibv_cq *ibvcq) +{ + struct efa_cq *cq = to_efa_cq(ibvcq); + int err; + + munmap(cq->buf, cq->buf_size); + + pthread_spin_destroy(&cq->lock); + + err = ibv_cmd_destroy_cq(ibvcq); + if (err) + return err; + + free(cq); + + return 0; +} + static void efa_cq_inc_ref_cnt(struct efa_cq *cq, uint8_t sub_cq_idx) { cq->sub_cq_arr[sub_cq_idx].ref_cnt++; @@ -490,15 +748,22 @@ static void efa_cq_dec_ref_cnt(struct efa_cq *cq, uint8_t sub_cq_idx) cq->sub_cq_arr[sub_cq_idx].ref_cnt--; } -static void efa_wq_terminate(struct efa_wq *wq) +static void efa_wq_terminate(struct efa_wq *wq, int pgsz) { + void *db_aligned; + pthread_spin_destroy(&wq->wqlock); + + db_aligned = (void *)((uintptr_t)wq->db & ~(pgsz - 1)); + munmap(db_aligned, pgsz); + free(wq->wrid_idx_pool); free(wq->wrid); } -static int efa_wq_initialize(struct efa_wq *wq) +static int efa_wq_initialize(struct efa_wq *wq, struct efa_wq_init_attr *attr) { + uint8_t *db_base; int err; int i; @@ -512,144 +777,158 @@ static int efa_wq_initialize(struct efa_wq *wq) goto err_free_wrid; } + db_base = mmap(NULL, attr->pgsz, PROT_WRITE, MAP_SHARED, attr->cmd_fd, + attr->db_mmap_key); + if (db_base == MAP_FAILED) { + err = errno; + goto err_free_wrid_idx_pool; + } + + wq->db = (uint32_t *)(db_base + attr->db_off); + /* Initialize the wrid free indexes pool. */ for (i = 0; i < wq->wqe_cnt; i++) wq->wrid_idx_pool[i] = i; pthread_spin_init(&wq->wqlock, PTHREAD_PROCESS_PRIVATE); + wq->sub_cq_idx = attr->sub_cq_idx; + return 0; +err_free_wrid_idx_pool: + free(wq->wrid_idx_pool); err_free_wrid: free(wq->wrid); - return err; } static void efa_sq_terminate(struct efa_qp *qp) { - void *db_aligned; + struct efa_sq *sq = &qp->sq; - if (!qp->sq.wq.wrid) + if (!sq->wq.wqe_cnt) return; - db_aligned = (void *)((uintptr_t)qp->sq.db & ~(qp->page_size - 1)); - munmap(db_aligned, qp->page_size); - munmap(qp->sq.desc - qp->sq.desc_offset, qp->sq.desc_ring_mmap_size); - free(qp->sq.local_queue); + munmap(sq->desc - sq->desc_offset, sq->desc_ring_mmap_size); + free(sq->local_queue); - efa_wq_terminate(&qp->sq.wq); + efa_wq_terminate(&sq->wq, qp->page_size); } -static int efa_sq_initialize(struct efa_qp *qp, struct efa_create_qp_resp *resp) +static int efa_sq_initialize(struct efa_qp *qp, + const struct ibv_qp_init_attr_ex *attr, + struct efa_create_qp_resp *resp) { - struct efa_dev *dev = to_efa_dev(qp->verbs_qp.qp.context->device); + struct efa_context *ctx = to_efa_context(qp->verbs_qp.qp.context); + struct efa_wq_init_attr wq_attr; + struct efa_sq *sq = &qp->sq; size_t desc_ring_size; - uint8_t *db_base; int err; - if (!qp->sq.wq.wqe_cnt) + if (!sq->wq.wqe_cnt) return 0; - err = efa_wq_initialize(&qp->sq.wq); + wq_attr = (struct efa_wq_init_attr) { + .db_mmap_key = resp->sq_db_mmap_key, + .db_off = resp->sq_db_offset, + .cmd_fd = qp->verbs_qp.qp.context->cmd_fd, + .pgsz = qp->page_size, + .sub_cq_idx = resp->send_sub_cq_idx, + }; + + err = efa_wq_initialize(&qp->sq.wq, &wq_attr); if (err) return err; - qp->sq.desc_offset = resp->llq_desc_offset; - desc_ring_size = qp->sq.wq.wqe_cnt * sizeof(struct efa_io_tx_wqe); - qp->sq.desc_ring_mmap_size = align(desc_ring_size + qp->sq.desc_offset, - qp->page_size); - qp->sq.max_inline_data = resp->ibv_resp.max_inline_data; + sq->desc_offset = resp->llq_desc_offset; + desc_ring_size = sq->wq.wqe_cnt * sizeof(struct efa_io_tx_wqe); + sq->desc_ring_mmap_size = align(desc_ring_size + sq->desc_offset, + qp->page_size); + sq->max_inline_data = attr->cap.max_inline_data; - qp->sq.local_queue = malloc(desc_ring_size); - if (!qp->sq.local_queue) { + sq->local_queue = malloc(desc_ring_size); + if (!sq->local_queue) { err = ENOMEM; goto err_terminate_wq; } - qp->sq.desc = mmap(NULL, qp->sq.desc_ring_mmap_size, PROT_WRITE, - MAP_SHARED, qp->verbs_qp.qp.context->cmd_fd, - resp->llq_desc_mmap_key); - if (qp->sq.desc == MAP_FAILED) { + sq->desc = mmap(NULL, sq->desc_ring_mmap_size, PROT_WRITE, + MAP_SHARED, qp->verbs_qp.qp.context->cmd_fd, + resp->llq_desc_mmap_key); + if (sq->desc == MAP_FAILED) { err = errno; goto err_free_local_queue; } - qp->sq.desc += qp->sq.desc_offset; - - db_base = mmap(NULL, qp->page_size, PROT_WRITE, MAP_SHARED, - qp->verbs_qp.qp.context->cmd_fd, resp->sq_db_mmap_key); - if (db_base == MAP_FAILED) { - err = errno; - goto err_unmap_desc_ring; + sq->desc += sq->desc_offset; + sq->max_wr_rdma_sge = min_t(uint16_t, ctx->max_wr_rdma_sge, + EFA_IO_TX_DESC_NUM_RDMA_BUFS); + sq->max_batch_wr = ctx->max_tx_batch ? + (ctx->max_tx_batch * 64) / sizeof(struct efa_io_tx_wqe) : + UINT16_MAX; + if (ctx->min_sq_wr) { + /* The device can't accept a doorbell for the whole SQ at once, + * set the max batch to at least (SQ size - 1). + */ + sq->max_batch_wr = min_t(uint32_t, sq->max_batch_wr, + sq->wq.wqe_cnt - 1); } - qp->sq.db = (uint32_t *)(db_base + resp->sq_db_offset); - qp->sq.sub_cq_idx = resp->send_sub_cq_idx; - qp->sq.max_wr_rdma_sge = min_t(uint16_t, dev->max_wr_rdma_sge, - EFA_IO_TX_DESC_NUM_RDMA_BUFS); - return 0; -err_unmap_desc_ring: - munmap(qp->sq.desc - qp->sq.desc_offset, qp->sq.desc_ring_mmap_size); err_free_local_queue: - free(qp->sq.local_queue); + free(sq->local_queue); err_terminate_wq: - efa_wq_terminate(&qp->sq.wq); + efa_wq_terminate(&sq->wq, qp->page_size); return err; } static void efa_rq_terminate(struct efa_qp *qp) { - void *db_aligned; + struct efa_rq *rq = &qp->rq; - if (!qp->rq.wq.wrid) + if (!rq->wq.wqe_cnt) return; - db_aligned = (void *)((uintptr_t)qp->rq.db & ~(qp->page_size - 1)); - munmap(db_aligned, qp->page_size); - munmap(qp->rq.buf, qp->rq.buf_size); + munmap(rq->buf, rq->buf_size); - efa_wq_terminate(&qp->rq.wq); + efa_wq_terminate(&rq->wq, qp->page_size); } static int efa_rq_initialize(struct efa_qp *qp, struct efa_create_qp_resp *resp) { - uint8_t *db_base; + struct efa_wq_init_attr wq_attr; + struct efa_rq *rq = &qp->rq; int err; - if (!qp->rq.wq.wqe_cnt) + if (!rq->wq.wqe_cnt) return 0; - err = efa_wq_initialize(&qp->rq.wq); + wq_attr = (struct efa_wq_init_attr) { + .db_mmap_key = resp->rq_db_mmap_key, + .db_off = resp->rq_db_offset, + .cmd_fd = qp->verbs_qp.qp.context->cmd_fd, + .pgsz = qp->page_size, + .sub_cq_idx = resp->recv_sub_cq_idx, + }; + + err = efa_wq_initialize(&qp->rq.wq, &wq_attr); if (err) return err; - qp->rq.buf_size = resp->rq_mmap_size; - qp->rq.buf = mmap(NULL, qp->rq.buf_size, PROT_WRITE, MAP_SHARED, - qp->verbs_qp.qp.context->cmd_fd, resp->rq_mmap_key); - if (qp->rq.buf == MAP_FAILED) { + rq->buf_size = resp->rq_mmap_size; + rq->buf = mmap(NULL, rq->buf_size, PROT_WRITE, MAP_SHARED, + qp->verbs_qp.qp.context->cmd_fd, resp->rq_mmap_key); + if (rq->buf == MAP_FAILED) { err = errno; goto err_terminate_wq; } - db_base = mmap(NULL, qp->page_size, PROT_WRITE, MAP_SHARED, - qp->verbs_qp.qp.context->cmd_fd, resp->rq_db_mmap_key); - if (db_base == MAP_FAILED) { - err = errno; - goto err_unmap_rq_buf; - } - - qp->rq.db = (uint32_t *)(db_base + resp->rq_db_offset); - qp->rq.sub_cq_idx = resp->recv_sub_cq_idx; - return 0; -err_unmap_rq_buf: - munmap(qp->rq.buf, qp->rq.buf_size); err_terminate_wq: - efa_wq_terminate(&qp->rq.wq); + efa_wq_terminate(&rq->wq, qp->page_size); return err; } @@ -657,16 +936,17 @@ static void efa_qp_init_indices(struct efa_qp *qp) { qp->sq.wq.wqe_posted = 0; qp->sq.wq.wqe_completed = 0; - qp->sq.wq.desc_idx = 0; + qp->sq.wq.pc = 0; qp->sq.wq.wrid_idx_pool_next = 0; qp->rq.wq.wqe_posted = 0; qp->rq.wq.wqe_completed = 0; - qp->rq.wq.desc_idx = 0; + qp->rq.wq.pc = 0; qp->rq.wq.wrid_idx_pool_next = 0; } -static void efa_setup_qp(struct efa_qp *qp, +static void efa_setup_qp(struct efa_context *ctx, + struct efa_qp *qp, struct ibv_qp_cap *cap, size_t page_size) { @@ -674,7 +954,8 @@ static void efa_setup_qp(struct efa_qp *qp, efa_qp_init_indices(qp); - qp->sq.wq.wqe_cnt = roundup_pow_of_two(cap->max_send_wr); + qp->sq.wq.wqe_cnt = roundup_pow_of_two(max_t(uint32_t, cap->max_send_wr, + ctx->min_sq_wr)); qp->sq.wq.max_sge = cap->max_send_sge; qp->sq.wq.desc_mask = qp->sq.wq.wqe_cnt - 1; @@ -719,7 +1000,7 @@ static void efa_unlock_cqs(struct ibv_qp *ibvqp) static void efa_qp_fill_wr_pfns(struct ibv_qp_ex *ibvqpx, struct ibv_qp_init_attr_ex *attr_ex); -static int efa_check_qp_attr(struct efa_dev *dev, +static int efa_check_qp_attr(struct efa_context *ctx, struct ibv_qp_init_attr_ex *attr, struct efadv_qp_init_attr *efa_attr) { @@ -728,7 +1009,7 @@ static int efa_check_qp_attr(struct efa_dev *dev, IBV_QP_EX_WITH_SEND_WITH_IMM; uint64_t supp_srd_send_ops_mask = IBV_QP_EX_WITH_SEND | IBV_QP_EX_WITH_SEND_WITH_IMM | - (is_rdma_read_cap(dev) ? IBV_QP_EX_WITH_RDMA_READ : 0); + (EFA_DEV_CAP(ctx, RDMA_READ) ? IBV_QP_EX_WITH_RDMA_READ : 0); #define EFA_CREATE_QP_SUPP_ATTR_MASK \ (IBV_QP_INIT_ATTR_PD | IBV_QP_INIT_ATTR_SEND_OPS_FLAGS) @@ -768,19 +1049,19 @@ static int efa_check_qp_attr(struct efa_dev *dev, return 0; } -static int efa_check_qp_limits(struct efa_dev *dev, +static int efa_check_qp_limits(struct efa_context *ctx, struct ibv_qp_init_attr_ex *attr) { - if (attr->cap.max_send_sge > dev->max_sq_sge) + if (attr->cap.max_send_sge > ctx->max_sq_sge) return EINVAL; - if (attr->cap.max_recv_sge > dev->max_rq_sge) + if (attr->cap.max_recv_sge > ctx->max_rq_sge) return EINVAL; - if (attr->cap.max_send_wr > dev->max_sq_wr) + if (attr->cap.max_send_wr > ctx->max_sq_wr) return EINVAL; - if (attr->cap.max_recv_wr > dev->max_rq_wr) + if (attr->cap.max_recv_wr > ctx->max_rq_wr) return EINVAL; return 0; @@ -800,11 +1081,11 @@ static struct ibv_qp *create_qp(struct ibv_context *ibvctx, struct efa_qp *qp; int err; - err = efa_check_qp_attr(dev, attr, efa_attr); + err = efa_check_qp_attr(ctx, attr, efa_attr); if (err) goto err_out; - err = efa_check_qp_limits(dev, attr); + err = efa_check_qp_limits(ctx, attr); if (err) goto err_out; @@ -814,7 +1095,7 @@ static struct ibv_qp *create_qp(struct ibv_context *ibvctx, goto err_out; } - efa_setup_qp(qp, &attr->cap, dev->pg_sz); + efa_setup_qp(ctx, qp, &attr->cap, dev->pg_sz); attr->cap.max_send_wr = qp->sq.wq.wqe_cnt; attr->cap.max_recv_wr = qp->rq.wq.wqe_cnt; @@ -826,7 +1107,7 @@ static struct ibv_qp *create_qp(struct ibv_context *ibvctx, if (attr->qp_type == IBV_QPT_DRIVER) req.driver_qp_type = efa_attr->driver_qp_type; - err = ibv_cmd_create_qp_ex(ibvctx, &qp->verbs_qp, sizeof(qp->verbs_qp), + err = ibv_cmd_create_qp_ex(ibvctx, &qp->verbs_qp, attr, &req.ibv_cmd, sizeof(req), &resp.ibv_resp, sizeof(resp)); if (err) @@ -840,7 +1121,7 @@ static struct ibv_qp *create_qp(struct ibv_context *ibvctx, if (err) goto err_destroy_qp; - err = efa_sq_initialize(qp, &resp); + err = efa_sq_initialize(qp, attr, &resp); if (err) goto err_terminate_rq; @@ -908,7 +1189,7 @@ struct ibv_qp *efa_create_qp_ex(struct ibv_context *ibvctx, struct ibv_qp_init_attr_ex *attr_ex) { if (attr_ex->qp_type != IBV_QPT_UD) { - errno = EINVAL; + errno = EOPNOTSUPP; return NULL; } @@ -1009,11 +1290,11 @@ int efa_destroy_qp(struct ibv_qp *ibvqp) if (ibvqp->send_cq) efa_cq_dec_ref_cnt(to_efa_cq(ibvqp->send_cq), - qp->sq.sub_cq_idx); + qp->sq.wq.sub_cq_idx); if (ibvqp->recv_cq) efa_cq_dec_ref_cnt(to_efa_cq(ibvqp->recv_cq), - qp->rq.sub_cq_idx); + qp->rq.wq.sub_cq_idx); ctx->qp_table[ibvqp->qp_num & ctx->qp_table_sz_m1] = NULL; @@ -1100,25 +1381,22 @@ static size_t efa_buf_list_total_bytes(const struct ibv_data_buf *buf_list, static void efa_sq_advance_post_idx(struct efa_qp *qp) { qp->sq.wq.wqe_posted++; - qp->sq.wq.desc_idx++; + qp->sq.wq.pc++; - if (!(qp->sq.wq.desc_idx & qp->sq.wq.desc_mask)) + if (!(qp->sq.wq.pc & qp->sq.wq.desc_mask)) qp->sq.wq.phase++; } -static uint32_t efa_wq_get_next_wrid_idx(struct efa_wq *wq, uint64_t wr_id) +static inline void efa_rq_ring_doorbell(struct efa_rq *rq, uint16_t pc) { - uint32_t wrid_idx; - - /* Get the next wrid to be used from the index pool */ - wrid_idx = wq->wrid_idx_pool[wq->wrid_idx_pool_next]; - wq->wrid[wrid_idx] = wr_id; - - /* Will never overlap, as validate function succeeded */ - wq->wrid_idx_pool_next++; - assert(wq->wrid_idx_pool_next <= wq->wqe_cnt); + udma_to_device_barrier(); + mmio_write32(rq->wq.db, pc); +} - return wrid_idx; +static inline void efa_sq_ring_doorbell(struct efa_sq *sq, uint16_t pc) +{ + mmio_flush_writes(); + mmio_write32(sq->wq.db, pc); } static void efa_set_common_ctrl_flags(struct efa_io_tx_meta_desc *desc, @@ -1188,6 +1466,7 @@ int efa_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr, struct efa_qp *qp = to_efa_qp(ibvqp); struct efa_io_tx_wqe tx_wqe; uint32_t sq_desc_offset; + uint32_t curbatch = 0; struct efa_ah *ah; int err = 0; @@ -1219,25 +1498,33 @@ int efa_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr, /* Set rest of the descriptor fields */ efa_set_common_ctrl_flags(meta_desc, qp, EFA_IO_SEND); - meta_desc->req_id = efa_wq_get_next_wrid_idx(&qp->sq.wq, wr->wr_id); + meta_desc->req_id = efa_wq_get_next_wrid_idx_locked(&qp->sq.wq, + wr->wr_id); meta_desc->dest_qp_num = wr->wr.ud.remote_qpn; meta_desc->ah = ah->efa_ah; meta_desc->qkey = wr->wr.ud.remote_qkey; /* Copy descriptor */ - sq_desc_offset = (qp->sq.wq.desc_idx & qp->sq.wq.desc_mask) * + sq_desc_offset = (qp->sq.wq.pc & qp->sq.wq.desc_mask) * sizeof(tx_wqe); - memcpy(qp->sq.desc + sq_desc_offset, &tx_wqe, sizeof(tx_wqe)); + mmio_memcpy_x64(qp->sq.desc + sq_desc_offset, &tx_wqe, + sizeof(tx_wqe)); /* advance index and change phase */ efa_sq_advance_post_idx(qp); + curbatch++; + + if (curbatch == qp->sq.max_batch_wr) { + curbatch = 0; + efa_sq_ring_doorbell(&qp->sq, qp->sq.wq.pc); + } wr = wr->next; } ring_db: - mmio_flush_writes(); - mmio_write32(qp->sq.db, qp->sq.wq.desc_idx); + if (curbatch) + efa_sq_ring_doorbell(&qp->sq, qp->sq.wq.pc); /* * Not using mmio_wc_spinunlock as the doorbell write should be done @@ -1269,7 +1556,8 @@ static int efa_send_wr_common(struct ibv_qp_ex *ibvqpx, meta_desc = &qp->sq.curr_tx_wqe->meta; efa_set_common_ctrl_flags(meta_desc, qp, op_type); - meta_desc->req_id = efa_wq_get_next_wrid_idx(&qp->sq.wq, ibvqpx->wr_id); + meta_desc->req_id = efa_wq_get_next_wrid_idx_locked(&qp->sq.wq, + ibvqpx->wr_id); /* advance index and change phase */ efa_sq_advance_post_idx(qp); @@ -1465,7 +1753,7 @@ static void efa_send_wr_start(struct ibv_qp_ex *ibvqpx) static inline void efa_sq_roll_back(struct efa_qp *qp) { qp->sq.wq.wqe_posted -= qp->sq.num_wqe_pending; - qp->sq.wq.desc_idx -= qp->sq.num_wqe_pending; + qp->sq.wq.pc -= qp->sq.num_wqe_pending; qp->sq.wq.wrid_idx_pool_next -= qp->sq.num_wqe_pending; qp->sq.wq.phase = qp->sq.phase_rb; } @@ -1473,9 +1761,12 @@ static inline void efa_sq_roll_back(struct efa_qp *qp) static int efa_send_wr_complete(struct ibv_qp_ex *ibvqpx) { struct efa_qp *qp = to_efa_qp_ex(ibvqpx); + uint32_t max_txbatch = qp->sq.max_batch_wr; uint32_t num_wqe_to_copy; uint16_t local_idx = 0; + uint16_t curbatch = 0; uint16_t sq_desc_idx; + uint16_t pc; if (unlikely(qp->wr_session_err)) { efa_sq_roll_back(qp); @@ -1483,27 +1774,37 @@ static int efa_send_wr_complete(struct ibv_qp_ex *ibvqpx) } /* - * Copy local queue to device in chunks, as the descriptor index - * might have wrapped around the submission queue. + * Copy local queue to device in chunks, handling wraparound and max + * doorbell batch. */ - sq_desc_idx = (qp->sq.wq.desc_idx - qp->sq.num_wqe_pending) & - qp->sq.wq.desc_mask; + pc = qp->sq.wq.pc - qp->sq.num_wqe_pending; + sq_desc_idx = pc & qp->sq.wq.desc_mask; while (qp->sq.num_wqe_pending) { - num_wqe_to_copy = min(qp->sq.num_wqe_pending, - qp->sq.wq.wqe_cnt - sq_desc_idx); - memcpy((struct efa_io_tx_wqe *)qp->sq.desc + sq_desc_idx, - (struct efa_io_tx_wqe *)qp->sq.local_queue + local_idx, - num_wqe_to_copy * sizeof(struct efa_io_tx_wqe)); + num_wqe_to_copy = min3(qp->sq.num_wqe_pending, + qp->sq.wq.wqe_cnt - sq_desc_idx, + max_txbatch - curbatch); + mmio_memcpy_x64((struct efa_io_tx_wqe *)qp->sq.desc + + sq_desc_idx, + (struct efa_io_tx_wqe *)qp->sq.local_queue + + local_idx, + num_wqe_to_copy * sizeof(struct efa_io_tx_wqe)); qp->sq.num_wqe_pending -= num_wqe_to_copy; local_idx += num_wqe_to_copy; + curbatch += num_wqe_to_copy; + pc += num_wqe_to_copy; sq_desc_idx = (sq_desc_idx + num_wqe_to_copy) & qp->sq.wq.desc_mask; + + if (curbatch == max_txbatch) { + efa_sq_ring_doorbell(&qp->sq, pc); + curbatch = 0; + } } - mmio_flush_writes(); - mmio_write32(qp->sq.db, qp->sq.wq.desc_idx); + if (curbatch) + efa_sq_ring_doorbell(&qp->sq, qp->sq.wq.pc); out: /* * Not using mmio_wc_spinunlock as the doorbell write should be done @@ -1584,7 +1885,8 @@ int efa_post_recv(struct ibv_qp *ibvqp, struct ibv_recv_wr *wr, memset(&rx_buf, 0, sizeof(rx_buf)); - rx_buf.req_id = efa_wq_get_next_wrid_idx(&qp->rq.wq, wr->wr_id); + rx_buf.req_id = efa_wq_get_next_wrid_idx_locked(&qp->rq.wq, + wr->wr_id); qp->rq.wq.wqe_posted++; /* Default init of the rx buffer */ @@ -1607,12 +1909,13 @@ int efa_post_recv(struct ibv_qp *ibvqp, struct ibv_recv_wr *wr, rx_buf.buf_addr_hi = (uint64_t)addr >> 32; /* Copy descriptor to RX ring */ - rq_desc_offset = (qp->rq.wq.desc_idx & qp->rq.wq.desc_mask) * sizeof(rx_buf); + rq_desc_offset = (qp->rq.wq.pc & qp->rq.wq.desc_mask) * + sizeof(rx_buf); memcpy(qp->rq.buf + rq_desc_offset, &rx_buf, sizeof(rx_buf)); /* Wrap rx descriptor index */ - qp->rq.wq.desc_idx++; - if (!(qp->rq.wq.desc_idx & qp->rq.wq.desc_mask)) + qp->rq.wq.pc++; + if (!(qp->rq.wq.pc & qp->rq.wq.desc_mask)) qp->rq.wq.phase++; /* reset descriptor for next iov */ @@ -1622,8 +1925,7 @@ int efa_post_recv(struct ibv_qp *ibvqp, struct ibv_recv_wr *wr, } ring_db: - udma_to_device_barrier(); - mmio_write32(qp->rq.db, qp->rq.wq.desc_idx); + efa_rq_ring_doorbell(&qp->rq, qp->rq.wq.pc); pthread_spin_unlock(&qp->rq.wq.wqlock); return err; diff --git a/providers/efa/verbs.h b/providers/efa/verbs.h index 8bf468d..da022e6 100644 --- a/providers/efa/verbs.h +++ b/providers/efa/verbs.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 OR BSD-2-Clause */ /* - * Copyright 2019 Amazon.com, Inc. or its affiliates. All rights reserved. + * Copyright 2019-2020 Amazon.com, Inc. or its affiliates. All rights reserved. */ #ifndef __EFA_VERBS_H__ @@ -23,6 +23,8 @@ int efa_dereg_mr(struct verbs_mr *vmr); struct ibv_cq *efa_create_cq(struct ibv_context *uctx, int ncqe, struct ibv_comp_channel *ch, int vec); +struct ibv_cq_ex *efa_create_cq_ex(struct ibv_context *uctx, + struct ibv_cq_init_attr_ex *attr_ex); int efa_destroy_cq(struct ibv_cq *ibvcq); int efa_poll_cq(struct ibv_cq *ibvcq, int nwc, struct ibv_wc *wc); diff --git a/providers/hns/hns_roce_u.c b/providers/hns/hns_roce_u.c index e5b9488..c437041 100644 --- a/providers/hns/hns_roce_u.c +++ b/providers/hns/hns_roce_u.c @@ -90,12 +90,13 @@ static struct verbs_context *hns_roce_alloc_context(struct ibv_device *ibdev, int cmd_fd, void *private_data) { - int i; - struct ibv_get_context cmd; + struct hns_roce_device *hr_dev = to_hr_dev(ibdev); + struct hns_roce_alloc_ucontext_resp resp = {}; struct ibv_device_attr dev_attrs; struct hns_roce_context *context; - struct hns_roce_alloc_ucontext_resp resp = {}; - struct hns_roce_device *hr_dev = to_hr_dev(ibdev); + struct ibv_get_context cmd; + int offset = 0; + int i; context = verbs_init_and_alloc_context(ibdev, cmd_fd, context, ibv_ctx, RDMA_DRIVER_HNS); @@ -115,12 +116,12 @@ static struct verbs_context *hns_roce_alloc_context(struct ibv_device *ibdev, for (i = 0; i < HNS_ROCE_QP_TABLE_SIZE; ++i) context->qp_table[i].refcnt = 0; - context->uar = mmap(NULL, hr_dev->page_size, - PROT_READ | PROT_WRITE, MAP_SHARED, cmd_fd, 0); - if (context->uar == MAP_FAILED) { - fprintf(stderr, PFX "Warning: failed to mmap() uar page.\n"); + context->uar = mmap(NULL, hr_dev->page_size, PROT_READ | PROT_WRITE, + MAP_SHARED, cmd_fd, offset); + if (context->uar == MAP_FAILED) goto err_free; - } + + offset += hr_dev->page_size; if (hr_dev->hw_version == HNS_ROCE_HW_VER1) { /* @@ -129,14 +130,18 @@ static struct verbs_context *hns_roce_alloc_context(struct ibv_device *ibdev, */ context->cq_tptr_base = mmap(NULL, HNS_ROCE_CQ_DB_BUF_SIZE, PROT_READ | PROT_WRITE, MAP_SHARED, - cmd_fd, HNS_ROCE_TPTR_OFFSET); - if (context->cq_tptr_base == MAP_FAILED) { - fprintf(stderr, - PFX "Warning: Failed to mmap cq_tptr page.\n"); + cmd_fd, offset); + if (context->cq_tptr_base == MAP_FAILED) goto db_free; - } } + if (!resp.cqe_size) + context->cqe_size = HNS_ROCE_CQE_SIZE; + else if (resp.cqe_size <= HNS_ROCE_V3_CQE_SIZE) + context->cqe_size = resp.cqe_size; + else + context->cqe_size = HNS_ROCE_V3_CQE_SIZE; + pthread_spin_init(&context->uar_lock, PTHREAD_PROCESS_PRIVATE); verbs_set_ops(&context->ibv_ctx, &hns_common_ops); diff --git a/providers/hns/hns_roce_u.h b/providers/hns/hns_roce_u.h index 3579070..b0308d1 100644 --- a/providers/hns/hns_roce_u.h +++ b/providers/hns/hns_roce_u.h @@ -39,7 +39,9 @@ #include #include +#include #include +#include #include #include @@ -49,7 +51,11 @@ #define PFX "hns: " -#define HNS_ROCE_MAX_INLINE_DATA_LEN 32 +/* The minimum page size is 4K for hardware */ +#define HNS_HW_PAGE_SHIFT 12 +#define HNS_HW_PAGE_SIZE (1 << HNS_HW_PAGE_SHIFT) + +#define HNS_ROCE_MAX_RC_INL_INN_SZ 32 #define HNS_ROCE_MAX_CQ_NUM 0x10000 #define HNS_ROCE_MAX_SRQWQE_NUM 0x8000 #define HNS_ROCE_MAX_SRQSGE_NUM 0x100 @@ -57,7 +63,9 @@ #define HNS_ROCE_V1_MIN_WQE_NUM 0x20 #define HNS_ROCE_V2_MIN_WQE_NUM 0x40 -#define HNS_ROCE_CQE_ENTRY_SIZE 0x20 +#define HNS_ROCE_CQE_SIZE 0x20 +#define HNS_ROCE_V3_CQE_SIZE 0x40 + #define HNS_ROCE_SQWQE_SHIFT 6 #define HNS_ROCE_SGE_IN_WQE 2 #define HNS_ROCE_SGE_SIZE 16 @@ -66,7 +74,6 @@ #define HNS_ROCE_GID_SIZE 16 #define HNS_ROCE_CQ_DB_BUF_SIZE ((HNS_ROCE_MAX_CQ_NUM >> 11) << 12) -#define HNS_ROCE_TPTR_OFFSET 0x1000 #define HNS_ROCE_STATIC_RATE 3 /* Gbps */ #define HNS_ROCE_ADDRESS_MASK 0xFFFFFFFF @@ -114,7 +121,7 @@ struct hns_roce_buf { }; #define BIT_CNT_PER_BYTE 8 -#define BIT_CNT_PER_U64 64 +#define BIT_CNT_PER_LONG (BIT_CNT_PER_BYTE * sizeof(unsigned long)) /* the sw doorbell type; */ enum hns_roce_db_type { @@ -155,6 +162,7 @@ struct hns_roce_context { unsigned int max_qp_wr; unsigned int max_sge; int max_cqe; + unsigned int cqe_size; }; struct hns_roce_pd { @@ -173,13 +181,14 @@ struct hns_roce_cq { unsigned int *arm_db; int arm_sn; unsigned long flags; + unsigned int cqe_size; }; struct hns_roce_idx_que { struct hns_roce_buf buf; - int buf_size; - int entry_sz; + int entry_shift; unsigned long *bitmap; + int bitmap_cnt; }; struct hns_roce_srq { @@ -188,7 +197,7 @@ struct hns_roce_srq { pthread_spinlock_t lock; unsigned long *wrid; unsigned int srqn; - unsigned int max_wqe; + unsigned int wqe_cnt; unsigned int max_gs; unsigned int wqe_shift; int head; @@ -253,6 +262,7 @@ struct hns_roce_qp { unsigned int next_sge; int port_num; int sl; + enum ibv_mtu path_mtu; struct hns_roce_rinl_buf rq_rinl_buf; unsigned long flags; @@ -263,6 +273,17 @@ struct hns_roce_u_hw { struct verbs_context_ops hw_ops; }; +/* + * The entries's buffer should be aligned to a multiple of the hardware's + * minimum page size. + */ +#define hr_hw_page_align(x) align(x, HNS_HW_PAGE_SIZE) + +static inline unsigned int to_hr_hem_entries_size(int count, int buf_shift) +{ + return hr_hw_page_align(count << buf_shift); +} + static inline struct hns_roce_device *to_hr_dev(struct ibv_device *ibv_dev) { return container_of(ibv_dev, struct hns_roce_device, ibv_dev.device); diff --git a/providers/hns/hns_roce_u_hw_v1.c b/providers/hns/hns_roce_u_hw_v1.c index 247e797..8fe512b 100644 --- a/providers/hns/hns_roce_u_hw_v1.c +++ b/providers/hns/hns_roce_u_hw_v1.c @@ -157,7 +157,7 @@ static void hns_roce_handle_error_cqe(struct hns_roce_cqe *cqe, static struct hns_roce_cqe *get_cqe(struct hns_roce_cq *cq, int entry) { - return cq->buf.buf + entry * HNS_ROCE_CQE_ENTRY_SIZE; + return cq->buf.buf + entry * HNS_ROCE_CQE_SIZE; } static void *get_sw_cqe(struct hns_roce_cq *cq, int n) diff --git a/providers/hns/hns_roce_u_hw_v2.c b/providers/hns/hns_roce_u_hw_v2.c index be3490f..7500155 100644 --- a/providers/hns/hns_roce_u_hw_v2.c +++ b/providers/hns/hns_roce_u_hw_v2.c @@ -37,6 +37,43 @@ #include "hns_roce_u_db.h" #include "hns_roce_u_hw_v2.h" +#define HR_IBV_OPC_MAP(ib_key, hr_key) \ + [IBV_WR_ ## ib_key] = HNS_ROCE_WQE_OP_ ## hr_key + +static const uint32_t hns_roce_opcode[] = { + HR_IBV_OPC_MAP(RDMA_WRITE, RDMA_WRITE), + HR_IBV_OPC_MAP(RDMA_WRITE_WITH_IMM, RDMA_WRITE_WITH_IMM), + HR_IBV_OPC_MAP(SEND, SEND), + HR_IBV_OPC_MAP(SEND_WITH_IMM, SEND_WITH_IMM), + HR_IBV_OPC_MAP(RDMA_READ, RDMA_READ), + HR_IBV_OPC_MAP(ATOMIC_CMP_AND_SWP, ATOMIC_COM_AND_SWAP), + HR_IBV_OPC_MAP(ATOMIC_FETCH_AND_ADD, ATOMIC_FETCH_AND_ADD), + HR_IBV_OPC_MAP(LOCAL_INV, LOCAL_INV), + HR_IBV_OPC_MAP(BIND_MW, BIND_MW_TYPE), + HR_IBV_OPC_MAP(SEND_WITH_INV, SEND_WITH_INV), +}; + +static inline uint32_t to_hr_opcode(enum ibv_wr_opcode ibv_opcode) +{ + if (ibv_opcode >= ARRAY_SIZE(hns_roce_opcode)) + return HNS_ROCE_WQE_OP_MASK; + + return hns_roce_opcode[ibv_opcode]; +} + +static const unsigned int hns_roce_mtu[] = { + [IBV_MTU_256] = 256, + [IBV_MTU_512] = 512, + [IBV_MTU_1024] = 1024, + [IBV_MTU_2048] = 2048, + [IBV_MTU_4096] = 4096, +}; + +static inline unsigned int mtu_enum_to_int(enum ibv_mtu mtu) +{ + return hns_roce_mtu[mtu]; +} + static void *get_send_sge_ex(struct hns_roce_qp *qp, int n); static void set_data_seg_v2(struct hns_roce_v2_wqe_data_seg *dseg, @@ -61,14 +98,12 @@ static void set_extend_atomic_seg(struct hns_roce_qp *qp, } static int set_atomic_seg(struct hns_roce_qp *qp, struct ibv_send_wr *wr, - unsigned int msg_len, void *dseg, - struct hns_roce_sge_info *sge_info) + void *dseg, struct hns_roce_sge_info *sge_info) { - struct hns_roce_wqe_atomic_seg *aseg; + struct hns_roce_wqe_atomic_seg *aseg = dseg; + unsigned int msg_len = sge_info->total_len; unsigned int ext_sg_num; - aseg = dseg; - if (msg_len == STANDARD_ATOMIC_U_BYTE_8) { if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) { aseg->fetchadd_swap_data = htole64(wr->wr.atomic.swap); @@ -84,6 +119,10 @@ static int set_atomic_seg(struct hns_roce_qp *qp, struct ibv_send_wr *wr, ext_sg_num = msg_len * DATA_TYPE_NUM >> HNS_ROCE_SGE_SHIFT; aseg->fetchadd_swap_data = 0; aseg->cmp_data = 0; + + if (ext_sg_num + HNS_ROCE_SGE_IN_WQE > qp->sq.max_gs) + return EINVAL; + if (wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) { if (!wr->wr.atomic.swap || !wr->wr.atomic.compare_add) return EINVAL; @@ -167,7 +206,7 @@ static void hns_roce_v2_handle_error_cqe(struct hns_roce_v2_cqe *cqe, static struct hns_roce_v2_cqe *get_cqe_v2(struct hns_roce_cq *cq, int entry) { - return cq->buf.buf + entry * HNS_ROCE_CQE_ENTRY_SIZE; + return cq->buf.buf + entry * cq->cqe_size; } static void *get_sw_cqe_v2(struct hns_roce_cq *cq, int n) @@ -208,6 +247,11 @@ static void *get_srq_wqe(struct hns_roce_srq *srq, int n) return srq->buf.buf + (n << srq->wqe_shift); } +static void *get_idx_buf(struct hns_roce_idx_que *idx_que, int n) +{ + return idx_que->buf.buf + (n << idx_que->entry_shift); +} + static void hns_roce_free_srq_wqe(struct hns_roce_srq *srq, uint16_t ind) { uint32_t bitmap_num; @@ -215,8 +259,8 @@ static void hns_roce_free_srq_wqe(struct hns_roce_srq *srq, uint16_t ind) pthread_spin_lock(&srq->lock); - bitmap_num = ind / BIT_CNT_PER_U64; - bit_num = ind % BIT_CNT_PER_U64; + bitmap_num = ind / BIT_CNT_PER_LONG; + bit_num = ind % BIT_CNT_PER_LONG; srq->idx_que.bitmap[bitmap_num] |= (1ULL << bit_num); srq->tail++; @@ -250,8 +294,6 @@ static void hns_roce_update_rq_db(struct hns_roce_context *ctx, roce_set_field(rq_db.parameter, DB_PARAM_RQ_PRODUCER_IDX_M, DB_PARAM_RQ_PRODUCER_IDX_S, rq_head); - udma_to_device_barrier(); - hns_roce_write64((uint32_t *)&rq_db, ctx, ROCEE_VF_DB_CFG0_OFFSET); } @@ -270,8 +312,6 @@ static void hns_roce_update_sq_db(struct hns_roce_context *ctx, DB_PARAM_SQ_PRODUCER_IDX_S, sq_head); roce_set_field(sq_db.parameter, DB_PARAM_SL_M, DB_PARAM_SL_S, sl); - udma_to_device_barrier(); - hns_roce_write64((uint32_t *)&sq_db, ctx, ROCEE_VF_DB_CFG0_OFFSET); } @@ -285,11 +325,9 @@ static void hns_roce_v2_update_cq_cons_index(struct hns_roce_context *ctx, HNS_ROCE_V2_CQ_DB_PTR); roce_set_field(cq_db.parameter, DB_PARAM_CQ_CONSUMER_IDX_M, - DB_PARAM_CQ_CONSUMER_IDX_S, - cq->cons_index & ((cq->cq_depth << 1) - 1)); + DB_PARAM_CQ_CONSUMER_IDX_S, cq->cons_index); roce_set_field(cq_db.parameter, DB_PARAM_CQ_CMD_SN_M, DB_PARAM_CQ_CMD_SN_S, 1); - roce_set_bit(cq_db.parameter, DB_PARAM_CQ_NOTIFY_S, 0); hns_roce_write64((uint32_t *)&cq_db, ctx, ROCEE_VF_DB_CFG0_OFFSET); } @@ -537,7 +575,7 @@ static int hns_roce_v2_poll_one(struct hns_roce_cq *cq, wqe_ctr = (uint16_t)(roce_get_field(cqe->byte_4, CQE_BYTE_4_WQE_IDX_M, CQE_BYTE_4_WQE_IDX_S)); - wc->wr_id = srq->wrid[wqe_ctr & (srq->max_wqe - 1)]; + wc->wr_id = srq->wrid[wqe_ctr & (srq->wqe_cnt - 1)]; hns_roce_free_srq_wqe(srq, wqe_ctr); } else { wq = &(*cur_qp)->rq; @@ -611,8 +649,8 @@ static int hns_roce_u_v2_poll_cq(struct ibv_cq *ibvcq, int ne, mmio_ordered_writes_hack(); if (cq->flags & HNS_ROCE_SUPPORT_CQ_RECORD_DB) - *cq->set_ci_db = (unsigned int)(cq->cons_index & - ((cq->cq_depth << 1) - 1)); + *cq->set_ci_db = + cq->cons_index & DB_PARAM_CQ_CONSUMER_IDX_M; else hns_roce_v2_update_cq_cons_index(ctx, cq); } @@ -667,6 +705,11 @@ static void set_sge(struct hns_roce_v2_wqe_data_seg *dseg, sge_info->total_len += wr->sg_list[i].length; sge_info->valid_num++; + if (wr->send_flags & IBV_SEND_INLINE && + wr->opcode != IBV_WR_ATOMIC_FETCH_AND_ADD && + wr->opcode != IBV_WR_ATOMIC_CMP_AND_SWP) + continue; + /* No inner sge in UD wqe */ if (sge_info->valid_num <= HNS_ROCE_SGE_IN_WQE && qp->ibv_qp.qp_type != IBV_QPT_UD) { @@ -681,93 +724,171 @@ static void set_sge(struct hns_roce_v2_wqe_data_seg *dseg, } } -static int set_rc_wqe(void *wqe, struct hns_roce_qp *qp, struct ibv_send_wr *wr, - int nreq, struct hns_roce_sge_info *sge_info) +static int fill_ext_sge_inl_data(struct hns_roce_qp *qp, + const struct ibv_send_wr *wr, + struct hns_roce_sge_info *sge_info) { - struct hns_roce_rc_sq_wqe *rc_sq_wqe = wqe; - struct hns_roce_v2_wqe_data_seg *dseg; - int hr_op; + unsigned int sge_sz = sizeof(struct hns_roce_v2_wqe_data_seg); + void *dseg; int i; - memset(rc_sq_wqe, 0, sizeof(struct hns_roce_rc_sq_wqe)); + if (sge_info->total_len > qp->sq.max_gs * sge_sz) + return EINVAL; + + dseg = get_send_sge_ex(qp, sge_info->start_idx); + + for (i = 0; i < wr->num_sge; i++) { + memcpy(dseg, (void *)(uintptr_t)wr->sg_list[i].addr, + wr->sg_list[i].length); + dseg += wr->sg_list[i].length; + } + + sge_info->start_idx += DIV_ROUND_UP(sge_info->total_len, sge_sz); + + return 0; +} + +static bool check_inl_data_len(struct hns_roce_qp *qp, unsigned int len) +{ + int mtu = mtu_enum_to_int(qp->path_mtu); + + return (len <= qp->max_inline_data && len <= mtu); +} + +static __le32 get_immtdata(enum ibv_wr_opcode opcode, const struct ibv_send_wr *wr) +{ + switch (opcode) { + case IBV_WR_SEND_WITH_IMM: + case IBV_WR_RDMA_WRITE_WITH_IMM: + return htole32(be32toh(wr->imm_data)); + default: + return 0; + } +} + +static int set_rc_inl(struct hns_roce_qp *qp, const struct ibv_send_wr *wr, + struct hns_roce_rc_sq_wqe *rc_sq_wqe, + struct hns_roce_sge_info *sge_info) +{ + unsigned int sge_idx = sge_info->start_idx; + void *dseg = rc_sq_wqe; + int ret; + int i; + + if (wr->opcode == IBV_WR_RDMA_READ) + return EINVAL; + + if (!check_inl_data_len(qp, sge_info->total_len)) + return EINVAL; + + dseg += sizeof(struct hns_roce_rc_sq_wqe); + + roce_set_bit(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_INLINE_S, 1); + + if (sge_info->total_len <= HNS_ROCE_MAX_RC_INL_INN_SZ) { + roce_set_bit(rc_sq_wqe->byte_20, RC_SQ_WQE_BYTE_20_INL_TYPE_S, + 0); + + for (i = 0; i < wr->num_sge; i++) { + memcpy(dseg, (void *)(uintptr_t)(wr->sg_list[i].addr), + wr->sg_list[i].length); + dseg += wr->sg_list[i].length; + } + } else { + roce_set_bit(rc_sq_wqe->byte_20, RC_SQ_WQE_BYTE_20_INL_TYPE_S, + 1); + + ret = fill_ext_sge_inl_data(qp, wr, sge_info); + if (ret) + return ret; + + sge_info->valid_num = sge_info->start_idx - sge_idx; + + roce_set_field(rc_sq_wqe->byte_16, RC_SQ_WQE_BYTE_16_SGE_NUM_M, + RC_SQ_WQE_BYTE_16_SGE_NUM_S, + sge_info->valid_num); + } + + return 0; +} + +static void set_bind_mw_seg(struct hns_roce_rc_sq_wqe *wqe, + const struct ibv_send_wr *wr) +{ + roce_set_bit(wqe->byte_4, RC_SQ_WQE_BYTE_4_MW_TYPE_S, + wr->bind_mw.mw->type - 1); + roce_set_bit(wqe->byte_4, RC_SQ_WQE_BYTE_4_ATOMIC_S, + (wr->bind_mw.bind_info.mw_access_flags & + IBV_ACCESS_REMOTE_ATOMIC) ? 1 : 0); + roce_set_bit(wqe->byte_4, RC_SQ_WQE_BYTE_4_RDMA_READ_S, + (wr->bind_mw.bind_info.mw_access_flags & + IBV_ACCESS_REMOTE_READ) ? 1 : 0); + roce_set_bit(wqe->byte_4, RC_SQ_WQE_BYTE_4_RDMA_WRITE_S, + (wr->bind_mw.bind_info.mw_access_flags & + IBV_ACCESS_REMOTE_WRITE) ? 1 : 0); + wqe->new_rkey = htole32(wr->bind_mw.rkey); + wqe->byte_16 = htole32(wr->bind_mw.bind_info.length & + HNS_ROCE_ADDRESS_MASK); + wqe->byte_20 = htole32(wr->bind_mw.bind_info.length >> + HNS_ROCE_ADDRESS_SHIFT); + wqe->rkey = htole32(wr->bind_mw.bind_info.mr->rkey); + wqe->va = htole64(wr->bind_mw.bind_info.addr); +} + +static int check_rc_opcode(struct hns_roce_rc_sq_wqe *wqe, + const struct ibv_send_wr *wr) +{ + int ret = 0; + + wqe->immtdata = get_immtdata(wr->opcode, wr); switch (wr->opcode) { case IBV_WR_RDMA_READ: - hr_op = HNS_ROCE_WQE_OP_RDMA_READ; - rc_sq_wqe->va = htole64(wr->wr.rdma.remote_addr); - rc_sq_wqe->rkey = htole32(wr->wr.rdma.rkey); - break; case IBV_WR_RDMA_WRITE: - hr_op = HNS_ROCE_WQE_OP_RDMA_WRITE; - rc_sq_wqe->va = htole64(wr->wr.rdma.remote_addr); - rc_sq_wqe->rkey = htole32(wr->wr.rdma.rkey); - break; case IBV_WR_RDMA_WRITE_WITH_IMM: - hr_op = HNS_ROCE_WQE_OP_RDMA_WRITE_WITH_IMM; - rc_sq_wqe->va = htole64(wr->wr.rdma.remote_addr); - rc_sq_wqe->rkey = htole32(wr->wr.rdma.rkey); - rc_sq_wqe->immtdata = htole32(be32toh(wr->imm_data)); + wqe->va = htole64(wr->wr.rdma.remote_addr); + wqe->rkey = htole32(wr->wr.rdma.rkey); break; case IBV_WR_SEND: - hr_op = HNS_ROCE_WQE_OP_SEND; - break; - case IBV_WR_SEND_WITH_INV: - hr_op = HNS_ROCE_WQE_OP_SEND_WITH_INV; - rc_sq_wqe->inv_key = htole32(wr->invalidate_rkey); - break; case IBV_WR_SEND_WITH_IMM: - hr_op = HNS_ROCE_WQE_OP_SEND_WITH_IMM; - rc_sq_wqe->immtdata = htole32(be32toh(wr->imm_data)); + break; + case IBV_WR_ATOMIC_CMP_AND_SWP: + case IBV_WR_ATOMIC_FETCH_AND_ADD: + wqe->rkey = htole32(wr->wr.atomic.rkey); + wqe->va = htole64(wr->wr.atomic.remote_addr); break; case IBV_WR_LOCAL_INV: - hr_op = HNS_ROCE_WQE_OP_LOCAL_INV; - roce_set_bit(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_SO_S, 1); - rc_sq_wqe->inv_key = htole32(wr->invalidate_rkey); + roce_set_bit(wqe->byte_4, RC_SQ_WQE_BYTE_4_SO_S, 1); + /* fallthrough */ + case IBV_WR_SEND_WITH_INV: + wqe->inv_key = htole32(wr->invalidate_rkey); break; case IBV_WR_BIND_MW: - hr_op = HNS_ROCE_WQE_OP_BIND_MW_TYPE; - roce_set_bit(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_MW_TYPE_S, - wr->bind_mw.mw->type - 1); - roce_set_bit(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_ATOMIC_S, - (wr->bind_mw.bind_info.mw_access_flags & - IBV_ACCESS_REMOTE_ATOMIC) ? 1 : 0); - roce_set_bit(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_RDMA_READ_S, - (wr->bind_mw.bind_info.mw_access_flags & - IBV_ACCESS_REMOTE_READ) ? 1 : 0); - roce_set_bit(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_RDMA_WRITE_S, - (wr->bind_mw.bind_info.mw_access_flags & - IBV_ACCESS_REMOTE_WRITE) ? 1 : 0); - rc_sq_wqe->new_rkey = htole32(wr->bind_mw.rkey); - rc_sq_wqe->byte_16 = htole32(wr->bind_mw.bind_info.length & - HNS_ROCE_ADDRESS_MASK); - rc_sq_wqe->byte_20 = htole32(wr->bind_mw.bind_info.length >> - HNS_ROCE_ADDRESS_SHIFT); - rc_sq_wqe->rkey = htole32(wr->bind_mw.bind_info.mr->rkey); - rc_sq_wqe->va = htole64(wr->bind_mw.bind_info.addr); - break; - case IBV_WR_ATOMIC_CMP_AND_SWP: - hr_op = HNS_ROCE_WQE_OP_ATOMIC_COM_AND_SWAP; - rc_sq_wqe->rkey = htole32(wr->wr.atomic.rkey); - rc_sq_wqe->va = htole64(wr->wr.atomic.remote_addr); - roce_set_field(rc_sq_wqe->byte_16, RC_SQ_WQE_BYTE_16_SGE_NUM_M, - RC_SQ_WQE_BYTE_16_SGE_NUM_S, - sge_info->valid_num); - break; - case IBV_WR_ATOMIC_FETCH_AND_ADD: - hr_op = HNS_ROCE_WQE_OP_ATOMIC_FETCH_AND_ADD; - rc_sq_wqe->rkey = htole32(wr->wr.atomic.rkey); - rc_sq_wqe->va = htole64(wr->wr.atomic.remote_addr); - roce_set_field(rc_sq_wqe->byte_16, RC_SQ_WQE_BYTE_16_SGE_NUM_M, - RC_SQ_WQE_BYTE_16_SGE_NUM_S, - sge_info->valid_num); + set_bind_mw_seg(wqe, wr); break; default: - hr_op = HNS_ROCE_WQE_OP_MASK; - return EINVAL; + ret = EINVAL; + break; } - roce_set_field(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_OPCODE_M, - RC_SQ_WQE_BYTE_4_OPCODE_S, hr_op); + roce_set_field(wqe->byte_4, RC_SQ_WQE_BYTE_4_OPCODE_M, + RC_SQ_WQE_BYTE_4_OPCODE_S, to_hr_opcode(wr->opcode)); + + return ret; +} + +static int set_rc_wqe(void *wqe, struct hns_roce_qp *qp, struct ibv_send_wr *wr, + int nreq, struct hns_roce_sge_info *sge_info) +{ + struct hns_roce_rc_sq_wqe *rc_sq_wqe = wqe; + struct hns_roce_v2_wqe_data_seg *dseg; + int ret; + + memset(rc_sq_wqe, 0, sizeof(struct hns_roce_rc_sq_wqe)); + + ret = check_rc_opcode(rc_sq_wqe, wr); + if (ret) + return ret; roce_set_bit(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_CQE_S, (wr->send_flags & IBV_SEND_SIGNALED) ? 1 : 0); @@ -802,26 +923,12 @@ static int set_rc_wqe(void *wqe, struct hns_roce_qp *qp, struct ibv_send_wr *wr, if (wr->opcode == IBV_WR_ATOMIC_FETCH_AND_ADD || wr->opcode == IBV_WR_ATOMIC_CMP_AND_SWP) { dseg++; - return set_atomic_seg(qp, wr, le32toh(rc_sq_wqe->msg_len), - dseg, sge_info); + ret = set_atomic_seg(qp, wr, dseg, sge_info); + } else if (wr->send_flags & IBV_SEND_INLINE) { + ret = set_rc_inl(qp, wr, rc_sq_wqe, sge_info); } - if (wr->send_flags & IBV_SEND_INLINE) { - if (wr->opcode == IBV_WR_RDMA_READ) - return EINVAL; - - if (sge_info->total_len > qp->max_inline_data) - return EINVAL; - - for (i = 0; i < wr->num_sge; i++) { - memcpy(dseg, (void *)(uintptr_t)(wr->sg_list[i].addr), - wr->sg_list[i].length); - dseg += wr->sg_list[i].length; - } - roce_set_bit(rc_sq_wqe->byte_4, RC_SQ_WQE_BYTE_4_INLINE_S, 1); - } - - return 0; + return ret; } int hns_roce_u_v2_post_send(struct ibv_qp *ibvqp, struct ibv_send_wr *wr, @@ -888,8 +995,9 @@ out: qp->sq.head += nreq; qp->next_sge = sge_info.start_idx; - hns_roce_update_sq_db(ctx, qp->ibv_qp.qp_num, qp->sl, - qp->sq.head & ((qp->sq.wqe_cnt << 1) - 1)); + udma_to_device_barrier(); + + hns_roce_update_sq_db(ctx, ibvqp->qp_num, qp->sl, qp->sq.head); if (qp->flags & HNS_ROCE_SUPPORT_SQ_RECORD_DB) *(qp->sdb) = qp->sq.head & 0xffff; @@ -991,8 +1099,7 @@ out: if (qp->flags & HNS_ROCE_SUPPORT_RQ_RECORD_DB) *qp->rdb = qp->rq.head & 0xffff; else - hns_roce_update_rq_db(ctx, qp->ibv_qp.qp_num, - qp->rq.head & ((qp->rq.wqe_cnt << 1) - 1)); + hns_roce_update_rq_db(ctx, ibvqp->qp_num, qp->rq.head); } pthread_spin_unlock(&qp->rq.lock); @@ -1063,6 +1170,23 @@ static void hns_roce_v2_cq_clean(struct hns_roce_cq *cq, unsigned int qpn, pthread_spin_unlock(&cq->lock); } +static void record_qp_attr(struct ibv_qp *qp, struct ibv_qp_attr *attr, + int attr_mask) +{ + struct hns_roce_qp *hr_qp = to_hr_qp(qp); + + if (attr_mask & IBV_QP_PORT) + hr_qp->port_num = attr->port_num; + + if (attr_mask & IBV_QP_AV) + hr_qp->sl = attr->ah_attr.sl; + + if (qp->qp_type == IBV_QPT_UD) + hr_qp->path_mtu = IBV_MTU_4096; + else if (attr_mask & IBV_QP_PATH_MTU) + hr_qp->path_mtu = attr->path_mtu; +} + static int hns_roce_u_v2_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask) { @@ -1100,11 +1224,7 @@ static int hns_roce_u_v2_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, hns_roce_init_qp_indices(to_hr_qp(qp)); } - if (attr_mask & IBV_QP_PORT) - hr_qp->port_num = attr->port_num; - - if (attr_mask & IBV_QP_AV) - hr_qp->sl = attr->ah_attr.sl; + record_qp_attr(qp, attr, attr_mask); return ret; } @@ -1195,27 +1315,21 @@ static int hns_roce_u_v2_destroy_qp(struct ibv_qp *ibqp) return ret; } -static void fill_idx_queue(struct hns_roce_idx_que *idx_que, - int cur_idx, int wqe_idx) -{ - unsigned int *addr; - - addr = idx_que->buf.buf + cur_idx * idx_que->entry_sz; - *addr = wqe_idx; -} - static int find_empty_entry(struct hns_roce_idx_que *idx_que) { int bit_num; int i; /* bitmap[i] is set zero if all bits are allocated */ - for (i = 0; idx_que->bitmap[i] == 0; ++i) + for (i = 0; i < idx_que->bitmap_cnt && idx_que->bitmap[i] == 0; ++i) ; + if (i == idx_que->bitmap_cnt) + return ENOMEM; + bit_num = ffsl(idx_que->bitmap[i]); idx_que->bitmap[i] &= ~(1ULL << (bit_num - 1)); - return i * BIT_CNT_PER_U64 + (bit_num - 1); + return i * BIT_CNT_PER_LONG + (bit_num - 1); } static int hns_roce_u_v2_post_srq_recv(struct ibv_srq *ib_srq, @@ -1226,6 +1340,7 @@ static int hns_roce_u_v2_post_srq_recv(struct ibv_srq *ib_srq, struct hns_roce_srq *srq = to_hr_srq(ib_srq); struct hns_roce_v2_wqe_data_seg *dseg; struct hns_roce_db srq_db; + __le32 *srq_idx; int ret = 0; int wqe_idx; void *wqe; @@ -1236,7 +1351,7 @@ static int hns_roce_u_v2_post_srq_recv(struct ibv_srq *ib_srq, pthread_spin_lock(&srq->lock); /* current idx of srqwq */ - ind = srq->head & (srq->max_wqe - 1); + ind = srq->head & (srq->wqe_cnt - 1); for (nreq = 0; wr; ++nreq, wr = wr->next) { if (wr->num_sge > srq->max_gs) { @@ -1253,7 +1368,11 @@ static int hns_roce_u_v2_post_srq_recv(struct ibv_srq *ib_srq, } wqe_idx = find_empty_entry(&srq->idx_que); - fill_idx_queue(&srq->idx_que, ind, wqe_idx); + if (wqe_idx < 0 || wqe_idx >= srq->wqe_cnt) { + ret = -ENOMEM; + *bad_wr = wr; + break; + } wqe = get_srq_wqe(srq, wqe_idx); dseg = (struct hns_roce_v2_wqe_data_seg *)wqe; @@ -1271,8 +1390,11 @@ static int hns_roce_u_v2_post_srq_recv(struct ibv_srq *ib_srq, dseg[i].addr = 0; } + srq_idx = (__le32 *)get_idx_buf(&srq->idx_que, ind); + *srq_idx = htole32(wqe_idx); + srq->wrid[wqe_idx] = wr->wr_id; - ind = (ind + 1) & (srq->max_wqe - 1); + ind = (ind + 1) & (srq->wqe_cnt - 1); } if (nreq) { @@ -1284,9 +1406,10 @@ static int hns_roce_u_v2_post_srq_recv(struct ibv_srq *ib_srq, */ udma_to_device_barrier(); - srq_db.byte_4 = htole32(HNS_ROCE_V2_SRQ_DB << DB_BYTE_4_CMD_S - | srq->srqn); - srq_db.parameter = htole32(srq->head); + srq_db.byte_4 = htole32(HNS_ROCE_V2_SRQ_DB << DB_BYTE_4_CMD_S | + srq->srqn); + srq_db.parameter = + htole32(srq->head & DB_PARAM_SRQ_PRODUCER_COUNTER_M); hns_roce_write64((uint32_t *)&srq_db, ctx, ROCEE_VF_DB_CFG0_OFFSET); diff --git a/providers/hns/hns_roce_u_hw_v2.h b/providers/hns/hns_roce_u_hw_v2.h index 366bc13..fc16320 100644 --- a/providers/hns/hns_roce_u_hw_v2.h +++ b/providers/hns/hns_roce_u_hw_v2.h @@ -179,6 +179,7 @@ struct hns_roce_v2_cqe { __le32 smac; __le32 byte_28; __le32 byte_32; + __le32 rsv[8]; }; #define CQE_BYTE_4_OPCODE_S 0 @@ -270,6 +271,8 @@ struct hns_roce_rc_sq_wqe { #define RC_SQ_WQE_BYTE_20_MSG_START_SGE_IDX_M \ (((1UL << 24) - 1) << RC_SQ_WQE_BYTE_20_MSG_START_SGE_IDX_S) +#define RC_SQ_WQE_BYTE_20_INL_TYPE_S 31 + struct hns_roce_v2_wqe_data_seg { __le32 len; __le32 lkey; diff --git a/providers/hns/hns_roce_u_verbs.c b/providers/hns/hns_roce_u_verbs.c index bbc307a..06fdbfa 100644 --- a/providers/hns/hns_roce_u_verbs.c +++ b/providers/hns/hns_roce_u_verbs.c @@ -38,7 +38,6 @@ #include #include #include -#include #include "hns_roce_u.h" #include "hns_roce_u_abi.h" #include "hns_roce_u_db.h" @@ -260,17 +259,6 @@ static int align_cq_size(int req) return nent; } -/* must check min depth before align */ -static int align_qp_size(int req) -{ - int nent; - - for (nent = HNS_ROCE_V1_MIN_WQE_NUM; nent < req; nent <<= 1) - ; - - return nent; -} - static uint64_t align_queue_size(uint64_t req) { return roundup_pow_of_two(req); @@ -287,13 +275,12 @@ static int hns_roce_verify_cq(int *cqe, struct hns_roce_context *context) return 0; } -static int hns_roce_alloc_cq_buf(struct hns_roce_device *dev, - struct hns_roce_buf *buf, int nent) +static int hns_roce_alloc_cq_buf(struct hns_roce_cq *cq, int nent) { - if (hns_roce_alloc_buf(buf, - align(nent * HNS_ROCE_CQE_ENTRY_SIZE, dev->page_size), - dev->page_size)) - return -1; + int buf_size = hr_hw_page_align(nent * cq->cqe_size); + + if (hns_roce_alloc_buf(&cq->buf, buf_size, HNS_HW_PAGE_SIZE)) + return ENOMEM; return 0; } @@ -303,12 +290,13 @@ struct ibv_cq *hns_roce_u_create_cq(struct ibv_context *context, int cqe, int comp_vector) { struct hns_roce_device *hr_dev = to_hr_dev(context->device); - struct hns_roce_create_cq cmd = {}; - struct hns_roce_create_cq_resp resp = {}; - struct hns_roce_cq *cq; - int ret; + struct hns_roce_context *hr_ctx = to_hr_ctx(context); + struct hns_roce_create_cq_resp resp = {}; + struct hns_roce_create_cq cmd = {}; + struct hns_roce_cq *cq; + int ret; - if (hns_roce_verify_cq(&cqe, to_hr_ctx(context))) + if (hns_roce_verify_cq(&cqe, hr_ctx)) return NULL; cq = malloc(sizeof(*cq)); @@ -317,6 +305,9 @@ struct ibv_cq *hns_roce_u_create_cq(struct ibv_context *context, int cqe, cq->cons_index = 0; + cq->cqe_size = hr_ctx->cqe_size; + cmd.cqe_size = cq->cqe_size; + if (pthread_spin_init(&cq->lock, PTHREAD_PROCESS_PRIVATE)) goto err; @@ -325,7 +316,7 @@ struct ibv_cq *hns_roce_u_create_cq(struct ibv_context *context, int cqe, else cqe = align_queue_size(cqe); - if (hns_roce_alloc_cq_buf(hr_dev, &cq->buf, cqe)) + if (hns_roce_alloc_cq_buf(cq, cqe)) goto err; cmd.buf_addr = (uintptr_t) cq->buf.buf; @@ -402,67 +393,53 @@ int hns_roce_u_destroy_cq(struct ibv_cq *cq) return ret; } -static int hns_roce_create_idx_que(struct ibv_pd *pd, struct hns_roce_srq *srq) +static int hns_roce_create_idx_que(struct hns_roce_srq *srq) { struct hns_roce_idx_que *idx_que = &srq->idx_que; - uint32_t bitmap_num; + unsigned int buf_size; int i; - idx_que->entry_sz = HNS_ROCE_IDX_QUE_ENTRY_SZ; - - /* bits needed in bitmap */ - bitmap_num = align(srq->max_wqe, BIT_CNT_PER_U64); - - idx_que->bitmap = calloc(1, bitmap_num / BIT_CNT_PER_BYTE); + idx_que->entry_shift = hr_ilog32(HNS_ROCE_IDX_QUE_ENTRY_SZ); + idx_que->bitmap_cnt = align(srq->wqe_cnt, BIT_CNT_PER_LONG) / + BIT_CNT_PER_LONG; + idx_que->bitmap = calloc(idx_que->bitmap_cnt, sizeof(unsigned long)); if (!idx_que->bitmap) - return -1; - - /* bitmap_num indicates amount of u64 */ - bitmap_num = bitmap_num / BIT_CNT_PER_U64; + return ENOMEM; - idx_que->buf_size = srq->max_wqe * idx_que->entry_sz; - if (hns_roce_alloc_buf(&idx_que->buf, idx_que->buf_size, - to_hr_dev(pd->context->device)->page_size)) { + buf_size = to_hr_hem_entries_size(srq->wqe_cnt, idx_que->entry_shift); + if (hns_roce_alloc_buf(&idx_que->buf, buf_size, HNS_HW_PAGE_SIZE)) { free(idx_que->bitmap); idx_que->bitmap = NULL; - return -1; + return ENOMEM; } /* init the idx_que bitmap */ - for (i = 0; i < bitmap_num; ++i) + for (i = 0; i < idx_que->bitmap_cnt; ++i) idx_que->bitmap[i] = ~(0UL); return 0; } -static int hns_roce_alloc_srq_buf(struct ibv_pd *pd, struct ibv_srq_attr *attr, - struct hns_roce_srq *srq) +static int hns_roce_alloc_srq_buf(struct hns_roce_srq *srq) { int srq_buf_size; - int srq_size; - srq->wrid = calloc(srq->max_wqe, sizeof(unsigned long)); + srq->wrid = calloc(srq->wqe_cnt, sizeof(unsigned long)); if (!srq->wrid) - return -1; - - /* srq size */ - srq_size = srq->max_gs * sizeof(struct hns_roce_v2_wqe_data_seg); - - for (srq->wqe_shift = HNS_ROCE_SGE_SHIFT; - 1 << srq->wqe_shift < srq_size; ++srq->wqe_shift) - ; /* nothing */ + return ENOMEM; - srq_buf_size = srq->max_wqe << srq->wqe_shift; + srq->wqe_shift = hr_ilog32(roundup_pow_of_two(HNS_ROCE_SGE_SIZE * + srq->max_gs)); + srq_buf_size = to_hr_hem_entries_size(srq->wqe_cnt, srq->wqe_shift); /* allocate srq wqe buf */ - if (hns_roce_alloc_buf(&srq->buf, srq_buf_size, - to_hr_dev(pd->context->device)->page_size)) { + if (hns_roce_alloc_buf(&srq->buf, srq_buf_size, HNS_HW_PAGE_SIZE)) { free(srq->wrid); - return -1; + return ENOMEM; } srq->head = 0; - srq->tail = srq->max_wqe - 1; + srq->tail = srq->wqe_cnt - 1; return 0; } @@ -486,19 +463,16 @@ struct ibv_srq *hns_roce_u_create_srq(struct ibv_pd *pd, if (pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE)) goto out; - srq->max_wqe = align_queue_size(init_attr->attr.max_wr + 1); + srq->wqe_cnt = roundup_pow_of_two(init_attr->attr.max_wr + 1); srq->max_gs = init_attr->attr.max_sge; - ret = hns_roce_create_idx_que(pd, srq); - if (ret) { - fprintf(stderr, "hns_roce_create_idx_que failed!\n"); + ret = hns_roce_create_idx_que(srq); + if (ret) goto out; - } - if (hns_roce_alloc_srq_buf(pd, &init_attr->attr, srq)) { - fprintf(stderr, "hns_roce_alloc_srq_buf failed!\n"); + ret = hns_roce_alloc_srq_buf(srq); + if (ret) goto err_idx_que; - } srq->db = hns_roce_alloc_db(to_hr_ctx(pd->context), HNS_ROCE_QP_TYPE_DB); @@ -591,40 +565,32 @@ static int hns_roce_verify_qp(struct ibv_qp_init_attr *attr, if (attr->cap.max_recv_wr && attr->cap.max_recv_wr < min_wqe_num) attr->cap.max_recv_wr = min_wqe_num; - if (attr->cap.max_recv_sge < 1) - attr->cap.max_recv_sge = 1; - if ((attr->qp_type != IBV_QPT_RC) && (attr->qp_type != IBV_QPT_UD)) return EINVAL; - if ((attr->qp_type == IBV_QPT_RC) && - (attr->cap.max_inline_data > HNS_ROCE_MAX_INLINE_DATA_LEN)) - return EINVAL; - return 0; } static int hns_roce_alloc_recv_inl_buf(struct ibv_qp_cap *cap, struct hns_roce_qp *qp) { + unsigned int cnt; int i; - qp->rq_rinl_buf.wqe_list = calloc(qp->rq.wqe_cnt, + cnt = qp->rq_rinl_buf.wqe_cnt; + qp->rq_rinl_buf.wqe_list = calloc(cnt, sizeof(struct hns_roce_rinl_wqe)); if (!qp->rq_rinl_buf.wqe_list) - return -1; - - qp->rq_rinl_buf.wqe_cnt = qp->rq.wqe_cnt; + return ENOMEM; - qp->rq_rinl_buf.wqe_list[0].sg_list = - calloc(qp->rq.wqe_cnt * cap->max_recv_sge, - sizeof(struct hns_roce_rinl_sge)); + qp->rq_rinl_buf.wqe_list[0].sg_list = calloc(cnt * cap->max_recv_sge, + sizeof(struct hns_roce_rinl_sge)); if (!qp->rq_rinl_buf.wqe_list[0].sg_list) { free(qp->rq_rinl_buf.wqe_list); - return -1; + return ENOMEM; } - for (i = 0; i < qp->rq_rinl_buf.wqe_cnt; i++) { + for (i = 0; i < cnt; i++) { int wqe_size = i * cap->max_recv_sge; qp->rq_rinl_buf.wqe_list[i].sg_list = @@ -634,72 +600,64 @@ static int hns_roce_alloc_recv_inl_buf(struct ibv_qp_cap *cap, return 0; } -static int hns_roce_calc_qp_buff_size(struct ibv_pd *pd, struct ibv_qp_cap *cap, - enum ibv_qp_type type, - struct hns_roce_qp *qp) +static void hns_roce_free_recv_inl_buf(struct hns_roce_qp *qp) { - int page_size = to_hr_dev(pd->context->device)->page_size; + if (qp->rq_rinl_buf.wqe_list) { + if (qp->rq_rinl_buf.wqe_list[0].sg_list) { + free(qp->rq_rinl_buf.wqe_list[0].sg_list); + qp->rq_rinl_buf.wqe_list[0].sg_list = NULL; + } - if (to_hr_dev(pd->context->device)->hw_version == HNS_ROCE_HW_VER1) { - qp->rq.wqe_shift = hr_ilog32(sizeof(struct hns_roce_rc_rq_wqe)); + free(qp->rq_rinl_buf.wqe_list); + qp->rq_rinl_buf.wqe_list = NULL; + } +} - qp->buf_size = align((qp->sq.wqe_cnt << qp->sq.wqe_shift), - page_size) + - (qp->rq.wqe_cnt << qp->rq.wqe_shift); - - if (qp->rq.wqe_shift > qp->sq.wqe_shift) { - qp->rq.offset = 0; - qp->sq.offset = qp->rq.wqe_cnt << qp->rq.wqe_shift; - } else { - qp->rq.offset = align((qp->sq.wqe_cnt << - qp->sq.wqe_shift), page_size); - qp->sq.offset = 0; - } +static int calc_qp_buff_size(struct ibv_pd *pd, struct hns_roce_qp *qp) +{ + struct hns_roce_wq *sq, *rq; + unsigned int size; + + if (to_hr_dev(pd->context->device)->hw_version == HNS_ROCE_HW_VER1 && + qp->rq.wqe_shift > qp->sq.wqe_shift) { + sq = &qp->rq; + rq = &qp->sq; } else { - unsigned int rqwqe_size = HNS_ROCE_SGE_SIZE * cap->max_recv_sge; + sq = &qp->sq; + rq = &qp->rq; + } - qp->rq.wqe_shift = hr_ilog32(rqwqe_size); + qp->buf_size = 0; - if (qp->sq.max_gs > HNS_ROCE_SGE_IN_WQE || type == IBV_QPT_UD) - qp->ex_sge.sge_shift = HNS_ROCE_SGE_SHIFT; - else - qp->ex_sge.sge_shift = 0; + /* SQ WQE */ + sq->offset = 0; + size = to_hr_hem_entries_size(sq->wqe_cnt, sq->wqe_shift); + qp->buf_size += size; - /* alloc recv inline buf */ - if (hns_roce_alloc_recv_inl_buf(cap, qp)) - return -1; - - qp->buf_size = align((qp->sq.wqe_cnt << qp->sq.wqe_shift), - page_size) + - align((qp->ex_sge.sge_cnt << - qp->ex_sge.sge_shift), - page_size) + - (qp->rq.wqe_cnt << qp->rq.wqe_shift); - - if (qp->ex_sge.sge_cnt) { - qp->sq.offset = 0; - qp->ex_sge.offset = align((qp->sq.wqe_cnt << - qp->sq.wqe_shift), - page_size); - qp->rq.offset = qp->ex_sge.offset + - align((qp->ex_sge.sge_cnt << - qp->ex_sge.sge_shift), - page_size); - } else { - qp->sq.offset = 0; - qp->ex_sge.offset = 0; - qp->rq.offset = align((qp->sq.wqe_cnt << - qp->sq.wqe_shift), page_size); - } + /* extend SGE WQE in SQ */ + qp->ex_sge.offset = qp->buf_size; + if (qp->ex_sge.sge_cnt > 0) { + size = to_hr_hem_entries_size(qp->ex_sge.sge_cnt, + qp->ex_sge.sge_shift); + qp->buf_size += size; } + /* RQ WQE */ + rq->offset = qp->buf_size; + size = to_hr_hem_entries_size(rq->wqe_cnt, rq->wqe_shift); + qp->buf_size += size; + + if (qp->buf_size < 1) + return EINVAL; + return 0; } static int hns_roce_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap, - enum ibv_qp_type type, struct hns_roce_qp *qp) + struct hns_roce_qp *qp) { - int page_size = to_hr_dev(pd->context->device)->page_size; + if (calc_qp_buff_size(pd, qp)) + return EINVAL; qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof(uint64_t)); if (!qp->sq.wrid) @@ -707,28 +665,51 @@ static int hns_roce_alloc_qp_buf(struct ibv_pd *pd, struct ibv_qp_cap *cap, if (qp->rq.wqe_cnt) { qp->rq.wrid = malloc(qp->rq.wqe_cnt * sizeof(uint64_t)); - if (!qp->rq.wrid) { - free(qp->sq.wrid); - return -1; - } + if (!qp->rq.wrid) + goto err_alloc; } - if (hns_roce_calc_qp_buff_size(pd, cap, type, qp)) { - if (qp->rq.wqe_cnt) - free(qp->rq.wrid); - free(qp->sq.wrid); - return -1; + if (qp->rq_rinl_buf.wqe_cnt) { + if (hns_roce_alloc_recv_inl_buf(cap, qp)) + goto err_alloc; } - if (hns_roce_alloc_buf(&qp->buf, align(qp->buf_size, page_size), - to_hr_dev(pd->context->device)->page_size)) { - if (qp->rq.wqe_cnt) - free(qp->rq.wrid); + if (hns_roce_alloc_buf(&qp->buf, qp->buf_size, HNS_HW_PAGE_SIZE)) + goto err_alloc; + + return 0; + +err_alloc: + hns_roce_free_recv_inl_buf(qp); + if (qp->rq.wrid) + free(qp->rq.wrid); + + if (qp->sq.wrid) free(qp->sq.wrid); - return -1; + + return ENOMEM; +} + +static void set_extend_sge_param(struct hns_roce_device *hr_dev, + struct ibv_qp_init_attr *attr, + struct hns_roce_qp *qp, unsigned int wr_cnt) +{ + int cnt = 0; + + if (hr_dev->hw_version == HNS_ROCE_HW_VER1) { + qp->sq.max_gs = HNS_ROCE_SGE_IN_WQE; + } else { + qp->sq.max_gs = attr->cap.max_send_sge; + if (attr->qp_type == IBV_QPT_UD) + cnt = roundup_pow_of_two(wr_cnt * qp->sq.max_gs); + else if (qp->sq.max_gs > HNS_ROCE_SGE_IN_WQE) + cnt = roundup_pow_of_two(wr_cnt * + (qp->sq.max_gs - + HNS_ROCE_SGE_IN_WQE)); } - return 0; + qp->ex_sge.sge_shift = HNS_ROCE_SGE_SHIFT; + qp->ex_sge.sge_cnt = cnt; } static void hns_roce_set_qp_params(struct ibv_pd *pd, @@ -736,43 +717,42 @@ static void hns_roce_set_qp_params(struct ibv_pd *pd, struct hns_roce_qp *qp, struct hns_roce_context *ctx) { - unsigned int sge_ex_count; + struct hns_roce_device *hr_dev = to_hr_dev(pd->context->device); + unsigned int cnt; - if (to_hr_dev(pd->context->device)->hw_version == HNS_ROCE_HW_VER1) { - qp->sq.wqe_cnt = align_qp_size(attr->cap.max_send_wr); - qp->rq.wqe_cnt = align_qp_size(attr->cap.max_recv_wr); - } else { - qp->sq.wqe_cnt = align_queue_size(attr->cap.max_send_wr); - qp->rq.wqe_cnt = align_queue_size(attr->cap.max_recv_wr); - } + /* RQ WQE */ + qp->rq.max_gs = max(1U, attr->cap.max_recv_sge); + if (hr_dev->hw_version == HNS_ROCE_HW_VER1) + qp->rq.wqe_shift = hr_ilog32(sizeof(struct hns_roce_rc_rq_wqe)); + else + qp->rq.wqe_shift = hr_ilog32(HNS_ROCE_SGE_SIZE * qp->rq.max_gs); + cnt = roundup_pow_of_two(attr->cap.max_recv_wr); + qp->rq.wqe_cnt = cnt; + qp->rq.shift = hr_ilog32(cnt); + if (to_hr_dev(pd->context->device)->hw_version == HNS_ROCE_HW_VER1) + qp->rq_rinl_buf.wqe_cnt = 0; + else + qp->rq_rinl_buf.wqe_cnt = cnt; + + /* SQ WQE */ qp->sq.wqe_shift = hr_ilog32(sizeof(struct hns_roce_rc_send_wqe)); - qp->sq.shift = hr_ilog32(qp->sq.wqe_cnt); - qp->rq.max_gs = attr->cap.max_recv_sge; + cnt = roundup_pow_of_two(attr->cap.max_send_wr); + qp->sq.wqe_cnt = cnt; + qp->sq.shift = hr_ilog32(cnt); - if (to_hr_dev(pd->context->device)->hw_version == HNS_ROCE_HW_VER1) { - qp->sq.max_gs = HNS_ROCE_SGE_IN_WQE; - } else { - qp->sq.max_gs = attr->cap.max_send_sge; - if (qp->sq.max_gs > HNS_ROCE_SGE_IN_WQE) { - sge_ex_count = qp->sq.wqe_cnt * - (qp->sq.max_gs - HNS_ROCE_SGE_IN_WQE); - qp->ex_sge.sge_cnt = align_queue_size(sge_ex_count); - } else { - qp->ex_sge.sge_cnt = 0; - } - } + set_extend_sge_param(hr_dev, attr, qp, cnt); + + qp->ibv_qp.qp_type = attr->qp_type; /* limit by the context queried during alloc context */ - qp->sq.max_post = min(ctx->max_qp_wr, qp->sq.wqe_cnt); + qp->sq.max_post = min(ctx->max_qp_wr, cnt); qp->sq.max_gs = min(ctx->max_sge, qp->sq.max_gs); qp->sq_signal_bits = attr->sq_sig_all ? 0 : 1; - qp->max_inline_data = HNS_ROCE_MAX_INLINE_DATA_LEN; /* update attr for creating qp */ attr->cap.max_send_wr = qp->sq.max_post; - attr->cap.max_inline_data = qp->max_inline_data; } static int get_sq_db_addr(struct ibv_pd *pd, struct ibv_qp_init_attr *attr, @@ -845,18 +825,14 @@ struct ibv_qp *hns_roce_u_create_qp(struct ibv_pd *pd, return NULL; } - qp = malloc(sizeof(*qp)); - if (!qp) { - fprintf(stderr, "malloc failed!\n"); + qp = calloc(1, sizeof(*qp)); + if (!qp) return NULL; - } hns_roce_set_qp_params(pd, attr, qp, context); - if (hns_roce_alloc_qp_buf(pd, &attr->cap, attr->qp_type, qp)) { - fprintf(stderr, "hns_roce_alloc_qp_buf failed!\n"); + if (hns_roce_alloc_qp_buf(pd, &attr->cap, qp)) goto err_buf; - } hns_roce_init_qp_indices(qp); @@ -894,6 +870,7 @@ struct ibv_qp *hns_roce_u_create_qp(struct ibv_pd *pd, } pthread_mutex_unlock(&context->qp_table_mutex); + qp->max_inline_data = attr->cap.max_inline_data; /* adjust rq maxima to not exceed reported device maxima */ attr->cap.max_recv_wr = min(context->max_qp_wr, attr->cap.max_recv_wr); attr->cap.max_recv_sge = min(context->max_sge, attr->cap.max_recv_sge); diff --git a/providers/mlx4/cq.c b/providers/mlx4/cq.c index be3009c..61313b8 100644 --- a/providers/mlx4/cq.c +++ b/providers/mlx4/cq.c @@ -58,11 +58,11 @@ static struct mlx4_cqe *get_cqe(struct mlx4_cq *cq, int entry) static void *get_sw_cqe(struct mlx4_cq *cq, int n) { - struct mlx4_cqe *cqe = get_cqe(cq, n & cq->ibv_cq.cqe); + struct mlx4_cqe *cqe = get_cqe(cq, n & cq->verbs_cq.cq.cqe); struct mlx4_cqe *tcqe = cq->cqe_size == 64 ? cqe + 1 : cqe; return (!!(tcqe->owner_sr_opcode & MLX4_CQE_OWNER_MASK) ^ - !!(n & (cq->ibv_cq.cqe + 1))) ? NULL : cqe; + !!(n & (cq->verbs_cq.cq.cqe + 1))) ? NULL : cqe; } static struct mlx4_cqe *next_cqe_sw(struct mlx4_cq *cq) @@ -206,7 +206,7 @@ static inline int mlx4_parse_cqe(struct mlx4_cq *cq, int is_send; enum ibv_wc_status *pstatus; - mctx = to_mctx(cq->ibv_cq.context); + mctx = to_mctx(cq->verbs_cq.cq.context); qpn = be32toh(cqe->vlan_my_qpn) & MLX4_CQE_QPN_MASK; if (lazy) { cq->cqe = cqe; @@ -243,7 +243,7 @@ static inline int mlx4_parse_cqe(struct mlx4_cq *cq, to_msrq((*cur_qp)->verbs_qp.qp.srq) : NULL; } - pwr_id = lazy ? &cq->ibv_cq.wr_id : &wc->wr_id; + pwr_id = lazy ? &cq->verbs_cq.cq_ex.wr_id : &wc->wr_id; if (is_send) { wq = &(*cur_qp)->sq; wqe_index = be16toh(cqe->wqe_index); @@ -260,7 +260,7 @@ static inline int mlx4_parse_cqe(struct mlx4_cq *cq, ++wq->tail; } - pstatus = lazy ? &cq->ibv_cq.status : &wc->status; + pstatus = lazy ? &cq->verbs_cq.cq_ex.status : &wc->status; if (is_error) { ecqe = (struct mlx4_err_cqe *)cqe; *pstatus = mlx4_handle_error_cqe(ecqe); @@ -610,33 +610,33 @@ void mlx4_cq_fill_pfns(struct mlx4_cq *cq, const struct ibv_cq_init_attr_ex *cq_ { if (cq->flags & MLX4_CQ_FLAGS_SINGLE_THREADED) { - cq->ibv_cq.start_poll = mlx4_start_poll; - cq->ibv_cq.end_poll = mlx4_end_poll; + cq->verbs_cq.cq_ex.start_poll = mlx4_start_poll; + cq->verbs_cq.cq_ex.end_poll = mlx4_end_poll; } else { - cq->ibv_cq.start_poll = mlx4_start_poll_lock; - cq->ibv_cq.end_poll = mlx4_end_poll_lock; + cq->verbs_cq.cq_ex.start_poll = mlx4_start_poll_lock; + cq->verbs_cq.cq_ex.end_poll = mlx4_end_poll_lock; } - cq->ibv_cq.next_poll = mlx4_next_poll; + cq->verbs_cq.cq_ex.next_poll = mlx4_next_poll; - cq->ibv_cq.read_opcode = mlx4_cq_read_wc_opcode; - cq->ibv_cq.read_vendor_err = mlx4_cq_read_wc_vendor_err; - cq->ibv_cq.read_wc_flags = mlx4_cq_read_wc_flags; + cq->verbs_cq.cq_ex.read_opcode = mlx4_cq_read_wc_opcode; + cq->verbs_cq.cq_ex.read_vendor_err = mlx4_cq_read_wc_vendor_err; + cq->verbs_cq.cq_ex.read_wc_flags = mlx4_cq_read_wc_flags; if (cq_attr->wc_flags & IBV_WC_EX_WITH_BYTE_LEN) - cq->ibv_cq.read_byte_len = mlx4_cq_read_wc_byte_len; + cq->verbs_cq.cq_ex.read_byte_len = mlx4_cq_read_wc_byte_len; if (cq_attr->wc_flags & IBV_WC_EX_WITH_IMM) - cq->ibv_cq.read_imm_data = mlx4_cq_read_wc_imm_data; + cq->verbs_cq.cq_ex.read_imm_data = mlx4_cq_read_wc_imm_data; if (cq_attr->wc_flags & IBV_WC_EX_WITH_QP_NUM) - cq->ibv_cq.read_qp_num = mlx4_cq_read_wc_qp_num; + cq->verbs_cq.cq_ex.read_qp_num = mlx4_cq_read_wc_qp_num; if (cq_attr->wc_flags & IBV_WC_EX_WITH_SRC_QP) - cq->ibv_cq.read_src_qp = mlx4_cq_read_wc_src_qp; + cq->verbs_cq.cq_ex.read_src_qp = mlx4_cq_read_wc_src_qp; if (cq_attr->wc_flags & IBV_WC_EX_WITH_SLID) - cq->ibv_cq.read_slid = mlx4_cq_read_wc_slid; + cq->verbs_cq.cq_ex.read_slid = mlx4_cq_read_wc_slid; if (cq_attr->wc_flags & IBV_WC_EX_WITH_SL) - cq->ibv_cq.read_sl = mlx4_cq_read_wc_sl; + cq->verbs_cq.cq_ex.read_sl = mlx4_cq_read_wc_sl; if (cq_attr->wc_flags & IBV_WC_EX_WITH_DLID_PATH_BITS) - cq->ibv_cq.read_dlid_path_bits = mlx4_cq_read_wc_dlid_path_bits; + cq->verbs_cq.cq_ex.read_dlid_path_bits = mlx4_cq_read_wc_dlid_path_bits; if (cq_attr->wc_flags & IBV_WC_EX_WITH_COMPLETION_TIMESTAMP) - cq->ibv_cq.read_completion_ts = mlx4_cq_read_wc_completion_ts; + cq->verbs_cq.cq_ex.read_completion_ts = mlx4_cq_read_wc_completion_ts; } int mlx4_arm_cq(struct ibv_cq *ibvcq, int solicited) @@ -693,7 +693,7 @@ void __mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq) * from our QP and therefore don't need to be checked. */ for (prod_index = cq->cons_index; get_sw_cqe(cq, prod_index); ++prod_index) - if (prod_index == cq->cons_index + cq->ibv_cq.cqe) + if (prod_index == cq->cons_index + cq->verbs_cq.cq.cqe) break; /* @@ -701,7 +701,7 @@ void __mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq) * that match our QP by copying older entries on top of them. */ while ((int) --prod_index - (int) cq->cons_index >= 0) { - cqe = get_cqe(cq, prod_index & cq->ibv_cq.cqe); + cqe = get_cqe(cq, prod_index & cq->verbs_cq.cq.cqe); cqe += cqe_inc; if (srq && srq->ext_srq && (be32toh(cqe->g_mlpath_rqpn) & MLX4_CQE_QPN_MASK) == srq->verbs_srq.srq_num && @@ -713,7 +713,7 @@ void __mlx4_cq_clean(struct mlx4_cq *cq, uint32_t qpn, struct mlx4_srq *srq) mlx4_free_srq_wqe(srq, be16toh(cqe->wqe_index)); ++nfreed; } else if (nfreed) { - dest = get_cqe(cq, (prod_index + nfreed) & cq->ibv_cq.cqe); + dest = get_cqe(cq, (prod_index + nfreed) & cq->verbs_cq.cq.cqe); dest += cqe_inc; owner_bit = dest->owner_sr_opcode & MLX4_CQE_OWNER_MASK; memcpy(dest, cqe, sizeof *cqe); @@ -762,8 +762,8 @@ void mlx4_cq_resize_copy_cqes(struct mlx4_cq *cq, void *buf, int old_cqe) while ((mlx4dv_get_cqe_opcode(cqe)) != MLX4_CQE_OPCODE_RESIZE) { cqe->owner_sr_opcode = (cqe->owner_sr_opcode & ~MLX4_CQE_OWNER_MASK) | - (((i + 1) & (cq->ibv_cq.cqe + 1)) ? MLX4_CQE_OWNER_MASK : 0); - memcpy(buf + ((i + 1) & cq->ibv_cq.cqe) * cq->cqe_size, + (((i + 1) & (cq->verbs_cq.cq.cqe + 1)) ? MLX4_CQE_OWNER_MASK : 0); + memcpy(buf + ((i + 1) & cq->verbs_cq.cq.cqe) * cq->cqe_size, cqe - cqe_inc, cq->cqe_size); ++i; cqe = get_cqe(cq, (i & old_cqe)); diff --git a/providers/mlx4/mlx4.c b/providers/mlx4/mlx4.c index 0842ff0..c4a3c55 100644 --- a/providers/mlx4/mlx4.c +++ b/providers/mlx4/mlx4.c @@ -126,7 +126,7 @@ static const struct verbs_context_ops mlx4_ctx_ops = { .destroy_flow = mlx4_destroy_flow, .destroy_rwq_ind_table = mlx4_destroy_rwq_ind_table, .destroy_wq = mlx4_destroy_wq, - .get_srq_num = verbs_get_srq_num, + .get_srq_num = mlx4_get_srq_num, .modify_cq = mlx4_modify_cq, .modify_wq = mlx4_modify_wq, .open_qp = mlx4_open_qp, @@ -354,7 +354,7 @@ static int mlx4dv_get_cq(struct ibv_cq *cq_in, cq_out->arm_db = mcq->arm_db; cq_out->arm_sn = mcq->arm_sn; cq_out->cqe_size = mcq->cqe_size; - cq_out->cqe_cnt = mcq->ibv_cq.cqe + 1; + cq_out->cqe_cnt = mcq->verbs_cq.cq.cqe + 1; mcq->flags |= MLX4_CQ_FLAGS_DV_OWNED; diff --git a/providers/mlx4/mlx4.h b/providers/mlx4/mlx4.h index 3c161e8..479c39d 100644 --- a/providers/mlx4/mlx4.h +++ b/providers/mlx4/mlx4.h @@ -159,7 +159,7 @@ enum { }; struct mlx4_cq { - struct ibv_cq_ex ibv_cq; + struct verbs_cq verbs_cq; struct mlx4_buf buf; struct mlx4_buf resize_buf; pthread_spinlock_t lock; @@ -268,7 +268,7 @@ static inline struct mlx4_pd *to_mpd(struct ibv_pd *ibpd) static inline struct mlx4_cq *to_mcq(struct ibv_cq *ibcq) { - return container_of((struct ibv_cq_ex *)ibcq, struct mlx4_cq, ibv_cq); + return container_of(ibcq, struct mlx4_cq, verbs_cq.cq); } static inline struct mlx4_srq *to_msrq(struct ibv_srq *ibsrq) @@ -319,6 +319,7 @@ int mlx4_free_pd(struct ibv_pd *pd); struct ibv_xrcd *mlx4_open_xrcd(struct ibv_context *context, struct ibv_xrcd_init_attr *attr); int mlx4_close_xrcd(struct ibv_xrcd *xrcd); +int mlx4_get_srq_num(struct ibv_srq *srq, uint32_t *srq_num); struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr, size_t length, uint64_t hca_va, int access); diff --git a/providers/mlx4/srq.c b/providers/mlx4/srq.c index a02a932..987b7b7 100644 --- a/providers/mlx4/srq.c +++ b/providers/mlx4/srq.c @@ -267,7 +267,6 @@ struct ibv_srq *mlx4_create_xrc_srq(struct ibv_context *context, cmd.db_addr = (uintptr_t) srq->db; ret = ibv_cmd_create_srq_ex(context, &srq->verbs_srq, - sizeof(srq->verbs_srq), attr_ex, &cmd.ibv_cmd, sizeof cmd, &resp.ibv_resp, sizeof resp); diff --git a/providers/mlx4/verbs.c b/providers/mlx4/verbs.c index 9f39ecd..512297f 100644 --- a/providers/mlx4/verbs.c +++ b/providers/mlx4/verbs.c @@ -274,6 +274,17 @@ int mlx4_close_xrcd(struct ibv_xrcd *ib_xrcd) return 0; } +int mlx4_get_srq_num(struct ibv_srq *srq, uint32_t *srq_num) +{ + struct mlx4_srq *msrq = + container_of(srq, struct mlx4_srq, verbs_srq.srq); + + if (!msrq->verbs_srq.xrcd) + return EOPNOTSUPP; + *srq_num = msrq->verbs_srq.srq_num; + return 0; +} + struct ibv_mr *mlx4_reg_mr(struct ibv_pd *pd, void *addr, size_t length, uint64_t hca_va, int access) { @@ -304,9 +315,6 @@ int mlx4_rereg_mr(struct verbs_mr *vmr, struct ibv_rereg_mr cmd; struct ib_uverbs_rereg_mr_resp resp; - if (flags & IBV_REREG_MR_KEEP_VALID) - return ENOTSUP; - return ibv_cmd_rereg_mr(vmr, flags, addr, length, (uintptr_t)addr, access, pd, @@ -416,7 +424,7 @@ static int mlx4_cmd_create_cq(struct ibv_context *context, ret = ibv_cmd_create_cq(context, cq_attr->cqe, cq_attr->channel, cq_attr->comp_vector, - ibv_cq_ex_to_cq(&cq->ibv_cq), + &cq->verbs_cq.cq, &cmd.ibv_cmd, sizeof(cmd), &resp.ibv_resp, sizeof(resp)); if (!ret) @@ -438,7 +446,7 @@ static int mlx4_cmd_create_cq_ex(struct ibv_context *context, cmd.db_addr = (uintptr_t) cq->set_ci_db; ret = ibv_cmd_create_cq_ex(context, cq_attr, - &cq->ibv_cq, &cmd.ibv_cmd, + &cq->verbs_cq, &cmd.ibv_cmd, sizeof(cmd), &resp.ibv_resp, sizeof(resp)); @@ -530,7 +538,7 @@ static struct ibv_cq_ex *create_cq(struct ibv_context *context, if (cq_alloc_flags & MLX4_CQ_FLAGS_EXTENDED) mlx4_cq_fill_pfns(cq, cq_attr); - return &cq->ibv_cq; + return &cq->verbs_cq.cq_ex; err_db: mlx4_free_db(to_mctx(context), MLX4_DB_TYPE_CQ, cq->set_ci_db); @@ -779,7 +787,7 @@ static int mlx4_cmd_create_qp_ex_rss(struct ibv_context *context, sizeof(cmd_ex.rx_hash_key)); ret = ibv_cmd_create_qp_ex2(context, &qp->verbs_qp, - sizeof(qp->verbs_qp), attr, &cmd_ex.ibv_cmd, + attr, &cmd_ex.ibv_cmd, sizeof(cmd_ex), &resp.ibv_resp, sizeof(resp)); return ret; @@ -838,7 +846,7 @@ static int mlx4_cmd_create_qp_ex(struct ibv_context *context, cmd_ex.drv_payload = cmd->drv_payload; ret = ibv_cmd_create_qp_ex2(context, &qp->verbs_qp, - sizeof(qp->verbs_qp), attr, &cmd_ex.ibv_cmd, + attr, &cmd_ex.ibv_cmd, sizeof(cmd_ex), &resp.ibv_resp, sizeof(resp)); return ret; @@ -970,7 +978,7 @@ static struct ibv_qp *create_qp_ex(struct ibv_context *context, ret = mlx4_cmd_create_qp_ex(context, attr, &cmd, qp); else ret = ibv_cmd_create_qp_ex(context, &qp->verbs_qp, - sizeof(qp->verbs_qp), attr, + attr, &cmd.ibv_cmd, sizeof(cmd), &resp, sizeof(resp)); if (ret) diff --git a/providers/mlx5/CMakeLists.txt b/providers/mlx5/CMakeLists.txt index dc97642..b50e000 100644 --- a/providers/mlx5/CMakeLists.txt +++ b/providers/mlx5/CMakeLists.txt @@ -11,11 +11,12 @@ if (MLX5_MW_DEBUG) endif() rdma_shared_provider(mlx5 libmlx5.map - 1 1.13.${PACKAGE_VERSION} + 1 1.16.${PACKAGE_VERSION} buf.c cq.c dbrec.c dr_action.c + dr_buddy.c dr_crc32.c dr_dbg.c dr_devx.c @@ -24,6 +25,8 @@ rdma_shared_provider(mlx5 libmlx5.map dr_domain.c dr_rule.c dr_ste.c + dr_ste_v0.c + dr_ste_v1.c dr_table.c dr_send.c mlx5.c diff --git a/providers/mlx5/cq.c b/providers/mlx5/cq.c index 2b4f189..365623e 100644 --- a/providers/mlx5/cq.c +++ b/providers/mlx5/cq.c @@ -121,13 +121,13 @@ static void *get_cqe(struct mlx5_cq *cq, int n) static void *get_sw_cqe(struct mlx5_cq *cq, int n) { - void *cqe = get_cqe(cq, n & cq->ibv_cq.cqe); + void *cqe = get_cqe(cq, n & cq->verbs_cq.cq.cqe); struct mlx5_cqe64 *cqe64; cqe64 = (cq->cqe_sz == 64) ? cqe : cqe + 64; if (likely(mlx5dv_get_cqe_opcode(cqe64) != MLX5_CQE_INVALID) && - !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->ibv_cq.cqe + 1)))) { + !((cqe64->op_own & MLX5_CQE_OWNER_MASK) ^ !!(n & (cq->verbs_cq.cq.cqe + 1)))) { return cqe; } else { return NULL; @@ -191,7 +191,7 @@ static inline int handle_responder_lazy(struct mlx5_cq *cq, struct mlx5_cqe64 *c if (srq) { wqe_ctr = be16toh(cqe->wqe_counter); - cq->ibv_cq.wr_id = srq->wrid[wqe_ctr]; + cq->verbs_cq.cq_ex.wr_id = srq->wrid[wqe_ctr]; mlx5_free_srq_wqe(srq, wqe_ctr); if (cqe->op_own & MLX5_INLINE_SCATTER_32) err = mlx5_copy_to_recv_srq(srq, wqe_ctr, cqe, @@ -209,7 +209,7 @@ static inline int handle_responder_lazy(struct mlx5_cq *cq, struct mlx5_cqe64 *c } wqe_ctr = wq->tail & (wq->wqe_cnt - 1); - cq->ibv_cq.wr_id = wq->wrid[wqe_ctr]; + cq->verbs_cq.cq_ex.wr_id = wq->wrid[wqe_ctr]; ++wq->tail; if (cqe->op_own & MLX5_INLINE_SCATTER_32) err = mlx5_copy_to_recv_wqe(qp, wqe_ctr, cqe, @@ -535,7 +535,7 @@ static inline int mlx5_get_next_cqe(struct mlx5_cq *cq, #ifdef MLX5_DEBUG { - struct mlx5_context *mctx = to_mctx(cq->ibv_cq.context); + struct mlx5_context *mctx = to_mctx(cq->verbs_cq.cq_ex.context); if (mlx5_debug_mask & MLX5_DBG_CQ_CQE) { FILE *fp = mctx->dbg_fp; @@ -560,12 +560,12 @@ static int handle_tag_matching(struct mlx5_cq *cq, struct mlx5_srq_op *op; uint16_t wqe_ctr; - cq->ibv_cq.status = IBV_WC_SUCCESS; + cq->verbs_cq.cq_ex.status = IBV_WC_SUCCESS; switch (cqe64->app_op) { case MLX5_CQE_APP_OP_TM_CONSUMED_MSG_SW_RDNV: case MLX5_CQE_APP_OP_TM_CONSUMED_SW_RDNV: case MLX5_CQE_APP_OP_TM_MSG_COMPLETION_CANCELED: - cq->ibv_cq.status = IBV_WC_TM_RNDV_INCOMPLETE; + cq->verbs_cq.cq_ex.status = IBV_WC_TM_RNDV_INCOMPLETE; SWITCH_FALLTHROUGH; case MLX5_CQE_APP_OP_TM_CONSUMED_MSG: @@ -576,17 +576,17 @@ static int handle_tag_matching(struct mlx5_cq *cq, if (!tag->expect_cqe) { mlx5_dbg(fp, MLX5_DBG_CQ, "got idx %d which wasn't added\n", be16toh(cqe64->app_info)); - cq->ibv_cq.status = IBV_WC_GENERAL_ERR; + cq->verbs_cq.cq_ex.status = IBV_WC_GENERAL_ERR; mlx5_spin_unlock(&srq->lock); return CQ_OK; } - cq->ibv_cq.wr_id = tag->wr_id; + cq->verbs_cq.cq_ex.wr_id = tag->wr_id; if (mlx5_cqe_app_op_tm_is_complete(cqe64->app_op)) mlx5_tm_release_tag(srq, tag); /* inline scatter 32 not supported for TM */ if (cqe64->op_own & MLX5_INLINE_SCATTER_64) { if (be32toh(cqe64->byte_cnt) > tag->size) - cq->ibv_cq.status = IBV_WC_LOC_LEN_ERR; + cq->verbs_cq.cq_ex.status = IBV_WC_LOC_LEN_ERR; else memcpy(tag->ptr, cqe64 - 1, be32toh(cqe64->byte_cnt)); @@ -596,7 +596,7 @@ static int handle_tag_matching(struct mlx5_cq *cq, case MLX5_CQE_APP_OP_TM_REMOVE: if (!(be32toh(cqe64->tm_cqe.success) & MLX5_TMC_SUCCESS)) - cq->ibv_cq.status = IBV_WC_TM_ERR; + cq->verbs_cq.cq_ex.status = IBV_WC_TM_ERR; SWITCH_FALLTHROUGH; case MLX5_CQE_APP_OP_TM_APPEND: @@ -605,7 +605,7 @@ static int handle_tag_matching(struct mlx5_cq *cq, #ifdef MLX5_DEBUG if (srq->op_tail == srq->op_head) { mlx5_dbg(fp, MLX5_DBG_CQ, "got unexpected list op CQE\n"); - cq->ibv_cq.status = IBV_WC_GENERAL_ERR; + cq->verbs_cq.cq_ex.status = IBV_WC_GENERAL_ERR; mlx5_spin_unlock(&srq->lock); return CQ_OK; } @@ -615,7 +615,7 @@ static int handle_tag_matching(struct mlx5_cq *cq, if (op->tag) { /* APPEND or REMOVE */ mlx5_tm_release_tag(srq, op->tag); if (cqe64->app_op == MLX5_CQE_APP_OP_TM_REMOVE && - cq->ibv_cq.status == IBV_WC_SUCCESS) + cq->verbs_cq.cq_ex.status == IBV_WC_SUCCESS) /* * If tag entry was successfully removed we * don't expect consumption completion for it @@ -629,7 +629,7 @@ static int handle_tag_matching(struct mlx5_cq *cq, } to_mqp(srq->cmd_qp)->sq.tail = op->wqe_head + 1; - cq->ibv_cq.wr_id = op->wr_id; + cq->verbs_cq.cq_ex.wr_id = op->wr_id; mlx5_spin_unlock(&srq->lock); break; @@ -642,7 +642,7 @@ static int handle_tag_matching(struct mlx5_cq *cq, case MLX5_CQE_APP_OP_TM_NO_TAG: wqe_ctr = be16toh(cqe64->wqe_counter); - cq->ibv_cq.wr_id = srq->wrid[wqe_ctr]; + cq->verbs_cq.cq_ex.wr_id = srq->wrid[wqe_ctr]; mlx5_free_srq_wqe(srq, wqe_ctr); if (cqe64->op_own & MLX5_INLINE_SCATTER_32) return mlx5_copy_to_recv_srq(srq, wqe_ctr, cqe64, @@ -698,7 +698,7 @@ again: is_srq = 0; err = 0; - mctx = to_mctx(ibv_cq_ex_to_cq(&cq->ibv_cq)->context); + mctx = to_mctx(cq->verbs_cq.cq.context); qpn = be32toh(cqe64->sop_drop_qpn) & 0xffffff; if (lazy) { cq->cqe64 = cqe64; @@ -745,8 +745,8 @@ again: break; } - cq->ibv_cq.wr_id = wq->wrid[idx]; - cq->ibv_cq.status = err; + cq->verbs_cq.cq_ex.wr_id = wq->wrid[idx]; + cq->verbs_cq.cq_ex.status = err; } else { handle_good_req(wc, cqe64, wq, idx); @@ -776,7 +776,7 @@ again: if (lazy) { if (likely(cqe64->app != MLX5_CQE_APP_TAG_MATCHING)) { - cq->ibv_cq.status = handle_responder_lazy + cq->verbs_cq.cq_ex.status = handle_responder_lazy (cq, cqe64, *cur_rsc, is_srq ? *cur_srq : NULL); } else { @@ -813,7 +813,7 @@ again: srqn_uidx = be32toh(cqe64->srqn_uidx) & 0xffffff; ecqe = (struct mlx5_err_cqe *)cqe64; { - enum ibv_wc_status *pstatus = lazy ? &cq->ibv_cq.status : &wc->status; + enum ibv_wc_status *pstatus = lazy ? &cq->verbs_cq.cq_ex.status : &wc->status; *pstatus = mlx5_handle_error_cqe(ecqe); } @@ -844,7 +844,7 @@ again: wqe_ctr = be16toh(cqe64->wqe_counter); idx = wqe_ctr & (wq->wqe_cnt - 1); if (lazy) - cq->ibv_cq.wr_id = wq->wrid[idx]; + cq->verbs_cq.cq_ex.wr_id = wq->wrid[idx]; else wc->wr_id = wq->wrid[idx]; wq->tail = wq->wqe_head[idx] + 1; @@ -868,7 +868,7 @@ again: } if (lazy) - cq->ibv_cq.wr_id = (*cur_srq)->wrid[wqe_ctr]; + cq->verbs_cq.cq_ex.wr_id = (*cur_srq)->wrid[wqe_ctr]; else wc->wr_id = (*cur_srq)->wrid[wqe_ctr]; mlx5_free_srq_wqe(*cur_srq, wqe_ctr); @@ -883,7 +883,7 @@ again: } if (lazy) - cq->ibv_cq.wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; + cq->verbs_cq.cq_ex.wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; else wc->wr_id = wq->wrid[wq->tail & (wq->wqe_cnt - 1)]; ++wq->tail; @@ -1608,39 +1608,39 @@ int mlx5_cq_fill_pfns(struct mlx5_cq *cq, ((cq_attr->wc_flags & IBV_WC_EX_WITH_COMPLETION_TIMESTAMP_WALLCLOCK) ? CLOCK_UPDATE : 0)]; - cq->ibv_cq.start_poll = poll_ops->start_poll; - cq->ibv_cq.next_poll = poll_ops->next_poll; - cq->ibv_cq.end_poll = poll_ops->end_poll; + cq->verbs_cq.cq_ex.start_poll = poll_ops->start_poll; + cq->verbs_cq.cq_ex.next_poll = poll_ops->next_poll; + cq->verbs_cq.cq_ex.end_poll = poll_ops->end_poll; - cq->ibv_cq.read_opcode = mlx5_cq_read_wc_opcode; - cq->ibv_cq.read_vendor_err = mlx5_cq_read_wc_vendor_err; - cq->ibv_cq.read_wc_flags = mlx5_cq_read_wc_flags; + cq->verbs_cq.cq_ex.read_opcode = mlx5_cq_read_wc_opcode; + cq->verbs_cq.cq_ex.read_vendor_err = mlx5_cq_read_wc_vendor_err; + cq->verbs_cq.cq_ex.read_wc_flags = mlx5_cq_read_wc_flags; if (cq_attr->wc_flags & IBV_WC_EX_WITH_BYTE_LEN) - cq->ibv_cq.read_byte_len = mlx5_cq_read_wc_byte_len; + cq->verbs_cq.cq_ex.read_byte_len = mlx5_cq_read_wc_byte_len; if (cq_attr->wc_flags & IBV_WC_EX_WITH_IMM) - cq->ibv_cq.read_imm_data = mlx5_cq_read_wc_imm_data; + cq->verbs_cq.cq_ex.read_imm_data = mlx5_cq_read_wc_imm_data; if (cq_attr->wc_flags & IBV_WC_EX_WITH_QP_NUM) - cq->ibv_cq.read_qp_num = mlx5_cq_read_wc_qp_num; + cq->verbs_cq.cq_ex.read_qp_num = mlx5_cq_read_wc_qp_num; if (cq_attr->wc_flags & IBV_WC_EX_WITH_SRC_QP) - cq->ibv_cq.read_src_qp = mlx5_cq_read_wc_src_qp; + cq->verbs_cq.cq_ex.read_src_qp = mlx5_cq_read_wc_src_qp; if (cq_attr->wc_flags & IBV_WC_EX_WITH_SLID) - cq->ibv_cq.read_slid = mlx5_cq_read_wc_slid; + cq->verbs_cq.cq_ex.read_slid = mlx5_cq_read_wc_slid; if (cq_attr->wc_flags & IBV_WC_EX_WITH_SL) - cq->ibv_cq.read_sl = mlx5_cq_read_wc_sl; + cq->verbs_cq.cq_ex.read_sl = mlx5_cq_read_wc_sl; if (cq_attr->wc_flags & IBV_WC_EX_WITH_DLID_PATH_BITS) - cq->ibv_cq.read_dlid_path_bits = mlx5_cq_read_wc_dlid_path_bits; + cq->verbs_cq.cq_ex.read_dlid_path_bits = mlx5_cq_read_wc_dlid_path_bits; if (cq_attr->wc_flags & IBV_WC_EX_WITH_COMPLETION_TIMESTAMP) - cq->ibv_cq.read_completion_ts = mlx5_cq_read_wc_completion_ts; + cq->verbs_cq.cq_ex.read_completion_ts = mlx5_cq_read_wc_completion_ts; if (cq_attr->wc_flags & IBV_WC_EX_WITH_CVLAN) - cq->ibv_cq.read_cvlan = mlx5_cq_read_wc_cvlan; + cq->verbs_cq.cq_ex.read_cvlan = mlx5_cq_read_wc_cvlan; if (cq_attr->wc_flags & IBV_WC_EX_WITH_FLOW_TAG) - cq->ibv_cq.read_flow_tag = mlx5_cq_read_flow_tag; + cq->verbs_cq.cq_ex.read_flow_tag = mlx5_cq_read_flow_tag; if (cq_attr->wc_flags & IBV_WC_EX_WITH_TM_INFO) - cq->ibv_cq.read_tm_info = mlx5_cq_read_wc_tm_info; + cq->verbs_cq.cq_ex.read_tm_info = mlx5_cq_read_wc_tm_info; if (cq_attr->wc_flags & IBV_WC_EX_WITH_COMPLETION_TIMESTAMP_WALLCLOCK) { if (!mctx->clock_info_page) return EOPNOTSUPP; - cq->ibv_cq.read_completion_wallclock_ns = + cq->verbs_cq.cq_ex.read_completion_wallclock_ns = mlx5_cq_read_wc_completion_wallclock_ns; } @@ -1750,21 +1750,21 @@ void __mlx5_cq_clean(struct mlx5_cq *cq, uint32_t rsn, struct mlx5_srq *srq) * from our QP and therefore don't need to be checked. */ for (prod_index = cq->cons_index; get_sw_cqe(cq, prod_index); ++prod_index) - if (prod_index == cq->cons_index + cq->ibv_cq.cqe) + if (prod_index == cq->cons_index + cq->verbs_cq.cq.cqe) break; /* * Now sweep backwards through the CQ, removing CQ entries * that match our QP by copying older entries on top of them. */ - cqe_version = (to_mctx(cq->ibv_cq.context))->cqe_version; + cqe_version = (to_mctx(cq->verbs_cq.cq.context))->cqe_version; while ((int) --prod_index - (int) cq->cons_index >= 0) { - cqe = get_cqe(cq, prod_index & cq->ibv_cq.cqe); + cqe = get_cqe(cq, prod_index & cq->verbs_cq.cq.cqe); cqe64 = (cq->cqe_sz == 64) ? cqe : cqe + 64; if (free_res_cqe(cqe64, rsn, srq, cqe_version)) { ++nfreed; } else if (nfreed) { - dest = get_cqe(cq, (prod_index + nfreed) & cq->ibv_cq.cqe); + dest = get_cqe(cq, (prod_index + nfreed) & cq->verbs_cq.cq.cqe); dest64 = (cq->cqe_sz == 64) ? dest : dest + 64; owner_bit = dest64->op_own & MLX5_CQE_OWNER_MASK; memcpy(dest, cqe, cq->cqe_sz); diff --git a/providers/mlx5/dr_action.c b/providers/mlx5/dr_action.c index 5f457fa..de1fbb0 100644 --- a/providers/mlx5/dr_action.c +++ b/providers/mlx5/dr_action.c @@ -33,7 +33,6 @@ #include #include #include -#include #include "mlx5dv_dr.h" enum dr_action_domain { @@ -65,9 +64,12 @@ static const enum dr_action_valid_state next_action_state[DR_ACTION_DOMAIN_MAX] [DR_ACTION_TYP_TAG] = DR_ACTION_STATE_NON_TERM, [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_NON_TERM, [DR_ACTION_TYP_METER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_DEST_ARRAY] = DR_ACTION_STATE_TERM, [DR_ACTION_TYP_TNL_L2_TO_L2] = DR_ACTION_STATE_REFORMAT, [DR_ACTION_TYP_TNL_L3_TO_L2] = DR_ACTION_STATE_REFORMAT, [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_MISS] = DR_ACTION_STATE_TERM, }, [DR_ACTION_STATE_REFORMAT] = { [DR_ACTION_TYP_QP] = DR_ACTION_STATE_TERM, @@ -75,6 +77,8 @@ static const enum dr_action_valid_state next_action_state[DR_ACTION_DOMAIN_MAX] [DR_ACTION_TYP_TAG] = DR_ACTION_STATE_REFORMAT, [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_REFORMAT, [DR_ACTION_TYP_METER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_DEST_ARRAY] = DR_ACTION_STATE_TERM, [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, }, [DR_ACTION_STATE_MODIFY_HDR] = { @@ -83,6 +87,8 @@ static const enum dr_action_valid_state next_action_state[DR_ACTION_DOMAIN_MAX] [DR_ACTION_TYP_TAG] = DR_ACTION_STATE_MODIFY_HDR, [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_MODIFY_HDR, [DR_ACTION_TYP_METER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_DEST_ARRAY] = DR_ACTION_STATE_TERM, }, [DR_ACTION_STATE_NON_TERM] = { [DR_ACTION_TYP_DROP] = DR_ACTION_STATE_TERM, @@ -91,9 +97,12 @@ static const enum dr_action_valid_state next_action_state[DR_ACTION_DOMAIN_MAX] [DR_ACTION_TYP_TAG] = DR_ACTION_STATE_NON_TERM, [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_NON_TERM, [DR_ACTION_TYP_METER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_DEST_ARRAY] = DR_ACTION_STATE_TERM, [DR_ACTION_TYP_TNL_L2_TO_L2] = DR_ACTION_STATE_REFORMAT, [DR_ACTION_TYP_TNL_L3_TO_L2] = DR_ACTION_STATE_REFORMAT, [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_MISS] = DR_ACTION_STATE_TERM, }, [DR_ACTION_STATE_TERM] = { [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_TERM, @@ -108,6 +117,7 @@ static const enum dr_action_valid_state next_action_state[DR_ACTION_DOMAIN_MAX] [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_REFORMAT, [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_REFORMAT, [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_MISS] = DR_ACTION_STATE_TERM, }, [DR_ACTION_STATE_REFORMAT] = { [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, @@ -129,6 +139,7 @@ static const enum dr_action_valid_state next_action_state[DR_ACTION_DOMAIN_MAX] [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_REFORMAT, [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_REFORMAT, [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, + [DR_ACTION_TYP_MISS] = DR_ACTION_STATE_TERM, }, [DR_ACTION_STATE_TERM] = { [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_TERM, @@ -140,15 +151,20 @@ static const enum dr_action_valid_state next_action_state[DR_ACTION_DOMAIN_MAX] [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_NON_TERM, [DR_ACTION_TYP_METER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_DEST_ARRAY] = DR_ACTION_STATE_TERM, [DR_ACTION_TYP_TNL_L2_TO_L2] = DR_ACTION_STATE_REFORMAT, [DR_ACTION_TYP_TNL_L3_TO_L2] = DR_ACTION_STATE_REFORMAT, [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_MISS] = DR_ACTION_STATE_TERM, }, [DR_ACTION_STATE_REFORMAT] = { [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_REFORMAT, [DR_ACTION_TYP_METER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_DEST_ARRAY] = DR_ACTION_STATE_TERM, [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, }, @@ -156,6 +172,8 @@ static const enum dr_action_valid_state next_action_state[DR_ACTION_DOMAIN_MAX] [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_MODIFY_HDR, [DR_ACTION_TYP_METER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_DEST_ARRAY] = DR_ACTION_STATE_TERM, [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, }, [DR_ACTION_STATE_NON_TERM] = { @@ -163,10 +181,13 @@ static const enum dr_action_valid_state next_action_state[DR_ACTION_DOMAIN_MAX] [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_NON_TERM, [DR_ACTION_TYP_METER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_DEST_ARRAY] = DR_ACTION_STATE_TERM, [DR_ACTION_TYP_TNL_L2_TO_L2] = DR_ACTION_STATE_REFORMAT, [DR_ACTION_TYP_TNL_L3_TO_L2] = DR_ACTION_STATE_REFORMAT, [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_MISS] = DR_ACTION_STATE_TERM, }, [DR_ACTION_STATE_TERM] = { [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_TERM, @@ -179,20 +200,27 @@ static const enum dr_action_valid_state next_action_state[DR_ACTION_DOMAIN_MAX] [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_NON_TERM, [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, [DR_ACTION_TYP_METER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_DEST_ARRAY] = DR_ACTION_STATE_TERM, [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_REFORMAT, [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_REFORMAT, [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_MISS] = DR_ACTION_STATE_TERM, }, [DR_ACTION_STATE_REFORMAT] = { [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_REFORMAT, [DR_ACTION_TYP_METER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_DEST_ARRAY] = DR_ACTION_STATE_TERM, [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, }, [DR_ACTION_STATE_MODIFY_HDR] = { [DR_ACTION_TYP_FT] = DR_ACTION_STATE_TERM, [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_MODIFY_HDR, [DR_ACTION_TYP_METER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_DEST_ARRAY] = DR_ACTION_STATE_TERM, [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_REFORMAT, [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_REFORMAT, [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, @@ -203,9 +231,12 @@ static const enum dr_action_valid_state next_action_state[DR_ACTION_DOMAIN_MAX] [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_NON_TERM, [DR_ACTION_TYP_MODIFY_HDR] = DR_ACTION_STATE_MODIFY_HDR, [DR_ACTION_TYP_METER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_SAMPLER] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_DEST_ARRAY] = DR_ACTION_STATE_TERM, [DR_ACTION_TYP_L2_TO_TNL_L2] = DR_ACTION_STATE_REFORMAT, [DR_ACTION_TYP_L2_TO_TNL_L3] = DR_ACTION_STATE_REFORMAT, [DR_ACTION_TYP_VPORT] = DR_ACTION_STATE_TERM, + [DR_ACTION_TYP_MISS] = DR_ACTION_STATE_TERM, }, [DR_ACTION_STATE_TERM] = { [DR_ACTION_TYP_CTR] = DR_ACTION_STATE_TERM, @@ -213,150 +244,6 @@ static const enum dr_action_valid_state next_action_state[DR_ACTION_DOMAIN_MAX] }, }; -struct dr_action_modify_field_conv { - uint16_t hw_field; - uint8_t start; - uint8_t end; - uint8_t l3_type; - uint8_t l4_type; -}; - -static const struct dr_action_modify_field_conv dr_action_conv_arr[] = { - [MLX5_ACTION_IN_FIELD_OUT_SMAC_47_16] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L2_1, .start = 16, .end = 47, - }, - [MLX5_ACTION_IN_FIELD_OUT_SMAC_15_0] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L2_1, .start = 0, .end = 15, - }, - [MLX5_ACTION_IN_FIELD_OUT_ETHERTYPE] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L2_2, .start = 32, .end = 47, - }, - [MLX5_ACTION_IN_FIELD_OUT_DMAC_47_16] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L2_0, .start = 16, .end = 47, - }, - [MLX5_ACTION_IN_FIELD_OUT_DMAC_15_0] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L2_0, .start = 0, .end = 15, - }, - [MLX5_ACTION_IN_FIELD_OUT_IP_DSCP] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L3_1, .start = 0, .end = 5, - }, - [MLX5_ACTION_IN_FIELD_OUT_TCP_FLAGS] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L4_0, .start = 48, .end = 56, - .l4_type = MLX5_DR_ACTION_MDFY_HW_HDR_L4_TCP, - }, - [MLX5_ACTION_IN_FIELD_OUT_TCP_SPORT] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L4_0, .start = 0, .end = 15, - .l4_type = MLX5_DR_ACTION_MDFY_HW_HDR_L4_TCP, - }, - [MLX5_ACTION_IN_FIELD_OUT_TCP_DPORT] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L4_0, .start = 16, .end = 31, - .l4_type = MLX5_DR_ACTION_MDFY_HW_HDR_L4_TCP, - }, - [MLX5_ACTION_IN_FIELD_OUT_IP_TTL] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L3_1, .start = 8, .end = 15, - .l3_type = MLX5_DR_ACTION_MDFY_HW_HDR_L3_IPV4, - }, - [MLX5_ACTION_IN_FIELD_OUT_IPV6_HOPLIMIT] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L3_1, .start = 8, .end = 15, - .l3_type = MLX5_DR_ACTION_MDFY_HW_HDR_L3_IPV6, - }, - [MLX5_ACTION_IN_FIELD_OUT_UDP_SPORT] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L4_0, .start = 0, .end = 15, - .l4_type = MLX5_DR_ACTION_MDFY_HW_HDR_L4_UDP, - }, - [MLX5_ACTION_IN_FIELD_OUT_UDP_DPORT] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L4_0, .start = 16, .end = 31, - .l4_type = MLX5_DR_ACTION_MDFY_HW_HDR_L4_UDP, - }, - [MLX5_ACTION_IN_FIELD_OUT_SIPV6_127_96] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L3_3, .start = 32, .end = 63, - .l3_type = MLX5_DR_ACTION_MDFY_HW_HDR_L3_IPV6, - }, - [MLX5_ACTION_IN_FIELD_OUT_SIPV6_95_64] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L3_3, .start = 0, .end = 31, - .l3_type = MLX5_DR_ACTION_MDFY_HW_HDR_L3_IPV6, - }, - [MLX5_ACTION_IN_FIELD_OUT_SIPV6_63_32] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L3_4, .start = 32, .end = 63, - .l3_type = MLX5_DR_ACTION_MDFY_HW_HDR_L3_IPV6, - }, - [MLX5_ACTION_IN_FIELD_OUT_SIPV6_31_0] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L3_4, .start = 0, .end = 31, - .l3_type = MLX5_DR_ACTION_MDFY_HW_HDR_L3_IPV6, - }, - [MLX5_ACTION_IN_FIELD_OUT_DIPV6_127_96] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L3_0, .start = 32, .end = 63, - .l3_type = MLX5_DR_ACTION_MDFY_HW_HDR_L3_IPV6, - }, - [MLX5_ACTION_IN_FIELD_OUT_DIPV6_95_64] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L3_0, .start = 0, .end = 31, - .l3_type = MLX5_DR_ACTION_MDFY_HW_HDR_L3_IPV6, - }, - [MLX5_ACTION_IN_FIELD_OUT_DIPV6_63_32] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L3_2, .start = 32, .end = 63, - .l3_type = MLX5_DR_ACTION_MDFY_HW_HDR_L3_IPV6, - }, - [MLX5_ACTION_IN_FIELD_OUT_DIPV6_31_0] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L3_2, .start = 0, .end = 31, - .l3_type = MLX5_DR_ACTION_MDFY_HW_HDR_L3_IPV6, - }, - [MLX5_ACTION_IN_FIELD_OUT_SIPV4] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L3_0, .start = 0, .end = 31, - .l3_type = MLX5_DR_ACTION_MDFY_HW_HDR_L3_IPV4, - }, - [MLX5_ACTION_IN_FIELD_OUT_DIPV4] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L3_0, .start = 32, .end = 63, - .l3_type = MLX5_DR_ACTION_MDFY_HW_HDR_L3_IPV4, - }, - [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGA] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_METADATA, .start = 0, .end = 31, - }, - [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGB] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_METADATA, .start = 32, .end = 63, - }, - [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_0] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_REG_0, .start = 32, .end = 63, - }, - [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_1] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_REG_0, .start = 0, .end = 31, - }, - [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_2] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_REG_1, .start = 32, .end = 63, - }, - [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_3] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_REG_1, .start = 0, .end = 31, - }, - [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_4] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_REG_2, .start = 32, .end = 63, - }, - [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_5] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_REG_2, .start = 0, .end = 31, - }, - [MLX5_ACTION_IN_FIELD_OUT_TCP_SEQ_NUM] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L4_1, .start = 32, .end = 63, - }, - [MLX5_ACTION_IN_FIELD_OUT_TCP_ACK_NUM] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L4_1, .start = 0, .end = 31, - }, - [MLX5_ACTION_IN_FIELD_OUT_FIRST_VID] = { - .hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_L2_2, .start = 0, .end = 15, - }, -}; - -struct dr_action_apply_attr { - uint32_t modify_index; - uint16_t modify_actions; - uint32_t decap_index; - uint16_t decap_actions; - bool decap_with_vlan; - uint64_t final_icm_addr; - uint32_t flow_tag; - uint32_t ctr_id; - uint16_t gvmi; - uint32_t reformat_id; - uint32_t reformat_size; -}; - static enum mlx5dv_flow_action_packet_reformat_type dr_action_type_to_reformat_enum(enum dr_action_type action_type) { @@ -393,118 +280,27 @@ dr_action_reformat_to_action_type(enum mlx5dv_flow_action_packet_reformat_type t } } -static void dr_actions_init_next_ste(uint8_t **last_ste, - uint32_t *added_stes, - enum dr_ste_entry_type entry_type, - uint16_t gvmi) -{ - (*added_stes)++; - *last_ste += DR_STE_SIZE; - dr_ste_init(*last_ste, DR_STE_LU_TYPE_DONT_CARE, entry_type, gvmi); -} - -static void dr_actions_apply_tx(uint8_t *action_type_set, - uint8_t *last_ste, - struct dr_action_apply_attr *attr, - uint32_t *added_stes) -{ - /* We want to make sure the modify header comes before L2 - * encapsulation. The reason for that is that we support - * modify headers for outer headers only - */ - if (action_type_set[DR_ACTION_TYP_MODIFY_HDR]) { - dr_ste_set_entry_type(last_ste, DR_STE_TYPE_MODIFY_PKT); - dr_ste_set_rewrite_actions(last_ste, - attr->modify_actions, - attr->modify_index); - } - - if (action_type_set[DR_ACTION_TYP_L2_TO_TNL_L2] || - action_type_set[DR_ACTION_TYP_L2_TO_TNL_L3]) { - /* Modify header and encapsulation require a different STEs. - * Since modify header STE format doesn't support encapsulation - * tunneling_action. - */ - if (action_type_set[DR_ACTION_TYP_MODIFY_HDR]) - dr_actions_init_next_ste(&last_ste, - added_stes, - DR_STE_TYPE_TX, - attr->gvmi); - - dr_ste_set_tx_encap(last_ste, - attr->reformat_id, - attr->reformat_size, - action_type_set[DR_ACTION_TYP_L2_TO_TNL_L3]); - } - - if (action_type_set[DR_ACTION_TYP_CTR]) - dr_ste_set_counter_id(last_ste, attr->ctr_id); -} - -static void dr_actions_apply_rx(uint8_t *action_type_set, - uint8_t *last_ste, - struct dr_action_apply_attr *attr, - uint32_t *added_stes) -{ - if (action_type_set[DR_ACTION_TYP_CTR]) - dr_ste_set_counter_id(last_ste, attr->ctr_id); - - if (action_type_set[DR_ACTION_TYP_TNL_L3_TO_L2]) { - dr_ste_set_entry_type(last_ste, DR_STE_TYPE_MODIFY_PKT); - dr_ste_set_rx_decap_l3(last_ste, attr->decap_with_vlan); - dr_ste_set_rewrite_actions(last_ste, - attr->decap_actions, - attr->decap_index); - } - - if (action_type_set[DR_ACTION_TYP_TNL_L2_TO_L2]) - dr_ste_set_rx_decap(last_ste); - - if (action_type_set[DR_ACTION_TYP_MODIFY_HDR]) { - if (dr_ste_get_entry_type(last_ste) == DR_STE_TYPE_MODIFY_PKT) - dr_actions_init_next_ste(&last_ste, - added_stes, - DR_STE_TYPE_MODIFY_PKT, - attr->gvmi); - else - dr_ste_set_entry_type(last_ste, DR_STE_TYPE_MODIFY_PKT); - - dr_ste_set_rewrite_actions(last_ste, - attr->modify_actions, - attr->modify_index); - } - - if (action_type_set[DR_ACTION_TYP_TAG]) { - if (dr_ste_get_entry_type(last_ste) == DR_STE_TYPE_MODIFY_PKT) - dr_actions_init_next_ste(&last_ste, - added_stes, - DR_STE_TYPE_RX, - attr->gvmi); - - dr_ste_rx_set_flow_tag(last_ste, attr->flow_tag); - } -} - /* Apply the actions on the rule STE array starting from the last_ste. * Actions might require more than one STE, new_num_stes will return * the new size of the STEs array, rule with actions. */ -static void dr_actions_apply(enum dr_ste_entry_type ste_type, +static void dr_actions_apply(struct mlx5dv_dr_domain *dmn, + enum dr_ste_entry_type ste_type, uint8_t *action_type_set, uint8_t *last_ste, - struct dr_action_apply_attr *attr, + struct dr_ste_actions_attr *attr, uint32_t *new_num_stes) { + struct dr_ste_ctx *ste_ctx = dmn->ste_ctx; uint32_t added_stes = 0; if (ste_type == DR_STE_TYPE_RX) - dr_actions_apply_rx(action_type_set, last_ste, attr, &added_stes); + dr_ste_set_actions_rx(ste_ctx, action_type_set, + last_ste, attr, &added_stes); else - dr_actions_apply_tx(action_type_set, last_ste, attr, &added_stes); + dr_ste_set_actions_tx(ste_ctx, action_type_set, + last_ste, attr, &added_stes); - last_ste += added_stes * DR_STE_SIZE; *new_num_stes += added_stes; - - dr_ste_set_hit_addr(last_ste, attr->final_icm_addr, 1); } static enum dr_action_domain @@ -556,7 +352,7 @@ int dr_actions_build_ste_arr(struct mlx5dv_dr_matcher *matcher, struct mlx5dv_dr_domain *dmn = matcher->tbl->dmn; uint8_t action_type_set[DR_ACTION_TYP_MAX] = {}; uint32_t state = DR_ACTION_STATE_NO_ACTION; - struct dr_action_apply_attr attr = {}; + struct dr_ste_actions_attr attr = {}; enum dr_action_domain action_domain; uint8_t *last_ste; int i; @@ -590,14 +386,14 @@ int dr_actions_build_ste_arr(struct mlx5dv_dr_matcher *matcher, action->dest_tbl->tx.s_anchor->chunk->icm_addr; break; case DR_ACTION_TYP_QP: - { - struct mlx5_qp *mlx5_qp = to_mqp(action->qp); + if (action->dest_qp.is_qp) + attr.final_icm_addr = to_mqp(action->dest_qp.qp)->tir_icm_addr; + else + attr.final_icm_addr = action->dest_qp.devx_tir->rx_icm_addr; - if (!mlx5_qp->tir_icm_addr) { - dr_dbg(dmn, "Unsupported QP for action\n"); - goto out_invalid_arg; - } - attr.final_icm_addr = mlx5_qp->tir_icm_addr; + if (!attr.final_icm_addr) { + dr_dbg(dmn, "Unsupported TIR/QP for action\n"); + goto out_invalid_arg; } break; case DR_ACTION_TYP_CTR: @@ -607,6 +403,7 @@ int dr_actions_build_ste_arr(struct mlx5dv_dr_matcher *matcher, case DR_ACTION_TYP_TAG: attr.flow_tag = action->flow_tag; break; + case DR_ACTION_TYP_MISS: case DR_ACTION_TYP_TNL_L2_TO_L2: break; case DR_ACTION_TYP_TNL_L3_TO_L2: @@ -650,6 +447,25 @@ int dr_actions_build_ste_arr(struct mlx5dv_dr_matcher *matcher, action->meter.rx_icm_addr : action->meter.tx_icm_addr; break; + case DR_ACTION_TYP_SAMPLER: + if (action->sampler.dmn != dmn) { + dr_dbg(dmn, "Sampler belongs to a different domain\n"); + goto out_invalid_arg; + } + if (action->sampler.sampler_default->next_ft->level <= + matcher->tbl->level) { + dr_dbg(dmn, "Sampler next table level should he higher than source table\n"); + goto out_invalid_arg; + } + + if (rx_rule) { + attr.final_icm_addr = action->sampler.sampler_default->rx_icm_addr; + } else { + attr.final_icm_addr = (action->sampler.sampler_restore) ? + action->sampler.sampler_restore->tx_icm_addr : + action->sampler.sampler_default->tx_icm_addr; + } + break; case DR_ACTION_TYP_VPORT: if (action->vport.dmn != dmn) { dr_dbg(dmn, "Destination vport belongs to a different domain\n"); @@ -665,6 +481,16 @@ int dr_actions_build_ste_arr(struct mlx5dv_dr_matcher *matcher, attr.final_icm_addr = action->vport.caps->icm_address_tx; } break; + case DR_ACTION_TYP_DEST_ARRAY: + if (action->dest_array.dmn != dmn) { + dr_dbg(dmn, "Destination array belongs to a different domain\n"); + goto out_invalid_arg; + } + + attr.final_icm_addr = rx_rule ? + action->dest_array.rx_icm_addr : + action->dest_array.tx_icm_addr; + break; default: goto out_invalid_arg; } @@ -687,7 +513,8 @@ int dr_actions_build_ste_arr(struct mlx5dv_dr_matcher *matcher, *new_hw_ste_arr_sz = nic_matcher->num_of_builders; last_ste = ste_arr + DR_STE_SIZE * (nic_matcher->num_of_builders - 1); - dr_actions_apply(nic_dmn->ste_type, + dr_actions_apply(dmn, + nic_dmn->ste_type, action_type_set, last_ste, &attr, @@ -721,6 +548,15 @@ int dr_actions_build_attr(struct mlx5dv_dr_matcher *matcher, attr[i].type = MLX5DV_FLOW_ACTION_DEST_DEVX; attr[i].obj = actions[i]->dest_tbl->devx_obj; break; + case DR_ACTION_TYP_DEST_ARRAY: + if (actions[i]->dest_array.dmn != dmn) { + dr_dbg(dmn, "Destination array belongs to a different domain\n"); + errno = EINVAL; + return errno; + } + attr[i].type = MLX5DV_FLOW_ACTION_DEST_DEVX; + attr[i].obj = actions[i]->dest_array.devx_tbl->ft_dvo; + break; case DR_ACTION_TYP_TNL_L2_TO_L2: case DR_ACTION_TYP_L2_TO_TNL_L2: case DR_ACTION_TYP_TNL_L3_TO_L2: @@ -733,8 +569,13 @@ int dr_actions_build_attr(struct mlx5dv_dr_matcher *matcher, attr[i].action = actions[i]->rewrite.flow_action; break; case DR_ACTION_TYP_QP: - attr[i].type = MLX5DV_FLOW_ACTION_DEST_IBV_QP; - attr[i].qp = actions[i]->qp; + if (actions[i]->dest_qp.is_qp) { + attr[i].type = MLX5DV_FLOW_ACTION_DEST_IBV_QP; + attr[i].qp = actions[i]->dest_qp.qp; + } else { + attr[i].type = MLX5DV_FLOW_ACTION_DEST_DEVX; + attr[i].obj = actions[i]->dest_qp.devx_tir; + } break; case DR_ACTION_TYP_CTR: attr[i].type = MLX5DV_FLOW_ACTION_COUNTERS_DEVX; @@ -749,6 +590,12 @@ int dr_actions_build_attr(struct mlx5dv_dr_matcher *matcher, attr[i].type = MLX5DV_FLOW_ACTION_TAG; attr[i].tag_value = actions[i]->flow_tag; break; + case DR_ACTION_TYP_MISS: + attr[i].type = MLX5DV_FLOW_ACTION_DEFAULT_MISS; + break; + case DR_ACTION_TYP_DROP: + attr[i].type = MLX5DV_FLOW_ACTION_DROP; + break; default: dr_dbg(dmn, "Found unsupported action type: %d\n", actions[i]->action_type); @@ -759,104 +606,6 @@ int dr_actions_build_attr(struct mlx5dv_dr_matcher *matcher, return 0; } -#define SVLAN_ETHERTYPE 0x88a8 -#define HDR_LEN_L2_ONLY 14 -#define HDR_LEN_L2_VLAN 18 -#define REWRITE_HW_ACTION_NUM 6 - -static int dr_actions_l2_rewrite(struct mlx5dv_dr_domain *dmn, - struct mlx5dv_dr_action *action, - void *data, size_t data_sz) -{ - struct mlx5_ifc_l2_hdr_bits *l2_hdr = data; - uint64_t ops[REWRITE_HW_ACTION_NUM] = {}; - uint32_t hdr_fld_4b; - uint16_t hdr_fld_2b; - uint16_t vlan_type; - bool vlan; - int i = 0; - int ret; - - vlan = (data_sz != HDR_LEN_L2_ONLY); - - /* dmac_47_16 */ - DEVX_SET(dr_action_hw_set, ops + i, opcode, MLX5_DR_ACTION_MDFY_HW_OP_SET); - DEVX_SET(dr_action_hw_set, ops + i, destination_length, 0); - DEVX_SET(dr_action_hw_set, ops + i, destination_field_code, MLX5_DR_ACTION_MDFY_HW_FLD_L2_0); - DEVX_SET(dr_action_hw_set, ops + i, destination_left_shifter, 16); - hdr_fld_4b = DEVX_GET(l2_hdr, l2_hdr, dmac_47_16); - DEVX_SET(dr_action_hw_set, ops + i, inline_data, hdr_fld_4b); - i++; - - /* smac_47_16 */ - DEVX_SET(dr_action_hw_set, ops + i, opcode, MLX5_DR_ACTION_MDFY_HW_OP_SET); - DEVX_SET(dr_action_hw_set, ops + i, destination_length, 0); - DEVX_SET(dr_action_hw_set, ops + i, destination_field_code, MLX5_DR_ACTION_MDFY_HW_FLD_L2_1); - DEVX_SET(dr_action_hw_set, ops + i, destination_left_shifter, 16); - hdr_fld_4b = (DEVX_GET(l2_hdr, l2_hdr, smac_31_0) >> 16 | - DEVX_GET(l2_hdr, l2_hdr, smac_47_32) << 16); - DEVX_SET(dr_action_hw_set, ops + i, inline_data, hdr_fld_4b); - i++; - - /* dmac_15_0 */ - DEVX_SET(dr_action_hw_set, ops + i, opcode, MLX5_DR_ACTION_MDFY_HW_OP_SET); - DEVX_SET(dr_action_hw_set, ops + i, destination_length, 16); - DEVX_SET(dr_action_hw_set, ops + i, destination_field_code, MLX5_DR_ACTION_MDFY_HW_FLD_L2_0); - DEVX_SET(dr_action_hw_set, ops + i, destination_left_shifter, 0); - hdr_fld_2b = DEVX_GET(l2_hdr, l2_hdr, dmac_15_0); - DEVX_SET(dr_action_hw_set, ops + i, inline_data, hdr_fld_2b); - i++; - - /* ethertype + (optional) vlan */ - DEVX_SET(dr_action_hw_set, ops + i, opcode, MLX5_DR_ACTION_MDFY_HW_OP_SET); - DEVX_SET(dr_action_hw_set, ops + i, destination_field_code, MLX5_DR_ACTION_MDFY_HW_FLD_L2_2); - DEVX_SET(dr_action_hw_set, ops + i, destination_left_shifter, 32); - if (!vlan) { - hdr_fld_2b = DEVX_GET(l2_hdr, l2_hdr, ethertype); - DEVX_SET(dr_action_hw_set, ops + i, inline_data, hdr_fld_2b); - DEVX_SET(dr_action_hw_set, ops + i, destination_length, 16); - } else { - hdr_fld_2b = DEVX_GET(l2_hdr, l2_hdr, ethertype); - vlan_type = hdr_fld_2b == SVLAN_ETHERTYPE ? DR_STE_SVLAN : DR_STE_CVLAN; - hdr_fld_2b = DEVX_GET(l2_hdr, l2_hdr, vlan); - hdr_fld_4b = (vlan_type << 16) | hdr_fld_2b; - DEVX_SET(dr_action_hw_set, ops + i, inline_data, hdr_fld_4b); - DEVX_SET(dr_action_hw_set, ops + i, destination_length, 18); - } - i++; - - /* smac_15_0 */ - DEVX_SET(dr_action_hw_set, ops + i, opcode, MLX5_DR_ACTION_MDFY_HW_OP_SET); - DEVX_SET(dr_action_hw_set, ops + i, destination_length, 16); - DEVX_SET(dr_action_hw_set, ops + i, destination_field_code, MLX5_DR_ACTION_MDFY_HW_FLD_L2_1); - DEVX_SET(dr_action_hw_set, ops + i, destination_left_shifter, 0); - hdr_fld_2b = DEVX_GET(l2_hdr, l2_hdr, smac_31_0); - DEVX_SET(dr_action_hw_set, ops + i, inline_data, hdr_fld_2b); - i++; - - if (vlan) { - DEVX_SET(dr_action_hw_set, ops + i, opcode, MLX5_DR_ACTION_MDFY_HW_OP_SET); - hdr_fld_2b = DEVX_GET(l2_hdr, l2_hdr, vlan_type); - DEVX_SET(dr_action_hw_set, ops + i, inline_data, hdr_fld_2b); - DEVX_SET(dr_action_hw_set, ops + i, destination_length, 16); - DEVX_SET(dr_action_hw_set, ops + i, destination_field_code, MLX5_DR_ACTION_MDFY_HW_FLD_L2_2); - DEVX_SET(dr_action_hw_set, ops + i, destination_left_shifter, 0); - i++; - } - - action->rewrite.data = (void *)ops; - action->rewrite.num_of_actions = i; - action->rewrite.chunk->byte_size = i * sizeof(*ops); - - ret = dr_send_postsend_action(dmn, action); - if (ret) { - dr_dbg(dmn, "Writing encapsulation action to ICM failed\n"); - return ret; - } - - return 0; -} - static struct mlx5dv_dr_action * dr_action_create_generic(enum dr_action_type action_type) { @@ -879,6 +628,11 @@ struct mlx5dv_dr_action *mlx5dv_dr_action_create_drop(void) return dr_action_create_generic(DR_ACTION_TYP_DROP); } +struct mlx5dv_dr_action *mlx5dv_dr_action_create_default_miss(void) +{ + return dr_action_create_generic(DR_ACTION_TYP_MISS); +} + struct mlx5dv_dr_action * mlx5dv_dr_action_create_dest_ibv_qp(struct ibv_qp *ibqp) { @@ -893,8 +647,27 @@ mlx5dv_dr_action_create_dest_ibv_qp(struct ibv_qp *ibqp) if (!action) return NULL; - action->qp = ibqp; + action->dest_qp.is_qp = true; + action->dest_qp.qp = ibqp; + + return action; +} + +struct mlx5dv_dr_action * +mlx5dv_dr_action_create_dest_devx_tir(struct mlx5dv_devx_obj *devx_obj) +{ + struct mlx5dv_dr_action *action; + + if (devx_obj->type != MLX5_DEVX_TIR) { + errno = EINVAL; + return NULL; + } + + action = dr_action_create_generic(DR_ACTION_TYP_QP); + if (!action) + return NULL; + action->dest_qp.devx_tir = devx_obj; return action; } @@ -1058,25 +831,34 @@ dr_action_create_reformat_action(struct mlx5dv_dr_domain *dmn, } case DR_ACTION_TYP_TNL_L3_TO_L2: { + uint8_t hw_actions[ACTION_CACHE_LINE_SIZE] = {}; int ret; - /* Only Ethernet frame is supported, with VLAN (18) or without (14) */ - if (data_sz != HDR_LEN_L2_ONLY && data_sz != HDR_LEN_L2_VLAN) { - errno = EINVAL; - return errno; + ret = dr_ste_set_action_decap_l3_list(dmn->ste_ctx, + data, data_sz, + hw_actions, + ACTION_CACHE_LINE_SIZE, + &action->rewrite.num_of_actions); + if (ret) { + dr_dbg(dmn, "Failed creating decap l3 action list\n"); + return ret; } action->rewrite.chunk = dr_icm_alloc_chunk(dmn->action_icm_pool, DR_CHUNK_SIZE_8); - if (!action->rewrite.chunk) + if (!action->rewrite.chunk) { + dr_dbg(dmn, "Failed allocating modify header chunk\n"); return errno; + } + action->rewrite.data = (void *)hw_actions; action->rewrite.index = (action->rewrite.chunk->icm_addr - dmn->info.caps.hdr_modify_icm_addr) / ACTION_CACHE_LINE_SIZE; - ret = dr_actions_l2_rewrite(dmn, action, data, data_sz); + ret = dr_send_postsend_action(dmn, action); if (ret) { + dr_dbg(dmn, "Writing decap l3 actions to ICM failed\n"); dr_icm_free_chunk(action->rewrite.chunk); return ret; } @@ -1155,32 +937,13 @@ dec_ref: return NULL; } -static const struct dr_action_modify_field_conv * -dr_action_modify_get_hw_info(uint16_t sw_field) -{ - const struct dr_action_modify_field_conv *hw_action_info; - - if (sw_field >= ARRAY_SIZE(dr_action_conv_arr)) - goto not_found; - - hw_action_info = &dr_action_conv_arr[sw_field]; - if (!hw_action_info->end && !hw_action_info->start) - goto not_found; - - return hw_action_info; - -not_found: - errno = EINVAL; - return NULL; -} - static int dr_action_modify_sw_to_hw_add(struct mlx5dv_dr_domain *dmn, __be64 *sw_action, __be64 *hw_action, - const struct dr_action_modify_field_conv **ret_hw_info) + const struct dr_ste_action_modify_field **ret_hw_info) { - const struct dr_action_modify_field_conv *hw_action_info; + const struct dr_ste_action_modify_field *hw_action_info; uint8_t max_length; uint16_t sw_field; uint32_t data; @@ -1190,7 +953,7 @@ dr_action_modify_sw_to_hw_add(struct mlx5dv_dr_domain *dmn, data = DEVX_GET(set_action_in, sw_action, data); /* Convert SW data to HW modify action format */ - hw_action_info = dr_action_modify_get_hw_info(sw_field); + hw_action_info = dr_ste_conv_modify_hdr_sw_field(dmn->ste_ctx, sw_field); if (!hw_action_info) { dr_dbg(dmn, "Modify ADD action invalid field given\n"); errno = EINVAL; @@ -1199,15 +962,11 @@ dr_action_modify_sw_to_hw_add(struct mlx5dv_dr_domain *dmn, max_length = hw_action_info->end - hw_action_info->start + 1; - DEVX_SET(dr_action_hw_set, hw_action, opcode, - MLX5_DR_ACTION_MDFY_HW_OP_ADD); - DEVX_SET(dr_action_hw_set, hw_action, destination_field_code, - hw_action_info->hw_field); - DEVX_SET(dr_action_hw_set, hw_action, destination_left_shifter, - hw_action_info->start); - DEVX_SET(dr_action_hw_set, hw_action, destination_length, - max_length == 32 ? 0 : max_length); - DEVX_SET(dr_action_hw_set, hw_action, inline_data, data); + dr_ste_set_action_add(dmn->ste_ctx, + hw_action, + hw_action_info->hw_field, + hw_action_info->start, + max_length, data); *ret_hw_info = hw_action_info; @@ -1218,9 +977,9 @@ static int dr_action_modify_sw_to_hw_set(struct mlx5dv_dr_domain *dmn, __be64 *sw_action, __be64 *hw_action, - const struct dr_action_modify_field_conv **ret_hw_info) + const struct dr_ste_action_modify_field **ret_hw_info) { - const struct dr_action_modify_field_conv *hw_action_info; + const struct dr_ste_action_modify_field *hw_action_info; uint8_t offset, length, max_length; uint16_t sw_field; uint32_t data; @@ -1232,7 +991,7 @@ dr_action_modify_sw_to_hw_set(struct mlx5dv_dr_domain *dmn, data = DEVX_GET(set_action_in, sw_action, data); /* Convert SW data to HW modify action format */ - hw_action_info = dr_action_modify_get_hw_info(sw_field); + hw_action_info = dr_ste_conv_modify_hdr_sw_field(dmn->ste_ctx, sw_field); if (!hw_action_info) { dr_dbg(dmn, "Modify SET action invalid field given\n"); errno = EINVAL; @@ -1249,15 +1008,11 @@ dr_action_modify_sw_to_hw_set(struct mlx5dv_dr_domain *dmn, return errno; } - DEVX_SET(dr_action_hw_set, hw_action, opcode, - MLX5_DR_ACTION_MDFY_HW_OP_SET); - DEVX_SET(dr_action_hw_set, hw_action, destination_field_code, - hw_action_info->hw_field); - DEVX_SET(dr_action_hw_set, hw_action, destination_left_shifter, - hw_action_info->start + offset); - DEVX_SET(dr_action_hw_set, hw_action, destination_length, - length == 32 ? 0 : length); - DEVX_SET(dr_action_hw_set, hw_action, inline_data, data); + dr_ste_set_action_set(dmn->ste_ctx, + hw_action, + hw_action_info->hw_field, + hw_action_info->start + offset, + length, data); *ret_hw_info = hw_action_info; @@ -1268,12 +1023,12 @@ static int dr_action_modify_sw_to_hw_copy(struct mlx5dv_dr_domain *dmn, __be64 *sw_action, __be64 *hw_action, - const struct dr_action_modify_field_conv **ret_dst_hw_info, - const struct dr_action_modify_field_conv **ret_src_hw_info) + const struct dr_ste_action_modify_field **ret_dst_hw_info, + const struct dr_ste_action_modify_field **ret_src_hw_info) { uint8_t src_offset, dst_offset, src_max_length, dst_max_length, length; - const struct dr_action_modify_field_conv *src_hw_action_info; - const struct dr_action_modify_field_conv *dst_hw_action_info; + const struct dr_ste_action_modify_field *src_hw_action_info; + const struct dr_ste_action_modify_field *dst_hw_action_info; uint16_t src_field, dst_field; /* Get SW modify action data */ @@ -1284,8 +1039,8 @@ dr_action_modify_sw_to_hw_copy(struct mlx5dv_dr_domain *dmn, length = DEVX_GET(copy_action_in, sw_action, length); /* Convert SW data to HW modify action format */ - src_hw_action_info = dr_action_modify_get_hw_info(src_field); - dst_hw_action_info = dr_action_modify_get_hw_info(dst_field); + src_hw_action_info = dr_ste_conv_modify_hdr_sw_field(dmn->ste_ctx, src_field); + dst_hw_action_info = dr_ste_conv_modify_hdr_sw_field(dmn->ste_ctx, dst_field); if (!src_hw_action_info || !dst_hw_action_info) { dr_dbg(dmn, "Modify COPY action invalid src/dst field given\n"); errno = EINVAL; @@ -1294,10 +1049,9 @@ dr_action_modify_sw_to_hw_copy(struct mlx5dv_dr_domain *dmn, /* Based on device specification value of 0 means 32 */ length = length ? length : 32; - src_max_length = src_hw_action_info->end - - src_hw_action_info->start + 1; - dst_max_length = dst_hw_action_info->end - - dst_hw_action_info->start + 1; + + src_max_length = src_hw_action_info->end - src_hw_action_info->start + 1; + dst_max_length = dst_hw_action_info->end - dst_hw_action_info->start + 1; if (length + src_offset > src_max_length || length + dst_offset > dst_max_length) { dr_dbg(dmn, "Modify action length exceeds limit\n"); @@ -1305,17 +1059,13 @@ dr_action_modify_sw_to_hw_copy(struct mlx5dv_dr_domain *dmn, return errno; } - DEVX_SET(dr_action_hw_copy, hw_action, opcode, - MLX5_DR_ACTION_MDFY_HW_OP_COPY); - DEVX_SET(dr_action_hw_copy, hw_action, destination_field_code, - dst_hw_action_info->hw_field); - DEVX_SET(dr_action_hw_copy, hw_action, destination_left_shifter, - dst_hw_action_info->start + dst_offset); - DEVX_SET(dr_action_hw_copy, hw_action, destination_length, length); - DEVX_SET(dr_action_hw_copy, hw_action, source_field_code, - src_hw_action_info->hw_field); - DEVX_SET(dr_action_hw_copy, hw_action, source_left_shifter, - src_hw_action_info->start + src_offset); + dr_ste_set_action_copy(dmn->ste_ctx, + hw_action, + dst_hw_action_info->hw_field, + dst_hw_action_info->start + dst_offset, + length, + src_hw_action_info->hw_field, + src_hw_action_info->start + src_offset); *ret_dst_hw_info = dst_hw_action_info; *ret_src_hw_info = src_hw_action_info; @@ -1327,8 +1077,8 @@ static int dr_action_modify_sw_to_hw(struct mlx5dv_dr_domain *dmn, __be64 *sw_action, __be64 *hw_action, - const struct dr_action_modify_field_conv **ret_dst_hw_info, - const struct dr_action_modify_field_conv **ret_src_hw_info) + const struct dr_ste_action_modify_field **ret_dst_hw_info, + const struct dr_ste_action_modify_field **ret_src_hw_info) { uint8_t action = DEVX_GET(set_action_in, sw_action, action_type); int ret = 0; @@ -1499,13 +1249,13 @@ static int dr_actions_convert_modify_header(struct mlx5dv_dr_action *action, __be64 hw_actions[], uint32_t *num_hw_actions) { - const struct dr_action_modify_field_conv *hw_dst_action_info; - const struct dr_action_modify_field_conv *hw_src_action_info; - uint16_t hw_field = MLX5_DR_ACTION_MDFY_HW_FLD_RESERVED; - uint32_t l3_type = MLX5_DR_ACTION_MDFY_HW_HDR_L3_NONE; - uint32_t l4_type = MLX5_DR_ACTION_MDFY_HW_HDR_L4_NONE; + const struct dr_ste_action_modify_field *hw_dst_action_info; + const struct dr_ste_action_modify_field *hw_src_action_info; struct mlx5dv_dr_domain *dmn = action->rewrite.dmn; int ret, i, hw_idx = 0; + uint16_t hw_field = 0; + uint32_t l3_type = 0; + uint32_t l4_type = 0; __be64 *sw_action; __be64 hw_action; @@ -1824,6 +1574,597 @@ struct mlx5dv_dr_action return action; } +static int +dr_action_convert_to_fte_dest(struct mlx5dv_dr_domain *dmn, + struct mlx5dv_dr_action *dest, + struct mlx5dv_dr_action *dest_reformat, + struct dr_devx_flow_fte_attr *fte_attr) +{ + struct dr_devx_flow_dest_info *dest_info = + &fte_attr->dest_arr[fte_attr->dest_size]; + + switch (dest->action_type) { + case DR_ACTION_TYP_MISS: + if (dmn->type != MLX5DV_DR_DOMAIN_TYPE_FDB) + goto err_exit; + + fte_attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + dest_info->type = MLX5_FLOW_DEST_TYPE_VPORT; + break; + case DR_ACTION_TYP_VPORT: + if (dmn->type != MLX5DV_DR_DOMAIN_TYPE_FDB) + goto err_exit; + + fte_attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + dest_info->type = MLX5_FLOW_DEST_TYPE_VPORT; + dest_info->vport_num = dest->vport.num; + break; + case DR_ACTION_TYP_QP: + fte_attr->action |= MLX5_FLOW_CONTEXT_ACTION_FWD_DEST; + dest_info->type = MLX5_FLOW_DEST_TYPE_TIR; + + if (dest->dest_qp.is_qp) + dest_info->tir_num = to_mqp(dest->dest_qp.qp)->tirn; + else + dest_info->tir_num = dest->dest_qp.devx_tir->object_id; + + break; + case DR_ACTION_TYP_CTR: + fte_attr->action |= MLX5_FLOW_CONTEXT_ACTION_COUNT; + dest_info->type = MLX5_FLOW_DEST_TYPE_COUNTER; + dest_info->counter_id = + dest->ctr.devx_obj->object_id + dest->ctr.offset; + break; + default: + goto err_exit; + } + + if (dest_reformat) { + switch (dest_reformat->action_type) { + case DR_ACTION_TYP_L2_TO_TNL_L2: + case DR_ACTION_TYP_L2_TO_TNL_L3: + if (dest_reformat->reformat.is_root_level) + goto err_exit; + + fte_attr->extended_dest = true; + dest_info->has_reformat = true; + dest_info->reformat_id = dest_reformat->reformat.dvo->object_id; + break; + default: + goto err_exit; + } + } + + fte_attr->dest_size++; + return 0; + +err_exit: + errno = EOPNOTSUPP; + return errno; +} + +static struct dr_devx_tbl_with_refs * +dr_action_create_sampler_term_tbl(struct mlx5dv_dr_domain *dmn, + struct mlx5dv_dr_flow_sampler_attr *attr) +{ + struct dr_devx_flow_table_attr ft_attr = {}; + struct dr_devx_flow_group_attr fg_attr = {}; + struct dr_devx_flow_fte_attr fte_attr = {}; + struct dr_devx_flow_dest_info *dest_info; + struct dr_devx_tbl_with_refs *term_tbl; + struct mlx5dv_dr_action **ref_actions; + uint32_t ref_index = 0; + uint32_t tbl_type; + uint32_t i; + + tbl_type = attr->default_next_table->table_type; + + dest_info = calloc(attr->num_sample_actions, + sizeof(struct dr_devx_flow_dest_info)); + if (!dest_info) { + errno = ENOMEM; + return NULL; + } + + term_tbl = calloc(1, sizeof(struct dr_devx_tbl_with_refs)); + if (!term_tbl) { + errno = ENOMEM; + goto free_dest_info; + } + + ref_actions = calloc(attr->num_sample_actions, + sizeof(struct mlx5dv_dr_action *)); + if (!ref_actions) { + errno = ENOMEM; + goto free_term_tbl; + } + + ft_attr.type = tbl_type; + ft_attr.level = dmn->info.caps.max_ft_level - 1; + ft_attr.term_tbl = true; + fte_attr.dest_arr = dest_info; + + for (i = 0; i < attr->num_sample_actions; i++) { + enum dr_action_type action_type = + attr->sample_actions[i]->action_type; + + atomic_fetch_add(&attr->sample_actions[i]->refcount, 1); + ref_actions[ref_index++] = attr->sample_actions[i]; + + switch (action_type) { + case DR_ACTION_TYP_MISS: + case DR_ACTION_TYP_VPORT: + if (dr_action_convert_to_fte_dest(dmn, attr->sample_actions[i], + NULL, &fte_attr)) + goto free_ref_actions; + + break; + case DR_ACTION_TYP_QP: + case DR_ACTION_TYP_CTR: + if (tbl_type != FS_FT_NIC_RX) { + errno = EOPNOTSUPP; + goto free_ref_actions; + } + + if (dr_action_convert_to_fte_dest(dmn, attr->sample_actions[i], + NULL, &fte_attr)) + goto free_ref_actions; + + break; + case DR_ACTION_TYP_TAG: + if (tbl_type != FS_FT_NIC_RX) { + errno = EOPNOTSUPP; + goto free_ref_actions; + } + + fte_attr.flow_tag = attr->sample_actions[i]->flow_tag; + break; + default: + errno = EOPNOTSUPP; + goto free_ref_actions; + } + } + + term_tbl->devx_tbl = dr_devx_create_always_hit_ft(dmn->ctx, &ft_attr, + &fg_attr, &fte_attr); + if (!term_tbl->devx_tbl) + goto free_ref_actions; + + term_tbl->ref_actions = ref_actions; + term_tbl->ref_actions_num = attr->num_sample_actions; + + free(dest_info); + return term_tbl; + +free_ref_actions: + for (i = 0; i < ref_index; i++) + atomic_fetch_sub(&ref_actions[i]->refcount, 1); + free(ref_actions); +free_term_tbl: + free(term_tbl); +free_dest_info: + free(dest_info); + + return NULL; +} + +static void +dr_action_destroy_sampler_term_tbl(struct dr_devx_tbl_with_refs *term_tbl) +{ + uint32_t i; + + dr_devx_destroy_always_hit_ft(term_tbl->devx_tbl); + + for (i = 0; i < term_tbl->ref_actions_num; i++) + atomic_fetch_sub(&term_tbl->ref_actions[i]->refcount, 1); + free(term_tbl->ref_actions); + free(term_tbl); +} + +static struct dr_flow_sampler * +dr_action_create_sampler(struct mlx5dv_dr_domain *dmn, + struct mlx5dv_dr_flow_sampler_attr *attr, + struct dr_devx_tbl_with_refs *term_tbl, + struct dr_flow_sampler_restore_tbl *restore) +{ + struct dr_devx_flow_sampler_attr sampler_attr = {}; + struct dr_flow_sampler *sampler; + uint64_t icm_rx, icm_tx; + int ret; + + sampler = calloc(1, sizeof(struct dr_flow_sampler)); + if (!sampler) { + errno = ENOMEM; + return NULL; + } + + sampler->next_ft = restore ? restore->tbl : attr->default_next_table; + atomic_fetch_add(&sampler->next_ft->refcount, 1); + + /* Sampler HW level equals to term_tbl HW level, need to set ignore level */ + sampler_attr.ignore_flow_level = true; + sampler_attr.sample_ratio = attr->sample_ratio; + sampler_attr.table_type = term_tbl->devx_tbl->type; + sampler_attr.level = term_tbl->devx_tbl->level; + sampler_attr.sample_table_id = term_tbl->devx_tbl->ft_dvo->object_id; + sampler_attr.default_next_table_id = sampler->next_ft->devx_obj->object_id; + + sampler->devx_obj = dr_devx_create_flow_sampler(dmn->ctx, &sampler_attr); + if (!sampler->devx_obj) + goto dec_next_ft_ref; + + ret = dr_devx_query_flow_sampler(sampler->devx_obj, &icm_rx, &icm_tx); + if (ret) + goto destroy_sampler_dvo; + + sampler->rx_icm_addr = icm_rx; + sampler->tx_icm_addr = icm_tx; + + return sampler; + +destroy_sampler_dvo: + mlx5dv_devx_obj_destroy(sampler->devx_obj); +dec_next_ft_ref: + atomic_fetch_sub(&sampler->next_ft->refcount, 1); + + free(sampler); + + return NULL; +} + +static void dr_action_destroy_sampler(struct dr_flow_sampler *sampler) +{ + mlx5dv_devx_obj_destroy(sampler->devx_obj); + atomic_fetch_sub(&sampler->next_ft->refcount, 1); + free(sampler); +} + +static struct dr_flow_sampler_restore_tbl * +dr_action_create_sampler_restore_tbl(struct mlx5dv_dr_domain *dmn, + struct mlx5dv_dr_flow_sampler_attr *attr) +{ + struct mlx5dv_flow_match_parameters *mask; + struct dr_flow_sampler_restore_tbl *restore; + uint32_t action_field; + uint32_t action_type; + uint32_t mask_size; + + action_type = DEVX_GET(set_action_in, &(attr->action), action_type); + action_field = DEVX_GET(set_action_in, &(attr->action), field); + + /* Currently only support restore of setting Reg_C0 */ + if (action_type != MLX5_ACTION_TYPE_SET || + action_field != MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_0) { + errno = EOPNOTSUPP; + return NULL; + } + + mask_size = sizeof(struct mlx5dv_flow_match_parameters) + + sizeof(struct dr_match_param); + mask = calloc(1, mask_size); + if (!mask) { + errno = ENOMEM; + return NULL; + } + mask->match_sz = sizeof(struct dr_match_param); + + restore = calloc(1, sizeof(struct dr_flow_sampler_restore_tbl)); + if (!restore) { + errno = ENOMEM; + goto free_mask; + } + + restore->tbl = mlx5dv_dr_table_create(dmn, attr->default_next_table->level - 1); + if (!restore->tbl) + goto free_restore; + + restore->matcher = mlx5dv_dr_matcher_create(restore->tbl, 0, 0, mask); + if (!restore->matcher) + goto destroy_restore_tbl; + + restore->num_of_actions = 2; + restore->actions = calloc(restore->num_of_actions, + sizeof(struct mlx5dv_dr_action *)); + if (!restore->actions) { + errno = ENOMEM; + goto destroy_restore_matcher; + } + + restore->actions[0] = + mlx5dv_dr_action_create_modify_header(dmn, 0, + DR_MODIFY_ACTION_SIZE, + &(attr->action)); + if (!restore->actions[0]) + goto free_action_list; + + restore->actions[1] = + mlx5dv_dr_action_create_dest_table(attr->default_next_table); + if (!restore->actions[1]) + goto destroy_modify_hdr_action; + + restore->rule = mlx5dv_dr_rule_create(restore->matcher, mask, + restore->num_of_actions, + restore->actions); + if (!restore->rule) + goto destroy_dest_action; + + free(mask); + return restore; + +destroy_dest_action: + mlx5dv_dr_action_destroy(restore->actions[1]); +destroy_modify_hdr_action: + mlx5dv_dr_action_destroy(restore->actions[0]); +free_action_list: + free(restore->actions); +destroy_restore_matcher: + mlx5dv_dr_matcher_destroy(restore->matcher); +destroy_restore_tbl: + mlx5dv_dr_table_destroy(restore->tbl); +free_restore: + free(restore); +free_mask: + free(mask); + + return NULL; +} + +static void dr_action_destroy_sampler_restore_tbl(struct dr_flow_sampler_restore_tbl *restore) +{ + uint32_t i; + + mlx5dv_dr_rule_destroy(restore->rule); + for (i = 0; i < restore->num_of_actions; i++) + mlx5dv_dr_action_destroy(restore->actions[i]); + free(restore->actions); + + mlx5dv_dr_matcher_destroy(restore->matcher); + mlx5dv_dr_table_destroy(restore->tbl); + free(restore); +} + +struct mlx5dv_dr_action * +mlx5dv_dr_action_create_flow_sampler(struct mlx5dv_dr_flow_sampler_attr *attr) +{ + struct mlx5dv_dr_action *action; + struct mlx5dv_dr_domain *dmn; + bool restore = false; + + dmn = attr->default_next_table->dmn; + if (!dmn || + !attr->default_next_table || attr->sample_ratio == 0 || + !attr->sample_actions || attr->num_sample_actions == 0) { + errno = EINVAL; + return NULL; + } + + if (dmn->type != MLX5DV_DR_DOMAIN_TYPE_NIC_RX && + dmn->type != MLX5DV_DR_DOMAIN_TYPE_FDB) { + errno = EOPNOTSUPP; + return NULL; + } + + if (dmn->type == MLX5DV_DR_DOMAIN_TYPE_FDB && + dmn->info.caps.sw_format_ver == MLX5_HW_CONNECTX_5) + restore = true; + + atomic_fetch_add(&dmn->refcount, 1); + + action = dr_action_create_generic(DR_ACTION_TYP_SAMPLER); + if (!action) + goto dec_ref; + + action->sampler.dmn = dmn; + + action->sampler.term_tbl = dr_action_create_sampler_term_tbl(dmn, attr); + if (!action->sampler.term_tbl) + goto free_action; + + action->sampler.sampler_default = dr_action_create_sampler(dmn, attr, + action->sampler.term_tbl, + NULL); + if (!action->sampler.sampler_default) + goto destroy_term_tbl; + + if (restore) { + struct dr_flow_sampler *sampler_restore; + + action->sampler.restore_tbl = dr_action_create_sampler_restore_tbl(dmn, attr); + if (!action->sampler.restore_tbl) + goto destroy_sampler_default; + + sampler_restore = dr_action_create_sampler(dmn, attr, + action->sampler.term_tbl, + action->sampler.restore_tbl); + if (!sampler_restore) + goto destroy_restore; + + action->sampler.sampler_restore = sampler_restore; + } + + return action; + +destroy_restore: + if (action->sampler.restore_tbl) + dr_action_destroy_sampler_restore_tbl(action->sampler.restore_tbl); +destroy_sampler_default: + dr_action_destroy_sampler(action->sampler.sampler_default); +destroy_term_tbl: + dr_action_destroy_sampler_term_tbl(action->sampler.term_tbl); +free_action: + free(action); +dec_ref: + atomic_fetch_sub(&dmn->refcount, 1); + + return NULL; +} + +static int dr_action_add_action_member(struct list_head *ref_list, + struct mlx5dv_dr_action *action) +{ + struct dr_rule_action_member *action_mem; + + action_mem = calloc(1, sizeof(*action_mem)); + if (!action_mem) { + errno = ENOMEM; + return errno; + } + + action_mem->action = action; + list_node_init(&action_mem->list); + list_add_tail(ref_list, &action_mem->list); + atomic_fetch_add(&action_mem->action->refcount, 1); + + return 0; +} + +static void dr_action_remove_action_members(struct list_head *ref_list) +{ + struct dr_rule_action_member *action_mem; + struct dr_rule_action_member *tmp; + + list_for_each_safe(ref_list, action_mem, tmp, list) { + list_del(&action_mem->list); + atomic_fetch_sub(&action_mem->action->refcount, 1); + free(action_mem); + } +} + +static int +dr_action_create_dest_array_tbl(struct mlx5dv_dr_action *action, + size_t num_dest, + struct mlx5dv_dr_action_dest_attr *dests[]) +{ + struct mlx5dv_dr_domain *dmn = action->dest_array.dmn; + struct dr_devx_flow_table_attr ft_attr = {}; + struct dr_devx_flow_group_attr fg_attr = {}; + struct dr_devx_flow_fte_attr fte_attr = {}; + uint32_t i; + int ret; + + switch (dmn->type) { + case MLX5DV_DR_DOMAIN_TYPE_FDB: + ft_attr.type = FS_FT_FDB; + ft_attr.level = dmn->info.caps.max_ft_level - 1; + break; + case MLX5DV_DR_DOMAIN_TYPE_NIC_RX: + ft_attr.type = FS_FT_NIC_RX; + ft_attr.level = MLX5_MULTI_PATH_FT_MAX_LEVEL - 1; + break; + default: + errno = EOPNOTSUPP; + return errno; + } + + fte_attr.dest_arr = calloc(num_dest, sizeof(struct dr_devx_flow_dest_info)); + if (!fte_attr.dest_arr) { + errno = ENOMEM; + return errno; + } + + for (i = 0; i < num_dest; i++) { + struct mlx5dv_dr_action *reformat_action; + struct mlx5dv_dr_action *dest_action; + + switch (dests[i]->type) { + case MLX5DV_DR_ACTION_DEST_REFORMAT: + dest_action = dests[i]->dest_reformat->dest; + reformat_action = dests[i]->dest_reformat->reformat; + ft_attr.reformat_en = true; + break; + case MLX5DV_DR_ACTION_DEST: + dest_action = dests[i]->dest; + reformat_action = NULL; + break; + default: + errno = EINVAL; + goto clear_actions_list; + } + + switch (dest_action->action_type) { + case DR_ACTION_TYP_MISS: + case DR_ACTION_TYP_VPORT: + case DR_ACTION_TYP_QP: + case DR_ACTION_TYP_CTR: + if (dr_action_add_action_member(&action->dest_array.actions_list, + dest_action)) + goto clear_actions_list; + + break; + default: + errno = EOPNOTSUPP; + goto clear_actions_list; + } + + if (reformat_action) + if (dr_action_add_action_member(&action->dest_array.actions_list, + reformat_action)) + goto clear_actions_list; + + if (dr_action_convert_to_fte_dest(dmn, dest_action, + reformat_action, &fte_attr)) + goto clear_actions_list; + } + + action->dest_array.devx_tbl = dr_devx_create_always_hit_ft(dmn->ctx, + &ft_attr, + &fg_attr, + &fte_attr); + if (!action->dest_array.devx_tbl) + goto clear_actions_list; + + ret = dr_devx_query_flow_table(action->dest_array.devx_tbl->ft_dvo, + ft_attr.type, + &action->dest_array.rx_icm_addr, + &action->dest_array.tx_icm_addr); + if (ret) + goto destroy_devx_tbl; + + free(fte_attr.dest_arr); + return 0; + +destroy_devx_tbl: + dr_devx_destroy_always_hit_ft(action->dest_array.devx_tbl); +clear_actions_list: + dr_action_remove_action_members(&action->dest_array.actions_list); + + free(fte_attr.dest_arr); + return errno; +} + +struct mlx5dv_dr_action * +mlx5dv_dr_action_create_dest_array(struct mlx5dv_dr_domain *dmn, + size_t num_dest, + struct mlx5dv_dr_action_dest_attr *dests[]) +{ + struct mlx5dv_dr_action *action; + + if (num_dest <= 1) { + errno = EINVAL; + return NULL; + } + + atomic_fetch_add(&dmn->refcount, 1); + + action = dr_action_create_generic(DR_ACTION_TYP_DEST_ARRAY); + if (!action) + goto dec_ref; + + action->dest_array.dmn = dmn; + list_head_init(&action->dest_array.actions_list); + + if (dr_action_create_dest_array_tbl(action, num_dest, dests)) + goto free_action; + + return action; + +free_action: + free(action); +dec_ref: + atomic_fetch_sub(&dmn->refcount, 1); + return NULL; +} + int mlx5dv_dr_action_destroy(struct mlx5dv_dr_action *action) { if (atomic_load(&action->refcount) > 1) @@ -1866,6 +2207,20 @@ int mlx5dv_dr_action_destroy(struct mlx5dv_dr_action *action) mlx5dv_devx_obj_destroy(action->meter.devx_obj); atomic_fetch_sub(&action->meter.next_ft->refcount, 1); break; + case DR_ACTION_TYP_SAMPLER: + if (action->sampler.sampler_restore) { + dr_action_destroy_sampler(action->sampler.sampler_restore); + dr_action_destroy_sampler_restore_tbl(action->sampler.restore_tbl); + } + dr_action_destroy_sampler(action->sampler.sampler_default); + dr_action_destroy_sampler_term_tbl(action->sampler.term_tbl); + atomic_fetch_sub(&action->sampler.dmn->refcount, 1); + break; + case DR_ACTION_TYP_DEST_ARRAY: + dr_devx_destroy_always_hit_ft(action->dest_array.devx_tbl); + dr_action_remove_action_members(&action->dest_array.actions_list); + atomic_fetch_sub(&action->dest_array.dmn->refcount, 1); + break; default: break; } diff --git a/providers/mlx5/dr_buddy.c b/providers/mlx5/dr_buddy.c new file mode 100644 index 0000000..e153677 --- /dev/null +++ b/providers/mlx5/dr_buddy.c @@ -0,0 +1,237 @@ +/* + * Copyright (c) 2004 Topspin Communications. All rights reserved. + * Copyright (c) 2005, 2006, 2007, 2008 Mellanox Technologies. All rights reserved. + * Copyright (c) 2006, 2007 Cisco Systems, Inc. All rights reserved. + * Copyright (c) 2019, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include "mlx5dv_dr.h" + +struct dr_icm_pool; +struct dr_icm_buddy_mem; + +static int dr_find_first_bit(const bitmap *set_addr, + const bitmap *addr, + unsigned int size) +{ + unsigned int set_size = (size - 1) / BITS_PER_LONG + 1; + unsigned long set_idx; + + /* find the first free in the first level */ + set_idx = bitmap_ffs(set_addr, 0, set_size); + /* find the next level */ + return bitmap_ffs(addr, set_idx * BITS_PER_LONG, size); +} + +int dr_buddy_init(struct dr_icm_buddy_mem *buddy, uint32_t max_order) +{ + int i, s; + + buddy->max_order = max_order; + + list_node_init(&buddy->list_node); + list_head_init(&buddy->used_list); + list_head_init(&buddy->hot_list); + + buddy->bits = calloc(buddy->max_order + 1, sizeof(long *)); + if (!buddy->bits) { + errno = ENOMEM; + return ENOMEM; + } + + buddy->num_free = calloc(buddy->max_order + 1, sizeof(*buddy->num_free)); + if (!buddy->num_free) + goto err_out_free_bits; + + buddy->set_bit = calloc(buddy->max_order + 1, sizeof(long *)); + if (!buddy->set_bit) + goto err_out_free_num_free; + + /* Allocating max_order bitmaps, one for each order. + * only the bitmap for the maximum size will be available for use and + * the first bit there will be set. + */ + for (i = 0; i <= buddy->max_order; ++i) { + s = 1 << (buddy->max_order - i); + buddy->bits[i] = bitmap_alloc0(s); + if (!buddy->bits[i]) + goto err_out_free_each_bit_per_order; + } + + for (i = 0; i <= buddy->max_order; ++i) { + s = BITS_TO_LONGS(1 << (buddy->max_order - i)); + buddy->set_bit[i] = bitmap_alloc0(s); + if (!buddy->set_bit[i]) + goto err_out_free_set; + } + + bitmap_set_bit(buddy->bits[buddy->max_order], 0); + bitmap_set_bit(buddy->set_bit[buddy->max_order], 0); + + buddy->num_free[buddy->max_order] = 1; + + return 0; + +err_out_free_set: + for (i = 0; i <= buddy->max_order; ++i) + free(buddy->set_bit[i]); + +err_out_free_each_bit_per_order: + free(buddy->set_bit); + + for (i = 0; i <= buddy->max_order; ++i) + free(buddy->bits[i]); + +err_out_free_num_free: + free(buddy->num_free); + +err_out_free_bits: + free(buddy->bits); + errno = ENOMEM; + return ENOMEM; +} + +void dr_buddy_cleanup(struct dr_icm_buddy_mem *buddy) +{ + int i; + + list_del(&buddy->list_node); + + for (i = 0; i <= buddy->max_order; ++i) { + free(buddy->bits[i]); + free(buddy->set_bit[i]); + } + + free(buddy->set_bit); + free(buddy->num_free); + free(buddy->bits); +} + +/* + * Find the borders (high and low) of specific seg (segment location) + * of the lower level of the bitmap in order to mark the upper layer + * of bitmap. + */ +static void dr_buddy_get_seg_borders(uint32_t seg, + uint32_t *low, + uint32_t *high) +{ + *low = (seg / BITS_PER_LONG) * BITS_PER_LONG; + *high = ((seg / BITS_PER_LONG) + 1) * BITS_PER_LONG; +} + +/* + * We have two layers of searching in the bitmaps, so when needed update the + * second layer of search. + */ +static void dr_buddy_update_upper_bitmap(struct dr_icm_buddy_mem *buddy, + uint32_t seg, int order) +{ + uint32_t h, l, m; + + /* clear upper layer of search if needed */ + dr_buddy_get_seg_borders(seg, &l, &h); + m = bitmap_ffs(buddy->bits[order], l, h); + if (m == h) /* nothing in the long that includes seg */ + bitmap_clear_bit(buddy->set_bit[order], seg / BITS_PER_LONG); +} + +/* + * This function finds the first area of the managed memory by the buddy. + * It uses the data structures of the buddy-system in order to find the first + * area of free place, starting from the current order till the maximum order + * in the system. + * The function returns the location (seg) in the whole buddy memory area, this + * indicates the place of the memory to use, it is the index of the mem segment. + */ +int dr_buddy_alloc_mem(struct dr_icm_buddy_mem *buddy, int order) +{ + int seg; + int o, m; + + for (o = order; o <= buddy->max_order; ++o) + if (buddy->num_free[o]) { + m = 1 << (buddy->max_order - o); + seg = dr_find_first_bit(buddy->set_bit[o], buddy->bits[o], m); + if (m <= seg) { + /* not found free mem, but there are free mem */ + assert(false); + return -1; + } + goto found; + } + + return -1; + +found: + bitmap_clear_bit(buddy->bits[o], seg); + /* clear upper layer of search if needed */ + dr_buddy_update_upper_bitmap(buddy, seg, o); + --buddy->num_free[o]; + /* if we find free memory in some order that it is bigger than the + * required order, we need to devied each order between the required to + * the found one to 2, and mark accordingly. + */ + while (o > order) { + --o; + seg <<= 1; + bitmap_set_bit(buddy->bits[o], seg ^ 1); + bitmap_set_bit(buddy->set_bit[o], (seg ^ 1) / BITS_PER_LONG); + + ++buddy->num_free[o]; + } + + seg <<= order; + + return seg; +} + +void +dr_buddy_free_mem(struct dr_icm_buddy_mem *buddy, uint32_t seg, int order) +{ + seg >>= order; + + /* whenever a segment is free, the mem is added to the buddy that gave it */ + while (bitmap_test_bit(buddy->bits[order], seg ^ 1)) { + bitmap_clear_bit(buddy->bits[order], seg ^ 1); + dr_buddy_update_upper_bitmap(buddy, seg ^ 1, order); + --buddy->num_free[order]; + seg >>= 1; + ++order; + } + bitmap_set_bit(buddy->bits[order], seg); + bitmap_set_bit(buddy->set_bit[order], seg / BITS_PER_LONG); + + ++buddy->num_free[order]; +} + diff --git a/providers/mlx5/dr_dbg.c b/providers/mlx5/dr_dbg.c index 12b76bb..4c73ef4 100644 --- a/providers/mlx5/dr_dbg.c +++ b/providers/mlx5/dr_dbg.c @@ -55,8 +55,10 @@ enum dr_dump_rec_type { DR_DUMP_REC_TYPE_MATCHER_BUILDER = 3204, DR_DUMP_REC_TYPE_RULE = 3300, - DR_DUMP_REC_TYPE_RULE_RX_ENTRY = 3301, - DR_DUMP_REC_TYPE_RULE_TX_ENTRY = 3302, + DR_DUMP_REC_TYPE_RULE_RX_ENTRY_V0 = 3301, + DR_DUMP_REC_TYPE_RULE_TX_ENTRY_V0 = 3302, + DR_DUMP_REC_TYPE_RULE_RX_ENTRY_V1 = 3303, + DR_DUMP_REC_TYPE_RULE_TX_ENTRY_V1 = 3304, DR_DUMP_REC_TYPE_ACTION_ENCAP_L2 = 3400, DR_DUMP_REC_TYPE_ACTION_ENCAP_L3 = 3401, @@ -69,6 +71,10 @@ enum dr_dump_rec_type { DR_DUMP_REC_TYPE_ACTION_VPORT = 3408, DR_DUMP_REC_TYPE_ACTION_DECAP_L2 = 3409, DR_DUMP_REC_TYPE_ACTION_DECAP_L3 = 3410, + DR_DUMP_REC_TYPE_ACTION_DEVX_TIR = 3411, + DR_DUMP_REC_TYPE_ACTION_METER = 3414, + DR_DUMP_REC_TYPE_ACTION_SAMPLER = 3415, + DR_DUMP_REC_TYPE_ACTION_DEST_ARRAY = 3416, }; static uint64_t dr_dump_icm_to_idx(uint64_t icm_addr) @@ -97,14 +103,20 @@ static int dr_dump_rule_action_mem(FILE *f, const uint64_t rule_id, DR_DUMP_REC_TYPE_ACTION_DROP, action_id, rule_id); break; case DR_ACTION_TYP_FT: - ret = fprintf(f, "%d,0x%" PRIx64 ",0x%" PRIx64 ",0x%x\n", + ret = fprintf(f, "%d,0x%" PRIx64 ",0x%" PRIx64 ",0x%x,0x%" PRIx64 "\n", DR_DUMP_REC_TYPE_ACTION_FT, action_id, rule_id, - action->dest_tbl->devx_obj->object_id); + action->dest_tbl->devx_obj->object_id, + (uint64_t)(uintptr_t)action->dest_tbl); break; case DR_ACTION_TYP_QP: - ret = fprintf(f, "%d,0x%" PRIx64 ",0x%" PRIx64 ",0x%x\n", - DR_DUMP_REC_TYPE_ACTION_QP, action_id, rule_id, - action->qp->qp_num); + if (action->dest_qp.is_qp) + ret = fprintf(f, "%d,0x%" PRIx64 ",0x%" PRIx64 ",0x%x\n", + DR_DUMP_REC_TYPE_ACTION_QP, action_id, + rule_id, action->dest_qp.qp->qp_num); + else + ret = fprintf(f, "%d,0x%" PRIx64 ",0x%" PRIx64 ",0x%" PRIx64 "\n", + DR_DUMP_REC_TYPE_ACTION_DEVX_TIR, action_id, + rule_id, action->dest_qp.devx_tir->rx_icm_addr); break; case DR_ACTION_TYP_CTR: ret = fprintf(f, "%d,0x%" PRIx64 ",0x%" PRIx64 ",0x%x\n", @@ -113,12 +125,12 @@ static int dr_dump_rule_action_mem(FILE *f, const uint64_t rule_id, action->ctr.offset); break; case DR_ACTION_TYP_TAG: - ret = fprintf(f, "%d,,0x%" PRIx64 ",0x%" PRIx64 "0x%x\n", + ret = fprintf(f, "%d,0x%" PRIx64 ",0x%" PRIx64 ",0x%x\n", DR_DUMP_REC_TYPE_ACTION_TAG, action_id, rule_id, action->flow_tag); break; case DR_ACTION_TYP_MODIFY_HDR: - ret = fprintf(f, "%d,,0x%" PRIx64 ",0x%" PRIx64 "0x%x\n", + ret = fprintf(f, "%d,0x%" PRIx64 ",0x%" PRIx64 ",0x%x\n", DR_DUMP_REC_TYPE_ACTION_MODIFY_HDR, action_id, rule_id, action->rewrite.index); break; @@ -147,6 +159,36 @@ static int dr_dump_rule_action_mem(FILE *f, const uint64_t rule_id, DR_DUMP_REC_TYPE_ACTION_ENCAP_L3, action_id, rule_id, action->reformat.dvo->object_id); break; + case DR_ACTION_TYP_METER: + ret = fprintf(f, "%d,0x%" PRIx64 ",0x%" PRIx64 ",0x%" PRIx64 ",0x%x,0x%" PRIx64 ",0x%" PRIx64 "\n", + DR_DUMP_REC_TYPE_ACTION_METER, + action_id, + rule_id, + (uint64_t)(uintptr_t)action->meter.next_ft, + action->meter.devx_obj->object_id, + action->meter.rx_icm_addr, + action->meter.tx_icm_addr); + break; + case DR_ACTION_TYP_SAMPLER: + ret = fprintf(f, "%d,0x%" PRIx64 ",0x%" PRIx64 ",0x%" PRIx64 ",0x%x,0x%x,0x%" PRIx64 ",0x%" PRIx64 "\n", + DR_DUMP_REC_TYPE_ACTION_SAMPLER, + action_id, + rule_id, + (uint64_t)(uintptr_t)action->sampler.sampler_default->next_ft, + action->sampler.term_tbl->devx_tbl->ft_dvo->object_id, + action->sampler.sampler_default->devx_obj->object_id, + action->sampler.sampler_default->rx_icm_addr, + (action->sampler.sampler_restore) ? + action->sampler.sampler_restore->tx_icm_addr : + action->sampler.sampler_default->tx_icm_addr); + break; + case DR_ACTION_TYP_DEST_ARRAY: + ret = fprintf(f, "%d,0x%" PRIx64 ",0x%" PRIx64 ",0x%x,0x%" PRIx64 ",0x%" PRIx64 "\n", + DR_DUMP_REC_TYPE_ACTION_DEST_ARRAY, action_id, rule_id, + action->dest_array.devx_tbl->ft_dvo->object_id, + action->dest_array.rx_icm_addr, + action->dest_array.tx_icm_addr); + break; default: return 0; } @@ -158,14 +200,20 @@ static int dr_dump_rule_action_mem(FILE *f, const uint64_t rule_id, } static int dr_dump_rule_mem(FILE *f, struct dr_rule_member *rule_mem, - bool is_rx, const uint64_t rule_id) + bool is_rx, const uint64_t rule_id, + enum mlx5_ifc_steering_format_version format_ver) { char hw_ste_dump[BUFF_SIZE] = {}; enum dr_dump_rec_type mem_rec_type; int ret; - mem_rec_type = is_rx ? DR_DUMP_REC_TYPE_RULE_RX_ENTRY : - DR_DUMP_REC_TYPE_RULE_TX_ENTRY; + if (format_ver == MLX5_HW_CONNECTX_5) { + mem_rec_type = is_rx ? DR_DUMP_REC_TYPE_RULE_RX_ENTRY_V0 : + DR_DUMP_REC_TYPE_RULE_TX_ENTRY_V0; + } else { + mem_rec_type = is_rx ? DR_DUMP_REC_TYPE_RULE_RX_ENTRY_V1 : + DR_DUMP_REC_TYPE_RULE_TX_ENTRY_V1; + } dump_hex_print(hw_ste_dump, (char *)rule_mem->ste->hw_ste, DR_STE_SIZE_REDUCED); ret = fprintf(f, "%d,0x%" PRIx64 ",0x%" PRIx64 ",%s\n", @@ -180,13 +228,14 @@ static int dr_dump_rule_mem(FILE *f, struct dr_rule_member *rule_mem, } static int dr_dump_rule_rx_tx(FILE *f, struct dr_rule_rx_tx *rule_rx_tx, - bool is_rx, const uint64_t rule_id) + bool is_rx, const uint64_t rule_id, + enum mlx5_ifc_steering_format_version format_ver) { struct dr_rule_member *rule_mem; int ret; list_for_each(&rule_rx_tx->rule_members_list, rule_mem, list) { - ret = dr_dump_rule_mem(f, rule_mem, is_rx, rule_id); + ret = dr_dump_rule_mem(f, rule_mem, is_rx, rule_id, format_ver); if (ret < 0) return ret; } @@ -195,12 +244,15 @@ static int dr_dump_rule_rx_tx(FILE *f, struct dr_rule_rx_tx *rule_rx_tx, static int dr_dump_rule(FILE *f, struct mlx5dv_dr_rule *rule) { - struct dr_rule_action_member *action_mem; const uint64_t rule_id = (uint64_t) (uintptr_t) rule; + enum mlx5_ifc_steering_format_version format_ver; + struct dr_rule_action_member *action_mem; struct dr_rule_rx_tx *rx = &rule->rx; struct dr_rule_rx_tx *tx = &rule->tx; int ret; + format_ver = rule->matcher->tbl->dmn->info.caps.sw_format_ver; + ret = fprintf(f, "%d,0x%" PRIx64 ",0x%" PRIx64 "\n", DR_DUMP_REC_TYPE_RULE, rule_id, @@ -210,13 +262,15 @@ static int dr_dump_rule(FILE *f, struct mlx5dv_dr_rule *rule) if (!dr_is_root_table(rule->matcher->tbl)) { if (rx->nic_matcher) { - ret = dr_dump_rule_rx_tx(f, rx, true, rule_id); + ret = dr_dump_rule_rx_tx(f, rx, true, rule_id, + format_ver); if (ret < 0) return ret; } if (tx->nic_matcher) { - ret = dr_dump_rule_rx_tx(f, tx, false, rule_id); + ret = dr_dump_rule_rx_tx(f, tx, false, rule_id, + format_ver); if (ret < 0) return ret; } diff --git a/providers/mlx5/dr_devx.c b/providers/mlx5/dr_devx.c index 61157b8..7057b47 100644 --- a/providers/mlx5/dr_devx.c +++ b/providers/mlx5/dr_devx.c @@ -123,9 +123,12 @@ int dr_devx_query_esw_caps(struct ibv_context *ctx, struct dr_esw_caps *caps) caps->uplink_icm_address_tx = DEVX_GET64(flow_table_eswitch_cap, esw_caps, sw_steering_uplink_icm_address_tx); - caps->sw_owner = - DEVX_GET(flow_table_eswitch_cap, esw_caps, - flow_table_properties_nic_esw_fdb.sw_owner); + caps->sw_owner_v2 = DEVX_GET(flow_table_eswitch_cap, esw_caps, + flow_table_properties_nic_esw_fdb.sw_owner_v2); + if (!caps->sw_owner_v2) + caps->sw_owner = + DEVX_GET(flow_table_eswitch_cap, esw_caps, + flow_table_properties_nic_esw_fdb.sw_owner); return 0; } @@ -133,6 +136,7 @@ int dr_devx_query_device(struct ibv_context *ctx, struct dr_devx_caps *caps) { uint32_t out[DEVX_ST_SZ_DW(query_hca_cap_out)] = {}; uint32_t in[DEVX_ST_SZ_DW(query_hca_cap_in)] = {}; + bool roce; int err; DEVX_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); @@ -151,8 +155,13 @@ int dr_devx_query_device(struct ibv_context *ctx, struct dr_devx_caps *caps) caps->gvmi = DEVX_GET(query_hca_cap_out, out, capability.cmd_hca_cap.vhca_id); caps->flex_protocols = DEVX_GET(query_hca_cap_out, out, capability.cmd_hca_cap.flex_parser_protocols); + roce = DEVX_GET(query_hca_cap_out, out, + capability.cmd_hca_cap.roce); - if (dr_matcher_supp_flex_parser_icmp_v4(caps)) { + caps->sw_format_ver = DEVX_GET(query_hca_cap_out, out, + capability.cmd_hca_cap.steering_format_version); + + if (caps->flex_protocols & MLX5_FLEX_PARSER_ICMP_V4_ENABLED) { caps->flex_parser_id_icmp_dw0 = DEVX_GET(query_hca_cap_out, out, @@ -163,7 +172,7 @@ int dr_devx_query_device(struct ibv_context *ctx, struct dr_devx_caps *caps) capability.cmd_hca_cap.flex_parser_id_icmp_dw1); } - if (dr_matcher_supp_flex_parser_icmp_v6(caps)) { + if (caps->flex_protocols & MLX5_FLEX_PARSER_ICMP_V6_ENABLED) { caps->flex_parser_id_icmpv6_dw0 = DEVX_GET(query_hca_cap_out, out, @@ -193,12 +202,20 @@ int dr_devx_query_device(struct ibv_context *ctx, struct dr_devx_caps *caps) caps->nic_tx_allow_address = DEVX_GET64(query_hca_cap_out, out, capability.flow_table_nic_cap. sw_steering_nic_tx_action_allow_icm_address); - caps->rx_sw_owner = DEVX_GET(query_hca_cap_out, out, - capability.flow_table_nic_cap. - flow_table_properties_nic_receive.sw_owner); - caps->tx_sw_owner = DEVX_GET(query_hca_cap_out, out, - capability.flow_table_nic_cap. - flow_table_properties_nic_transmit.sw_owner); + caps->rx_sw_owner_v2 = DEVX_GET(query_hca_cap_out, out, + capability.flow_table_nic_cap. + flow_table_properties_nic_receive.sw_owner_v2); + caps->tx_sw_owner_v2 = DEVX_GET(query_hca_cap_out, out, + capability.flow_table_nic_cap. + flow_table_properties_nic_transmit.sw_owner_v2); + if (!caps->rx_sw_owner_v2) + caps->rx_sw_owner = DEVX_GET(query_hca_cap_out, out, + capability.flow_table_nic_cap. + flow_table_properties_nic_receive.sw_owner); + if (!caps->tx_sw_owner_v2) + caps->tx_sw_owner = DEVX_GET(query_hca_cap_out, out, + capability.flow_table_nic_cap. + flow_table_properties_nic_transmit.sw_owner); caps->max_ft_level = DEVX_GET(query_hca_cap_out, out, capability.flow_table_nic_cap. flow_table_properties_nic_receive.max_ft_level); @@ -217,6 +234,24 @@ int dr_devx_query_device(struct ibv_context *ctx, struct dr_devx_caps *caps) caps->hdr_modify_icm_addr = DEVX_GET64(query_hca_cap_out, out, capability.device_mem_cap. header_modify_sw_icm_start_address); + caps->log_modify_hdr_icm_size = DEVX_GET(query_hca_cap_out, out, + capability.device_mem_cap.log_header_modify_sw_icm_size); + + /* RoCE caps */ + if (roce) { + DEVX_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); + DEVX_SET(query_hca_cap_in, in, op_mod, + MLX5_SET_HCA_CAP_OP_MOD_ROCE | + HCA_CAP_OPMOD_GET_CUR); + + err = mlx5dv_devx_general_cmd(ctx, in, sizeof(in), out, sizeof(out)); + if (err) { + dr_dbg_ctx(ctx, "Query RoCE capabilities failed %d\n", err); + return err; + } + caps->roce_caps.fl_rc_qp_when_roce_enabled = DEVX_GET(query_hca_cap_out, out, + capability.roce_caps.fl_rc_qp_when_roce_enabled); + } return 0; } @@ -236,41 +271,321 @@ int dr_devx_sync_steering(struct ibv_context *ctx) return err; } -struct mlx5dv_devx_obj *dr_devx_create_flow_table(struct ibv_context *ctx, - uint32_t table_type, - uint64_t icm_addr_rx, - uint64_t icm_addr_tx, - u8 level) +struct mlx5dv_devx_obj * +dr_devx_create_flow_table(struct ibv_context *ctx, + struct dr_devx_flow_table_attr *ft_attr) { uint32_t out[DEVX_ST_SZ_DW(create_flow_table_out)] = {}; uint32_t in[DEVX_ST_SZ_DW(create_flow_table_in)] = {}; void *ft_ctx; DEVX_SET(create_flow_table_in, in, opcode, MLX5_CMD_OP_CREATE_FLOW_TABLE); - DEVX_SET(create_flow_table_in, in, table_type, table_type); + DEVX_SET(create_flow_table_in, in, table_type, ft_attr->type); ft_ctx = DEVX_ADDR_OF(create_flow_table_in, in, flow_table_context); - DEVX_SET(flow_table_context, ft_ctx, sw_owner, 1); - - DEVX_SET(flow_table_context, ft_ctx, level, level); - /* - * icm_addr_0 used for FDB RX / NIC TX / NIC_RX - * icm_addr_1 used for FDB TX - */ - if (table_type == FS_FT_NIC_RX) { - DEVX_SET64(flow_table_context, ft_ctx, sw_owner_icm_root_0, icm_addr_rx); - } else if (table_type == FS_FT_NIC_TX) { - DEVX_SET64(flow_table_context, ft_ctx, sw_owner_icm_root_0, icm_addr_tx); - } else if (table_type == FS_FT_FDB) { - DEVX_SET64(flow_table_context, ft_ctx, sw_owner_icm_root_0, icm_addr_rx); - DEVX_SET64(flow_table_context, ft_ctx, sw_owner_icm_root_1, icm_addr_tx); - } else { - assert(false); + DEVX_SET(flow_table_context, ft_ctx, termination_table, ft_attr->term_tbl); + DEVX_SET(flow_table_context, ft_ctx, sw_owner, ft_attr->sw_owner); + DEVX_SET(flow_table_context, ft_ctx, level, ft_attr->level); + DEVX_SET(flow_table_context, ft_ctx, reformat_en, ft_attr->reformat_en); + + if (ft_attr->sw_owner) { + /* icm_addr_0 used for FDB RX / NIC TX / NIC_RX + * icm_addr_1 used for FDB TX + */ + if (ft_attr->type == FS_FT_NIC_RX) { + DEVX_SET64(flow_table_context, ft_ctx, + sw_owner_icm_root_0, ft_attr->icm_addr_rx); + } else if (ft_attr->type == FS_FT_NIC_TX) { + DEVX_SET64(flow_table_context, ft_ctx, + sw_owner_icm_root_0, ft_attr->icm_addr_tx); + } else if (ft_attr->type == FS_FT_FDB) { + DEVX_SET64(flow_table_context, ft_ctx, + sw_owner_icm_root_0, ft_attr->icm_addr_rx); + DEVX_SET64(flow_table_context, ft_ctx, + sw_owner_icm_root_1, ft_attr->icm_addr_tx); + } else { + assert(false); + } + } + + return mlx5dv_devx_obj_create(ctx, in, sizeof(in), out, sizeof(out)); +} + +int dr_devx_query_flow_table(struct mlx5dv_devx_obj *obj, uint32_t type, + uint64_t *rx_icm_addr, uint64_t *tx_icm_addr) +{ + uint32_t out[DEVX_ST_SZ_DW(query_flow_table_out)] = {}; + uint32_t in[DEVX_ST_SZ_DW(query_flow_table_in)] = {}; + int ret; + + DEVX_SET(query_flow_table_in, in, opcode, MLX5_CMD_OP_QUERY_FLOW_TABLE); + DEVX_SET(query_flow_table_in, in, table_type, type); + DEVX_SET(query_flow_table_in, in, table_id, obj->object_id); + + ret = mlx5dv_devx_obj_query(obj, in, sizeof(in), out, sizeof(out)); + if (ret) { + dr_dbg_ctx(obj->context, "Failed to query flow table id %u\n", + obj->object_id); + return ret; + } + + *tx_icm_addr = DEVX_GET64(query_flow_table_out, out, + flow_table_context.sw_owner_icm_root_1); + *rx_icm_addr = DEVX_GET64(query_flow_table_out, out, + flow_table_context.sw_owner_icm_root_0); + + return 0; +} + +static struct mlx5dv_devx_obj * +dr_devx_create_flow_group(struct ibv_context *ctx, + struct dr_devx_flow_group_attr *fg_attr) +{ + uint32_t out[DEVX_ST_SZ_DW(create_flow_group_out)] = {}; + uint32_t inlen = DEVX_ST_SZ_BYTES(create_flow_group_in); + struct mlx5dv_devx_obj *obj; + uint32_t *in; + + in = calloc(1, inlen); + if (!in) { + errno = ENOMEM; + return NULL; + } + + DEVX_SET(create_flow_group_in, in, opcode, MLX5_CMD_OP_CREATE_FLOW_GROUP); + DEVX_SET(create_flow_group_in, in, table_type, fg_attr->table_type); + DEVX_SET(create_flow_group_in, in, table_id, fg_attr->table_id); + + obj = mlx5dv_devx_obj_create(ctx, in, inlen, out, sizeof(out)); + free(in); + + return obj; +} + +static struct mlx5dv_devx_obj * +dr_devx_set_fte(struct ibv_context *ctx, + struct dr_devx_flow_fte_attr *fte_attr) +{ + uint32_t out[DEVX_ST_SZ_DW(set_fte_out)] = {}; + struct mlx5dv_devx_obj *obj; + uint32_t dest_entry_size; + void *in_flow_context; + uint32_t list_size; + uint8_t *in_dests; + uint32_t inlen; + uint32_t *in; + uint32_t i; + + if (fte_attr->extended_dest) + dest_entry_size = DEVX_ST_SZ_BYTES(extended_dest_format); + else + dest_entry_size = DEVX_ST_SZ_BYTES(dest_format); + inlen = DEVX_ST_SZ_BYTES(set_fte_in) + fte_attr->dest_size * dest_entry_size; + in = calloc(1, inlen); + if (!in) { + errno = ENOMEM; + return NULL; + } + + DEVX_SET(set_fte_in, in, opcode, MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY); + DEVX_SET(set_fte_in, in, table_type, fte_attr->table_type); + DEVX_SET(set_fte_in, in, table_id, fte_attr->table_id); + + in_flow_context = DEVX_ADDR_OF(set_fte_in, in, flow_context); + DEVX_SET(flow_context, in_flow_context, group_id, fte_attr->group_id); + DEVX_SET(flow_context, in_flow_context, flow_tag, fte_attr->flow_tag); + DEVX_SET(flow_context, in_flow_context, action, fte_attr->action); + DEVX_SET(flow_context, in_flow_context, extended_destination, + fte_attr->extended_dest); + + in_dests = DEVX_ADDR_OF(flow_context, in_flow_context, destination); + if (fte_attr->action & MLX5_FLOW_CONTEXT_ACTION_FWD_DEST) { + list_size = 0; + + for (i = 0; i < fte_attr->dest_size; i++) { + uint32_t id; + uint32_t type = fte_attr->dest_arr[i].type; + + if (type == MLX5_FLOW_DEST_TYPE_COUNTER) + continue; + + switch (type) { + case MLX5_FLOW_DEST_TYPE_VPORT: + id = fte_attr->dest_arr[i].vport_num; + break; + case MLX5_FLOW_DEST_TYPE_TIR: + id = fte_attr->dest_arr[i].tir_num; + break; + default: + errno = EOPNOTSUPP; + goto err_out; + } + + DEVX_SET(dest_format, in_dests, destination_type, type); + DEVX_SET(dest_format, in_dests, destination_id, id); + if (fte_attr->dest_arr[i].has_reformat) { + if (!fte_attr->extended_dest) { + errno = EINVAL; + goto err_out; + } + + DEVX_SET(dest_format, in_dests, packet_reformat, 1); + DEVX_SET(extended_dest_format, in_dests, + packet_reformat_id, + fte_attr->dest_arr[i].reformat_id); + } + + in_dests += dest_entry_size; + list_size++; + } + + DEVX_SET(flow_context, in_flow_context, destination_list_size, list_size); + } + + if (fte_attr->action & MLX5_FLOW_CONTEXT_ACTION_COUNT) { + list_size = 0; + + for (i = 0; i < fte_attr->dest_size; i++) { + if (fte_attr->dest_arr[i].type != MLX5_FLOW_DEST_TYPE_COUNTER) + continue; + + DEVX_SET(flow_counter_list, in_dests, flow_counter_id, + fte_attr->dest_arr[i].counter_id); + in_dests += dest_entry_size; + list_size++; + } + + DEVX_SET(flow_context, in_flow_context, flow_counter_list_size, list_size); } + obj = mlx5dv_devx_obj_create(ctx, in, inlen, out, sizeof(out)); + + free(in); + return obj; + +err_out: + free(in); + return NULL; +} + +struct dr_devx_tbl * +dr_devx_create_always_hit_ft(struct ibv_context *ctx, + struct dr_devx_flow_table_attr *ft_attr, + struct dr_devx_flow_group_attr *fg_attr, + struct dr_devx_flow_fte_attr *fte_attr) +{ + struct mlx5dv_devx_obj *fte_dvo; + struct mlx5dv_devx_obj *fg_dvo; + struct mlx5dv_devx_obj *ft_dvo; + struct dr_devx_tbl *tbl; + + tbl = calloc(1, sizeof(*tbl)); + if (!tbl) { + errno = ENOMEM; + return NULL; + } + + ft_dvo = dr_devx_create_flow_table(ctx, ft_attr); + if (!ft_dvo) + goto free_tbl; + + fg_attr->table_id = ft_dvo->object_id; + fg_attr->table_type = ft_attr->type; + fg_dvo = dr_devx_create_flow_group(ctx, fg_attr); + if (!fg_dvo) + goto free_ft_dvo; + + fte_attr->table_id = ft_dvo->object_id; + fte_attr->table_type = ft_attr->type; + fte_attr->group_id = fg_dvo->object_id; + fte_dvo = dr_devx_set_fte(ctx, fte_attr); + if (!fte_dvo) + goto free_fg_dvo; + + tbl->type = ft_attr->type; + tbl->level = ft_attr->level; + tbl->ft_dvo = ft_dvo; + tbl->fg_dvo = fg_dvo; + tbl->fte_dvo = fte_dvo; + + return tbl; + +free_fg_dvo: + mlx5dv_devx_obj_destroy(fg_dvo); +free_ft_dvo: + mlx5dv_devx_obj_destroy(ft_dvo); +free_tbl: + free(tbl); + + return NULL; +} + +void dr_devx_destroy_always_hit_ft(struct dr_devx_tbl *devx_tbl) +{ + mlx5dv_devx_obj_destroy(devx_tbl->fte_dvo); + mlx5dv_devx_obj_destroy(devx_tbl->fg_dvo); + mlx5dv_devx_obj_destroy(devx_tbl->ft_dvo); + free(devx_tbl); +} + +struct mlx5dv_devx_obj * +dr_devx_create_flow_sampler(struct ibv_context *ctx, + struct dr_devx_flow_sampler_attr *sampler_attr) +{ + uint32_t out[DEVX_ST_SZ_DW(general_obj_out_cmd_hdr)] = {}; + uint32_t in[DEVX_ST_SZ_DW(create_flow_sampler_in)] = {}; + void *attr; + + attr = DEVX_ADDR_OF(create_flow_sampler_in, in, hdr); + DEVX_SET(general_obj_in_cmd_hdr, + attr, opcode, MLX5_CMD_OP_CREATE_GENERAL_OBJECT); + DEVX_SET(general_obj_in_cmd_hdr, + attr, obj_type, MLX5_OBJ_TYPE_FLOW_SAMPLER); + + attr = DEVX_ADDR_OF(create_flow_sampler_in, in, sampler); + DEVX_SET(flow_sampler, attr, table_type, sampler_attr->table_type); + DEVX_SET(flow_sampler, attr, level, sampler_attr->level); + DEVX_SET(flow_sampler, attr, sample_ratio, sampler_attr->sample_ratio); + DEVX_SET(flow_sampler, attr, ignore_flow_level, + sampler_attr->ignore_flow_level); + DEVX_SET(flow_sampler, attr, default_table_id, + sampler_attr->default_next_table_id); + DEVX_SET(flow_sampler, attr, sample_table_id, + sampler_attr->sample_table_id); + return mlx5dv_devx_obj_create(ctx, in, sizeof(in), out, sizeof(out)); } +int dr_devx_query_flow_sampler(struct mlx5dv_devx_obj *obj, + uint64_t *rx_icm_addr, uint64_t *tx_icm_addr) +{ + uint32_t out[DEVX_ST_SZ_DW(query_flow_sampler_out)] = {}; + uint32_t in[DEVX_ST_SZ_DW(general_obj_in_cmd_hdr)] = {}; + void *attr; + int ret; + + DEVX_SET(general_obj_in_cmd_hdr, in, opcode, + MLX5_CMD_OP_QUERY_GENERAL_OBJECT); + DEVX_SET(general_obj_in_cmd_hdr, in, obj_type, + MLX5_OBJ_TYPE_FLOW_SAMPLER); + DEVX_SET(general_obj_in_cmd_hdr, in, obj_id, obj->object_id); + + ret = mlx5dv_devx_obj_query(obj, in, sizeof(in), out, sizeof(out)); + if (ret) { + dr_dbg_ctx(obj->context, "Failed to query flow sampler id %u\n", + obj->object_id); + return ret; + } + + attr = DEVX_ADDR_OF(query_flow_sampler_out, out, obj); + *rx_icm_addr = DEVX_GET64(flow_sampler, attr, + sw_steering_icm_address_rx); + *tx_icm_addr = DEVX_GET64(flow_sampler, attr, + sw_steering_icm_address_tx); + + return 0; +} + struct mlx5dv_devx_obj *dr_devx_create_reformat_ctx(struct ibv_context *ctx, enum reformat_type rt, size_t reformat_size, @@ -467,16 +782,20 @@ int dr_devx_modify_qp_init2rtr(struct ibv_context *ctx, DEVX_SET(qpc, qpc, mtu, attr->mtu); DEVX_SET(qpc, qpc, log_msg_max, DR_CHUNK_SIZE_MAX - 1); DEVX_SET(qpc, qpc, remote_qpn, attr->qp_num); - memcpy(DEVX_ADDR_OF(qpc, qpc, primary_address_path.rmac_47_32), - attr->dgid_attr.mac, sizeof(attr->dgid_attr.mac)); - memcpy(DEVX_ADDR_OF(qpc, qpc, primary_address_path.rgid_rip), - attr->dgid_attr.gid.raw, sizeof(attr->dgid_attr.gid.raw)); - DEVX_SET(qpc, qpc, primary_address_path.src_addr_index, - attr->sgid_index); - - if (attr->dgid_attr.roce_ver == MLX5_ROCE_VERSION_2) - DEVX_SET(qpc, qpc, primary_address_path.udp_sport, - DR_DEVX_ICM_UDP_PORT); + + if (attr->fl) { + DEVX_SET(qpc, qpc, primary_address_path.fl, attr->fl); + } else { + memcpy(DEVX_ADDR_OF(qpc, qpc, primary_address_path.rmac_47_32), + attr->dgid_attr.mac, sizeof(attr->dgid_attr.mac)); + memcpy(DEVX_ADDR_OF(qpc, qpc, primary_address_path.rgid_rip), + attr->dgid_attr.gid.raw, sizeof(attr->dgid_attr.gid.raw)); + DEVX_SET(qpc, qpc, primary_address_path.src_addr_index, + attr->sgid_index); + if (attr->dgid_attr.roce_ver == MLX5_ROCE_VERSION_2) + DEVX_SET(qpc, qpc, primary_address_path.udp_sport, + DR_DEVX_ICM_UDP_PORT); + } DEVX_SET(qpc, qpc, primary_address_path.vhca_port_num, attr->port_num); DEVX_SET(qpc, qpc, min_rnr_nak, 1); diff --git a/providers/mlx5/dr_domain.c b/providers/mlx5/dr_domain.c index 339208e..5e172d0 100644 --- a/providers/mlx5/dr_domain.c +++ b/providers/mlx5/dr_domain.c @@ -37,13 +37,20 @@ enum { MLX5DV_DR_DOMAIN_SYNC_SUP_FLAGS = (MLX5DV_DR_DOMAIN_SYNC_FLAGS_SW | - MLX5DV_DR_DOMAIN_SYNC_FLAGS_HW), + MLX5DV_DR_DOMAIN_SYNC_FLAGS_HW | + MLX5DV_DR_DOMAIN_SYNC_FLAGS_MEM), }; static int dr_domain_init_resources(struct mlx5dv_dr_domain *dmn) { int ret = -1; + dmn->ste_ctx = dr_ste_get_ctx(dmn->info.caps.sw_format_ver); + if (!dmn->ste_ctx) { + dr_dbg(dmn, "Couldn't initialize STE context\n"); + return errno; + } + dmn->pd = ibv_alloc_pd(dmn->ctx); if (!dmn->pd) { dr_dbg(dmn, "Couldn't allocate PD\n"); @@ -151,6 +158,7 @@ static int dr_domain_query_fdb_caps(struct ibv_context *ctx, goto err; dmn->info.caps.fdb_sw_owner = esw_caps.sw_owner; + dmn->info.caps.fdb_sw_owner_v2 = esw_caps.sw_owner_v2; dmn->info.caps.vports_caps[i].icm_address_rx = esw_caps.uplink_icm_address_rx; dmn->info.caps.vports_caps[i].icm_address_tx = esw_caps.uplink_icm_address_tx; dmn->info.caps.esw_rx_drop_address = esw_caps.drop_icm_address_rx; @@ -188,10 +196,6 @@ static int dr_domain_caps_init(struct ibv_context *ctx, if (ret) return ret; - /* Non FDB type is supported only over root table */ - if (dmn->type != MLX5DV_DR_DOMAIN_TYPE_FDB) - return 0; - ret = dr_devx_query_device(ctx, &dmn->info.caps); if (ret) /* Ignore devx query failure to allow steering on root level @@ -199,13 +203,22 @@ static int dr_domain_caps_init(struct ibv_context *ctx, */ return 0; + /* Non FDB type is supported over root table or when we can enable + * force-loopback. + */ + if ((dmn->type != MLX5DV_DR_DOMAIN_TYPE_FDB) && + !dmn->info.caps.roce_caps.fl_rc_qp_when_roce_enabled) + return 0; + ret = dr_domain_query_fdb_caps(ctx, dmn); if (ret) return ret; switch (dmn->type) { case MLX5DV_DR_DOMAIN_TYPE_NIC_RX: - if (!dmn->info.caps.rx_sw_owner) + if (!dmn->info.caps.rx_sw_owner && + !(dmn->info.caps.rx_sw_owner_v2 && + dmn->info.caps.sw_format_ver <= MLX5_HW_CONNECTX_6DX)) return 0; dmn->info.supp_sw_steering = true; @@ -214,7 +227,9 @@ static int dr_domain_caps_init(struct ibv_context *ctx, dmn->info.rx.drop_icm_addr = dmn->info.caps.nic_rx_drop_address; break; case MLX5DV_DR_DOMAIN_TYPE_NIC_TX: - if (!dmn->info.caps.tx_sw_owner) + if (!dmn->info.caps.tx_sw_owner && + !(dmn->info.caps.tx_sw_owner_v2 && + dmn->info.caps.sw_format_ver <= MLX5_HW_CONNECTX_6DX)) return 0; dmn->info.supp_sw_steering = true; @@ -226,7 +241,9 @@ static int dr_domain_caps_init(struct ibv_context *ctx, if (!dmn->info.caps.eswitch_manager) return 0; - if (!dmn->info.caps.fdb_sw_owner) + if (!dmn->info.caps.fdb_sw_owner && + !(dmn->info.caps.fdb_sw_owner_v2 && + dmn->info.caps.sw_format_ver <= MLX5_HW_CONNECTX_6DX)) return 0; dmn->info.rx.ste_type = DR_STE_TYPE_RX; @@ -258,6 +275,30 @@ static void dr_domain_caps_uninit(struct mlx5dv_dr_domain *dmn) free(dmn->info.caps.vports_caps); } +static int dr_domain_check_icm_memory_caps(struct mlx5dv_dr_domain *dmn) +{ + if (dmn->info.caps.log_modify_hdr_icm_size < DR_CHUNK_SIZE_4K + + DR_MODIFY_ACTION_LOG_SIZE) { + errno = ENOMEM; + return errno; + } + + dmn->info.max_log_action_icm_sz = min_t(uint32_t, + DR_CHUNK_SIZE_1024K, + dmn->info.caps.log_modify_hdr_icm_size + - DR_MODIFY_ACTION_LOG_SIZE); + + if (dmn->info.caps.log_icm_size < DR_CHUNK_SIZE_1024K + + DR_STE_LOG_SIZE) { + errno = ENOMEM; + return errno; + } + + dmn->info.max_log_sw_icm_sz = DR_CHUNK_SIZE_1024K; + + return 0; +} + struct mlx5dv_dr_domain * mlx5dv_dr_domain_create(struct ibv_context *ctx, enum mlx5dv_dr_domain_type type) @@ -286,22 +327,22 @@ mlx5dv_dr_domain_create(struct ibv_context *ctx, goto free_domain; } - dmn->info.max_log_action_icm_sz = DR_CHUNK_SIZE_4K; - dmn->info.max_log_sw_icm_sz = min_t(uint32_t, DR_CHUNK_SIZE_1024K, - dmn->info.caps.log_icm_size); - /* Allocate resources */ if (dmn->info.supp_sw_steering) { + + if (dr_domain_check_icm_memory_caps(dmn)) + goto uninit_caps; + ret = dr_domain_init_resources(dmn); if (ret) { dr_dbg(dmn, "Failed init domain resources for %s\n", ibv_get_device_name(ctx->device)); goto uninit_caps; } - /* Init CRC table for htbl CRC calculation */ dr_crc32_init_table(); } + return dmn; uninit_caps: @@ -334,8 +375,22 @@ int mlx5dv_dr_domain_sync(struct mlx5dv_dr_domain *dmn, uint32_t flags) pthread_mutex_unlock(&dmn->mutex); } - if (flags & MLX5DV_DR_DOMAIN_SYNC_FLAGS_HW) + if (flags & MLX5DV_DR_DOMAIN_SYNC_FLAGS_HW) { ret = dr_devx_sync_steering(dmn->ctx); + if (ret) + return ret; + } + + if (flags & MLX5DV_DR_DOMAIN_SYNC_FLAGS_MEM) { + if (dmn->ste_icm_pool) { + ret = dr_icm_pool_sync_pool(dmn->ste_icm_pool); + if (ret) + return ret; + } + + if (dmn->action_icm_pool) + ret = dr_icm_pool_sync_pool(dmn->action_icm_pool); + } return ret; @@ -344,6 +399,17 @@ out_unlock: return ret; } +void mlx5dv_dr_domain_set_reclaim_device_memory(struct mlx5dv_dr_domain *dmn, + bool enable) +{ + pthread_mutex_lock(&dmn->mutex); + if (enable) + dmn->flags |= DR_DOMAIN_FLAG_MEMORY_RECLAIM; + else + dmn->flags &= ~DR_DOMAIN_FLAG_MEMORY_RECLAIM; + pthread_mutex_unlock(&dmn->mutex); +} + int mlx5dv_dr_domain_destroy(struct mlx5dv_dr_domain *dmn) { if (atomic_load(&dmn->refcount) > 1) diff --git a/providers/mlx5/dr_icm_pool.c b/providers/mlx5/dr_icm_pool.c index 1e28539..429a9aa 100644 --- a/providers/mlx5/dr_icm_pool.c +++ b/providers/mlx5/dr_icm_pool.c @@ -33,72 +33,92 @@ #include "mlx5dv_dr.h" #define DR_ICM_MODIFY_HDR_ALIGN_BASE 64 - -struct dr_icm_pool; - -#define DR_ICM_SYNC_THRESHOLD (64 * 1024 * 1024) - -struct dr_icm_bucket { - struct dr_icm_pool *pool; - - /* It is safe to allocate chunks from this list, now HW is guaranteed - * to not access this memory - */ - struct list_head free_list; - unsigned int free_list_count; - - /* This is the list of used chunks, HW may be accessing this memory */ - struct list_head used_list; - unsigned int used_list_count; - - /* HW may be accessing this memory but at some future, - * undetermined time, it might cease to do so. Before deciding to call - * sync_ste, this list is moved to tmp_list - */ - struct list_head hot_list; - unsigned int hot_list_count; - - /* Temporary list, entries from the hot list are moved to this list. - * sync_ste is executed and then tmp_list is concatenated to the free list - */ - struct list_head tmp_list; - unsigned int tmp_list_count; - - uint32_t total_chunks; - uint32_t num_of_entries; - uint32_t entry_size; - pthread_mutex_t mutex; -}; +#define DR_ICM_SYNC_THRESHOLD_POOL (64 * 1024 * 1024) struct dr_icm_pool { - struct dr_icm_bucket *buckets; enum dr_icm_type icm_type; - enum dr_icm_chunk_size max_log_chunk_sz; - enum dr_icm_chunk_size num_of_buckets; - struct list_head icm_mr_list; - pthread_mutex_t mr_mutex; struct mlx5dv_dr_domain *dmn; + enum dr_icm_chunk_size max_log_chunk_sz; + /* memory management */ + pthread_mutex_t mutex; + struct list_head buddy_mem_list; + uint64_t hot_memory_size; }; struct dr_icm_mr { - struct dr_icm_pool *pool; struct ibv_mr *mr; struct ibv_dm *dm; - size_t used_length; uint64_t icm_start_addr; - struct list_node mr_list; }; -static struct dr_icm_mr * -dr_icm_pool_mr_create(struct dr_icm_pool *pool, - enum mlx5_ib_uapi_dm_type dm_type, - size_t align_base) +static int +dr_icm_allocate_aligned_dm(struct dr_icm_pool *pool, + struct dr_icm_mr *icm_mr, + struct ibv_alloc_dm_attr *dm_attr) { struct mlx5dv_alloc_dm_attr mlx5_dm_attr = {}; + size_t log_align_base = 0; + bool fallback = false; + struct mlx5_dm *dm; + size_t size; + + /* create dm/mr for this pool */ + size = dr_icm_pool_chunk_size_to_byte(pool->max_log_chunk_sz, + pool->icm_type); + + if (pool->icm_type == DR_ICM_TYPE_STE) { + mlx5_dm_attr.type = MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM; + /* Align base is the biggest chunk size */ + log_align_base = ilog32(size - 1); + } else if (pool->icm_type == DR_ICM_TYPE_MODIFY_ACTION) { + mlx5_dm_attr.type = MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM; + /* Align base is 64B */ + log_align_base = ilog32(DR_ICM_MODIFY_HDR_ALIGN_BASE - 1); + } + + dm_attr->length = size; + +alloc_dm: + icm_mr->dm = mlx5dv_alloc_dm(pool->dmn->ctx, dm_attr, &mlx5_dm_attr); + if (!icm_mr->dm) { + dr_dbg(pool->dmn, "Failed allocating DM\n"); + return errno; + } + + dm = to_mdm(icm_mr->dm); + icm_mr->icm_start_addr = dm->remote_va; + + if (icm_mr->icm_start_addr & ((1UL << log_align_base) - 1)) { + uint64_t align_base; + uint64_t align_diff; + + /* Fallback to previous implementation, ask for double size */ + dr_dbg(pool->dmn, "Got not aligned memory: %zu last_try: %d\n", + log_align_base, fallback); + if (fallback) { + align_base = 1UL << log_align_base; + align_diff = icm_mr->icm_start_addr % align_base; + /* increase the address to start from aligned size */ + icm_mr->icm_start_addr = icm_mr->icm_start_addr + + (align_base - align_diff); + return 0; + } + + mlx5_free_dm(icm_mr->dm); + /* retry to allocate, now double the size */ + dm_attr->length = size * 2; + fallback = true; + goto alloc_dm; + } + + return 0; +} + +static struct dr_icm_mr * +dr_icm_pool_mr_create(struct dr_icm_pool *pool) +{ struct ibv_alloc_dm_attr dm_attr = {}; struct dr_icm_mr *icm_mr; - struct mlx5_dm *dm; - size_t align_diff; icm_mr = calloc(1, sizeof(struct dr_icm_mr)); if (!icm_mr) { @@ -106,20 +126,8 @@ dr_icm_pool_mr_create(struct dr_icm_pool *pool, return NULL; } - icm_mr->pool = pool; - list_node_init(&icm_mr->mr_list); - - mlx5_dm_attr.type = dm_type; - - /* 2^log_biggest_table * entry-size * double-for-alignment */ - dm_attr.length = dr_icm_pool_chunk_size_to_byte(pool->max_log_chunk_sz, - pool->icm_type) * 2; - - icm_mr->dm = mlx5dv_alloc_dm(pool->dmn->ctx, &dm_attr, &mlx5_dm_attr); - if (!icm_mr->dm) { - dr_dbg(pool->dmn, "Failed allocating DM\n"); + if (dr_icm_allocate_aligned_dm(pool, icm_mr, &dm_attr)) goto free_icm_mr; - } /* Register device memory */ icm_mr->mr = ibv_reg_dm_mr(pool->dmn->pd, icm_mr->dm, 0, @@ -133,15 +141,6 @@ dr_icm_pool_mr_create(struct dr_icm_pool *pool, goto free_dm; } - dm = to_mdm(icm_mr->dm); - icm_mr->icm_start_addr = dm->remote_va; - - align_diff = icm_mr->icm_start_addr % align_base; - if (align_diff) - icm_mr->used_length = align_base - align_diff; - - list_add_tail(&pool->icm_mr_list, &icm_mr->mr_list); - return icm_mr; free_dm: @@ -153,35 +152,34 @@ free_icm_mr: static void dr_icm_pool_mr_destroy(struct dr_icm_mr *icm_mr) { - list_del(&icm_mr->mr_list); ibv_dereg_mr(icm_mr->mr); mlx5_free_dm(icm_mr->dm); free(icm_mr); } -static int dr_icm_chunk_ste_init(struct dr_icm_chunk *chunk) +static enum dr_icm_type +get_chunk_icm_type(struct dr_icm_chunk *chunk) { - struct dr_icm_bucket *bucket = chunk->bucket; - struct dr_icm_pool *pool = bucket->pool; + return chunk->buddy_mem->pool->icm_type; +} - chunk->ste_arr = calloc(bucket->num_of_entries, sizeof(struct dr_ste)); +static int dr_icm_chunk_ste_init(struct dr_icm_chunk *chunk) +{ + chunk->ste_arr = calloc(chunk->num_of_entries, sizeof(struct dr_ste)); if (!chunk->ste_arr) { - dr_dbg(pool->dmn, "Failed allocating ste_arr for chunk\n"); errno = ENOMEM; return errno; } - chunk->hw_ste_arr = calloc(bucket->num_of_entries, DR_STE_SIZE_REDUCED); + chunk->hw_ste_arr = calloc(chunk->num_of_entries, DR_STE_SIZE_REDUCED); if (!chunk->hw_ste_arr) { - dr_dbg(pool->dmn, "Failed allocating hw_ste_arr for chunk\n"); errno = ENOMEM; goto out_free_ste_arr; } - chunk->miss_list = malloc(bucket->num_of_entries * + chunk->miss_list = malloc(chunk->num_of_entries * sizeof(struct list_head)); if (!chunk->miss_list) { - dr_dbg(pool->dmn, "Failed allocating miss_list for chunk\n"); errno = ENOMEM; goto out_free_hw_ste_arr; } @@ -195,76 +193,6 @@ out_free_ste_arr: return errno; } -static int dr_icm_chunks_create(struct dr_icm_bucket *bucket) -{ - size_t mr_free_size, mr_req_size, mr_row_size; - struct dr_icm_pool *pool = bucket->pool; - enum mlx5_ib_uapi_dm_type dm_type; - struct dr_icm_chunk *chunk; - struct dr_icm_mr *icm_mr; - size_t align_base; - int i; - - mr_req_size = bucket->num_of_entries * bucket->entry_size; - mr_row_size = dr_icm_pool_chunk_size_to_byte(pool->max_log_chunk_sz, - pool->icm_type); - - if (pool->icm_type == DR_ICM_TYPE_STE) { - dm_type = MLX5_IB_UAPI_DM_TYPE_STEERING_SW_ICM; - /* Align base is the biggest chunk size / row size */ - align_base = mr_row_size; - } else { - dm_type = MLX5_IB_UAPI_DM_TYPE_HEADER_MODIFY_SW_ICM; - /* Align base is 64B */ - align_base = DR_ICM_MODIFY_HDR_ALIGN_BASE; - } - - pthread_mutex_lock(&pool->mr_mutex); - icm_mr = list_tail(&pool->icm_mr_list, struct dr_icm_mr, mr_list); - if (icm_mr) - mr_free_size = icm_mr->mr->length - icm_mr->used_length; - - if (!icm_mr || mr_free_size < mr_row_size) { - icm_mr = dr_icm_pool_mr_create(pool, dm_type, align_base); - if (!icm_mr) - goto out_err; - } - - /* Create memory aligned chunks */ - for (i = 0; i < mr_row_size / mr_req_size; i++) { - chunk = calloc(1, sizeof(struct dr_icm_chunk)); - if (!chunk) { - errno = ENOMEM; - goto out_err; - } - - chunk->bucket = bucket; - chunk->rkey = icm_mr->mr->rkey; - chunk->mr_addr = (uintptr_t)icm_mr->mr->addr + icm_mr->used_length; - chunk->icm_addr = (uintptr_t)icm_mr->icm_start_addr + icm_mr->used_length; - icm_mr->used_length += mr_req_size; - chunk->num_of_entries = bucket->num_of_entries; - chunk->byte_size = chunk->num_of_entries * bucket->entry_size; - - if (pool->icm_type == DR_ICM_TYPE_STE) - if (dr_icm_chunk_ste_init(chunk)) - goto out_free_chunk; - - list_node_init(&chunk->chunk_list); - list_add(&bucket->free_list, &chunk->chunk_list); - bucket->free_list_count++; - bucket->total_chunks++; - } - pthread_mutex_unlock(&pool->mr_mutex); - return 0; - -out_free_chunk: - free(chunk); -out_err: - pthread_mutex_unlock(&pool->mr_mutex); - return errno; -} - static void dr_icm_chunk_ste_cleanup(struct dr_icm_chunk *chunk) { free(chunk->miss_list); @@ -274,164 +202,199 @@ static void dr_icm_chunk_ste_cleanup(struct dr_icm_chunk *chunk) static void dr_icm_chunk_destroy(struct dr_icm_chunk *chunk) { - struct dr_icm_bucket *bucket = chunk->bucket; + enum dr_icm_type icm_type = get_chunk_icm_type(chunk); list_del(&chunk->chunk_list); - bucket->total_chunks--; - if (bucket->pool->icm_type == DR_ICM_TYPE_STE) + if (icm_type == DR_ICM_TYPE_STE) dr_icm_chunk_ste_cleanup(chunk); free(chunk); } -static void dr_icm_bucket_init(struct dr_icm_pool *pool, - struct dr_icm_bucket *bucket, - enum dr_icm_chunk_size chunk_size) +static int dr_icm_buddy_create(struct dr_icm_pool *pool) { - if (pool->icm_type == DR_ICM_TYPE_STE) - bucket->entry_size = DR_STE_SIZE; - else - bucket->entry_size = DR_MODIFY_ACTION_SIZE; - - bucket->num_of_entries = dr_icm_pool_chunk_size_to_entries(chunk_size); - bucket->pool = pool; - pthread_mutex_init(&bucket->mutex, NULL); - list_head_init(&bucket->free_list); - list_head_init(&bucket->used_list); - list_head_init(&bucket->hot_list); - list_head_init(&bucket->tmp_list); + struct dr_icm_buddy_mem *buddy; + struct dr_icm_mr *icm_mr; + + icm_mr = dr_icm_pool_mr_create(pool); + if (!icm_mr) + return ENOMEM; + + buddy = calloc(1, sizeof(*buddy)); + if (!buddy) { + errno = ENOMEM; + goto free_mr; + } + + if (dr_buddy_init(buddy, pool->max_log_chunk_sz)) + goto err_free_buddy; + + buddy->icm_mr = icm_mr; + buddy->pool = pool; + + /* add it to the -start- of the list in order to search in it first */ + list_add(&pool->buddy_mem_list, &buddy->list_node); + + return 0; + +err_free_buddy: + free(buddy); +free_mr: + dr_icm_pool_mr_destroy(icm_mr); + return errno; } -static void dr_icm_bucket_cleanup(struct dr_icm_bucket *bucket) +static void dr_icm_buddy_destroy(struct dr_icm_buddy_mem *buddy) { struct dr_icm_chunk *chunk, *next; - pthread_mutex_destroy(&bucket->mutex); - list_append_list(&bucket->free_list, &bucket->tmp_list); - list_append_list(&bucket->free_list, &bucket->hot_list); + list_for_each_safe(&buddy->hot_list, chunk, next, chunk_list) + dr_icm_chunk_destroy(chunk); - list_for_each_safe(&bucket->free_list, chunk, next, chunk_list) + list_for_each_safe(&buddy->used_list, chunk, next, chunk_list) dr_icm_chunk_destroy(chunk); - assert(bucket->total_chunks == 0); + dr_icm_pool_mr_destroy(buddy->icm_mr); - /* Cleanup of unreturned chunks */ - list_for_each_safe(&bucket->used_list, chunk, next, chunk_list) - dr_icm_chunk_destroy(chunk); + dr_buddy_cleanup(buddy); + + free(buddy); } -static uint64_t dr_icm_hot_mem_size(struct dr_icm_pool *pool) +static struct dr_icm_chunk * +dr_icm_chunk_create(struct dr_icm_pool *pool, + enum dr_icm_chunk_size chunk_size, + struct dr_icm_buddy_mem *buddy_mem_pool, + int seg) { - uint64_t hot_size = 0; - int i; + struct dr_icm_chunk *chunk; + int offset; - for (i = 0; i < pool->num_of_buckets; i++) - hot_size += pool->buckets[i].hot_list_count * - dr_icm_pool_chunk_size_to_byte(i, pool->icm_type); + chunk = calloc(1, sizeof(struct dr_icm_chunk)); + if (!chunk) { + errno = ENOMEM; + return NULL; + } - return hot_size; -} + offset = dr_icm_pool_dm_type_to_entry_size(pool->icm_type) * seg; -static bool dr_icm_reuse_hot_entries(struct dr_icm_pool *pool, - struct dr_icm_bucket *bucket) -{ - uint64_t bytes_for_sync; + chunk->rkey = buddy_mem_pool->icm_mr->mr->rkey; + chunk->mr_addr = (uintptr_t)buddy_mem_pool->icm_mr->mr->addr + offset; + chunk->icm_addr = (uintptr_t)buddy_mem_pool->icm_mr->icm_start_addr + offset; + chunk->num_of_entries = dr_icm_pool_chunk_size_to_entries(chunk_size); + chunk->byte_size = dr_icm_pool_chunk_size_to_byte(chunk_size, pool->icm_type); + chunk->seg = seg; + + if (pool->icm_type == DR_ICM_TYPE_STE && dr_icm_chunk_ste_init(chunk)) { + dr_dbg(pool->dmn, "Failed init ste arrays: order: %d\n", + chunk_size) + goto out_free_chunk; + } - bytes_for_sync = dr_icm_hot_mem_size(pool); - if (bytes_for_sync < DR_ICM_SYNC_THRESHOLD || !bucket->hot_list_count) - return false; + buddy_mem_pool->used_memory += chunk->byte_size; + chunk->buddy_mem = buddy_mem_pool; + list_node_init(&chunk->chunk_list); - return true; -} + /* chunk now is part of the used_list */ + list_add_tail(&buddy_mem_pool->used_list, &chunk->chunk_list); -static void dr_icm_chill_bucket_start(struct dr_icm_bucket *bucket) -{ - list_append_list(&bucket->tmp_list, &bucket->hot_list); - bucket->tmp_list_count += bucket->hot_list_count; - bucket->hot_list_count = 0; -} + return chunk; -static void dr_icm_chill_bucket_end(struct dr_icm_bucket *bucket) -{ - list_append_list(&bucket->free_list, &bucket->tmp_list); - bucket->free_list_count += bucket->tmp_list_count; - bucket->tmp_list_count = 0; +out_free_chunk: + free(chunk); + return NULL; } -static void dr_icm_chill_bucket_abort(struct dr_icm_bucket *bucket) +static bool dr_icm_pool_is_sync_required(struct dr_icm_pool *pool) { - list_append_list(&bucket->hot_list, &bucket->tmp_list); - bucket->hot_list_count += bucket->tmp_list_count; - bucket->tmp_list_count = 0; + if (pool->hot_memory_size > DR_ICM_SYNC_THRESHOLD_POOL) + return true; + + return false; } -static void dr_icm_chill_buckets_start(struct dr_icm_pool *pool, - struct dr_icm_bucket *cb, - bool bucks[DR_CHUNK_SIZE_MAX]) +static int dr_icm_pool_sync_pool_buddies(struct dr_icm_pool *pool) { - struct dr_icm_bucket *bucket; - int i; - - for (i = 0; i < pool->num_of_buckets; i++) { - bucket = &pool->buckets[i]; - if (bucket == cb) { - dr_icm_chill_bucket_start(bucket); - continue; - } + struct dr_icm_buddy_mem *buddy, *tmp_buddy; + int err; + + err = dr_devx_sync_steering(pool->dmn->ctx); + if (err) { + dr_dbg(pool->dmn, "Failed devx sync hw\n"); + return err; + } - /* Freeing the mutex is done at the end of that process, after - * sync_ste was executed at dr_icm_chill_buckets_end func. - */ - if (!pthread_mutex_trylock(&bucket->mutex)) { - dr_icm_chill_bucket_start(bucket); - bucks[i] = true; + list_for_each_safe(&pool->buddy_mem_list, buddy, tmp_buddy, list_node) { + struct dr_icm_chunk *chunk, *tmp_chunk; + + list_for_each_safe(&buddy->hot_list, chunk, tmp_chunk, chunk_list) { + dr_buddy_free_mem(buddy, chunk->seg, + ilog32(chunk->num_of_entries - 1)); + buddy->used_memory -= chunk->byte_size; + pool->hot_memory_size -= chunk->byte_size; + dr_icm_chunk_destroy(chunk); } + + if ((pool->dmn->flags & DR_DOMAIN_FLAG_MEMORY_RECLAIM) && + !buddy->used_memory) + dr_icm_buddy_destroy(buddy); } + + return 0; } -static void dr_icm_chill_buckets_end(struct dr_icm_pool *pool, - struct dr_icm_bucket *cb, - bool bucks[DR_CHUNK_SIZE_MAX]) +int dr_icm_pool_sync_pool(struct dr_icm_pool *pool) { - struct dr_icm_bucket *bucket; - int i; - - for (i = 0; i < pool->num_of_buckets; i++) { - bucket = &pool->buckets[i]; - if (bucket == cb) { - dr_icm_chill_bucket_end(bucket); - continue; - } + int ret; - if (!bucks[i]) - continue; + pthread_mutex_lock(&pool->mutex); + ret = dr_icm_pool_sync_pool_buddies(pool); + pthread_mutex_unlock(&pool->mutex); - dr_icm_chill_bucket_end(bucket); - pthread_mutex_unlock(&bucket->mutex); - } + return ret; } -static void dr_icm_chill_buckets_abort(struct dr_icm_pool *pool, - struct dr_icm_bucket *cb, - bool bucks[DR_CHUNK_SIZE_MAX]) +static int dr_icm_handle_buddies_get_mem(struct dr_icm_pool *pool, + enum dr_icm_chunk_size chunk_size, + struct dr_icm_buddy_mem **buddy, + int *seg) { - struct dr_icm_bucket *bucket; - int i; - - for (i = 0; i < pool->num_of_buckets; i++) { - bucket = &pool->buckets[i]; - if (bucket == cb) { - dr_icm_chill_bucket_abort(bucket); - continue; + struct dr_icm_buddy_mem *buddy_mem_pool; + bool new_mem = false; + int err = 0; + + *seg = -1; + + /* find the next free place from the buddy list */ + while (*seg == -1) { + list_for_each(&pool->buddy_mem_list, buddy_mem_pool, list_node) { + *seg = dr_buddy_alloc_mem(buddy_mem_pool, chunk_size); + if (*seg != -1) + goto found; + + if (new_mem) { + /* We have new memory pool, first in the list */ + assert(false); + dr_dbg(pool->dmn, "No memory for order: %d\n", + chunk_size); + errno = ENOMEM; + err = ENOMEM; + goto out; + } } - - if (!bucks[i]) - continue; - - dr_icm_chill_bucket_abort(bucket); - pthread_mutex_unlock(&bucket->mutex); + /* no more available allocators in that pool, create new */ + err = dr_icm_buddy_create(pool); + if (err) + goto out; + /* mark we have new memory, first in list */ + new_mem = true; } + +found: + *buddy = buddy_mem_pool; +out: + return err; } /* Allocate an ICM chunk, each chunk holds a piece of ICM memory and @@ -440,66 +403,50 @@ static void dr_icm_chill_buckets_abort(struct dr_icm_pool *pool, struct dr_icm_chunk *dr_icm_alloc_chunk(struct dr_icm_pool *pool, enum dr_icm_chunk_size chunk_size) { - bool bucks[DR_CHUNK_SIZE_MAX] = {}; - struct dr_icm_bucket *bucket; - struct dr_icm_chunk *chunk; - int err; + struct dr_icm_buddy_mem *buddy; + struct dr_icm_chunk *chunk = NULL; + int ret; + int seg; if (chunk_size > pool->max_log_chunk_sz) { errno = EINVAL; return NULL; } - bucket = &pool->buckets[chunk_size]; + pthread_mutex_lock(&pool->mutex); + /* find mem, get back the relevant buddy pool and seg in that mem */ + ret = dr_icm_handle_buddies_get_mem(pool, chunk_size, &buddy, &seg); + if (ret) + goto out; - pthread_mutex_lock(&bucket->mutex); + chunk = dr_icm_chunk_create(pool, chunk_size, buddy, seg); + if (!chunk) + goto out_err; - /* Take chunk from pool if available, otherwise allocate new chunks */ - if (list_empty(&bucket->free_list)) { - if (dr_icm_reuse_hot_entries(pool, bucket)) { - dr_icm_chill_buckets_start(pool, bucket, bucks); - err = dr_devx_sync_steering(pool->dmn->ctx); - if (err) { - dr_icm_chill_buckets_abort(pool, bucket, bucks); - dr_dbg(pool->dmn, "Sync_steering failed\n"); - chunk = NULL; - goto out; - } - dr_icm_chill_buckets_end(pool, bucket, bucks); - } else { - dr_icm_chunks_create(bucket); - } - } + goto out; - chunk = list_tail(&bucket->free_list, struct dr_icm_chunk, chunk_list); - if (chunk) { - list_del_init(&chunk->chunk_list); - list_add_tail(&bucket->used_list, &chunk->chunk_list); - bucket->free_list_count--; - bucket->used_list_count++; - } +out_err: + dr_buddy_free_mem(buddy, seg, chunk_size); out: - pthread_mutex_unlock(&bucket->mutex); + pthread_mutex_unlock(&pool->mutex); return chunk; } void dr_icm_free_chunk(struct dr_icm_chunk *chunk) { - struct dr_icm_bucket *bucket = chunk->bucket; - - if (bucket->pool->icm_type == DR_ICM_TYPE_STE) { - memset(chunk->ste_arr, 0, - bucket->num_of_entries * sizeof(struct dr_ste)); - memset(chunk->hw_ste_arr, 0, - bucket->num_of_entries * DR_STE_SIZE_REDUCED); - } + struct dr_icm_buddy_mem *buddy = chunk->buddy_mem; - pthread_mutex_lock(&bucket->mutex); + /* move the memory to the waiting list AKA "hot" */ + pthread_mutex_lock(&buddy->pool->mutex); list_del_init(&chunk->chunk_list); - list_add_tail(&bucket->hot_list, &chunk->chunk_list); - bucket->hot_list_count++; - bucket->used_list_count--; - pthread_mutex_unlock(&bucket->mutex); + list_add_tail(&buddy->hot_list, &chunk->chunk_list); + buddy->pool->hot_memory_size += chunk->byte_size; + + /* Check if we have chunks that are waiting for sync-ste */ + if (dr_icm_pool_is_sync_required(buddy->pool)) + dr_icm_pool_sync_pool_buddies(buddy->pool); + + pthread_mutex_unlock(&buddy->pool->mutex); } struct dr_icm_pool *dr_icm_pool_create(struct mlx5dv_dr_domain *dmn, @@ -507,7 +454,6 @@ struct dr_icm_pool *dr_icm_pool_create(struct mlx5dv_dr_domain *dmn, { enum dr_icm_chunk_size max_log_chunk_sz; struct dr_icm_pool *pool; - int i; if (icm_type == DR_ICM_TYPE_STE) max_log_chunk_sz = dmn->info.max_log_sw_icm_sz; @@ -520,43 +466,25 @@ struct dr_icm_pool *dr_icm_pool_create(struct mlx5dv_dr_domain *dmn, return NULL; } - pool->buckets = calloc(max_log_chunk_sz + 1, sizeof(struct dr_icm_bucket)); - if (!pool->buckets) { - errno = ENOMEM; - goto free_pool; - } - pool->dmn = dmn; pool->icm_type = icm_type; pool->max_log_chunk_sz = max_log_chunk_sz; - pool->num_of_buckets = max_log_chunk_sz + 1; - list_head_init(&pool->icm_mr_list); - for (i = 0; i < pool->num_of_buckets; i++) - dr_icm_bucket_init(pool, &pool->buckets[i], i); + list_head_init(&pool->buddy_mem_list); - pthread_mutex_init(&pool->mr_mutex, NULL); + pthread_mutex_init(&pool->mutex, NULL); return pool; - -free_pool: - free(pool); - return NULL; } void dr_icm_pool_destroy(struct dr_icm_pool *pool) { - struct dr_icm_mr *icm_mr, *next; - int i; - - pthread_mutex_destroy(&pool->mr_mutex); + struct dr_icm_buddy_mem *buddy, *tmp_buddy; - list_for_each_safe(&pool->icm_mr_list, icm_mr, next, mr_list) - dr_icm_pool_mr_destroy(icm_mr); + list_for_each_safe(&pool->buddy_mem_list, buddy, tmp_buddy, list_node) + dr_icm_buddy_destroy(buddy); - for (i = 0; i < pool->num_of_buckets; i++) - dr_icm_bucket_cleanup(&pool->buckets[i]); + pthread_mutex_destroy(&pool->mutex); - free(pool->buckets); free(pool); } diff --git a/providers/mlx5/dr_matcher.c b/providers/mlx5/dr_matcher.c index 717ee9b..1d40874 100644 --- a/providers/mlx5/dr_matcher.c +++ b/providers/mlx5/dr_matcher.c @@ -120,7 +120,7 @@ static bool dr_mask_is_ttl_set(struct dr_match_spec *spec) (_misc2)._inner_outer##_first_mpls_s_bos || \ (_misc2)._inner_outer##_first_mpls_ttl) -static bool dr_mask_is_gre_set(struct dr_match_misc *misc) +static bool dr_mask_is_tnl_gre_set(struct dr_match_misc *misc) { return (misc->gre_key_h || misc->gre_key_l || misc->gre_protocol || misc->gre_c_present || @@ -133,12 +133,12 @@ static bool dr_mask_is_gre_set(struct dr_match_misc *misc) (_misc2).outer_first_mpls_over_##gre_udp##_s_bos || \ (_misc2).outer_first_mpls_over_##gre_udp##_ttl) -#define DR_MASK_IS_FLEX_PARSER_0_SET(_misc2) ( \ +#define DR_MASK_IS_TNL_MPLS_SET(_misc2) ( \ DR_MASK_IS_OUTER_MPLS_OVER_GRE_UDP_SET(_misc2, gre) || \ DR_MASK_IS_OUTER_MPLS_OVER_GRE_UDP_SET(_misc2, udp)) static bool -dr_mask_is_misc3_vxlan_gpe_set(struct dr_match_misc3 *misc3) +dr_mask_is_vxlan_gpe_set(struct dr_match_misc3 *misc3) { return misc3->outer_vxlan_gpe_vni || misc3->outer_vxlan_gpe_next_protocol || @@ -146,21 +146,21 @@ dr_mask_is_misc3_vxlan_gpe_set(struct dr_match_misc3 *misc3) } static bool -dr_matcher_supp_flex_parser_vxlan_gpe(struct dr_devx_caps *caps) +dr_matcher_supp_vxlan_gpe(struct dr_devx_caps *caps) { - return caps->flex_protocols & - MLX5_FLEX_PARSER_VXLAN_GPE_ENABLED; + return (caps->sw_format_ver == MLX5_HW_CONNECTX_6DX) || + (caps->flex_protocols & MLX5_FLEX_PARSER_VXLAN_GPE_ENABLED); } static bool -dr_mask_is_flex_parser_tnl_vxlan_gpe_set(struct dr_match_param *mask, - struct mlx5dv_dr_domain *dmn) +dr_mask_is_tnl_vxlan_gpe(struct dr_match_param *mask, + struct mlx5dv_dr_domain *dmn) { - return dr_mask_is_misc3_vxlan_gpe_set(&mask->misc3) && - dr_matcher_supp_flex_parser_vxlan_gpe(&dmn->info.caps); + return dr_mask_is_vxlan_gpe_set(&mask->misc3) && + dr_matcher_supp_vxlan_gpe(&dmn->info.caps); } -static bool dr_mask_is_misc_geneve_set(struct dr_match_misc *misc) +static bool dr_mask_is_tnl_geneve_set(struct dr_match_misc *misc) { return misc->geneve_vni || misc->geneve_oam || @@ -169,46 +169,66 @@ static bool dr_mask_is_misc_geneve_set(struct dr_match_misc *misc) } static bool -dr_matcher_supp_flex_parser_geneve(struct dr_devx_caps *caps) +dr_matcher_supp_tnl_geneve(struct dr_devx_caps *caps) { - return caps->flex_protocols & - MLX5_FLEX_PARSER_GENEVE_ENABLED; + return (caps->sw_format_ver == MLX5_HW_CONNECTX_6DX) || + (caps->flex_protocols & MLX5_FLEX_PARSER_GENEVE_ENABLED); } static bool -dr_mask_is_flex_parser_tnl_geneve_set(struct dr_match_param *mask, - struct mlx5dv_dr_domain *dmn) +dr_mask_is_tnl_geneve(struct dr_match_param *mask, + struct mlx5dv_dr_domain *dmn) { - return dr_mask_is_misc_geneve_set(&mask->misc) && - dr_matcher_supp_flex_parser_geneve(&dmn->info.caps); + return dr_mask_is_tnl_geneve_set(&mask->misc) && + dr_matcher_supp_tnl_geneve(&dmn->info.caps); } -static bool dr_mask_is_misc3_gtpu_set(struct dr_match_misc3 *misc3) +static bool dr_mask_is_tnl_gtpu_set(struct dr_match_misc3 *misc3) { - return misc3->gtpu_flags || - misc3->gtpu_msg_type || - misc3->gtpu_teid; + return misc3->gtpu_flags || misc3->gtpu_msg_type || misc3->gtpu_teid; } -static bool dr_matcher_supp_flex_parser_gtpu(struct dr_devx_caps *caps) +static bool dr_matcher_supp_tnl_gtpu(struct dr_devx_caps *caps) { - return caps->flex_protocols & - MLX5_FLEX_PARSER_GTPU_ENABLED; + return caps->flex_protocols & MLX5_FLEX_PARSER_GTPU_ENABLED; } -static bool dr_mask_is_flex_parser_tnl_gtpu_set(struct dr_match_param *mask, - struct mlx5dv_dr_domain *dmn) +static bool dr_mask_is_tnl_gtpu(struct dr_match_param *mask, + struct mlx5dv_dr_domain *dmn) { - return dr_mask_is_misc3_gtpu_set(&mask->misc3) && - dr_matcher_supp_flex_parser_gtpu(&dmn->info.caps); + return dr_mask_is_tnl_gtpu_set(&mask->misc3) && + dr_matcher_supp_tnl_gtpu(&dmn->info.caps); } -static bool dr_mask_is_flex_parser_icmpv6_set(struct dr_match_misc3 *misc3) +static inline int dr_matcher_supp_icmp_v4(struct dr_devx_caps *caps) +{ + return (caps->sw_format_ver == MLX5_HW_CONNECTX_6DX) || + (caps->flex_protocols & MLX5_FLEX_PARSER_ICMP_V4_ENABLED); +} + +static inline int dr_matcher_supp_icmp_v6(struct dr_devx_caps *caps) +{ + return (caps->sw_format_ver == MLX5_HW_CONNECTX_6DX) || + (caps->flex_protocols & MLX5_FLEX_PARSER_ICMP_V6_ENABLED); +} + +static bool dr_mask_is_icmpv6_set(struct dr_match_misc3 *misc3) { return (misc3->icmpv6_type || misc3->icmpv6_code || misc3->icmpv6_header_data); } +static bool dr_mask_is_icmp(struct dr_match_param *mask, + struct mlx5dv_dr_domain *dmn) +{ + if (DR_MASK_IS_ICMPV4_SET(&mask->misc3)) + return dr_matcher_supp_icmp_v4(&dmn->info.caps); + else if (dr_mask_is_icmpv6_set(&mask->misc3)) + return dr_matcher_supp_icmp_v6(&dmn->info.caps); + + return false; +} + static bool dr_mask_is_wqe_metadata_set(struct dr_match_misc2 *misc2) { return misc2->metadata_reg_a; @@ -237,8 +257,9 @@ static int dr_matcher_set_ste_builders(struct mlx5dv_dr_matcher *matcher, struct dr_domain_rx_tx *nic_dmn = nic_matcher->nic_tbl->nic_dmn; struct dr_ste_build *sb = nic_matcher->ste_builder; struct mlx5dv_dr_domain *dmn = matcher->tbl->dmn; + struct dr_ste_ctx *ste_ctx = dmn->ste_ctx; struct dr_match_param mask = {}; - struct dr_match_misc3 *misc3; + bool allow_empty_match = false; bool inner, rx; uint8_t ipv; int idx = 0; @@ -267,6 +288,15 @@ static int dr_matcher_set_ste_builders(struct mlx5dv_dr_matcher *matcher, if (ret) return ret; + /* Optimize RX pipe by reducing source port match, since + * the FDB RX part is conneted only to the wire. + */ + if (dmn->type == MLX5DV_DR_DOMAIN_TYPE_FDB && + rx && mask.misc.source_port) { + mask.misc.source_port = 0; + allow_empty_match = true; + } + /* Outer */ if (matcher->match_criteria & (DR_MATCHER_CRITERIA_OUTER | DR_MATCHER_CRITERIA_MISC | @@ -276,92 +306,90 @@ static int dr_matcher_set_ste_builders(struct mlx5dv_dr_matcher *matcher, ipv = mask.outer.ip_version; if (dr_mask_is_wqe_metadata_set(&mask.misc2)) - dr_ste_build_general_purpose(&sb[idx++], &mask, inner, rx); + dr_ste_build_general_purpose(ste_ctx, &sb[idx++], + &mask, inner, rx); if (dr_mask_is_reg_c_0_3_set(&mask.misc2)) - dr_ste_build_register_0(&sb[idx++], &mask, inner, rx); + dr_ste_build_register_0(ste_ctx, &sb[idx++], + &mask, inner, rx); if (dr_mask_is_reg_c_4_7_set(&mask.misc2)) - dr_ste_build_register_1(&sb[idx++], &mask, inner, rx); + dr_ste_build_register_1(ste_ctx, &sb[idx++], + &mask, inner, rx); if (dr_mask_is_gvmi_or_qpn_set(&mask.misc) && (dmn->type == MLX5DV_DR_DOMAIN_TYPE_FDB || - dmn->type == MLX5DV_DR_DOMAIN_TYPE_NIC_RX)) { - ret = dr_ste_build_src_gvmi_qpn(&sb[idx++], &mask, - &dmn->info.caps, - inner, rx); - if (ret) - return ret; - } + dmn->type == MLX5DV_DR_DOMAIN_TYPE_NIC_RX)) + dr_ste_build_src_gvmi_qpn(ste_ctx, &sb[idx++], + &mask, &dmn->info.caps, + inner, rx); if (dr_mask_is_smac_set(&mask.outer) && - dr_mask_is_dmac_set(&mask.outer)) { - ret = dr_ste_build_eth_l2_src_des(&sb[idx++], &mask, - inner, rx); - if (ret) - return ret; - } + dr_mask_is_dmac_set(&mask.outer)) + dr_ste_build_eth_l2_src_dst(ste_ctx, &sb[idx++], + &mask, inner, rx); if (dr_mask_is_smac_set(&mask.outer)) - dr_ste_build_eth_l2_src(&sb[idx++], &mask, inner, rx); + dr_ste_build_eth_l2_src(ste_ctx, &sb[idx++], + &mask, inner, rx); if (DR_MASK_IS_L2_DST(mask.outer, mask.misc, outer)) - dr_ste_build_eth_l2_dst(&sb[idx++], &mask, inner, rx); + dr_ste_build_eth_l2_dst(ste_ctx, &sb[idx++], + &mask, inner, rx); if (ipv == 4) { - if (dr_mask_is_ipv4_5_tuple_set(&mask.outer)) - dr_ste_build_eth_l3_ipv4_5_tuple(&sb[idx++], &mask, - inner, rx); - if (dr_mask_is_ttl_set(&mask.outer)) - dr_ste_build_eth_l3_ipv4_misc(&sb[idx++], &mask, - inner, rx); + dr_ste_build_eth_l3_ipv4_misc(ste_ctx, &sb[idx++], + &mask, inner, rx); + + if (dr_mask_is_ipv4_5_tuple_set(&mask.outer)) + dr_ste_build_eth_l3_ipv4_5_tuple(ste_ctx, &sb[idx++], + &mask, inner, rx); } else if (ipv == 6) { if (dr_mask_is_dst_addr_set(&mask.outer)) - dr_ste_build_eth_l3_ipv6_dst(&sb[idx++], &mask, - inner, rx); + dr_ste_build_eth_l3_ipv6_dst(ste_ctx, &sb[idx++], + &mask, inner, rx); if (dr_mask_is_src_addr_set(&mask.outer)) - dr_ste_build_eth_l3_ipv6_src(&sb[idx++], &mask, - inner, rx); + dr_ste_build_eth_l3_ipv6_src(ste_ctx, &sb[idx++], + &mask, inner, rx); if (DR_MASK_IS_ETH_L4_SET(mask.outer, mask.misc, outer)) - dr_ste_build_ipv6_l3_l4(&sb[idx++], &mask, - inner, rx); + dr_ste_build_eth_ipv6_l3_l4(ste_ctx, &sb[idx++], + &mask, inner, rx); } - if (dr_mask_is_flex_parser_tnl_vxlan_gpe_set(&mask, dmn)) - dr_ste_build_flex_parser_tnl_vxlan_gpe(&sb[idx++], &mask, - inner, rx); - else if (dr_mask_is_flex_parser_tnl_geneve_set(&mask, dmn)) - dr_ste_build_flex_parser_tnl_geneve(&sb[idx++], &mask, - inner, rx); - else if (dr_mask_is_flex_parser_tnl_gtpu_set(&mask, dmn)) - dr_ste_build_flex_parser_tnl_gtpu(&sb[idx++], &mask, - inner, rx); + if (dr_mask_is_tnl_vxlan_gpe(&mask, dmn)) + dr_ste_build_tnl_vxlan_gpe(ste_ctx, &sb[idx++], + &mask, inner, rx); + else if (dr_mask_is_tnl_geneve(&mask, dmn)) + dr_ste_build_tnl_geneve(ste_ctx, &sb[idx++], + &mask, inner, rx); + else if (dr_mask_is_tnl_gtpu(&mask, dmn)) + dr_ste_build_tnl_gtpu(ste_ctx, &sb[idx++], + &mask, inner, rx); if (DR_MASK_IS_ETH_L4_MISC_SET(mask.misc3, outer)) - dr_ste_build_eth_l4_misc(&sb[idx++], &mask, inner, rx); + dr_ste_build_eth_l4_misc(ste_ctx, &sb[idx++], + &mask, inner, rx); if (DR_MASK_IS_FIRST_MPLS_SET(mask.misc2, outer)) - dr_ste_build_mpls(&sb[idx++], &mask, inner, rx); - - if (DR_MASK_IS_FLEX_PARSER_0_SET(mask.misc2)) - dr_ste_build_flex_parser_0(&sb[idx++], &mask, inner, rx); - - misc3 = &mask.misc3; - if ((DR_MASK_IS_FLEX_PARSER_ICMPV4_SET(misc3) && - dr_matcher_supp_flex_parser_icmp_v4(&dmn->info.caps)) || - (dr_mask_is_flex_parser_icmpv6_set(&mask.misc3) && - dr_matcher_supp_flex_parser_icmp_v6(&dmn->info.caps))) { - ret = dr_ste_build_flex_parser_1(&sb[idx++], - &mask, &dmn->info.caps, - inner, rx); + dr_ste_build_mpls(ste_ctx, &sb[idx++], + &mask, inner, rx); + + if (DR_MASK_IS_TNL_MPLS_SET(mask.misc2)) + dr_ste_build_tnl_mpls(ste_ctx, &sb[idx++], + &mask, inner, rx); + + if (dr_mask_is_icmp(&mask, dmn)) { + ret = dr_ste_build_icmp(ste_ctx, &sb[idx++], + &mask, &dmn->info.caps, + inner, rx); if (ret) return ret; } - if (dr_mask_is_gre_set(&mask.misc)) - dr_ste_build_gre(&sb[idx++], &mask, inner, rx); + if (dr_mask_is_tnl_gre_set(&mask.misc)) + dr_ste_build_tnl_gre(ste_ctx, &sb[idx++], &mask, inner, rx); } /* Inner */ @@ -373,55 +401,60 @@ static int dr_matcher_set_ste_builders(struct mlx5dv_dr_matcher *matcher, ipv = mask.inner.ip_version; if (dr_mask_is_eth_l2_tnl_set(&mask.misc)) - dr_ste_build_eth_l2_tnl(&sb[idx++], &mask, inner, rx); + dr_ste_build_eth_l2_tnl(ste_ctx, &sb[idx++], &mask, + inner, rx); if (dr_mask_is_smac_set(&mask.inner) && - dr_mask_is_dmac_set(&mask.inner)) { - ret = dr_ste_build_eth_l2_src_des(&sb[idx++], - &mask, inner, rx); - if (ret) - return ret; - } + dr_mask_is_dmac_set(&mask.inner)) + dr_ste_build_eth_l2_src_dst(ste_ctx, &sb[idx++], + &mask, inner, rx); if (dr_mask_is_smac_set(&mask.inner)) - dr_ste_build_eth_l2_src(&sb[idx++], &mask, inner, rx); + dr_ste_build_eth_l2_src(ste_ctx, &sb[idx++], + &mask, inner, rx); if (DR_MASK_IS_L2_DST(mask.inner, mask.misc, inner)) - dr_ste_build_eth_l2_dst(&sb[idx++], &mask, inner, rx); + dr_ste_build_eth_l2_dst(ste_ctx, &sb[idx++], + &mask, inner, rx); if (ipv == 4) { - if (dr_mask_is_ipv4_5_tuple_set(&mask.inner)) - dr_ste_build_eth_l3_ipv4_5_tuple(&sb[idx++], &mask, - inner, rx); - if (dr_mask_is_ttl_set(&mask.inner)) - dr_ste_build_eth_l3_ipv4_misc(&sb[idx++], &mask, - inner, rx); + dr_ste_build_eth_l3_ipv4_misc(ste_ctx, &sb[idx++], + &mask, inner, rx); + + if (dr_mask_is_ipv4_5_tuple_set(&mask.inner)) + dr_ste_build_eth_l3_ipv4_5_tuple(ste_ctx, &sb[idx++], + &mask, inner, rx); } else if (ipv == 6) { if (dr_mask_is_dst_addr_set(&mask.inner)) - dr_ste_build_eth_l3_ipv6_dst(&sb[idx++], &mask, - inner, rx); + dr_ste_build_eth_l3_ipv6_dst(ste_ctx, &sb[idx++], + &mask, inner, rx); if (dr_mask_is_src_addr_set(&mask.inner)) - dr_ste_build_eth_l3_ipv6_src(&sb[idx++], &mask, - inner, rx); + dr_ste_build_eth_l3_ipv6_src(ste_ctx, &sb[idx++], + &mask, inner, rx); if (DR_MASK_IS_ETH_L4_SET(mask.inner, mask.misc, inner)) - dr_ste_build_ipv6_l3_l4(&sb[idx++], &mask, - inner, rx); + dr_ste_build_eth_ipv6_l3_l4(ste_ctx, &sb[idx++], + &mask, inner, rx); } if (DR_MASK_IS_ETH_L4_MISC_SET(mask.misc3, inner)) - dr_ste_build_eth_l4_misc(&sb[idx++], &mask, inner, rx); + dr_ste_build_eth_l4_misc(ste_ctx, &sb[idx++], + &mask, inner, rx); if (DR_MASK_IS_FIRST_MPLS_SET(mask.misc2, inner)) - dr_ste_build_mpls(&sb[idx++], &mask, inner, rx); + dr_ste_build_mpls(ste_ctx, &sb[idx++], + &mask, inner, rx); - if (DR_MASK_IS_FLEX_PARSER_0_SET(mask.misc2)) - dr_ste_build_flex_parser_0(&sb[idx++], &mask, inner, rx); + if (DR_MASK_IS_TNL_MPLS_SET(mask.misc2)) + dr_ste_build_tnl_mpls(ste_ctx, &sb[idx++], + &mask, inner, rx); } + /* Empty matcher, takes all */ - if (matcher->match_criteria == DR_MATCHER_CRITERIA_EMPTY) + if ((!idx && allow_empty_match) || + matcher->match_criteria == DR_MATCHER_CRITERIA_EMPTY) dr_ste_build_empty_always_hit(&sb[idx++], rx); if (idx == 0) { diff --git a/providers/mlx5/dr_rule.c b/providers/mlx5/dr_rule.c index 1870a8b..9239fe5 100644 --- a/providers/mlx5/dr_rule.c +++ b/providers/mlx5/dr_rule.c @@ -36,7 +36,8 @@ #define DR_RULE_MAX_STE_CHAIN (DR_RULE_MAX_STES + DR_ACTION_MAX_STES) -static int dr_rule_append_to_miss_list(struct dr_ste *new_last_ste, +static int dr_rule_append_to_miss_list(struct dr_ste_ctx *ste_ctx, + struct dr_ste *new_last_ste, struct list_head *miss_list, struct list_head *send_list) { @@ -53,10 +54,13 @@ static int dr_rule_append_to_miss_list(struct dr_ste *new_last_ste, return errno; } - dr_ste_set_miss_addr(last_ste->hw_ste, dr_ste_get_icm_addr(new_last_ste)); + dr_ste_set_miss_addr(ste_ctx, + last_ste->hw_ste, + dr_ste_get_icm_addr(new_last_ste)); + list_add_tail(miss_list, &new_last_ste->miss_list_node); - dr_send_fill_and_append_ste_send_info(last_ste, DR_STE_SIZE_REDUCED, + dr_send_fill_and_append_ste_send_info(last_ste, DR_STE_SIZE_CTRL, 0, last_ste->hw_ste, ste_info_last, send_list, true); @@ -69,6 +73,7 @@ static struct dr_ste uint8_t *hw_ste) { struct mlx5dv_dr_domain *dmn = matcher->tbl->dmn; + struct dr_ste_ctx *ste_ctx = dmn->ste_ctx; struct dr_ste_htbl *new_htbl; struct dr_ste *ste; @@ -84,7 +89,7 @@ static struct dr_ste /* One and only entry, never grows */ ste = new_htbl->ste_arr; - dr_ste_set_miss_addr(hw_ste, nic_matcher->e_anchor->chunk->icm_addr); + dr_ste_set_miss_addr(ste_ctx, hw_ste, nic_matcher->e_anchor->chunk->icm_addr); dr_htbl_get(new_htbl); return ste; @@ -118,7 +123,7 @@ static struct dr_ste *dr_rule_create_collision_entry(struct mlx5dv_dr_matcher *m return ste; free_tbl: - dr_ste_free(ste, matcher, nic_matcher); + dr_htbl_put(ste->htbl); return NULL; } @@ -128,14 +133,19 @@ static int dr_rule_handle_one_ste_in_update_list(struct dr_ste_send_info *ste_in int ret; list_del(&ste_info->send_list); + + /* Copy data to ste, only reduced size or control, the last 16B (mask) + * is already written to the hw. + */ + if (ste_info->size == DR_STE_SIZE_CTRL) + memcpy(ste_info->ste->hw_ste, ste_info->data, DR_STE_SIZE_CTRL); + else + memcpy(ste_info->ste->hw_ste, ste_info->data, DR_STE_SIZE_REDUCED); + ret = dr_send_postsend_ste(dmn, ste_info->ste, ste_info->data, ste_info->size, ste_info->offset); if (ret) goto out; - /* Copy data to ste, only reduced size, the last 16B (mask) - * is already written to the hw. - */ - memcpy(ste_info->ste->hw_ste, ste_info->data, DR_STE_SIZE_REDUCED); out: free(ste_info); @@ -190,6 +200,7 @@ dr_rule_rehash_handle_collision(struct mlx5dv_dr_matcher *matcher, struct dr_ste *col_ste, uint8_t *hw_ste) { + struct mlx5dv_dr_domain *dmn = matcher->tbl->dmn; struct dr_ste *new_ste; int ret; @@ -201,18 +212,18 @@ dr_rule_rehash_handle_collision(struct mlx5dv_dr_matcher *matcher, new_ste->htbl->miss_list = dr_ste_get_miss_list(col_ste); /* Update the previous from the list */ - ret = dr_rule_append_to_miss_list(new_ste, + ret = dr_rule_append_to_miss_list(dmn->ste_ctx, new_ste, dr_ste_get_miss_list(col_ste), update_list); if (ret) { - dr_dbg(matcher->tbl->dmn, "Failed update dup entry\n"); + dr_dbg(dmn, "Failed update dup entry\n"); goto err_exit; } return new_ste; err_exit: - dr_ste_free(new_ste, matcher, nic_matcher); + dr_htbl_put(new_ste->htbl); return NULL; } @@ -245,6 +256,7 @@ static struct dr_ste *dr_rule_rehash_copy_ste(struct mlx5dv_dr_matcher *matcher, struct dr_ste_htbl *new_htbl, struct list_head *update_list) { + struct dr_ste_ctx *ste_ctx = matcher->tbl->dmn->ste_ctx; uint8_t hw_ste[DR_STE_SIZE] = {}; struct dr_ste_send_info *ste_info; bool use_update_list = false; @@ -258,12 +270,12 @@ static struct dr_ste *dr_rule_rehash_copy_ste(struct mlx5dv_dr_matcher *matcher, /* Copy STE control and tag */ memcpy(hw_ste, cur_ste->hw_ste, DR_STE_SIZE_REDUCED); - dr_ste_set_miss_addr(hw_ste, nic_matcher->e_anchor->chunk->icm_addr); + dr_ste_set_miss_addr(ste_ctx, hw_ste, nic_matcher->e_anchor->chunk->icm_addr); new_idx = dr_ste_calc_hash_index(hw_ste, new_htbl); new_ste = &new_htbl->ste_arr[new_idx]; - if (dr_ste_not_used_ste(new_ste)) { + if (dr_ste_is_not_used(new_ste)) { dr_htbl_get(new_htbl); list_add_tail(dr_ste_get_miss_list(new_ste), &new_ste->miss_list_node); } else { @@ -349,7 +361,7 @@ static int dr_rule_rehash_copy_htbl(struct mlx5dv_dr_matcher *matcher, for (i = 0; i < cur_entries; i++) { cur_ste = &cur_htbl->ste_arr[i]; - if (dr_ste_not_used_ste(cur_ste)) /* Empty, nothing to copy */ + if (dr_ste_is_not_used(cur_ste)) /* Empty, nothing to copy */ continue; err = dr_rule_rehash_copy_miss_list(matcher, @@ -403,7 +415,8 @@ static struct dr_ste_htbl *dr_rule_rehash_htbl(struct mlx5dv_dr_rule *rule, /* Write new table to HW */ info.type = CONNECT_MISS; info.miss_icm_addr = nic_matcher->e_anchor->chunk->icm_addr; - dr_ste_set_formated_ste(dmn->info.caps.gvmi, + dr_ste_set_formated_ste(dmn->ste_ctx, + dmn->info.caps.gvmi, nic_dmn, new_htbl, formated_ste, @@ -450,18 +463,20 @@ static struct dr_ste_htbl *dr_rule_rehash_htbl(struct mlx5dv_dr_rule *rule, * It is safe to operate dr_ste_set_hit_addr on the hw_ste here * (48B len) which works only on first 32B */ - dr_ste_set_hit_addr(prev_htbl->ste_arr[0].hw_ste, + dr_ste_set_hit_addr(dmn->ste_ctx, + prev_htbl->ste_arr[0].hw_ste, new_htbl->chunk->icm_addr, new_htbl->chunk->num_of_entries); ste_to_update = &prev_htbl->ste_arr[0]; } else { - dr_ste_set_hit_addr_by_next_htbl(cur_htbl->pointing_ste->hw_ste, + dr_ste_set_hit_addr_by_next_htbl(dmn->ste_ctx, + cur_htbl->pointing_ste->hw_ste, new_htbl); ste_to_update = cur_htbl->pointing_ste; } - dr_send_fill_and_append_ste_send_info(ste_to_update, DR_STE_SIZE_REDUCED, + dr_send_fill_and_append_ste_send_info(ste_to_update, DR_STE_SIZE_CTRL, 0, ste_to_update->hw_ste, ste_info, update_list, false); @@ -508,24 +523,26 @@ static struct dr_ste *dr_rule_handle_collision(struct mlx5dv_dr_matcher *matcher struct list_head *miss_list, struct list_head *send_list) { + struct mlx5dv_dr_domain *dmn = matcher->tbl->dmn; + struct dr_ste_ctx *ste_ctx = dmn->ste_ctx; struct dr_ste_send_info *ste_info; struct dr_ste *new_ste; ste_info = calloc(1, sizeof(*ste_info)); if (!ste_info) { - dr_dbg(matcher->tbl->dmn, "Failed allocating ste_info\n"); + dr_dbg(dmn, "Failed allocating ste_info\n"); errno = ENOMEM; return NULL; } new_ste = dr_rule_create_collision_entry(matcher, nic_matcher, hw_ste, ste); if (!new_ste) { - dr_dbg(matcher->tbl->dmn, "Failed creating collision entry\n"); + dr_dbg(dmn, "Failed creating collision entry\n"); goto free_send_info; } - if (dr_rule_append_to_miss_list(new_ste, miss_list, send_list)) { - dr_dbg(matcher->tbl->dmn, "Failed to update prev miss_list\n"); + if (dr_rule_append_to_miss_list(ste_ctx, new_ste, miss_list, send_list)) { + dr_dbg(dmn, "Failed to update prev miss_list\n"); goto err_exit; } @@ -538,7 +555,8 @@ static struct dr_ste *dr_rule_handle_collision(struct mlx5dv_dr_matcher *matcher return new_ste; err_exit: - dr_ste_free(new_ste, matcher, nic_matcher); + dr_htbl_put(new_ste->htbl); + free_send_info: free(ste_info); return NULL; @@ -675,6 +693,7 @@ static int dr_rule_handle_action_stes(struct mlx5dv_dr_rule *rule, struct dr_ste_send_info *ste_info_arr[DR_ACTION_MAX_STES]; uint8_t num_of_builders = nic_matcher->num_of_builders; struct mlx5dv_dr_matcher *matcher = rule->matcher; + struct mlx5dv_dr_domain *dmn = matcher->tbl->dmn; uint8_t *curr_hw_ste, *prev_hw_ste; struct dr_ste *action_ste; int i, k, ret; @@ -704,17 +723,19 @@ static int dr_rule_handle_action_stes(struct mlx5dv_dr_rule *rule, ste_info_arr[k] = calloc(1, sizeof(struct dr_ste_send_info)); if (!ste_info_arr[k]) { - dr_dbg(matcher->tbl->dmn, "Failed allocate ste_info, k: %d\n", k); + dr_dbg(dmn, "Failed allocate ste_info, k: %d\n", k); errno = ENOMEM; ret = errno; goto err_exit; } /* Point current ste to the new action */ - dr_ste_set_hit_addr_by_next_htbl(prev_hw_ste, action_ste->htbl); + dr_ste_set_hit_addr_by_next_htbl(dmn->ste_ctx, + prev_hw_ste, + action_ste->htbl); ret = dr_rule_add_member(nic_rule, action_ste); if (ret) { - dr_dbg(matcher->tbl->dmn, "Failed adding rule member\n"); + dr_dbg(dmn, "Failed adding rule member\n"); goto free_ste_info; } dr_send_fill_and_append_ste_send_info(action_ste, DR_STE_SIZE, 0, @@ -741,6 +762,7 @@ static int dr_rule_handle_empty_entry(struct mlx5dv_dr_matcher *matcher, struct list_head *miss_list, struct list_head *send_list) { + struct dr_ste_ctx *ste_ctx = matcher->tbl->dmn->ste_ctx; struct dr_ste_send_info *ste_info; /* Take ref on table, only on first time this ste is used */ @@ -749,7 +771,7 @@ static int dr_rule_handle_empty_entry(struct mlx5dv_dr_matcher *matcher, /* new entry -> new branch */ list_add_tail(miss_list, &ste->miss_list_node); - dr_ste_set_miss_addr(hw_ste, nic_matcher->e_anchor->chunk->icm_addr); + dr_ste_set_miss_addr(ste_ctx, hw_ste, nic_matcher->e_anchor->chunk->icm_addr); ste->ste_chain_location = ste_location; @@ -810,7 +832,7 @@ again: miss_list = &cur_htbl->chunk->miss_list[index]; ste = &cur_htbl->ste_arr[index]; - if (dr_ste_not_used_ste(ste)) { + if (dr_ste_is_not_used(ste)) { if (dr_rule_handle_empty_entry(matcher, nic_matcher, cur_htbl, ste, ste_location, hw_ste, miss_list, @@ -895,8 +917,10 @@ static bool dr_rule_verify(struct mlx5dv_dr_matcher *matcher, size_t value_size = value->match_sz; uint32_t s_idx, e_idx; - if (!value_size || - (value_size > DEVX_ST_SZ_BYTES(dr_match_param) || + if (!value_size) + return true; + + if ((value_size > DEVX_ST_SZ_BYTES(dr_match_param) || (value_size % sizeof(uint32_t)))) { dr_dbg(dmn, "Rule parameters length is incorrect\n"); errno = EINVAL; diff --git a/providers/mlx5/dr_send.c b/providers/mlx5/dr_send.c index a0237ac..cf58c02 100644 --- a/providers/mlx5/dr_send.c +++ b/providers/mlx5/dr_send.c @@ -692,6 +692,8 @@ int dr_send_postsend_ste(struct mlx5dv_dr_domain *dmn, struct dr_ste *ste, { struct postsend_info send_info = {}; + dr_ste_prepare_for_postsend(dmn->ste_ctx, data, size); + send_info.write.addr = (uintptr_t) data; send_info.write.length = size; send_info.write.lkey = 0; @@ -714,6 +716,8 @@ int dr_send_postsend_htbl(struct mlx5dv_dr_domain *dmn, struct dr_ste_htbl *htbl if (ret) return ret; + dr_ste_prepare_for_postsend(dmn->ste_ctx, formated_ste, DR_STE_SIZE); + /* Send the data iteration times */ for (i = 0; i < iterations; i++) { uint32_t ste_index = i * (byte_size / DR_STE_SIZE); @@ -721,7 +725,7 @@ int dr_send_postsend_htbl(struct mlx5dv_dr_domain *dmn, struct dr_ste_htbl *htbl /* Copy all ste's on the data buffer, need to add the bit_mask */ for (j = 0; j < num_stes_per_iter; j++) { - if (dr_ste_is_not_valid_entry(htbl->ste_arr[ste_index + j].hw_ste)) { + if (dr_ste_is_not_used(&htbl->ste_arr[ste_index + j])) { memcpy(data + (j * DR_STE_SIZE), formated_ste, DR_STE_SIZE); } else { @@ -731,6 +735,11 @@ int dr_send_postsend_htbl(struct mlx5dv_dr_domain *dmn, struct dr_ste_htbl *htbl /* Copy bit_mask */ memcpy(data + (j * DR_STE_SIZE) + DR_STE_SIZE_REDUCED, mask, DR_STE_SIZE_MASK); + + /* Prepare STE to specific HW format */ + dr_ste_prepare_for_postsend(dmn->ste_ctx, + data + (j * DR_STE_SIZE), + DR_STE_SIZE); } } @@ -758,6 +767,7 @@ int dr_send_postsend_formated_htbl(struct mlx5dv_dr_domain *dmn, { uint32_t byte_size = htbl->chunk->byte_size; int i, num_stes, iterations, ret; + uint8_t *copy_dst; uint8_t *data; ret = dr_get_tbl_copy_details(dmn, htbl, &data, &byte_size, @@ -765,20 +775,22 @@ int dr_send_postsend_formated_htbl(struct mlx5dv_dr_domain *dmn, if (ret) return ret; - for (i = 0; i < num_stes; i++) { - uint8_t *copy_dst; - - /* Copy the same ste on the data buffer */ - copy_dst = data + i * DR_STE_SIZE; - memcpy(copy_dst, ste_init_data, DR_STE_SIZE); - - if (update_hw_ste) { - /* Copy the reduced ste to hash table ste_arr */ + if (update_hw_ste) { + /* Copy the reduced STE to hash table ste_arr */ + for (i = 0; i < num_stes; i++) { copy_dst = htbl->hw_ste_arr + i * DR_STE_SIZE_REDUCED; memcpy(copy_dst, ste_init_data, DR_STE_SIZE_REDUCED); } } + dr_ste_prepare_for_postsend(dmn->ste_ctx, ste_init_data, DR_STE_SIZE); + + /* Copy the same STE on the data buffer */ + for (i = 0; i < num_stes; i++) { + copy_dst = data + i * DR_STE_SIZE; + memcpy(copy_dst, ste_init_data, DR_STE_SIZE); + } + /* Send the data iteration times */ for (i = 0; i < iterations; i++) { uint32_t ste_index = i * (byte_size / DR_STE_SIZE); @@ -838,17 +850,22 @@ static int dr_prepare_qp_to_rts(struct mlx5dv_dr_domain *dmn) } /* RTR */ - ret = dr_devx_query_gid(dmn->ctx, port, gid_index, &rtr_attr.dgid_attr); - if (ret) { - dr_dbg(dmn, "can't read sgid of index %d\n", gid_index); - return ret; - } - rtr_attr.mtu = mtu; rtr_attr.qp_num = dr_qp->obj->object_id; rtr_attr.min_rnr_timer = 12; rtr_attr.port_num = port; - rtr_attr.sgid_index = gid_index; + + /* Enable force-loopback on the QP */ + if (dmn->info.caps.roce_caps.fl_rc_qp_when_roce_enabled) { + rtr_attr.fl = true; + } else { + ret = dr_devx_query_gid(dmn->ctx, port, gid_index, &rtr_attr.dgid_attr); + if (ret) { + dr_dbg(dmn, "can't read sgid of index %d\n", gid_index); + return ret; + } + rtr_attr.sgid_index = gid_index; + } ret = dr_devx_modify_qp_init2rtr(dmn->ctx, dr_qp->obj, &rtr_attr); if (ret) { diff --git a/providers/mlx5/dr_ste.c b/providers/mlx5/dr_ste.c index 2c401fd..c2f8af2 100644 --- a/providers/mlx5/dr_ste.c +++ b/providers/mlx5/dr_ste.c @@ -30,117 +30,7 @@ * SOFTWARE. */ -#include -#include -#include "mlx5dv_dr.h" - -#define IPV4_ETHERTYPE 0x0800 -#define IPV6_ETHERTYPE 0x86DD -#define STE_IPV4 0x1 -#define STE_IPV6 0x2 -#define STE_TCP 0x1 -#define STE_UDP 0x2 -#define STE_SPI 0x3 -#define IP_VERSION_IPV4 0x4 -#define IP_VERSION_IPV6 0x6 -#define IP_PROTOCOL_UDP 0x11 -#define IP_PROTOCOL_TCP 0x06 -#define IP_PROTOCOL_IPSEC 0x33 -#define TCP_PROTOCOL 0x6 -#define UDP_PROTOCOL 0x11 -#define IPSEC_PROTOCOL 0x33 - -#define DR_STE_ENABLE_FLOW_TAG (1 << 31) - -/* Read from layout struct */ -#define DR_STE_GET(typ, p, fld) DEVX_GET(ste_##typ, p, fld) - -/* Write to layout a value */ -#define DR_STE_SET(typ, p, fld, v) DEVX_SET(ste_##typ, p, fld, v) - -#define DR_STE_SET_BOOL(typ, p, fld, v) DEVX_SET(ste_##typ, p, fld, !!(v)) - -/* Set to STE a specific value using DR_STE_SET */ -#define DR_STE_SET_VAL(lookup_type, tag, t_fname, spec, s_fname, value) do { \ - if ((spec)->s_fname) { \ - DR_STE_SET(lookup_type, tag, t_fname, value); \ - (spec)->s_fname = 0; \ - } \ -} while (0) - -/* Set to STE spec->s_fname to tag->t_fname */ -#define DR_STE_SET_TAG(lookup_type, tag, t_fname, spec, s_fname) \ - DR_STE_SET_VAL(lookup_type, tag, t_fname, spec, s_fname, (spec)->s_fname); - -/* Set to STE -1 to bit_mask->bm_fname and set spec->s_fname as used */ -#define DR_STE_SET_MASK(lookup_type, bit_mask, bm_fname, spec, s_fname) \ - DR_STE_SET_VAL(lookup_type, bit_mask, bm_fname, spec, s_fname, -1); - -/* Set to STE spec->s_fname to bit_mask->bm_fname and set spec->s_fname as used */ -#define DR_STE_SET_MASK_V(lookup_type, bit_mask, bm_fname, spec, s_fname) \ - DR_STE_SET_VAL(lookup_type, bit_mask, bm_fname, spec, s_fname, (spec)->s_fname); - -#define DR_STE_SET_TCP_FLAGS(lookup_type, tag, spec) do { \ - DR_STE_SET_BOOL(lookup_type, tag, tcp_ns, (spec)->tcp_flags & (1 << 8)); \ - DR_STE_SET_BOOL(lookup_type, tag, tcp_cwr, (spec)->tcp_flags & (1 << 7)); \ - DR_STE_SET_BOOL(lookup_type, tag, tcp_ece, (spec)->tcp_flags & (1 << 6)); \ - DR_STE_SET_BOOL(lookup_type, tag, tcp_urg, (spec)->tcp_flags & (1 << 5)); \ - DR_STE_SET_BOOL(lookup_type, tag, tcp_ack, (spec)->tcp_flags & (1 << 4)); \ - DR_STE_SET_BOOL(lookup_type, tag, tcp_psh, (spec)->tcp_flags & (1 << 3)); \ - DR_STE_SET_BOOL(lookup_type, tag, tcp_rst, (spec)->tcp_flags & (1 << 2)); \ - DR_STE_SET_BOOL(lookup_type, tag, tcp_syn, (spec)->tcp_flags & (1 << 1)); \ - DR_STE_SET_BOOL(lookup_type, tag, tcp_fin, (spec)->tcp_flags & (1 << 0)); \ -} while (0) - -#define DR_STE_SET_MPLS_MASK(lookup_type, mask, in_out, bit_mask) do { \ - DR_STE_SET_MASK_V(lookup_type, mask, mpls0_label, mask, \ - in_out##_first_mpls_label);\ - DR_STE_SET_MASK_V(lookup_type, mask, mpls0_s_bos, mask, \ - in_out##_first_mpls_s_bos); \ - DR_STE_SET_MASK_V(lookup_type, mask, mpls0_exp, mask, \ - in_out##_first_mpls_exp); \ - DR_STE_SET_MASK_V(lookup_type, mask, mpls0_ttl, mask, \ - in_out##_first_mpls_ttl); \ -} while (0) - -#define DR_STE_SET_MPLS_TAG(lookup_type, mask, in_out, tag) do { \ - DR_STE_SET_TAG(lookup_type, tag, mpls0_label, mask, \ - in_out##_first_mpls_label);\ - DR_STE_SET_TAG(lookup_type, tag, mpls0_s_bos, mask, \ - in_out##_first_mpls_s_bos); \ - DR_STE_SET_TAG(lookup_type, tag, mpls0_exp, mask, \ - in_out##_first_mpls_exp); \ - DR_STE_SET_TAG(lookup_type, tag, mpls0_ttl, mask, \ - in_out##_first_mpls_ttl); \ -} while (0) - -#define DR_STE_IS_OUTER_MPLS_OVER_GRE_SET(_misc) (\ - (_misc)->outer_first_mpls_over_gre_label || \ - (_misc)->outer_first_mpls_over_gre_exp || \ - (_misc)->outer_first_mpls_over_gre_s_bos || \ - (_misc)->outer_first_mpls_over_gre_ttl) -#define DR_STE_IS_OUTER_MPLS_OVER_UDP_SET(_misc) (\ - (_misc)->outer_first_mpls_over_udp_label || \ - (_misc)->outer_first_mpls_over_udp_exp || \ - (_misc)->outer_first_mpls_over_udp_s_bos || \ - (_misc)->outer_first_mpls_over_udp_ttl) - -#define DR_STE_CALC_LU_TYPE(lookup_type, rx, inner) \ - ((inner) ? DR_STE_LU_TYPE_##lookup_type##_I : \ - (rx) ? DR_STE_LU_TYPE_##lookup_type##_D : \ - DR_STE_LU_TYPE_##lookup_type##_O) - -enum dr_ste_tunl_action { - DR_STE_TUNL_ACTION_NONE = 0, - DR_STE_TUNL_ACTION_ENABLE = 1, - DR_STE_TUNL_ACTION_DECAP = 2, - DR_STE_TUNL_ACTION_L3_DECAP = 3, -}; - -enum dr_ste_action_type { - DR_STE_ACTION_TYPE_ENCAP_L3 = 3, - DR_STE_ACTION_TYPE_ENCAP = 4, -}; +#include "dr_ste.h" struct dr_hw_ste_format { uint8_t ctrl[DR_STE_SIZE_CTRL]; @@ -176,7 +66,7 @@ uint32_t dr_ste_calc_hash_index(uint8_t *hw_ste_p, return index; } -static uint16_t dr_ste_conv_bit_to_byte_mask(uint8_t *bit_mask) +uint16_t dr_ste_conv_bit_to_byte_mask(uint8_t *bit_mask) { uint16_t byte_mask = 0; int i; @@ -189,80 +79,18 @@ static uint16_t dr_ste_conv_bit_to_byte_mask(uint8_t *bit_mask) return byte_mask; } -void dr_ste_set_bit_mask(uint8_t *hw_ste_p, uint8_t *bit_mask) +static uint8_t *dr_ste_get_tag(uint8_t *hw_ste_p) { struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; - memcpy(hw_ste->mask, bit_mask, DR_STE_SIZE_MASK); -} - -void dr_ste_rx_set_flow_tag(uint8_t *hw_ste_p, uint32_t flow_tag) -{ - DR_STE_SET(rx_steering_mult, hw_ste_p, qp_list_pointer, - DR_STE_ENABLE_FLOW_TAG | flow_tag); -} - -void dr_ste_set_counter_id(uint8_t *hw_ste_p, uint32_t ctr_id) -{ - /* This can be used for both rx_steering_mult and for sx_transmit */ - DR_STE_SET(rx_steering_mult, hw_ste_p, counter_trigger_15_0, ctr_id); - DR_STE_SET(rx_steering_mult, hw_ste_p, counter_trigger_23_16, ctr_id >> 16); -} - -void dr_ste_set_tx_encap(void *hw_ste_p, uint32_t reformat_id, int size, bool encap_l3) -{ - DR_STE_SET(sx_transmit, hw_ste_p, action_type, - encap_l3 ? DR_STE_ACTION_TYPE_ENCAP_L3 : DR_STE_ACTION_TYPE_ENCAP); - /* The hardware expects here size in words (2 byte) */ - DR_STE_SET(sx_transmit, hw_ste_p, action_description, size / 2); - DR_STE_SET(sx_transmit, hw_ste_p, encap_pointer_vlan_data, reformat_id); + return hw_ste->tag; } -void dr_ste_set_rx_decap(uint8_t *hw_ste_p) -{ - DR_STE_SET(rx_steering_mult, hw_ste_p, tunneling_action, - DR_STE_TUNL_ACTION_DECAP); -} - -void dr_ste_set_rx_decap_l3(uint8_t *hw_ste_p, bool vlan) -{ - DR_STE_SET(rx_steering_mult, hw_ste_p, tunneling_action, - DR_STE_TUNL_ACTION_L3_DECAP); - DR_STE_SET(modify_packet, hw_ste_p, action_description, vlan ? 1 : 0); -} - -void dr_ste_set_entry_type(uint8_t *hw_ste_p, uint8_t entry_type) -{ - DR_STE_SET(general, hw_ste_p, entry_type, entry_type); -} - -uint8_t dr_ste_get_entry_type(uint8_t *hw_ste_p) -{ - return DR_STE_GET(general, hw_ste_p, entry_type); -} - -void dr_ste_set_rewrite_actions(uint8_t *hw_ste_p, uint16_t num_of_actions, - uint32_t re_write_index) -{ - DR_STE_SET(modify_packet, hw_ste_p, number_of_re_write_actions, - num_of_actions); - DR_STE_SET(modify_packet, hw_ste_p, header_re_write_actions_pointer, - re_write_index); -} - -void dr_ste_init(uint8_t *hw_ste_p, uint8_t lu_type, uint8_t entry_type, - uint16_t gvmi) +void dr_ste_set_bit_mask(uint8_t *hw_ste_p, uint8_t *bit_mask) { - DR_STE_SET(general, hw_ste_p, entry_type, entry_type); - DR_STE_SET(general, hw_ste_p, entry_sub_type, lu_type); - DR_STE_SET(general, hw_ste_p, next_lu_type, DR_STE_LU_TYPE_DONT_CARE); + struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; - /* Set GVMI once, this is the same for RX/TX - * bits 63_48 of next table base / miss address encode the next GVMI - */ - DR_STE_SET(rx_steering_mult, hw_ste_p, gvmi, gvmi); - DR_STE_SET(rx_steering_mult, hw_ste_p, next_table_base_63_48, gvmi); - DR_STE_SET(rx_steering_mult, hw_ste_p, miss_address_63_48, gvmi); + memcpy(hw_ste->mask, bit_mask, DR_STE_SIZE_MASK); } static void dr_ste_set_always_hit(struct dr_hw_ste_format *hw_ste) @@ -277,21 +105,27 @@ static void dr_ste_set_always_miss(struct dr_hw_ste_format *hw_ste) hw_ste->mask[0] = 0; } -uint64_t dr_ste_get_miss_addr(uint8_t *hw_ste) +void dr_ste_set_miss_addr(struct dr_ste_ctx *ste_ctx, uint8_t *hw_ste_p, + uint64_t miss_addr) { - uint64_t index = - (DR_STE_GET(rx_steering_mult, hw_ste, miss_address_31_6) | - DR_STE_GET(rx_steering_mult, hw_ste, miss_address_39_32) << 26); - - return index << 6; + ste_ctx->set_miss_addr(hw_ste_p, miss_addr); } -void dr_ste_set_hit_addr(uint8_t *hw_ste, uint64_t icm_addr, uint32_t ht_size) +static void dr_ste_always_miss_addr(struct dr_ste_ctx *ste_ctx, + struct dr_ste *ste, + uint64_t miss_addr) { - uint64_t index = (icm_addr >> 5) | ht_size; + uint8_t *hw_ste_p = ste->hw_ste; + + ste_ctx->set_next_lu_type(hw_ste_p, DR_STE_LU_TYPE_DONT_CARE); + ste_ctx->set_miss_addr(hw_ste_p, miss_addr); + dr_ste_set_always_miss((struct dr_hw_ste_format *)ste->hw_ste); +} - DR_STE_SET(general, hw_ste, next_table_base_39_32_size, index >> 27); - DR_STE_SET(general, hw_ste, next_table_base_31_5_size, index); +void dr_ste_set_hit_addr(struct dr_ste_ctx *ste_ctx, uint8_t *hw_ste_p, + uint64_t icm_addr, uint32_t ht_size) +{ + ste_ctx->set_hit_addr(hw_ste_p, icm_addr, ht_size); } uint64_t dr_ste_get_icm_addr(struct dr_ste *ste) @@ -315,14 +149,16 @@ struct list_head *dr_ste_get_miss_list(struct dr_ste *ste) return &ste->htbl->miss_list[index]; } -void dr_ste_always_hit_htbl(struct dr_ste *ste, struct dr_ste_htbl *next_htbl) +static void dr_ste_always_hit_htbl(struct dr_ste_ctx *ste_ctx, + struct dr_ste *ste, + struct dr_ste_htbl *next_htbl) { struct dr_icm_chunk *chunk = next_htbl->chunk; uint8_t *hw_ste = ste->hw_ste; - DR_STE_SET(general, hw_ste, byte_mask, next_htbl->byte_mask); - DR_STE_SET(general, hw_ste, next_lu_type, next_htbl->lu_type); - dr_ste_set_hit_addr(hw_ste, chunk->icm_addr, chunk->num_of_entries); + ste_ctx->set_byte_mask(hw_ste, next_htbl->byte_mask); + ste_ctx->set_next_lu_type(hw_ste, next_htbl->lu_type); + ste_ctx->set_hit_addr(hw_ste, chunk->icm_addr, chunk->num_of_entries); dr_ste_set_always_hit((struct dr_hw_ste_format *)ste->hw_ste); } @@ -361,7 +197,8 @@ static void dr_ste_replace(struct dr_ste *dst, struct dr_ste *src) /* Free ste which is the head and the only one in miss_list */ static void -dr_ste_remove_head_ste(struct dr_ste *ste, +dr_ste_remove_head_ste(struct dr_ste_ctx *ste_ctx, + struct dr_ste *ste, struct dr_matcher_rx_tx *nic_matcher, struct dr_ste_send_info *ste_info_head, struct list_head *send_ste_list, @@ -378,7 +215,7 @@ dr_ste_remove_head_ste(struct dr_ste *ste, */ memcpy(tmp_ste.hw_ste, ste->hw_ste, DR_STE_SIZE_REDUCED); miss_addr = nic_matcher->e_anchor->chunk->icm_addr; - dr_ste_always_miss_addr(&tmp_ste, miss_addr); + dr_ste_always_miss_addr(ste_ctx, &tmp_ste, miss_addr); memcpy(ste->hw_ste, tmp_ste.hw_ste, DR_STE_SIZE_REDUCED); list_del_init(&ste->miss_list_node); @@ -398,13 +235,16 @@ dr_ste_remove_head_ste(struct dr_ste *ste, * |_ste_| --> |_next_ste_| -->|__| -->|__| -->/0 */ static void -dr_ste_replace_head_ste(struct dr_ste *ste, struct dr_ste *next_ste, +dr_ste_replace_head_ste(struct dr_matcher_rx_tx *nic_matcher, + struct dr_ste *ste, struct dr_ste *next_ste, struct dr_ste_send_info *ste_info_head, struct list_head *send_ste_list, struct dr_ste_htbl *stats_tbl) { struct dr_ste_htbl *next_miss_htbl; + uint8_t hw_ste[DR_STE_SIZE] = {}; + int sb_idx; next_miss_htbl = next_ste->htbl; @@ -417,14 +257,19 @@ dr_ste_replace_head_ste(struct dr_ste *ste, struct dr_ste *next_ste, /* Move data from next into ste */ dr_ste_replace(ste, next_ste); + /* Copy all 64 hw_ste bytes */ + memcpy(hw_ste, ste->hw_ste, DR_STE_SIZE_REDUCED); + sb_idx = ste->ste_chain_location - 1; + dr_ste_set_bit_mask(hw_ste, nic_matcher->ste_builder[sb_idx].bit_mask); + /* * Del the htbl that contains the next_ste. * The origin htbl stay with the same number of entries. */ dr_htbl_put(next_miss_htbl); - dr_send_fill_and_append_ste_send_info(ste, DR_STE_SIZE_REDUCED, - 0, ste->hw_ste, + dr_send_fill_and_append_ste_send_info(ste, DR_STE_SIZE, + 0, hw_ste, ste_info_head, send_ste_list, true /* Copy data */); @@ -437,7 +282,8 @@ dr_ste_replace_head_ste(struct dr_ste *ste, struct dr_ste *next_ste, * Free ste that is located in the middle of the miss list: * |__| -->|_prev_ste_|->|_ste_|-->|_next_ste_| */ -static void dr_ste_remove_middle_ste(struct dr_ste *ste, +static void dr_ste_remove_middle_ste(struct dr_ste_ctx *ste_ctx, + struct dr_ste *ste, struct dr_ste_send_info *ste_info, struct list_head *send_ste_list, struct dr_ste_htbl *stats_tbl) @@ -448,10 +294,10 @@ static void dr_ste_remove_middle_ste(struct dr_ste *ste, prev_ste = list_prev(dr_ste_get_miss_list(ste), ste, miss_list_node); assert(prev_ste); - miss_addr = dr_ste_get_miss_addr(ste->hw_ste); - dr_ste_set_miss_addr(prev_ste->hw_ste, miss_addr); + miss_addr = ste_ctx->get_miss_addr(ste->hw_ste); + ste_ctx->set_miss_addr(prev_ste->hw_ste, miss_addr); - dr_send_fill_and_append_ste_send_info(prev_ste, DR_STE_SIZE_REDUCED, 0, + dr_send_fill_and_append_ste_send_info(prev_ste, DR_STE_SIZE_CTRL, 0, prev_ste->hw_ste, ste_info, send_ste_list, true /* Copy data*/); @@ -467,11 +313,12 @@ void dr_ste_free(struct dr_ste *ste, { struct dr_ste_send_info *cur_ste_info, *tmp_ste_info; struct mlx5dv_dr_domain *dmn = matcher->tbl->dmn; + struct dr_ste_ctx *ste_ctx = dmn->ste_ctx; struct dr_ste_send_info ste_info_head; struct dr_ste *next_ste, *first_ste; - LIST_HEAD(send_ste_list); bool put_on_origin_table = true; struct dr_ste_htbl *stats_tbl; + LIST_HEAD(send_ste_list); first_ste = list_top(dr_ste_get_miss_list(ste), struct dr_ste, miss_list_node); stats_tbl = first_ste->htbl; @@ -486,18 +333,21 @@ void dr_ste_free(struct dr_ste *ste, next_ste = list_next(dr_ste_get_miss_list(ste), ste, miss_list_node); if (!next_ste) { /* One and only entry in the list */ - dr_ste_remove_head_ste(ste, nic_matcher, + dr_ste_remove_head_ste(ste_ctx, ste, + nic_matcher, &ste_info_head, &send_ste_list, stats_tbl); } else { /* First but not only entry in the list */ - dr_ste_replace_head_ste(ste, next_ste, &ste_info_head, - &send_ste_list, stats_tbl); + dr_ste_replace_head_ste(nic_matcher, ste, next_ste, + &ste_info_head, &send_ste_list, + stats_tbl); put_on_origin_table = false; } } else { /* Ste in the middle of the list */ - dr_ste_remove_middle_ste(ste, &ste_info_head, &send_ste_list, stats_tbl); + dr_ste_remove_middle_ste(ste_ctx, ste, &ste_info_head, + &send_ste_list, stats_tbl); } /* Update HW */ @@ -520,54 +370,25 @@ bool dr_ste_equal_tag(void *src, void *dst) return !memcmp(s_hw_ste->tag, d_hw_ste->tag, DR_STE_SIZE_TAG); } -void dr_ste_set_hit_addr_by_next_htbl(uint8_t *hw_ste, +void dr_ste_set_hit_addr_by_next_htbl(struct dr_ste_ctx *ste_ctx, + uint8_t *hw_ste, struct dr_ste_htbl *next_htbl) { struct dr_icm_chunk *chunk = next_htbl->chunk; - dr_ste_set_hit_addr(hw_ste, chunk->icm_addr, chunk->num_of_entries); -} - -void dr_ste_set_miss_addr(uint8_t *hw_ste_p, uint64_t miss_addr) -{ - uint64_t index = miss_addr >> 6; - - /* Miss address for TX and RX STEs located in the same offsets */ - DR_STE_SET(rx_steering_mult, hw_ste_p, miss_address_39_32, index >> 26); - DR_STE_SET(rx_steering_mult, hw_ste_p, miss_address_31_6, index); + ste_ctx->set_hit_addr(hw_ste, chunk->icm_addr, chunk->num_of_entries); } -void dr_ste_always_miss_addr(struct dr_ste *ste, uint64_t miss_addr) +void dr_ste_prepare_for_postsend(struct dr_ste_ctx *ste_ctx, + uint8_t *hw_ste_p, uint32_t ste_size) { - uint8_t *hw_ste = ste->hw_ste; - - DR_STE_SET(rx_steering_mult, hw_ste, next_lu_type, DR_STE_LU_TYPE_DONT_CARE); - dr_ste_set_miss_addr(hw_ste, miss_addr); - dr_ste_set_always_miss((struct dr_hw_ste_format *)ste->hw_ste); -} - -/* - * The assumption here is that we don't update the ste->hw_ste if it is not - * used ste, so it will be all zero, checking the next_lu_type. - */ -bool dr_ste_is_not_valid_entry(uint8_t *p_hw_ste) -{ - struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)p_hw_ste; - - if (DR_STE_GET(general, hw_ste, next_lu_type) == - DR_STE_LU_TYPE_NOP) - return true; - - return false; -} - -bool dr_ste_not_used_ste(struct dr_ste *ste) -{ - return !atomic_load(&ste->refcount); + if (ste_ctx->prepare_for_postsend) + ste_ctx->prepare_for_postsend(hw_ste_p, ste_size); } /* Init one ste as a pattern for ste data array */ -void dr_ste_set_formated_ste(uint16_t gvmi, +void dr_ste_set_formated_ste(struct dr_ste_ctx *ste_ctx, + uint16_t gvmi, struct dr_domain_rx_tx *nic_dmn, struct dr_ste_htbl *htbl, uint8_t *formated_ste, @@ -575,13 +396,13 @@ void dr_ste_set_formated_ste(uint16_t gvmi, { struct dr_ste ste = {}; - dr_ste_init(formated_ste, htbl->lu_type, nic_dmn->ste_type, gvmi); + ste_ctx->ste_init(formated_ste, htbl->lu_type, nic_dmn->ste_type, gvmi); ste.hw_ste = formated_ste; if (connect_info->type == CONNECT_HIT) - dr_ste_always_hit_htbl(&ste, connect_info->hit_next_htbl); + dr_ste_always_hit_htbl(ste_ctx, &ste, connect_info->hit_next_htbl); else - dr_ste_always_miss_addr(&ste, connect_info->miss_icm_addr); + dr_ste_always_miss_addr(ste_ctx, &ste, connect_info->miss_icm_addr); } int dr_ste_htbl_init_and_postsend(struct mlx5dv_dr_domain *dmn, @@ -592,7 +413,8 @@ int dr_ste_htbl_init_and_postsend(struct mlx5dv_dr_domain *dmn, { uint8_t formated_ste[DR_STE_SIZE] = {}; - dr_ste_set_formated_ste(dmn->info.caps.gvmi, + dr_ste_set_formated_ste(dmn->ste_ctx, + dmn->info.caps.gvmi, nic_dmn, htbl, formated_ste, @@ -607,18 +429,18 @@ int dr_ste_create_next_htbl(struct mlx5dv_dr_matcher *matcher, uint8_t *cur_hw_ste, enum dr_icm_chunk_size log_table_size) { - struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)cur_hw_ste; struct dr_domain_rx_tx *nic_dmn = nic_matcher->nic_tbl->nic_dmn; struct mlx5dv_dr_domain *dmn = matcher->tbl->dmn; + struct dr_ste_ctx *ste_ctx = dmn->ste_ctx; struct dr_htbl_connect_info info; struct dr_ste_htbl *next_htbl; if (!dr_ste_is_last_in_rule(nic_matcher, ste->ste_chain_location)) { - uint8_t next_lu_type; + uint16_t next_lu_type; uint16_t byte_mask; - next_lu_type = DR_STE_GET(general, hw_ste, next_lu_type); - byte_mask = DR_STE_GET(general, hw_ste, byte_mask); + next_lu_type = ste_ctx->get_next_lu_type(cur_hw_ste); + byte_mask = ste_ctx->get_byte_mask(cur_hw_ste); next_htbl = dr_ste_htbl_alloc(dmn->ste_icm_pool, log_table_size, @@ -638,7 +460,7 @@ int dr_ste_create_next_htbl(struct mlx5dv_dr_matcher *matcher, goto free_table; } - dr_ste_set_hit_addr_by_next_htbl(cur_hw_ste, next_htbl); + dr_ste_set_hit_addr_by_next_htbl(ste_ctx, cur_hw_ste, next_htbl); ste->next_htbl = next_htbl; next_htbl->pointing_ste = ste; } @@ -667,7 +489,7 @@ static void dr_ste_set_ctrl(struct dr_ste_htbl *htbl) struct dr_ste_htbl *dr_ste_htbl_alloc(struct dr_icm_pool *pool, enum dr_icm_chunk_size chunk_size, - uint8_t lu_type, uint16_t byte_mask) + uint16_t lu_type, uint16_t byte_mask) { struct dr_icm_chunk *chunk; struct dr_ste_htbl *htbl; @@ -721,6 +543,94 @@ int dr_ste_htbl_free(struct dr_ste_htbl *htbl) return 0; } +void dr_ste_set_actions_tx(struct dr_ste_ctx *ste_ctx, + uint8_t *action_type_set, + uint8_t *hw_ste_arr, + struct dr_ste_actions_attr *attr, + uint32_t *added_stes) +{ + ste_ctx->set_actions_tx(action_type_set, hw_ste_arr, attr, added_stes); +} + +void dr_ste_set_actions_rx(struct dr_ste_ctx *ste_ctx, + uint8_t *action_type_set, + uint8_t *hw_ste_arr, + struct dr_ste_actions_attr *attr, + uint32_t *added_stes) +{ + ste_ctx->set_actions_rx(action_type_set, hw_ste_arr, attr, added_stes); +} + +const struct dr_ste_action_modify_field * +dr_ste_conv_modify_hdr_sw_field(struct dr_ste_ctx *ste_ctx, uint16_t sw_field) +{ + const struct dr_ste_action_modify_field *hw_field; + + if (sw_field >= ste_ctx->modify_field_arr_sz) + goto not_found; + + hw_field = &ste_ctx->modify_field_arr[sw_field]; + if (!hw_field->end && !hw_field->start) + goto not_found; + + return hw_field; + +not_found: + errno = EINVAL; + return NULL; +} + +void dr_ste_set_action_set(struct dr_ste_ctx *ste_ctx, + __be64 *hw_action, + uint8_t hw_field, + uint8_t shifter, + uint8_t length, + uint32_t data) +{ + ste_ctx->set_action_set((uint8_t *)hw_action, + hw_field, shifter, length, data); +} + +void dr_ste_set_action_add(struct dr_ste_ctx *ste_ctx, + __be64 *hw_action, + uint8_t hw_field, + uint8_t shifter, + uint8_t length, + uint32_t data) +{ + ste_ctx->set_action_add((uint8_t *)hw_action, + hw_field, shifter, length, data); +} + +void dr_ste_set_action_copy(struct dr_ste_ctx *ste_ctx, + __be64 *hw_action, + uint8_t dst_hw_field, + uint8_t dst_shifter, + uint8_t dst_len, + uint8_t src_hw_field, + uint8_t src_shifter) +{ + ste_ctx->set_action_copy((uint8_t *)hw_action, + dst_hw_field, dst_shifter, dst_len, + src_hw_field, src_shifter); +} + +int dr_ste_set_action_decap_l3_list(struct dr_ste_ctx *ste_ctx, + void *data, uint32_t data_sz, + uint8_t *hw_action, uint32_t hw_action_sz, + uint16_t *used_hw_action_num) +{ + /* Only Ethernet frame is supported, with VLAN (18) or without (14) */ + if (data_sz != HDR_LEN_L2 && data_sz != HDR_LEN_L2_W_VLAN) { + errno = EINVAL; + return errno; + } + + return ste_ctx->set_action_decap_l3_list(data, data_sz, + hw_action, hw_action_sz, + used_hw_action_num); +} + static int dr_ste_build_pre_check_spec(struct mlx5dv_dr_domain *dmn, struct dr_match_spec *m_spec, struct dr_match_spec *v_spec) @@ -782,6 +692,7 @@ int dr_ste_build_ste_arr(struct mlx5dv_dr_matcher *matcher, { struct dr_domain_rx_tx *nic_dmn = nic_matcher->nic_tbl->nic_dmn; struct mlx5dv_dr_domain *dmn = matcher->tbl->dmn; + struct dr_ste_ctx *ste_ctx = dmn->ste_ctx; struct dr_ste_build *sb; int ret, i; @@ -792,14 +703,14 @@ int dr_ste_build_ste_arr(struct mlx5dv_dr_matcher *matcher, sb = nic_matcher->ste_builder; for (i = 0; i < nic_matcher->num_of_builders; i++) { - dr_ste_init(ste_arr, - sb->lu_type, - nic_dmn->ste_type, - dmn->info.caps.gvmi); + ste_ctx->ste_init(ste_arr, + sb->lu_type, + nic_dmn->ste_type, + dmn->info.caps.gvmi); dr_ste_set_bit_mask(ste_arr, sb->bit_mask); - ret = sb->ste_build_tag_func(value, sb, ste_arr); + ret = sb->ste_build_tag_func(value, sb, dr_ste_get_tag(ste_arr)); if (ret) return ret; @@ -809,52 +720,14 @@ int dr_ste_build_ste_arr(struct mlx5dv_dr_matcher *matcher, * not relevant for the last ste in the chain. */ sb++; - DR_STE_SET(general, ste_arr, next_lu_type, sb->lu_type); - DR_STE_SET(general, ste_arr, byte_mask, sb->byte_mask); + ste_ctx->set_next_lu_type(ste_arr, sb->lu_type); + ste_ctx->set_byte_mask(ste_arr, sb->byte_mask); } ste_arr += DR_STE_SIZE; } return 0; } -static int dr_ste_build_eth_l2_src_des_bit_mask(struct dr_match_param *value, - bool inner, uint8_t *bit_mask) -{ - struct dr_match_spec *mask = inner ? &value->inner : &value->outer; - - DR_STE_SET_MASK_V(eth_l2_src_dst, bit_mask, dmac_47_16, mask, dmac_47_16); - DR_STE_SET_MASK_V(eth_l2_src_dst, bit_mask, dmac_15_0, mask, dmac_15_0); - - if (mask->smac_47_16 || mask->smac_15_0) { - DR_STE_SET(eth_l2_src_dst, bit_mask, smac_47_32, - mask->smac_47_16 >> 16); - DR_STE_SET(eth_l2_src_dst, bit_mask, smac_31_0, - mask->smac_47_16 << 16 | mask->smac_15_0); - mask->smac_47_16 = 0; - mask->smac_15_0 = 0; - } - - DR_STE_SET_MASK_V(eth_l2_src_dst, bit_mask, first_vlan_id, mask, first_vid); - DR_STE_SET_MASK_V(eth_l2_src_dst, bit_mask, first_cfi, mask, first_cfi); - DR_STE_SET_MASK_V(eth_l2_src_dst, bit_mask, first_priority, mask, first_prio); - DR_STE_SET_MASK(eth_l2_src_dst, bit_mask, l3_type, mask, ip_version); - - if (mask->cvlan_tag) { - DR_STE_SET(eth_l2_src_dst, bit_mask, first_vlan_qualifier, -1); - mask->cvlan_tag = 0; - } else if (mask->svlan_tag) { - DR_STE_SET(eth_l2_src_dst, bit_mask, first_vlan_qualifier, -1); - mask->svlan_tag = 0; - } - - if (mask->cvlan_tag || mask->svlan_tag) { - errno = EINVAL; - return errno; - } - - return 0; -} - static void dr_ste_copy_mask_misc(char *mask, struct dr_match_misc *spec) { spec->gre_c_present = DEVX_GET(dr_match_set_misc, mask, gre_c_present); @@ -1096,598 +969,99 @@ void dr_ste_copy_param(uint8_t match_criteria, } } -static int dr_ste_build_eth_l2_src_des_tag(struct dr_match_param *value, - struct dr_ste_build *sb, - uint8_t *hw_ste_p) -{ - struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; - struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; - uint8_t *tag = hw_ste->tag; - - DR_STE_SET_TAG(eth_l2_src_dst, tag, dmac_47_16, spec, dmac_47_16); - DR_STE_SET_TAG(eth_l2_src_dst, tag, dmac_15_0, spec, dmac_15_0); - - if (spec->smac_47_16 || spec->smac_15_0) { - DR_STE_SET(eth_l2_src_dst, tag, smac_47_32, - spec->smac_47_16 >> 16); - DR_STE_SET(eth_l2_src_dst, tag, smac_31_0, - spec->smac_47_16 << 16 | spec->smac_15_0); - spec->smac_47_16 = 0; - spec->smac_15_0 = 0; - } - - if (spec->ip_version) { - if (spec->ip_version == IP_VERSION_IPV4) { - DR_STE_SET(eth_l2_src_dst, tag, l3_type, STE_IPV4); - spec->ip_version = 0; - } else if (spec->ip_version == IP_VERSION_IPV6) { - DR_STE_SET(eth_l2_src_dst, tag, l3_type, STE_IPV6); - spec->ip_version = 0; - } else { - errno = EINVAL; - return errno; - } - } - - DR_STE_SET_TAG(eth_l2_src_dst, tag, first_vlan_id, spec, first_vid); - DR_STE_SET_TAG(eth_l2_src_dst, tag, first_cfi, spec, first_cfi); - DR_STE_SET_TAG(eth_l2_src_dst, tag, first_priority, spec, first_prio); - - if (spec->cvlan_tag) { - DR_STE_SET(eth_l2_src_dst, tag, first_vlan_qualifier, DR_STE_CVLAN); - spec->cvlan_tag = 0; - } else if (spec->svlan_tag) { - DR_STE_SET(eth_l2_src_dst, tag, first_vlan_qualifier, DR_STE_SVLAN); - spec->svlan_tag = 0; - } - return 0; -} - -int dr_ste_build_eth_l2_src_des(struct dr_ste_build *sb, - struct dr_match_param *mask, - bool inner, bool rx) +void dr_ste_build_eth_l2_src_dst(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx) { - int ret; - - ret = dr_ste_build_eth_l2_src_des_bit_mask(mask, inner, sb->bit_mask); - if (ret) - return ret; - sb->rx = rx; sb->inner = inner; - sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL2_SRC_DST, rx, inner); - sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); - sb->ste_build_tag_func = &dr_ste_build_eth_l2_src_des_tag; - - return 0; -} - -static void dr_ste_build_eth_l3_ipv6_dst_bit_mask(struct dr_match_param *value, - bool inner, uint8_t *bit_mask) -{ - struct dr_match_spec *mask = inner ? &value->inner : &value->outer; - - DR_STE_SET_MASK_V(eth_l3_ipv6_dst, bit_mask, dst_ip_127_96, mask, dst_ip_127_96); - DR_STE_SET_MASK_V(eth_l3_ipv6_dst, bit_mask, dst_ip_95_64, mask, dst_ip_95_64); - DR_STE_SET_MASK_V(eth_l3_ipv6_dst, bit_mask, dst_ip_63_32, mask, dst_ip_63_32); - DR_STE_SET_MASK_V(eth_l3_ipv6_dst, bit_mask, dst_ip_31_0, mask, dst_ip_31_0); + ste_ctx->build_eth_l2_src_dst_init(sb, mask); } -static int dr_ste_build_eth_l3_ipv6_dst_tag(struct dr_match_param *value, - struct dr_ste_build *sb, - uint8_t *hw_ste_p) -{ - struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; - struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; - uint8_t *tag = hw_ste->tag; - - DR_STE_SET_TAG(eth_l3_ipv6_dst, tag, dst_ip_127_96, spec, dst_ip_127_96); - DR_STE_SET_TAG(eth_l3_ipv6_dst, tag, dst_ip_95_64, spec, dst_ip_95_64); - DR_STE_SET_TAG(eth_l3_ipv6_dst, tag, dst_ip_63_32, spec, dst_ip_63_32); - DR_STE_SET_TAG(eth_l3_ipv6_dst, tag, dst_ip_31_0, spec, dst_ip_31_0); - - return 0; -} - -void dr_ste_build_eth_l3_ipv6_dst(struct dr_ste_build *sb, +void dr_ste_build_eth_l3_ipv6_dst(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, struct dr_match_param *mask, bool inner, bool rx) { - dr_ste_build_eth_l3_ipv6_dst_bit_mask(mask, inner, sb->bit_mask); - sb->rx = rx; sb->inner = inner; - sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL3_IPV6_DST, rx, inner); - sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); - sb->ste_build_tag_func = &dr_ste_build_eth_l3_ipv6_dst_tag; -} - -static void dr_ste_build_eth_l3_ipv6_src_bit_mask(struct dr_match_param *value, - bool inner, uint8_t *bit_mask) -{ - struct dr_match_spec *mask = inner ? &value->inner : &value->outer; - - DR_STE_SET_MASK_V(eth_l3_ipv6_src, bit_mask, src_ip_127_96, mask, src_ip_127_96); - DR_STE_SET_MASK_V(eth_l3_ipv6_src, bit_mask, src_ip_95_64, mask, src_ip_95_64); - DR_STE_SET_MASK_V(eth_l3_ipv6_src, bit_mask, src_ip_63_32, mask, src_ip_63_32); - DR_STE_SET_MASK_V(eth_l3_ipv6_src, bit_mask, src_ip_31_0, mask, src_ip_31_0); + ste_ctx->build_eth_l3_ipv6_dst_init(sb, mask); } -static int dr_ste_build_eth_l3_ipv6_src_tag(struct dr_match_param *value, - struct dr_ste_build *sb, - uint8_t *hw_ste_p) -{ - struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; - struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; - uint8_t *tag = hw_ste->tag; - - DR_STE_SET_TAG(eth_l3_ipv6_src, tag, src_ip_127_96, spec, src_ip_127_96); - DR_STE_SET_TAG(eth_l3_ipv6_src, tag, src_ip_95_64, spec, src_ip_95_64); - DR_STE_SET_TAG(eth_l3_ipv6_src, tag, src_ip_63_32, spec, src_ip_63_32); - DR_STE_SET_TAG(eth_l3_ipv6_src, tag, src_ip_31_0, spec, src_ip_31_0); - - return 0; -} - -void dr_ste_build_eth_l3_ipv6_src(struct dr_ste_build *sb, +void dr_ste_build_eth_l3_ipv6_src(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, struct dr_match_param *mask, bool inner, bool rx) { - dr_ste_build_eth_l3_ipv6_src_bit_mask(mask, inner, sb->bit_mask); - sb->rx = rx; sb->inner = inner; - sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL3_IPV6_SRC, rx, inner); - sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); - sb->ste_build_tag_func = &dr_ste_build_eth_l3_ipv6_src_tag; -} - -static void dr_ste_build_eth_l3_ipv4_5_tuple_bit_mask(struct dr_match_param *value, - bool inner, - uint8_t *bit_mask) -{ - struct dr_match_spec *mask = inner ? &value->inner : &value->outer; - - DR_STE_SET_MASK_V(eth_l3_ipv4_5_tuple, bit_mask, destination_address, mask, dst_ip_31_0); - DR_STE_SET_MASK_V(eth_l3_ipv4_5_tuple, bit_mask, source_address, mask, src_ip_31_0); - DR_STE_SET_MASK_V(eth_l3_ipv4_5_tuple, bit_mask, destination_port, mask, tcp_dport); - DR_STE_SET_MASK_V(eth_l3_ipv4_5_tuple, bit_mask, destination_port, mask, udp_dport); - DR_STE_SET_MASK_V(eth_l3_ipv4_5_tuple, bit_mask, source_port, mask, tcp_sport); - DR_STE_SET_MASK_V(eth_l3_ipv4_5_tuple, bit_mask, source_port, mask, udp_sport); - DR_STE_SET_MASK_V(eth_l3_ipv4_5_tuple, bit_mask, protocol, mask, ip_protocol); - DR_STE_SET_MASK_V(eth_l3_ipv4_5_tuple, bit_mask, fragmented, mask, frag); - DR_STE_SET_MASK_V(eth_l3_ipv4_5_tuple, bit_mask, dscp, mask, ip_dscp); - DR_STE_SET_MASK_V(eth_l3_ipv4_5_tuple, bit_mask, ecn, mask, ip_ecn); - - if (mask->tcp_flags) { - DR_STE_SET_TCP_FLAGS(eth_l3_ipv4_5_tuple, bit_mask, mask); - mask->tcp_flags = 0; - } + ste_ctx->build_eth_l3_ipv6_src_init(sb, mask); } -static int dr_ste_build_eth_l3_ipv4_5_tuple_tag(struct dr_match_param *value, - struct dr_ste_build *sb, - uint8_t *hw_ste_p) -{ - struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; - struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; - uint8_t *tag = hw_ste->tag; - - DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, destination_address, spec, dst_ip_31_0); - DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, source_address, spec, src_ip_31_0); - DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, destination_port, spec, tcp_dport); - DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, destination_port, spec, udp_dport); - DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, source_port, spec, tcp_sport); - DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, source_port, spec, udp_sport); - DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, protocol, spec, ip_protocol); - DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, fragmented, spec, frag); - DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, dscp, spec, ip_dscp); - DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, ecn, spec, ip_ecn); - - if (spec->tcp_flags) { - DR_STE_SET_TCP_FLAGS(eth_l3_ipv4_5_tuple, tag, spec); - spec->tcp_flags = 0; - } - - return 0; -} - -void dr_ste_build_eth_l3_ipv4_5_tuple(struct dr_ste_build *sb, +void dr_ste_build_eth_l3_ipv4_5_tuple(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, struct dr_match_param *mask, bool inner, bool rx) { - dr_ste_build_eth_l3_ipv4_5_tuple_bit_mask(mask, inner, sb->bit_mask); - sb->rx = rx; sb->inner = inner; - sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL3_IPV4_5_TUPLE, rx, inner); - sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); - sb->ste_build_tag_func = &dr_ste_build_eth_l3_ipv4_5_tuple_tag; -} - -static void -dr_ste_build_eth_l2_src_or_dst_bit_mask(struct dr_match_param *value, - bool inner, uint8_t *bit_mask) -{ - struct dr_match_spec *mask = inner ? &value->inner : &value->outer; - struct dr_match_misc *misc_mask = &value->misc; - - DR_STE_SET_MASK_V(eth_l2_src, bit_mask, first_vlan_id, mask, first_vid); - DR_STE_SET_MASK_V(eth_l2_src, bit_mask, first_cfi, mask, first_cfi); - DR_STE_SET_MASK_V(eth_l2_src, bit_mask, first_priority, mask, first_prio); - DR_STE_SET_MASK_V(eth_l2_src, bit_mask, ip_fragmented, mask, frag); - DR_STE_SET_MASK_V(eth_l2_src, bit_mask, l3_ethertype, mask, ethertype); - DR_STE_SET_MASK(eth_l2_src, bit_mask, l3_type, mask, ip_version); - - if (mask->svlan_tag || mask->cvlan_tag) { - DR_STE_SET(eth_l2_src, bit_mask, first_vlan_qualifier, -1); - mask->cvlan_tag = 0; - mask->svlan_tag = 0; - } - - if (inner) { - if (misc_mask->inner_second_cvlan_tag || - misc_mask->inner_second_svlan_tag) { - DR_STE_SET(eth_l2_src, bit_mask, second_vlan_qualifier, -1); - misc_mask->inner_second_cvlan_tag = 0; - misc_mask->inner_second_svlan_tag = 0; - } - - DR_STE_SET_MASK_V(eth_l2_src, bit_mask, second_vlan_id, misc_mask, inner_second_vid); - DR_STE_SET_MASK_V(eth_l2_src, bit_mask, second_cfi, misc_mask, inner_second_cfi); - DR_STE_SET_MASK_V(eth_l2_src, bit_mask, second_priority, misc_mask, inner_second_prio); - } else { - if (misc_mask->outer_second_cvlan_tag || - misc_mask->outer_second_svlan_tag) { - DR_STE_SET(eth_l2_src, bit_mask, second_vlan_qualifier, -1); - misc_mask->outer_second_cvlan_tag = 0; - misc_mask->outer_second_svlan_tag = 0; - } - - DR_STE_SET_MASK_V(eth_l2_src, bit_mask, second_vlan_id, misc_mask, outer_second_vid); - DR_STE_SET_MASK_V(eth_l2_src, bit_mask, second_cfi, misc_mask, outer_second_cfi); - DR_STE_SET_MASK_V(eth_l2_src, bit_mask, second_priority, misc_mask, outer_second_prio); - } + ste_ctx->build_eth_l3_ipv4_5_tuple_init(sb, mask); } -static int dr_ste_build_eth_l2_src_or_dst_tag(struct dr_match_param *value, - bool inner, uint8_t *hw_ste_p) -{ - struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; - struct dr_match_spec *spec = inner ? &value->inner : &value->outer; - struct dr_match_misc *misc_spec = &value->misc; - uint8_t *tag = hw_ste->tag; - - DR_STE_SET_TAG(eth_l2_src, tag, first_vlan_id, spec, first_vid); - DR_STE_SET_TAG(eth_l2_src, tag, first_cfi, spec, first_cfi); - DR_STE_SET_TAG(eth_l2_src, tag, first_priority, spec, first_prio); - DR_STE_SET_TAG(eth_l2_src, tag, ip_fragmented, spec, frag); - DR_STE_SET_TAG(eth_l2_src, tag, l3_ethertype, spec, ethertype); - - if (spec->ip_version) { - if (spec->ip_version == IP_VERSION_IPV4) { - DR_STE_SET(eth_l2_src, tag, l3_type, STE_IPV4); - spec->ip_version = 0; - } else if (spec->ip_version == IP_VERSION_IPV6) { - DR_STE_SET(eth_l2_src, tag, l3_type, STE_IPV6); - spec->ip_version = 0; - } else { - errno = EINVAL; - return errno; - } - } - - if (spec->cvlan_tag) { - DR_STE_SET(eth_l2_src, tag, first_vlan_qualifier, DR_STE_CVLAN); - spec->cvlan_tag = 0; - } else if (spec->svlan_tag) { - DR_STE_SET(eth_l2_src, tag, first_vlan_qualifier, DR_STE_SVLAN); - spec->svlan_tag = 0; - } - - if (inner) { - if (misc_spec->inner_second_cvlan_tag) { - DR_STE_SET(eth_l2_src, tag, second_vlan_qualifier, DR_STE_CVLAN); - misc_spec->inner_second_cvlan_tag = 0; - } else if (misc_spec->inner_second_svlan_tag) { - DR_STE_SET(eth_l2_src, tag, second_vlan_qualifier, DR_STE_SVLAN); - misc_spec->inner_second_svlan_tag = 0; - } - - DR_STE_SET_TAG(eth_l2_src, tag, second_vlan_id, misc_spec, inner_second_vid); - DR_STE_SET_TAG(eth_l2_src, tag, second_cfi, misc_spec, inner_second_cfi); - DR_STE_SET_TAG(eth_l2_src, tag, second_priority, misc_spec, inner_second_prio); - } else { - if (misc_spec->outer_second_cvlan_tag) { - DR_STE_SET(eth_l2_src, tag, second_vlan_qualifier, DR_STE_CVLAN); - misc_spec->outer_second_cvlan_tag = 0; - } else if (misc_spec->outer_second_svlan_tag) { - DR_STE_SET(eth_l2_src, tag, second_vlan_qualifier, DR_STE_SVLAN); - misc_spec->outer_second_svlan_tag = 0; - } - DR_STE_SET_TAG(eth_l2_src, tag, second_vlan_id, misc_spec, outer_second_vid); - DR_STE_SET_TAG(eth_l2_src, tag, second_cfi, misc_spec, outer_second_cfi); - DR_STE_SET_TAG(eth_l2_src, tag, second_priority, misc_spec, outer_second_prio); - } - - return 0; -} - -static void dr_ste_build_eth_l2_src_bit_mask(struct dr_match_param *value, - bool inner, uint8_t *bit_mask) -{ - struct dr_match_spec *mask = inner ? &value->inner : &value->outer; - - DR_STE_SET_MASK_V(eth_l2_src, bit_mask, smac_47_16, mask, smac_47_16); - DR_STE_SET_MASK_V(eth_l2_src, bit_mask, smac_15_0, mask, smac_15_0); - - dr_ste_build_eth_l2_src_or_dst_bit_mask(value, inner, bit_mask); -} - -static int dr_ste_build_eth_l2_src_tag(struct dr_match_param *value, - struct dr_ste_build *sb, - uint8_t *hw_ste_p) -{ - struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; - struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; - uint8_t *tag = hw_ste->tag; - - DR_STE_SET_TAG(eth_l2_src, tag, smac_47_16, spec, smac_47_16); - DR_STE_SET_TAG(eth_l2_src, tag, smac_15_0, spec, smac_15_0); - - return dr_ste_build_eth_l2_src_or_dst_tag(value, sb->inner, hw_ste_p); -} - -void dr_ste_build_eth_l2_src(struct dr_ste_build *sb, +void dr_ste_build_eth_l2_src(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, struct dr_match_param *mask, bool inner, bool rx) { - dr_ste_build_eth_l2_src_bit_mask(mask, inner, sb->bit_mask); - sb->rx = rx; sb->inner = inner; - sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL2_SRC, rx, inner); - sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); - sb->ste_build_tag_func = &dr_ste_build_eth_l2_src_tag; -} - -static void dr_ste_build_eth_l2_dst_bit_mask(struct dr_match_param *value, - bool inner, uint8_t *bit_mask) -{ - struct dr_match_spec *mask = inner ? &value->inner : &value->outer; - - DR_STE_SET_MASK_V(eth_l2_dst, bit_mask, dmac_47_16, mask, dmac_47_16); - DR_STE_SET_MASK_V(eth_l2_dst, bit_mask, dmac_15_0, mask, dmac_15_0); - - dr_ste_build_eth_l2_src_or_dst_bit_mask(value, inner, bit_mask); + ste_ctx->build_eth_l2_src_init(sb, mask); } -static int dr_ste_build_eth_l2_dst_tag(struct dr_match_param *value, - struct dr_ste_build *sb, - uint8_t *hw_ste_p) -{ - struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; - struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; - uint8_t *tag = hw_ste->tag; - - DR_STE_SET_TAG(eth_l2_dst, tag, dmac_47_16, spec, dmac_47_16); - DR_STE_SET_TAG(eth_l2_dst, tag, dmac_15_0, spec, dmac_15_0); - - return dr_ste_build_eth_l2_src_or_dst_tag(value, sb->inner, hw_ste_p); -} - -void dr_ste_build_eth_l2_dst(struct dr_ste_build *sb, +void dr_ste_build_eth_l2_dst(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, struct dr_match_param *mask, bool inner, bool rx) { - dr_ste_build_eth_l2_dst_bit_mask(mask, inner, sb->bit_mask); - sb->rx = rx; sb->inner = inner; - sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL2_DST, rx, inner); - sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); - sb->ste_build_tag_func = &dr_ste_build_eth_l2_dst_tag; + ste_ctx->build_eth_l2_dst_init(sb, mask); } -static void dr_ste_build_eth_l2_tnl_bit_mask(struct dr_match_param *value, - bool inner, uint8_t *bit_mask) -{ - struct dr_match_spec *mask = inner ? &value->inner : &value->outer; - struct dr_match_misc *misc = &value->misc; - - DR_STE_SET_MASK_V(eth_l2_tnl, bit_mask, dmac_47_16, mask, dmac_47_16); - DR_STE_SET_MASK_V(eth_l2_tnl, bit_mask, dmac_15_0, mask, dmac_15_0); - DR_STE_SET_MASK_V(eth_l2_tnl, bit_mask, first_vlan_id, mask, first_vid); - DR_STE_SET_MASK_V(eth_l2_tnl, bit_mask, first_cfi, mask, first_cfi); - DR_STE_SET_MASK_V(eth_l2_tnl, bit_mask, first_priority, mask, first_prio); - DR_STE_SET_MASK_V(eth_l2_tnl, bit_mask, ip_fragmented, mask, frag); - DR_STE_SET_MASK_V(eth_l2_tnl, bit_mask, l3_ethertype, mask, ethertype); - DR_STE_SET_MASK(eth_l2_tnl, bit_mask, l3_type, mask, ip_version); - - if (misc->vxlan_vni) { - DR_STE_SET(eth_l2_tnl, bit_mask, l2_tunneling_network_id, (misc->vxlan_vni << 8)); - misc->vxlan_vni = 0; - } - - if (mask->svlan_tag || mask->cvlan_tag) { - DR_STE_SET(eth_l2_tnl, bit_mask, first_vlan_qualifier, -1); - mask->cvlan_tag = 0; - mask->svlan_tag = 0; - } -} - -static int dr_ste_build_eth_l2_tnl_tag(struct dr_match_param *value, - struct dr_ste_build *sb, - uint8_t *hw_ste_p) -{ - struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; - struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; - struct dr_match_misc *misc = &value->misc; - uint8_t *tag = hw_ste->tag; - - DR_STE_SET_TAG(eth_l2_tnl, tag, dmac_47_16, spec, dmac_47_16); - DR_STE_SET_TAG(eth_l2_tnl, tag, dmac_15_0, spec, dmac_15_0); - DR_STE_SET_TAG(eth_l2_tnl, tag, first_vlan_id, spec, first_vid); - DR_STE_SET_TAG(eth_l2_tnl, tag, first_cfi, spec, first_cfi); - DR_STE_SET_TAG(eth_l2_tnl, tag, ip_fragmented, spec, frag); - DR_STE_SET_TAG(eth_l2_tnl, tag, first_priority, spec, first_prio); - DR_STE_SET_TAG(eth_l2_tnl, tag, l3_ethertype, spec, ethertype); - - if (misc->vxlan_vni) { - DR_STE_SET(eth_l2_tnl, tag, l2_tunneling_network_id, - (misc->vxlan_vni << 8)); - misc->vxlan_vni = 0; - } - - if (spec->cvlan_tag) { - DR_STE_SET(eth_l2_tnl, tag, first_vlan_qualifier, DR_STE_CVLAN); - spec->cvlan_tag = 0; - } else if (spec->svlan_tag) { - DR_STE_SET(eth_l2_tnl, tag, first_vlan_qualifier, DR_STE_SVLAN); - spec->svlan_tag = 0; - } - - if (spec->ip_version) { - if (spec->ip_version == IP_VERSION_IPV4) { - DR_STE_SET(eth_l2_tnl, tag, l3_type, STE_IPV4); - spec->ip_version = 0; - } else if (spec->ip_version == IP_VERSION_IPV6) { - DR_STE_SET(eth_l2_tnl, tag, l3_type, STE_IPV6); - spec->ip_version = 0; - } else { - errno = EINVAL; - return errno; - } - } - - return 0; -} - -void dr_ste_build_eth_l2_tnl(struct dr_ste_build *sb, - struct dr_match_param *mask, bool inner, bool rx) +void dr_ste_build_eth_l2_tnl(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx) { - dr_ste_build_eth_l2_tnl_bit_mask(mask, inner, sb->bit_mask); - sb->rx = rx; sb->inner = inner; - sb->lu_type = DR_STE_LU_TYPE_ETHL2_TUNNELING_I; - sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); - sb->ste_build_tag_func = &dr_ste_build_eth_l2_tnl_tag; -} - -static void dr_ste_build_eth_l3_ipv4_misc_bit_mask(struct dr_match_param *value, - bool inner, uint8_t *bit_mask) -{ - struct dr_match_spec *mask = inner ? &value->inner : &value->outer; - - DR_STE_SET_MASK_V(eth_l3_ipv4_misc, bit_mask, time_to_live, mask, ip_ttl_hoplimit); -} - -static int dr_ste_build_eth_l3_ipv4_misc_tag(struct dr_match_param *value, - struct dr_ste_build *sb, - uint8_t *hw_ste_p) -{ - struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; - struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; - uint8_t *tag = hw_ste->tag; - - DR_STE_SET_TAG(eth_l3_ipv4_misc, tag, time_to_live, spec, ip_ttl_hoplimit); - - return 0; + ste_ctx->build_eth_l2_tnl_init(sb, mask); } -void dr_ste_build_eth_l3_ipv4_misc(struct dr_ste_build *sb, +void dr_ste_build_eth_l3_ipv4_misc(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, struct dr_match_param *mask, bool inner, bool rx) { - dr_ste_build_eth_l3_ipv4_misc_bit_mask(mask, inner, sb->bit_mask); - sb->rx = rx; sb->inner = inner; - sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL3_IPV4_MISC, rx, inner); - sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); - sb->ste_build_tag_func = &dr_ste_build_eth_l3_ipv4_misc_tag; -} - -static void dr_ste_build_ipv6_l3_l4_bit_mask(struct dr_match_param *value, - bool inner, uint8_t *bit_mask) -{ - struct dr_match_spec *mask = inner ? &value->inner : &value->outer; - struct dr_match_misc *misc = &value->misc; - - DR_STE_SET_MASK_V(eth_l4, bit_mask, dst_port, mask, tcp_dport); - DR_STE_SET_MASK_V(eth_l4, bit_mask, src_port, mask, tcp_sport); - DR_STE_SET_MASK_V(eth_l4, bit_mask, dst_port, mask, udp_dport); - DR_STE_SET_MASK_V(eth_l4, bit_mask, src_port, mask, udp_sport); - DR_STE_SET_MASK_V(eth_l4, bit_mask, protocol, mask, ip_protocol); - DR_STE_SET_MASK_V(eth_l4, bit_mask, fragmented, mask, frag); - DR_STE_SET_MASK_V(eth_l4, bit_mask, dscp, mask, ip_dscp); - DR_STE_SET_MASK_V(eth_l4, bit_mask, ecn, mask, ip_ecn); - DR_STE_SET_MASK_V(eth_l4, bit_mask, ipv6_hop_limit, mask, ip_ttl_hoplimit); - if (inner) { - DR_STE_SET_MASK_V(eth_l4, bit_mask, flow_label, - misc, inner_ipv6_flow_label); - } else { - DR_STE_SET_MASK_V(eth_l4, bit_mask, flow_label, - misc, outer_ipv6_flow_label); - } - - if (mask->tcp_flags) { - DR_STE_SET_TCP_FLAGS(eth_l4, bit_mask, mask); - mask->tcp_flags = 0; - } + ste_ctx->build_eth_l3_ipv4_misc_init(sb, mask); } -static int dr_ste_build_ipv6_l3_l4_tag(struct dr_match_param *value, - struct dr_ste_build *sb, - uint8_t *hw_ste_p) -{ - struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; - struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; - struct dr_match_misc *misc = &value->misc; - uint8_t *tag = hw_ste->tag; - - DR_STE_SET_TAG(eth_l4, tag, dst_port, spec, tcp_dport); - DR_STE_SET_TAG(eth_l4, tag, src_port, spec, tcp_sport); - DR_STE_SET_TAG(eth_l4, tag, dst_port, spec, udp_dport); - DR_STE_SET_TAG(eth_l4, tag, src_port, spec, udp_sport); - DR_STE_SET_TAG(eth_l4, tag, protocol, spec, ip_protocol); - DR_STE_SET_TAG(eth_l4, tag, fragmented, spec, frag); - DR_STE_SET_TAG(eth_l4, tag, dscp, spec, ip_dscp); - DR_STE_SET_TAG(eth_l4, tag, ecn, spec, ip_ecn); - DR_STE_SET_TAG(eth_l4, tag, ipv6_hop_limit, spec, ip_ttl_hoplimit); - if (sb->inner) { - DR_STE_SET_TAG(eth_l4, tag, flow_label, - misc, inner_ipv6_flow_label); - } else { - DR_STE_SET_TAG(eth_l4, tag, flow_label, - misc, outer_ipv6_flow_label); - } - - if (spec->tcp_flags) { - DR_STE_SET_TCP_FLAGS(eth_l4, tag, spec); - spec->tcp_flags = 0; - } - - return 0; -} - -void dr_ste_build_ipv6_l3_l4(struct dr_ste_build *sb, - struct dr_match_param *mask, - bool inner, bool rx) +void dr_ste_build_eth_ipv6_l3_l4(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx) { - dr_ste_build_ipv6_l3_l4_bit_mask(mask, inner, sb->bit_mask); - sb->rx = rx; sb->inner = inner; - sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL4, rx, inner); - sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); - sb->ste_build_tag_func = &dr_ste_build_ipv6_l3_l4_tag; + ste_ctx->build_eth_ipv6_l3_l4_init(sb, mask); } static int dr_ste_build_empty_always_hit_tag(struct dr_match_param *value, struct dr_ste_build *sb, - uint8_t *hw_ste_p) + uint8_t *tag) { return 0; } @@ -1700,729 +1074,138 @@ void dr_ste_build_empty_always_hit(struct dr_ste_build *sb, bool rx) sb->ste_build_tag_func = &dr_ste_build_empty_always_hit_tag; } -static void dr_ste_build_mpls_bit_mask(struct dr_match_param *value, - bool inner, uint8_t *bit_mask) -{ - struct dr_match_misc2 *misc2_mask = &value->misc2; - - if (inner) - DR_STE_SET_MPLS_MASK(mpls, misc2_mask, inner, bit_mask); - else - DR_STE_SET_MPLS_MASK(mpls, misc2_mask, outer, bit_mask); -} - -static int dr_ste_build_mpls_tag(struct dr_match_param *value, - struct dr_ste_build *sb, - uint8_t *hw_ste_p) -{ - struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; - struct dr_match_misc2 *misc2_mask = &value->misc2; - uint8_t *tag = hw_ste->tag; - - if (sb->inner) - DR_STE_SET_MPLS_TAG(mpls, misc2_mask, inner, tag); - else - DR_STE_SET_MPLS_TAG(mpls, misc2_mask, outer, tag); - - return 0; -} - -void dr_ste_build_mpls(struct dr_ste_build *sb, struct dr_match_param *mask, +void dr_ste_build_mpls(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, + struct dr_match_param *mask, bool inner, bool rx) { - dr_ste_build_mpls_bit_mask(mask, inner, sb->bit_mask); - sb->rx = rx; sb->inner = inner; - sb->lu_type = DR_STE_CALC_LU_TYPE(MPLS_FIRST, rx, inner); - sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); - sb->ste_build_tag_func = &dr_ste_build_mpls_tag; -} - -static void dr_ste_build_gre_bit_mask(struct dr_match_param *value, - bool inner, uint8_t *bit_mask) -{ - struct dr_match_misc *misc_mask = &value->misc; - - DR_STE_SET_MASK_V(gre, bit_mask, gre_protocol, misc_mask, gre_protocol); - DR_STE_SET_MASK_V(gre, bit_mask, gre_k_present, misc_mask, gre_k_present); - DR_STE_SET_MASK_V(gre, bit_mask, gre_key_h, misc_mask, gre_key_h); - DR_STE_SET_MASK_V(gre, bit_mask, gre_key_l, misc_mask, gre_key_l); - - DR_STE_SET_MASK_V(gre, bit_mask, gre_c_present, misc_mask, gre_c_present); - DR_STE_SET_MASK_V(gre, bit_mask, gre_s_present, misc_mask, gre_s_present); -} - -static int dr_ste_build_gre_tag(struct dr_match_param *value, - struct dr_ste_build *sb, - uint8_t *hw_ste_p) -{ - struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; - struct dr_match_misc *misc = &value->misc; - uint8_t *tag = hw_ste->tag; - - DR_STE_SET_TAG(gre, tag, gre_protocol, misc, gre_protocol); - - DR_STE_SET_TAG(gre, tag, gre_k_present, misc, gre_k_present); - DR_STE_SET_TAG(gre, tag, gre_key_h, misc, gre_key_h); - DR_STE_SET_TAG(gre, tag, gre_key_l, misc, gre_key_l); - - DR_STE_SET_TAG(gre, tag, gre_c_present, misc, gre_c_present); - - DR_STE_SET_TAG(gre, tag, gre_s_present, misc, gre_s_present); - - return 0; + ste_ctx->build_mpls_init(sb, mask); } -void dr_ste_build_gre(struct dr_ste_build *sb, struct dr_match_param *mask, - bool inner, bool rx) +void dr_ste_build_tnl_gre(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx) { - dr_ste_build_gre_bit_mask(mask, inner, sb->bit_mask); - sb->rx = rx; sb->inner = inner; - sb->lu_type = DR_STE_LU_TYPE_GRE; - sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); - sb->ste_build_tag_func = &dr_ste_build_gre_tag; + ste_ctx->build_tnl_gre_init(sb, mask); } -static void dr_ste_build_flex_parser_0_bit_mask(struct dr_match_param *value, - bool inner, uint8_t *bit_mask) -{ - struct dr_match_misc2 *misc_2_mask = &value->misc2; - - if (DR_STE_IS_OUTER_MPLS_OVER_GRE_SET(misc_2_mask)) { - DR_STE_SET_MASK_V(flex_parser_0, bit_mask, parser_3_label, - misc_2_mask, outer_first_mpls_over_gre_label); - - DR_STE_SET_MASK_V(flex_parser_0, bit_mask, parser_3_exp, - misc_2_mask, outer_first_mpls_over_gre_exp); - - DR_STE_SET_MASK_V(flex_parser_0, bit_mask, parser_3_s_bos, - misc_2_mask, outer_first_mpls_over_gre_s_bos); - - DR_STE_SET_MASK_V(flex_parser_0, bit_mask, parser_3_ttl, - misc_2_mask, outer_first_mpls_over_gre_ttl); - } else { - DR_STE_SET_MASK_V(flex_parser_0, bit_mask, parser_3_label, - misc_2_mask, outer_first_mpls_over_udp_label); - - DR_STE_SET_MASK_V(flex_parser_0, bit_mask, parser_3_exp, - misc_2_mask, outer_first_mpls_over_udp_exp); - - DR_STE_SET_MASK_V(flex_parser_0, bit_mask, parser_3_s_bos, - misc_2_mask, outer_first_mpls_over_udp_s_bos); - - DR_STE_SET_MASK_V(flex_parser_0, bit_mask, parser_3_ttl, - misc_2_mask, outer_first_mpls_over_udp_ttl); - } -} - -static int dr_ste_build_flex_parser_0_tag(struct dr_match_param *value, - struct dr_ste_build *sb, - uint8_t *hw_ste_p) -{ - struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; - struct dr_match_misc2 *misc_2_mask = &value->misc2; - uint8_t *tag = hw_ste->tag; - - if (DR_STE_IS_OUTER_MPLS_OVER_GRE_SET(misc_2_mask)) { - DR_STE_SET_TAG(flex_parser_0, tag, parser_3_label, - misc_2_mask, outer_first_mpls_over_gre_label); - - DR_STE_SET_TAG(flex_parser_0, tag, parser_3_exp, - misc_2_mask, outer_first_mpls_over_gre_exp); - - DR_STE_SET_TAG(flex_parser_0, tag, parser_3_s_bos, - misc_2_mask, outer_first_mpls_over_gre_s_bos); - - DR_STE_SET_TAG(flex_parser_0, tag, parser_3_ttl, - misc_2_mask, outer_first_mpls_over_gre_ttl); - } else { - DR_STE_SET_TAG(flex_parser_0, tag, parser_3_label, - misc_2_mask, outer_first_mpls_over_udp_label); - - DR_STE_SET_TAG(flex_parser_0, tag, parser_3_exp, - misc_2_mask, outer_first_mpls_over_udp_exp); - - DR_STE_SET_TAG(flex_parser_0, tag, parser_3_s_bos, - misc_2_mask, outer_first_mpls_over_udp_s_bos); - - DR_STE_SET_TAG(flex_parser_0, tag, parser_3_ttl, - misc_2_mask, outer_first_mpls_over_udp_ttl); - } - return 0; -} - -void dr_ste_build_flex_parser_0(struct dr_ste_build *sb, - struct dr_match_param *mask, - bool inner, bool rx) +void dr_ste_build_tnl_mpls(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx) { - dr_ste_build_flex_parser_0_bit_mask(mask, inner, sb->bit_mask); - sb->rx = rx; sb->inner = inner; - sb->lu_type = DR_STE_LU_TYPE_FLEX_PARSER_0; - sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); - sb->ste_build_tag_func = &dr_ste_build_flex_parser_0_tag; -} - -#define ICMP_TYPE_OFFSET_FIRST_DW 24 -#define ICMP_CODE_OFFSET_FIRST_DW 16 -#define ICMP_HEADER_DATA_OFFSET_SECOND_DW 0 - -static int dr_ste_build_flex_parser_1_bit_mask(struct dr_match_param *mask, - struct dr_devx_caps *caps, - uint8_t *bit_mask) -{ - struct dr_match_misc3 *misc_3_mask = &mask->misc3; - bool is_ipv4_mask = DR_MASK_IS_FLEX_PARSER_ICMPV4_SET(misc_3_mask); - uint32_t icmp_header_data_mask; - uint32_t icmp_type_mask; - uint32_t icmp_code_mask; - int dw0_location; - int dw1_location; - - if (is_ipv4_mask) { - icmp_header_data_mask = misc_3_mask->icmpv4_header_data; - icmp_type_mask = misc_3_mask->icmpv4_type; - icmp_code_mask = misc_3_mask->icmpv4_code; - dw0_location = caps->flex_parser_id_icmp_dw0; - dw1_location = caps->flex_parser_id_icmp_dw1; - } else { - icmp_header_data_mask = misc_3_mask->icmpv6_header_data; - icmp_type_mask = misc_3_mask->icmpv6_type; - icmp_code_mask = misc_3_mask->icmpv6_code; - dw0_location = caps->flex_parser_id_icmpv6_dw0; - dw1_location = caps->flex_parser_id_icmpv6_dw1; - } - - switch (dw0_location) { - case 4: - if (icmp_type_mask) { - DR_STE_SET(flex_parser_1, bit_mask, flex_parser_4, - (icmp_type_mask << ICMP_TYPE_OFFSET_FIRST_DW)); - if (is_ipv4_mask) - misc_3_mask->icmpv4_type = 0; - else - misc_3_mask->icmpv6_type = 0; - } - if (icmp_code_mask) { - uint32_t cur_val = DR_STE_GET(flex_parser_1, bit_mask, - flex_parser_4); - DR_STE_SET(flex_parser_1, bit_mask, flex_parser_4, - cur_val | (icmp_code_mask << ICMP_CODE_OFFSET_FIRST_DW)); - if (is_ipv4_mask) - misc_3_mask->icmpv4_code = 0; - else - misc_3_mask->icmpv6_code = 0; - } - break; - default: - errno = ENOTSUP; - return errno; - } - - switch (dw1_location) { - case 5: - if (icmp_header_data_mask) { - DR_STE_SET(flex_parser_1, bit_mask, flex_parser_5, - (icmp_header_data_mask << ICMP_HEADER_DATA_OFFSET_SECOND_DW)); - if (is_ipv4_mask) - misc_3_mask->icmpv4_header_data = 0; - else - misc_3_mask->icmpv6_header_data = 0; - } - break; - default: - errno = ENOTSUP; - return errno; - } - - return 0; -} - -static int dr_ste_build_flex_parser_1_tag(struct dr_match_param *value, - struct dr_ste_build *sb, - uint8_t *hw_ste_p) -{ - struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; - struct dr_match_misc3 *misc_3 = &value->misc3; - bool is_ipv4 = DR_MASK_IS_FLEX_PARSER_ICMPV4_SET(misc_3); - uint8_t *tag = hw_ste->tag; - uint32_t icmp_header_data; - uint32_t icmp_type; - uint32_t icmp_code; - int dw0_location; - int dw1_location; - - if (is_ipv4) { - icmp_header_data = misc_3->icmpv4_header_data; - icmp_type = misc_3->icmpv4_type; - icmp_code = misc_3->icmpv4_code; - dw0_location = sb->caps->flex_parser_id_icmp_dw0; - dw1_location = sb->caps->flex_parser_id_icmp_dw1; - } else { - icmp_header_data = misc_3->icmpv6_header_data; - icmp_type = misc_3->icmpv6_type; - icmp_code = misc_3->icmpv6_code; - dw0_location = sb->caps->flex_parser_id_icmpv6_dw0; - dw1_location = sb->caps->flex_parser_id_icmpv6_dw1; - } - - switch (dw0_location) { - case 4: - if (icmp_type) { - DR_STE_SET(flex_parser_1, tag, flex_parser_4, - (icmp_type << ICMP_TYPE_OFFSET_FIRST_DW)); - if (is_ipv4) - misc_3->icmpv4_type = 0; - else - misc_3->icmpv6_type = 0; - } - - if (icmp_code) { - uint32_t cur_val = DR_STE_GET(flex_parser_1, tag, - flex_parser_4); - DR_STE_SET(flex_parser_1, tag, flex_parser_4, - cur_val | (icmp_code << ICMP_CODE_OFFSET_FIRST_DW)); - if (is_ipv4) - misc_3->icmpv4_code = 0; - else - misc_3->icmpv6_code = 0; - } - break; - default: - errno = ENOTSUP; - return errno; - } - - switch (dw1_location) { - case 5: - if (icmp_header_data) { - DR_STE_SET(flex_parser_1, tag, flex_parser_5, - (icmp_header_data << ICMP_HEADER_DATA_OFFSET_SECOND_DW)); - if (is_ipv4) - misc_3->icmpv4_header_data = 0; - else - misc_3->icmpv6_header_data = 0; - } - break; - default: - errno = ENOTSUP; - return errno; - } - - return 0; + ste_ctx->build_tnl_mpls_init(sb, mask); } -int dr_ste_build_flex_parser_1(struct dr_ste_build *sb, - struct dr_match_param *mask, - struct dr_devx_caps *caps, - bool inner, bool rx) +int dr_ste_build_icmp(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, + struct dr_match_param *mask, + struct dr_devx_caps *caps, + bool inner, bool rx) { - int ret; - - ret = dr_ste_build_flex_parser_1_bit_mask(mask, caps, sb->bit_mask); - if (ret) - return ret; - sb->rx = rx; - sb->inner = inner; sb->caps = caps; - sb->lu_type = DR_STE_LU_TYPE_FLEX_PARSER_1; - sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); - sb->ste_build_tag_func = &dr_ste_build_flex_parser_1_tag; - - return 0; -} - -static void dr_ste_build_general_purpose_bit_mask(struct dr_match_param *value, - bool inner, uint8_t *bit_mask) -{ - struct dr_match_misc2 *misc_2_mask = &value->misc2; - - DR_STE_SET_MASK_V(general_purpose, bit_mask, - general_purpose_lookup_field, misc_2_mask, - metadata_reg_a); -} - -static int dr_ste_build_general_purpose_tag(struct dr_match_param *value, - struct dr_ste_build *sb, - uint8_t *hw_ste_p) -{ - struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; - struct dr_match_misc2 *misc_2_mask = &value->misc2; - uint8_t *tag = hw_ste->tag; - - DR_STE_SET_TAG(general_purpose, tag, general_purpose_lookup_field, - misc_2_mask, metadata_reg_a); - - return 0; + sb->inner = inner; + return ste_ctx->build_icmp_init(sb, mask); } -void dr_ste_build_general_purpose(struct dr_ste_build *sb, +void dr_ste_build_general_purpose(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, struct dr_match_param *mask, bool inner, bool rx) { - dr_ste_build_general_purpose_bit_mask(mask, inner, sb->bit_mask); - sb->rx = rx; sb->inner = inner; - sb->lu_type = DR_STE_LU_TYPE_GENERAL_PURPOSE; - sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); - sb->ste_build_tag_func = &dr_ste_build_general_purpose_tag; -} - -static void dr_ste_build_eth_l4_misc_bit_mask(struct dr_match_param *value, - bool inner, uint8_t *bit_mask) -{ - struct dr_match_misc3 *misc_3_mask = &value->misc3; - - if (inner) { - DR_STE_SET_MASK_V(eth_l4_misc, bit_mask, seq_num, misc_3_mask, - inner_tcp_seq_num); - DR_STE_SET_MASK_V(eth_l4_misc, bit_mask, ack_num, misc_3_mask, - inner_tcp_ack_num); - } else { - DR_STE_SET_MASK_V(eth_l4_misc, bit_mask, seq_num, misc_3_mask, - outer_tcp_seq_num); - DR_STE_SET_MASK_V(eth_l4_misc, bit_mask, ack_num, misc_3_mask, - outer_tcp_ack_num); - } + ste_ctx->build_general_purpose_init(sb, mask); } -static int dr_ste_build_eth_l4_misc_tag(struct dr_match_param *value, - struct dr_ste_build *sb, - uint8_t *hw_ste_p) -{ - struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; - struct dr_match_misc3 *misc3 = &value->misc3; - uint8_t *tag = hw_ste->tag; - - if (sb->inner) { - DR_STE_SET_TAG(eth_l4_misc, tag, seq_num, misc3, inner_tcp_seq_num); - DR_STE_SET_TAG(eth_l4_misc, tag, ack_num, misc3, inner_tcp_ack_num); - } else { - DR_STE_SET_TAG(eth_l4_misc, tag, seq_num, misc3, outer_tcp_seq_num); - DR_STE_SET_TAG(eth_l4_misc, tag, ack_num, misc3, outer_tcp_ack_num); - } - - return 0; -} - -void dr_ste_build_eth_l4_misc(struct dr_ste_build *sb, +void dr_ste_build_eth_l4_misc(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, struct dr_match_param *mask, bool inner, bool rx) { - dr_ste_build_eth_l4_misc_bit_mask(mask, inner, sb->bit_mask); - sb->rx = rx; sb->inner = inner; - sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL4_MISC, rx, inner); - sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); - sb->ste_build_tag_func = &dr_ste_build_eth_l4_misc_tag; + ste_ctx->build_eth_l4_misc_init(sb, mask); } -static void -dr_ste_build_flex_parser_tnl_vxlan_gpe_bit_mask(struct dr_match_param *value, - bool inner, uint8_t *bit_mask) -{ - struct dr_match_misc3 *misc_3_mask = &value->misc3; - - DR_STE_SET_MASK_V(flex_parser_tnl_vxlan_gpe, bit_mask, - outer_vxlan_gpe_flags, - misc_3_mask, outer_vxlan_gpe_flags); - DR_STE_SET_MASK_V(flex_parser_tnl_vxlan_gpe, bit_mask, - outer_vxlan_gpe_next_protocol, - misc_3_mask, outer_vxlan_gpe_next_protocol); - DR_STE_SET_MASK_V(flex_parser_tnl_vxlan_gpe, bit_mask, - outer_vxlan_gpe_vni, - misc_3_mask, outer_vxlan_gpe_vni); -} - -static int -dr_ste_build_flex_parser_tnl_vxlan_gpe_tag(struct dr_match_param *value, - struct dr_ste_build *sb, - uint8_t *hw_ste_p) -{ - struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; - struct dr_match_misc3 *misc3 = &value->misc3; - uint8_t *tag = hw_ste->tag; - - DR_STE_SET_TAG(flex_parser_tnl_vxlan_gpe, tag, - outer_vxlan_gpe_flags, misc3, - outer_vxlan_gpe_flags); - DR_STE_SET_TAG(flex_parser_tnl_vxlan_gpe, tag, - outer_vxlan_gpe_next_protocol, misc3, - outer_vxlan_gpe_next_protocol); - DR_STE_SET_TAG(flex_parser_tnl_vxlan_gpe, tag, - outer_vxlan_gpe_vni, misc3, - outer_vxlan_gpe_vni); - - return 0; -} - -void dr_ste_build_flex_parser_tnl_vxlan_gpe(struct dr_ste_build *sb, - struct dr_match_param *mask, - bool inner, bool rx) +void dr_ste_build_tnl_vxlan_gpe(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx) { - dr_ste_build_flex_parser_tnl_vxlan_gpe_bit_mask(mask, inner, - sb->bit_mask); sb->rx = rx; sb->inner = inner; - sb->lu_type = DR_STE_LU_TYPE_FLEX_PARSER_TNL_HEADER; - sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); - sb->ste_build_tag_func = &dr_ste_build_flex_parser_tnl_vxlan_gpe_tag; -} - -static void -dr_ste_build_flex_parser_tnl_geneve_bit_mask(struct dr_match_param *value, - uint8_t *bit_mask) -{ - struct dr_match_misc *misc_mask = &value->misc; - - DR_STE_SET_MASK_V(flex_parser_tnl_geneve, bit_mask, - geneve_protocol_type, - misc_mask, geneve_protocol_type); - DR_STE_SET_MASK_V(flex_parser_tnl_geneve, bit_mask, - geneve_oam, - misc_mask, geneve_oam); - DR_STE_SET_MASK_V(flex_parser_tnl_geneve, bit_mask, - geneve_opt_len, - misc_mask, geneve_opt_len); - DR_STE_SET_MASK_V(flex_parser_tnl_geneve, bit_mask, - geneve_vni, - misc_mask, geneve_vni); -} - -static int -dr_ste_build_flex_parser_tnl_geneve_tag(struct dr_match_param *value, - struct dr_ste_build *sb, - uint8_t *hw_ste_p) -{ - struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; - struct dr_match_misc *misc = &value->misc; - uint8_t *tag = hw_ste->tag; - - DR_STE_SET_TAG(flex_parser_tnl_geneve, tag, - geneve_protocol_type, misc, geneve_protocol_type); - DR_STE_SET_TAG(flex_parser_tnl_geneve, tag, - geneve_oam, misc, geneve_oam); - DR_STE_SET_TAG(flex_parser_tnl_geneve, tag, - geneve_opt_len, misc, geneve_opt_len); - DR_STE_SET_TAG(flex_parser_tnl_geneve, tag, - geneve_vni, misc, geneve_vni); - - return 0; + ste_ctx->build_tnl_vxlan_gpe_init(sb, mask); } -void dr_ste_build_flex_parser_tnl_geneve(struct dr_ste_build *sb, - struct dr_match_param *mask, - bool inner, bool rx) +void dr_ste_build_tnl_geneve(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx) { - dr_ste_build_flex_parser_tnl_geneve_bit_mask(mask, sb->bit_mask); sb->rx = rx; sb->inner = inner; - sb->lu_type = DR_STE_LU_TYPE_FLEX_PARSER_TNL_HEADER; - sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); - sb->ste_build_tag_func = &dr_ste_build_flex_parser_tnl_geneve_tag; + ste_ctx->build_tnl_geneve_init(sb, mask); } -static void -dr_ste_build_flex_parser_tnl_gtpu_bit_mask(struct dr_match_param *value, - uint8_t *bit_mask) -{ - struct dr_match_misc3 *misc3 = &value->misc3; - - DR_STE_SET_MASK_V(flex_parser_tnl_gtpu, bit_mask, - gtpu_flags, misc3, - gtpu_flags); - DR_STE_SET_MASK_V(flex_parser_tnl_gtpu, bit_mask, - gtpu_msg_type, misc3, - gtpu_msg_type); - DR_STE_SET_MASK_V(flex_parser_tnl_gtpu, bit_mask, - gtpu_teid, misc3, - gtpu_teid); -} - -static int -dr_ste_build_flex_parser_tnl_gtpu_tag(struct dr_match_param *value, - struct dr_ste_build *sb, - uint8_t *hw_ste_p) -{ - struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; - struct dr_match_misc3 *misc3 = &value->misc3; - uint8_t *tag = hw_ste->tag; - - DR_STE_SET_TAG(flex_parser_tnl_gtpu, tag, - gtpu_flags, misc3, - gtpu_flags); - DR_STE_SET_TAG(flex_parser_tnl_gtpu, tag, - gtpu_msg_type, misc3, - gtpu_msg_type); - DR_STE_SET_TAG(flex_parser_tnl_gtpu, tag, - gtpu_teid, misc3, - gtpu_teid); - - return 0; -} - -void dr_ste_build_flex_parser_tnl_gtpu(struct dr_ste_build *sb, - struct dr_match_param *mask, - bool inner, bool rx) +void dr_ste_build_tnl_gtpu(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx) { - dr_ste_build_flex_parser_tnl_gtpu_bit_mask(mask, sb->bit_mask); sb->rx = rx; sb->inner = inner; - sb->lu_type = DR_STE_LU_TYPE_FLEX_PARSER_TNL_HEADER; - sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); - sb->ste_build_tag_func = &dr_ste_build_flex_parser_tnl_gtpu_tag; -} - -static void dr_ste_build_register_0_bit_mask(struct dr_match_param *value, - uint8_t *bit_mask) -{ - struct dr_match_misc2 *misc_2_mask = &value->misc2; - - DR_STE_SET_MASK_V(register_0, bit_mask, register_0_h, - misc_2_mask, metadata_reg_c_0); - DR_STE_SET_MASK_V(register_0, bit_mask, register_0_l, - misc_2_mask, metadata_reg_c_1); - DR_STE_SET_MASK_V(register_0, bit_mask, register_1_h, - misc_2_mask, metadata_reg_c_2); - DR_STE_SET_MASK_V(register_0, bit_mask, register_1_l, - misc_2_mask, metadata_reg_c_3); + ste_ctx->build_tnl_gtpu_init(sb, mask); } -static int dr_ste_build_register_0_tag(struct dr_match_param *value, - struct dr_ste_build *sb, - uint8_t *hw_ste_p) -{ - struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; - struct dr_match_misc2 *misc2 = &value->misc2; - uint8_t *tag = hw_ste->tag; - - DR_STE_SET_TAG(register_0, tag, register_0_h, misc2, metadata_reg_c_0); - DR_STE_SET_TAG(register_0, tag, register_0_l, misc2, metadata_reg_c_1); - DR_STE_SET_TAG(register_0, tag, register_1_h, misc2, metadata_reg_c_2); - DR_STE_SET_TAG(register_0, tag, register_1_l, misc2, metadata_reg_c_3); - - return 0; -} - -void dr_ste_build_register_0(struct dr_ste_build *sb, +void dr_ste_build_register_0(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, struct dr_match_param *mask, bool inner, bool rx) { - dr_ste_build_register_0_bit_mask(mask, sb->bit_mask); - sb->rx = rx; sb->inner = inner; - sb->lu_type = DR_STE_LU_TYPE_STEERING_REGISTERS_0; - sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); - sb->ste_build_tag_func = &dr_ste_build_register_0_tag; -} - -static void dr_ste_build_register_1_bit_mask(struct dr_match_param *value, - uint8_t *bit_mask) -{ - struct dr_match_misc2 *misc_2_mask = &value->misc2; - - DR_STE_SET_MASK_V(register_1, bit_mask, register_2_h, - misc_2_mask, metadata_reg_c_4); - DR_STE_SET_MASK_V(register_1, bit_mask, register_2_l, - misc_2_mask, metadata_reg_c_5); - DR_STE_SET_MASK_V(register_1, bit_mask, register_3_h, - misc_2_mask, metadata_reg_c_6); - DR_STE_SET_MASK_V(register_1, bit_mask, register_3_l, - misc_2_mask, metadata_reg_c_7); + ste_ctx->build_register_0_init(sb, mask); } -static int dr_ste_build_register_1_tag(struct dr_match_param *value, - struct dr_ste_build *sb, - uint8_t *hw_ste_p) -{ - struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; - struct dr_match_misc2 *misc2 = &value->misc2; - uint8_t *tag = hw_ste->tag; - - DR_STE_SET_TAG(register_1, tag, register_2_h, misc2, metadata_reg_c_4); - DR_STE_SET_TAG(register_1, tag, register_2_l, misc2, metadata_reg_c_5); - DR_STE_SET_TAG(register_1, tag, register_3_h, misc2, metadata_reg_c_6); - DR_STE_SET_TAG(register_1, tag, register_3_l, misc2, metadata_reg_c_7); - - return 0; -} - -void dr_ste_build_register_1(struct dr_ste_build *sb, +void dr_ste_build_register_1(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, struct dr_match_param *mask, bool inner, bool rx) { - dr_ste_build_register_1_bit_mask(mask, sb->bit_mask); - sb->rx = rx; sb->inner = inner; - sb->lu_type = DR_STE_LU_TYPE_STEERING_REGISTERS_1; - sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); - sb->ste_build_tag_func = &dr_ste_build_register_1_tag; -} - -static int dr_ste_build_src_gvmi_qpn_bit_mask(struct dr_match_param *value, - uint8_t *bit_mask) -{ - struct dr_match_misc *misc_mask = &value->misc; - - if (misc_mask->source_port && misc_mask->source_port != 0xffff) { - errno = EINVAL; - return errno; - } - DR_STE_SET_MASK(src_gvmi_qp, bit_mask, source_gvmi, misc_mask, source_port); - DR_STE_SET_MASK(src_gvmi_qp, bit_mask, source_qp, misc_mask, source_sqn); - - return 0; + ste_ctx->build_register_1_init(sb, mask); } -static int dr_ste_build_src_gvmi_qpn_tag(struct dr_match_param *value, - struct dr_ste_build *sb, - uint8_t *hw_ste_p) +void dr_ste_build_src_gvmi_qpn(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, + struct dr_match_param *mask, + struct dr_devx_caps *caps, + bool inner, bool rx) { - struct dr_hw_ste_format *hw_ste = (struct dr_hw_ste_format *)hw_ste_p; - struct dr_match_misc *misc = &value->misc; - struct dr_devx_vport_cap *vport_cap; - uint8_t *tag = hw_ste->tag; - - DR_STE_SET_TAG(src_gvmi_qp, tag, source_qp, misc, source_sqn); - - vport_cap = dr_get_vport_cap(sb->caps, misc->source_port); - if (!vport_cap) - return errno; - - if (vport_cap->gvmi) - DR_STE_SET(src_gvmi_qp, tag, source_gvmi, vport_cap->gvmi); - - misc->source_port = 0; - - return 0; + sb->rx = rx; + sb->caps = caps; + sb->inner = inner; + ste_ctx->build_src_gvmi_qpn_init(sb, mask); } -int dr_ste_build_src_gvmi_qpn(struct dr_ste_build *sb, - struct dr_match_param *mask, - struct dr_devx_caps *caps, - bool inner, bool rx) +struct dr_ste_ctx *dr_ste_get_ctx(uint8_t version) { - int ret; + if (version == MLX5_HW_CONNECTX_5) + return dr_ste_get_ctx_v0(); + else if (version == MLX5_HW_CONNECTX_6DX) + return dr_ste_get_ctx_v1(); - ret = dr_ste_build_src_gvmi_qpn_bit_mask(mask, sb->bit_mask); - if (ret) - return ret; + errno = EOPNOTSUPP; - sb->rx = rx; - sb->caps = caps; - sb->inner = inner; - sb->lu_type = DR_STE_LU_TYPE_SRC_GVMI_AND_QP; - sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); - sb->ste_build_tag_func = &dr_ste_build_src_gvmi_qpn_tag; - - return 0; + return NULL; } diff --git a/providers/mlx5/dr_ste.h b/providers/mlx5/dr_ste.h new file mode 100644 index 0000000..88ceef2 --- /dev/null +++ b/providers/mlx5/dr_ste.h @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2020, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef _DR_STE_ +#define _DR_STE_ + +#include +#include "mlx5dv_dr.h" + +#define IPV4_ETHERTYPE 0x0800 +#define IPV6_ETHERTYPE 0x86DD +#define STE_IPV4 0x1 +#define STE_IPV6 0x2 +#define STE_TCP 0x1 +#define STE_UDP 0x2 +#define STE_SPI 0x3 +#define IP_VERSION_IPV4 0x4 +#define IP_VERSION_IPV6 0x6 +#define IP_PROTOCOL_UDP 0x11 +#define IP_PROTOCOL_TCP 0x06 +#define IP_PROTOCOL_IPSEC 0x33 +#define TCP_PROTOCOL 0x6 +#define UDP_PROTOCOL 0x11 +#define IPSEC_PROTOCOL 0x33 +#define HDR_LEN_L2_MACS 0xC +#define HDR_LEN_L2_VLAN 0x4 +#define HDR_LEN_L2_ETHER 0x2 +#define HDR_LEN_L2 (HDR_LEN_L2_MACS + HDR_LEN_L2_ETHER) +#define HDR_LEN_L2_W_VLAN (HDR_LEN_L2 + HDR_LEN_L2_VLAN) + +/* Read from layout struct */ +#define DR_STE_GET(typ, p, fld) DEVX_GET(ste_##typ, p, fld) + +/* Write to layout a value */ +#define DR_STE_SET(typ, p, fld, v) DEVX_SET(ste_##typ, p, fld, v) + +#define DR_STE_SET_BOOL(typ, p, fld, v) DEVX_SET(ste_##typ, p, fld, !!(v)) + +/* Set to STE a specific value using DR_STE_SET */ +#define DR_STE_SET_VAL(lookup_type, tag, t_fname, spec, s_fname, value) do { \ + if ((spec)->s_fname) { \ + DR_STE_SET(lookup_type, tag, t_fname, value); \ + (spec)->s_fname = 0; \ + } \ +} while (0) + +/* Set to STE spec->s_fname to tag->t_fname set spec->s_fname as used */ +#define DR_STE_SET_TAG(lookup_type, tag, t_fname, spec, s_fname) \ + DR_STE_SET_VAL(lookup_type, tag, t_fname, spec, s_fname, (spec)->s_fname) + +/* Set to STE -1 to tag->t_fname and set spec->s_fname as used */ +#define DR_STE_SET_ONES(lookup_type, tag, t_fname, spec, s_fname) \ + DR_STE_SET_VAL(lookup_type, tag, t_fname, spec, s_fname, -1) + +#define DR_STE_SET_TCP_FLAGS(lookup_type, tag, spec) do { \ + DR_STE_SET_BOOL(lookup_type, tag, tcp_ns, (spec)->tcp_flags & (1 << 8)); \ + DR_STE_SET_BOOL(lookup_type, tag, tcp_cwr, (spec)->tcp_flags & (1 << 7)); \ + DR_STE_SET_BOOL(lookup_type, tag, tcp_ece, (spec)->tcp_flags & (1 << 6)); \ + DR_STE_SET_BOOL(lookup_type, tag, tcp_urg, (spec)->tcp_flags & (1 << 5)); \ + DR_STE_SET_BOOL(lookup_type, tag, tcp_ack, (spec)->tcp_flags & (1 << 4)); \ + DR_STE_SET_BOOL(lookup_type, tag, tcp_psh, (spec)->tcp_flags & (1 << 3)); \ + DR_STE_SET_BOOL(lookup_type, tag, tcp_rst, (spec)->tcp_flags & (1 << 2)); \ + DR_STE_SET_BOOL(lookup_type, tag, tcp_syn, (spec)->tcp_flags & (1 << 1)); \ + DR_STE_SET_BOOL(lookup_type, tag, tcp_fin, (spec)->tcp_flags & (1 << 0)); \ +} while (0) + +#define DR_STE_SET_MPLS(lookup_type, mask, in_out, tag) do { \ + DR_STE_SET_TAG(lookup_type, tag, mpls0_label, mask, \ + in_out##_first_mpls_label);\ + DR_STE_SET_TAG(lookup_type, tag, mpls0_s_bos, mask, \ + in_out##_first_mpls_s_bos); \ + DR_STE_SET_TAG(lookup_type, tag, mpls0_exp, mask, \ + in_out##_first_mpls_exp); \ + DR_STE_SET_TAG(lookup_type, tag, mpls0_ttl, mask, \ + in_out##_first_mpls_ttl); \ +} while (0) + +#define DR_STE_IS_OUTER_MPLS_OVER_GRE_SET(_misc) (\ + (_misc)->outer_first_mpls_over_gre_label || \ + (_misc)->outer_first_mpls_over_gre_exp || \ + (_misc)->outer_first_mpls_over_gre_s_bos || \ + (_misc)->outer_first_mpls_over_gre_ttl) + +#define DR_STE_IS_OUTER_MPLS_OVER_UDP_SET(_misc) (\ + (_misc)->outer_first_mpls_over_udp_label || \ + (_misc)->outer_first_mpls_over_udp_exp || \ + (_misc)->outer_first_mpls_over_udp_s_bos || \ + (_misc)->outer_first_mpls_over_udp_ttl) + +enum dr_ste_action_modify_type_l3 { + DR_STE_ACTION_MDFY_TYPE_L3_NONE = 0x0, + DR_STE_ACTION_MDFY_TYPE_L3_IPV4 = 0x1, + DR_STE_ACTION_MDFY_TYPE_L3_IPV6 = 0x2, +}; + +enum dr_ste_action_modify_type_l4 { + DR_STE_ACTION_MDFY_TYPE_L4_NONE = 0x0, + DR_STE_ACTION_MDFY_TYPE_L4_TCP = 0x1, + DR_STE_ACTION_MDFY_TYPE_L4_UDP = 0x2, +}; + +uint16_t dr_ste_conv_bit_to_byte_mask(uint8_t *bit_mask); + +typedef void (*dr_ste_builder_void_init)(struct dr_ste_build *sb, + struct dr_match_param *mask); + +typedef int (*dr_ste_builder_int_init)(struct dr_ste_build *sb, + struct dr_match_param *mask); +struct dr_ste_ctx { + /* Builders */ + dr_ste_builder_void_init build_eth_l2_src_dst_init; + dr_ste_builder_void_init build_eth_l3_ipv6_src_init; + dr_ste_builder_void_init build_eth_l3_ipv6_dst_init; + dr_ste_builder_void_init build_eth_l3_ipv4_5_tuple_init; + dr_ste_builder_void_init build_eth_l2_src_init; + dr_ste_builder_void_init build_eth_l2_dst_init; + dr_ste_builder_void_init build_eth_l2_tnl_init; + dr_ste_builder_void_init build_eth_l3_ipv4_misc_init; + dr_ste_builder_void_init build_eth_ipv6_l3_l4_init; + dr_ste_builder_void_init build_mpls_init; + dr_ste_builder_void_init build_tnl_gre_init; + dr_ste_builder_void_init build_tnl_mpls_init; + dr_ste_builder_int_init build_icmp_init; + dr_ste_builder_void_init build_general_purpose_init; + dr_ste_builder_void_init build_eth_l4_misc_init; + dr_ste_builder_void_init build_tnl_vxlan_gpe_init; + dr_ste_builder_void_init build_tnl_geneve_init; + dr_ste_builder_void_init build_tnl_gtpu_init; + dr_ste_builder_void_init build_register_0_init; + dr_ste_builder_void_init build_register_1_init; + dr_ste_builder_void_init build_src_gvmi_qpn_init; + + /* Getters and Setters */ + void (*ste_init)(uint8_t *hw_ste_p, uint16_t lu_type, + uint8_t entry_type, uint16_t gvmi); + void (*set_next_lu_type)(uint8_t *hw_ste_p, uint16_t lu_type); + uint16_t (*get_next_lu_type)(uint8_t *hw_ste_p); + void (*set_miss_addr)(uint8_t *hw_ste_p, uint64_t miss_addr); + uint64_t (*get_miss_addr)(uint8_t *hw_ste_p); + void (*set_hit_addr)(uint8_t *hw_ste_p, uint64_t icm_addr, uint32_t ht_size); + void (*set_byte_mask)(uint8_t *hw_ste_p, uint16_t byte_mask); + uint16_t (*get_byte_mask)(uint8_t *hw_ste_p); + + /* Actions */ + void (*set_actions_rx)(uint8_t *action_type_set, + uint8_t *hw_ste_arr, + struct dr_ste_actions_attr *attr, + uint32_t *added_stes); + void (*set_actions_tx)(uint8_t *action_type_set, + uint8_t *hw_ste_arr, + struct dr_ste_actions_attr *attr, + uint32_t *added_stes); + uint32_t modify_field_arr_sz; + const struct dr_ste_action_modify_field *modify_field_arr; + void (*set_action_set)(uint8_t *hw_action, + uint8_t hw_field, + uint8_t shifter, + uint8_t length, + uint32_t data); + void (*set_action_add)(uint8_t *hw_action, + uint8_t hw_field, + uint8_t shifter, + uint8_t length, + uint32_t data); + void (*set_action_copy)(uint8_t *hw_action, + uint8_t dst_hw_field, + uint8_t dst_shifter, + uint8_t dst_len, + uint8_t src_hw_field, + uint8_t src_shifter); + int (*set_action_decap_l3_list)(void *data, uint32_t data_sz, + uint8_t *hw_action, uint32_t hw_action_sz, + uint16_t *used_hw_action_num); + + /* Send */ + void (*prepare_for_postsend)(uint8_t *hw_ste_p, uint32_t ste_size); +}; + +struct dr_ste_ctx *dr_ste_get_ctx_v0(void); +struct dr_ste_ctx *dr_ste_get_ctx_v1(void); + +#endif diff --git a/providers/mlx5/dr_ste_v0.c b/providers/mlx5/dr_ste_v0.c new file mode 100644 index 0000000..75825e3 --- /dev/null +++ b/providers/mlx5/dr_ste_v0.c @@ -0,0 +1,1535 @@ +/* + * Copyright (c) 2020, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "dr_ste.h" + +#define SVLAN_ETHERTYPE 0x88a8 +#define DR_STE_ENABLE_FLOW_TAG (1 << 31) + +enum dr_ste_v0_action_tunl { + DR_STE_TUNL_ACTION_NONE = 0, + DR_STE_TUNL_ACTION_ENABLE = 1, + DR_STE_TUNL_ACTION_DECAP = 2, + DR_STE_TUNL_ACTION_L3_DECAP = 3, +}; + +enum dr_ste_v0_action_type { + DR_STE_ACTION_TYPE_ENCAP_L3 = 3, + DR_STE_ACTION_TYPE_ENCAP = 4, +}; + +enum dr_ste_v0_action_mdfy_op { + DR_STE_ACTION_MDFY_OP_COPY = 0x1, + DR_STE_ACTION_MDFY_OP_SET = 0x2, + DR_STE_ACTION_MDFY_OP_ADD = 0x3, +}; + +#define DR_STE_CALC_LU_TYPE(lookup_type, rx, inner) \ + ((inner) ? DR_STE_V0_LU_TYPE_##lookup_type##_I : \ + (rx) ? DR_STE_V0_LU_TYPE_##lookup_type##_D : \ + DR_STE_V0_LU_TYPE_##lookup_type##_O) + +enum dr_ste_v0_lu_type { + DR_STE_V0_LU_TYPE_NOP = 0x00, + DR_STE_V0_LU_TYPE_SRC_GVMI_AND_QP = 0x05, + DR_STE_V0_LU_TYPE_ETHL2_TUNNELING_I = 0x0a, + DR_STE_V0_LU_TYPE_ETHL2_DST_O = 0x06, + DR_STE_V0_LU_TYPE_ETHL2_DST_I = 0x07, + DR_STE_V0_LU_TYPE_ETHL2_DST_D = 0x1b, + DR_STE_V0_LU_TYPE_ETHL2_SRC_O = 0x08, + DR_STE_V0_LU_TYPE_ETHL2_SRC_I = 0x09, + DR_STE_V0_LU_TYPE_ETHL2_SRC_D = 0x1c, + DR_STE_V0_LU_TYPE_ETHL2_SRC_DST_O = 0x36, + DR_STE_V0_LU_TYPE_ETHL2_SRC_DST_I = 0x37, + DR_STE_V0_LU_TYPE_ETHL2_SRC_DST_D = 0x38, + DR_STE_V0_LU_TYPE_ETHL3_IPV6_DST_O = 0x0d, + DR_STE_V0_LU_TYPE_ETHL3_IPV6_DST_I = 0x0e, + DR_STE_V0_LU_TYPE_ETHL3_IPV6_DST_D = 0x1e, + DR_STE_V0_LU_TYPE_ETHL3_IPV6_SRC_O = 0x0f, + DR_STE_V0_LU_TYPE_ETHL3_IPV6_SRC_I = 0x10, + DR_STE_V0_LU_TYPE_ETHL3_IPV6_SRC_D = 0x1f, + DR_STE_V0_LU_TYPE_ETHL3_IPV4_5_TUPLE_O = 0x11, + DR_STE_V0_LU_TYPE_ETHL3_IPV4_5_TUPLE_I = 0x12, + DR_STE_V0_LU_TYPE_ETHL3_IPV4_5_TUPLE_D = 0x20, + DR_STE_V0_LU_TYPE_ETHL3_IPV4_MISC_O = 0x29, + DR_STE_V0_LU_TYPE_ETHL3_IPV4_MISC_I = 0x2a, + DR_STE_V0_LU_TYPE_ETHL3_IPV4_MISC_D = 0x2b, + DR_STE_V0_LU_TYPE_ETHL4_O = 0x13, + DR_STE_V0_LU_TYPE_ETHL4_I = 0x14, + DR_STE_V0_LU_TYPE_ETHL4_D = 0x21, + DR_STE_V0_LU_TYPE_ETHL4_MISC_O = 0x2c, + DR_STE_V0_LU_TYPE_ETHL4_MISC_I = 0x2d, + DR_STE_V0_LU_TYPE_ETHL4_MISC_D = 0x2e, + DR_STE_V0_LU_TYPE_MPLS_FIRST_O = 0x15, + DR_STE_V0_LU_TYPE_MPLS_FIRST_I = 0x24, + DR_STE_V0_LU_TYPE_MPLS_FIRST_D = 0x25, + DR_STE_V0_LU_TYPE_GRE = 0x16, + DR_STE_V0_LU_TYPE_FLEX_PARSER_0 = 0x22, + DR_STE_V0_LU_TYPE_FLEX_PARSER_1 = 0x23, + DR_STE_V0_LU_TYPE_FLEX_PARSER_TNL_HEADER = 0x19, + DR_STE_V0_LU_TYPE_GENERAL_PURPOSE = 0x18, + DR_STE_V0_LU_TYPE_STEERING_REGISTERS_0 = 0x2f, + DR_STE_V0_LU_TYPE_STEERING_REGISTERS_1 = 0x30, + DR_STE_V0_LU_TYPE_DONT_CARE = DR_STE_LU_TYPE_DONT_CARE, +}; + +enum { + DR_STE_V0_ACTION_MDFY_FLD_L2_0 = 0x00, + DR_STE_V0_ACTION_MDFY_FLD_L2_1 = 0x01, + DR_STE_V0_ACTION_MDFY_FLD_L2_2 = 0x02, + DR_STE_V0_ACTION_MDFY_FLD_L3_0 = 0x03, + DR_STE_V0_ACTION_MDFY_FLD_L3_1 = 0x04, + DR_STE_V0_ACTION_MDFY_FLD_L3_2 = 0x05, + DR_STE_V0_ACTION_MDFY_FLD_L3_3 = 0x06, + DR_STE_V0_ACTION_MDFY_FLD_L3_4 = 0x07, + DR_STE_V0_ACTION_MDFY_FLD_L4_0 = 0x08, + DR_STE_V0_ACTION_MDFY_FLD_L4_1 = 0x09, + DR_STE_V0_ACTION_MDFY_FLD_MPLS = 0x0a, + DR_STE_V0_ACTION_MDFY_FLD_L2_TNL_0 = 0x0b, + DR_STE_V0_ACTION_MDFY_FLD_REG_0 = 0x0c, + DR_STE_V0_ACTION_MDFY_FLD_REG_1 = 0x0d, + DR_STE_V0_ACTION_MDFY_FLD_REG_2 = 0x0e, + DR_STE_V0_ACTION_MDFY_FLD_REG_3 = 0x0f, + DR_STE_V0_ACTION_MDFY_FLD_L4_2 = 0x10, + DR_STE_V0_ACTION_MDFY_FLD_FLEX_0 = 0x11, + DR_STE_V0_ACTION_MDFY_FLD_FLEX_1 = 0x12, + DR_STE_V0_ACTION_MDFY_FLD_FLEX_2 = 0x13, + DR_STE_V0_ACTION_MDFY_FLD_FLEX_3 = 0x14, + DR_STE_V0_ACTION_MDFY_FLD_L2_TNL_1 = 0x15, + DR_STE_V0_ACTION_MDFY_FLD_METADATA = 0x16, + DR_STE_V0_ACTION_MDFY_FLD_RESERVED = 0x17, +}; + +static const struct dr_ste_action_modify_field dr_ste_v0_action_modify_field_arr[] = { + [MLX5_ACTION_IN_FIELD_OUT_SMAC_47_16] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L2_1, .start = 16, .end = 47, + }, + [MLX5_ACTION_IN_FIELD_OUT_SMAC_15_0] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L2_1, .start = 0, .end = 15, + }, + [MLX5_ACTION_IN_FIELD_OUT_ETHERTYPE] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L2_2, .start = 32, .end = 47, + }, + [MLX5_ACTION_IN_FIELD_OUT_DMAC_47_16] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L2_0, .start = 16, .end = 47, + }, + [MLX5_ACTION_IN_FIELD_OUT_DMAC_15_0] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L2_0, .start = 0, .end = 15, + }, + [MLX5_ACTION_IN_FIELD_OUT_IP_DSCP] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L3_1, .start = 0, .end = 5, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_FLAGS] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L4_0, .start = 48, .end = 56, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_TCP, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_SPORT] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L4_0, .start = 0, .end = 15, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_TCP, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_DPORT] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L4_0, .start = 16, .end = 31, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_TCP, + }, + [MLX5_ACTION_IN_FIELD_OUT_IP_TTL] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L3_1, .start = 8, .end = 15, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV4, + }, + [MLX5_ACTION_IN_FIELD_OUT_IPV6_HOPLIMIT] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L3_1, .start = 8, .end = 15, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_UDP_SPORT] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L4_0, .start = 0, .end = 15, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_UDP, + }, + [MLX5_ACTION_IN_FIELD_OUT_UDP_DPORT] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L4_0, .start = 16, .end = 31, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_UDP, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_127_96] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L3_3, .start = 32, .end = 63, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_95_64] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L3_3, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_63_32] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L3_4, .start = 32, .end = 63, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_31_0] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L3_4, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_127_96] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L3_0, .start = 32, .end = 63, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_95_64] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L3_0, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_63_32] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L3_2, .start = 32, .end = 63, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_31_0] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L3_2, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV4] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L3_0, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV4, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV4] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L3_0, .start = 32, .end = 63, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV4, + }, + [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGA] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_METADATA, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGB] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_METADATA, .start = 32, .end = 63, + }, + [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_0] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_REG_0, .start = 32, .end = 63, + }, + [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_1] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_REG_0, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_2] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_REG_1, .start = 32, .end = 63, + }, + [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_3] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_REG_1, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_4] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_REG_2, .start = 32, .end = 63, + }, + [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_5] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_REG_2, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_SEQ_NUM] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L4_1, .start = 32, .end = 63, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_ACK_NUM] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L4_1, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_FIRST_VID] = { + .hw_field = DR_STE_V0_ACTION_MDFY_FLD_L2_2, .start = 0, .end = 15, + }, +}; + +static void dr_ste_v0_set_entry_type(uint8_t *hw_ste_p, uint8_t entry_type) +{ + DR_STE_SET(general, hw_ste_p, entry_type, entry_type); +} + +static uint8_t dr_ste_v0_get_entry_type(uint8_t *hw_ste_p) +{ + return DR_STE_GET(general, hw_ste_p, entry_type); +} + +static void dr_ste_v0_set_miss_addr(uint8_t *hw_ste_p, uint64_t miss_addr) +{ + uint64_t index = miss_addr >> 6; + + /* Miss address for TX and RX STEs located in the same offsets */ + DR_STE_SET(rx_steering_mult, hw_ste_p, miss_address_39_32, index >> 26); + DR_STE_SET(rx_steering_mult, hw_ste_p, miss_address_31_6, index); +} + +static uint64_t dr_ste_v0_get_miss_addr(uint8_t *hw_ste_p) +{ + uint64_t index = + (DR_STE_GET(rx_steering_mult, hw_ste_p, miss_address_31_6) | + DR_STE_GET(rx_steering_mult, hw_ste_p, miss_address_39_32) << 26); + + return index << 6; +} + +static void dr_ste_v0_set_byte_mask(uint8_t *hw_ste_p, uint16_t byte_mask) +{ + DR_STE_SET(general, hw_ste_p, byte_mask, byte_mask); +} + +static uint16_t dr_ste_v0_get_byte_mask(uint8_t *hw_ste_p) +{ + return DR_STE_GET(general, hw_ste_p, byte_mask); +} + +static void dr_ste_v0_set_lu_type(uint8_t *hw_ste_p, uint16_t lu_type) +{ + DR_STE_SET(general, hw_ste_p, entry_sub_type, lu_type); +} + +static void dr_ste_v0_set_next_lu_type(uint8_t *hw_ste_p, uint16_t lu_type) +{ + DR_STE_SET(general, hw_ste_p, next_lu_type, lu_type); +} + +static uint16_t dr_ste_v0_get_next_lu_type(uint8_t *hw_ste_p) +{ + return DR_STE_GET(general, hw_ste_p, next_lu_type); +} + +static void dr_ste_v0_set_hit_addr(uint8_t *hw_ste_p, uint64_t icm_addr, uint32_t ht_size) +{ + uint64_t index = (icm_addr >> 5) | ht_size; + + DR_STE_SET(general, hw_ste_p, next_table_base_39_32_size, index >> 27); + DR_STE_SET(general, hw_ste_p, next_table_base_31_5_size, index); +} + +static void dr_ste_v0_init(uint8_t *hw_ste_p, uint16_t lu_type, + uint8_t entry_type, uint16_t gvmi) +{ + dr_ste_v0_set_entry_type(hw_ste_p, entry_type); + dr_ste_v0_set_lu_type(hw_ste_p, lu_type); + dr_ste_v0_set_next_lu_type(hw_ste_p, DR_STE_LU_TYPE_DONT_CARE); + + DR_STE_SET(rx_steering_mult, hw_ste_p, gvmi, gvmi); + DR_STE_SET(rx_steering_mult, hw_ste_p, next_table_base_63_48, gvmi); + DR_STE_SET(rx_steering_mult, hw_ste_p, miss_address_63_48, gvmi); +} + +static void dr_ste_v0_set_rx_flow_tag(uint8_t *hw_ste_p, uint32_t flow_tag) +{ + DR_STE_SET(rx_steering_mult, hw_ste_p, qp_list_pointer, + DR_STE_ENABLE_FLOW_TAG | flow_tag); +} + +static void dr_ste_v0_set_counter_id(uint8_t *hw_ste_p, uint32_t ctr_id) +{ + /* This can be used for both rx_steering_mult and for sx_transmit */ + DR_STE_SET(rx_steering_mult, hw_ste_p, counter_trigger_15_0, ctr_id); + DR_STE_SET(rx_steering_mult, hw_ste_p, counter_trigger_23_16, ctr_id >> 16); +} + +static void dr_ste_v0_set_tx_encap(void *hw_ste_p, uint32_t reformat_id, + int size, bool encap_l3) +{ + DR_STE_SET(sx_transmit, hw_ste_p, action_type, + encap_l3 ? DR_STE_ACTION_TYPE_ENCAP_L3 : DR_STE_ACTION_TYPE_ENCAP); + /* The hardware expects here size in words (2 byte) */ + DR_STE_SET(sx_transmit, hw_ste_p, action_description, size / 2); + DR_STE_SET(sx_transmit, hw_ste_p, encap_pointer_vlan_data, reformat_id); +} + +static void dr_ste_v0_set_rx_decap(uint8_t *hw_ste_p) +{ + DR_STE_SET(rx_steering_mult, hw_ste_p, tunneling_action, + DR_STE_TUNL_ACTION_DECAP); +} + +static void dr_ste_v0_set_rx_decap_l3(uint8_t *hw_ste_p, bool vlan) +{ + DR_STE_SET(rx_steering_mult, hw_ste_p, tunneling_action, + DR_STE_TUNL_ACTION_L3_DECAP); + DR_STE_SET(modify_packet, hw_ste_p, action_description, vlan ? 1 : 0); +} + +static void dr_ste_v0_set_rewrite_actions(uint8_t *hw_ste_p, + uint16_t num_of_actions, + uint32_t re_write_index) +{ + DR_STE_SET(modify_packet, hw_ste_p, number_of_re_write_actions, + num_of_actions); + DR_STE_SET(modify_packet, hw_ste_p, header_re_write_actions_pointer, + re_write_index); +} + +static inline void dr_ste_v0_arr_init_next(uint8_t **last_ste, + uint32_t *added_stes, + enum dr_ste_entry_type entry_type, + uint16_t gvmi) +{ + (*added_stes)++; + *last_ste += DR_STE_SIZE; + dr_ste_v0_init(*last_ste, DR_STE_LU_TYPE_DONT_CARE, entry_type, gvmi); +} + +static void dr_ste_v0_set_actions_tx(uint8_t *action_type_set, + uint8_t *last_ste, + struct dr_ste_actions_attr *attr, + uint32_t *added_stes) +{ + /* We want to make sure the modify header comes before L2 + * encapsulation. The reason for that is that we support + * modify headers for outer headers only + */ + if (action_type_set[DR_ACTION_TYP_MODIFY_HDR]) { + dr_ste_v0_set_entry_type(last_ste, DR_STE_TYPE_MODIFY_PKT); + dr_ste_v0_set_rewrite_actions(last_ste, + attr->modify_actions, + attr->modify_index); + } + + if (action_type_set[DR_ACTION_TYP_L2_TO_TNL_L2] || + action_type_set[DR_ACTION_TYP_L2_TO_TNL_L3]) { + /* Modify header and encapsulation require a different STEs. + * Since modify header STE format doesn't support encapsulation + * tunneling_action. + */ + if (action_type_set[DR_ACTION_TYP_MODIFY_HDR]) + dr_ste_v0_arr_init_next(&last_ste, + added_stes, + DR_STE_TYPE_TX, + attr->gvmi); + + dr_ste_v0_set_tx_encap(last_ste, + attr->reformat_id, + attr->reformat_size, + action_type_set[DR_ACTION_TYP_L2_TO_TNL_L3]); + } + + if (action_type_set[DR_ACTION_TYP_CTR]) + dr_ste_v0_set_counter_id(last_ste, attr->ctr_id); + + dr_ste_v0_set_hit_addr(last_ste, attr->final_icm_addr, 1); +} + +static void dr_ste_v0_set_actions_rx(uint8_t *action_type_set, + uint8_t *last_ste, + struct dr_ste_actions_attr *attr, + uint32_t *added_stes) +{ + if (action_type_set[DR_ACTION_TYP_CTR]) + dr_ste_v0_set_counter_id(last_ste, attr->ctr_id); + + if (action_type_set[DR_ACTION_TYP_TNL_L3_TO_L2]) { + dr_ste_v0_set_entry_type(last_ste, DR_STE_TYPE_MODIFY_PKT); + dr_ste_v0_set_rx_decap_l3(last_ste, attr->decap_with_vlan); + dr_ste_v0_set_rewrite_actions(last_ste, + attr->decap_actions, + attr->decap_index); + } + + if (action_type_set[DR_ACTION_TYP_TNL_L2_TO_L2]) + dr_ste_v0_set_rx_decap(last_ste); + + if (action_type_set[DR_ACTION_TYP_MODIFY_HDR]) { + if (dr_ste_v0_get_entry_type(last_ste) == DR_STE_TYPE_MODIFY_PKT) + dr_ste_v0_arr_init_next(&last_ste, + added_stes, + DR_STE_TYPE_MODIFY_PKT, + attr->gvmi); + else + dr_ste_v0_set_entry_type(last_ste, DR_STE_TYPE_MODIFY_PKT); + + dr_ste_v0_set_rewrite_actions(last_ste, + attr->modify_actions, + attr->modify_index); + } + + if (action_type_set[DR_ACTION_TYP_TAG]) { + if (dr_ste_v0_get_entry_type(last_ste) == DR_STE_TYPE_MODIFY_PKT) + dr_ste_v0_arr_init_next(&last_ste, + added_stes, + DR_STE_TYPE_RX, + attr->gvmi); + + dr_ste_v0_set_rx_flow_tag(last_ste, attr->flow_tag); + } + + dr_ste_v0_set_hit_addr(last_ste, attr->final_icm_addr, 1); +} + +static void dr_ste_v0_set_action_set(uint8_t *hw_action, + uint8_t hw_field, + uint8_t shifter, + uint8_t length, + uint32_t data) +{ + length = (length == 32) ? 0 : length; + DEVX_SET(dr_action_hw_set, hw_action, opcode, DR_STE_ACTION_MDFY_OP_SET); + DEVX_SET(dr_action_hw_set, hw_action, destination_field_code, hw_field); + DEVX_SET(dr_action_hw_set, hw_action, destination_left_shifter, shifter); + DEVX_SET(dr_action_hw_set, hw_action, destination_length, length); + DEVX_SET(dr_action_hw_set, hw_action, inline_data, data); +} + +static void dr_ste_v0_set_action_add(uint8_t *hw_action, + uint8_t hw_field, + uint8_t shifter, + uint8_t length, + uint32_t data) +{ + length = (length == 32) ? 0 : length; + DEVX_SET(dr_action_hw_set, hw_action, opcode, DR_STE_ACTION_MDFY_OP_ADD); + DEVX_SET(dr_action_hw_set, hw_action, destination_field_code, hw_field); + DEVX_SET(dr_action_hw_set, hw_action, destination_left_shifter, shifter); + DEVX_SET(dr_action_hw_set, hw_action, destination_length, length); + DEVX_SET(dr_action_hw_set, hw_action, inline_data, data); +} + +static void dr_ste_v0_set_action_copy(uint8_t *hw_action, + uint8_t dst_hw_field, + uint8_t dst_shifter, + uint8_t dst_len, + uint8_t src_hw_field, + uint8_t src_shifter) +{ + DEVX_SET(dr_action_hw_copy, hw_action, opcode, DR_STE_ACTION_MDFY_OP_COPY); + DEVX_SET(dr_action_hw_copy, hw_action, destination_field_code, dst_hw_field); + DEVX_SET(dr_action_hw_copy, hw_action, destination_left_shifter, dst_shifter); + DEVX_SET(dr_action_hw_copy, hw_action, destination_length, dst_len); + DEVX_SET(dr_action_hw_copy, hw_action, source_field_code, src_hw_field); + DEVX_SET(dr_action_hw_copy, hw_action, source_left_shifter, src_shifter); +} + +#define DR_STE_DECAP_L3_MIN_ACTION_NUM 5 + +static int +dr_ste_v0_set_action_decap_l3_list(void *data, uint32_t data_sz, + uint8_t *hw_action, uint32_t hw_action_sz, + uint16_t *used_hw_action_num) +{ + struct mlx5_ifc_l2_hdr_bits *l2_hdr = data; + uint32_t hw_action_num; + int required_actions; + uint32_t hdr_fld_4b; + uint16_t hdr_fld_2b; + uint16_t vlan_type; + bool vlan; + + vlan = (data_sz != HDR_LEN_L2); + hw_action_num = hw_action_sz / DEVX_ST_SZ_BYTES(dr_action_hw_set); + required_actions = DR_STE_DECAP_L3_MIN_ACTION_NUM + !!vlan; + + if (hw_action_num < required_actions) { + errno = ENOMEM; + return errno; + } + + /* dmac_47_16 */ + DEVX_SET(dr_action_hw_set, hw_action, opcode, DR_STE_ACTION_MDFY_OP_SET); + DEVX_SET(dr_action_hw_set, hw_action, destination_length, 0); + DEVX_SET(dr_action_hw_set, hw_action, destination_field_code, DR_STE_V0_ACTION_MDFY_FLD_L2_0); + DEVX_SET(dr_action_hw_set, hw_action, destination_left_shifter, 16); + hdr_fld_4b = DEVX_GET(l2_hdr, l2_hdr, dmac_47_16); + DEVX_SET(dr_action_hw_set, hw_action, inline_data, hdr_fld_4b); + hw_action += DEVX_ST_SZ_BYTES(dr_action_hw_set); + + /* smac_47_16 */ + DEVX_SET(dr_action_hw_set, hw_action, opcode, DR_STE_ACTION_MDFY_OP_SET); + DEVX_SET(dr_action_hw_set, hw_action, destination_length, 0); + DEVX_SET(dr_action_hw_set, hw_action, destination_field_code, DR_STE_V0_ACTION_MDFY_FLD_L2_1); + DEVX_SET(dr_action_hw_set, hw_action, destination_left_shifter, 16); + hdr_fld_4b = (DEVX_GET(l2_hdr, l2_hdr, smac_31_0) >> 16 | + DEVX_GET(l2_hdr, l2_hdr, smac_47_32) << 16); + DEVX_SET(dr_action_hw_set, hw_action, inline_data, hdr_fld_4b); + hw_action += DEVX_ST_SZ_BYTES(dr_action_hw_set); + + /* dmac_15_0 */ + DEVX_SET(dr_action_hw_set, hw_action, opcode, DR_STE_ACTION_MDFY_OP_SET); + DEVX_SET(dr_action_hw_set, hw_action, destination_length, 16); + DEVX_SET(dr_action_hw_set, hw_action, destination_field_code, DR_STE_V0_ACTION_MDFY_FLD_L2_0); + DEVX_SET(dr_action_hw_set, hw_action, destination_left_shifter, 0); + hdr_fld_2b = DEVX_GET(l2_hdr, l2_hdr, dmac_15_0); + DEVX_SET(dr_action_hw_set, hw_action, inline_data, hdr_fld_2b); + hw_action += DEVX_ST_SZ_BYTES(dr_action_hw_set); + + /* ethertype + (optional) vlan */ + DEVX_SET(dr_action_hw_set, hw_action, opcode, DR_STE_ACTION_MDFY_OP_SET); + DEVX_SET(dr_action_hw_set, hw_action, destination_field_code, DR_STE_V0_ACTION_MDFY_FLD_L2_2); + DEVX_SET(dr_action_hw_set, hw_action, destination_left_shifter, 32); + if (!vlan) { + hdr_fld_2b = DEVX_GET(l2_hdr, l2_hdr, ethertype); + DEVX_SET(dr_action_hw_set, hw_action, inline_data, hdr_fld_2b); + DEVX_SET(dr_action_hw_set, hw_action, destination_length, 16); + } else { + hdr_fld_2b = DEVX_GET(l2_hdr, l2_hdr, ethertype); + vlan_type = hdr_fld_2b == SVLAN_ETHERTYPE ? DR_STE_SVLAN : DR_STE_CVLAN; + hdr_fld_2b = DEVX_GET(l2_hdr, l2_hdr, vlan); + hdr_fld_4b = (vlan_type << 16) | hdr_fld_2b; + DEVX_SET(dr_action_hw_set, hw_action, inline_data, hdr_fld_4b); + DEVX_SET(dr_action_hw_set, hw_action, destination_length, 18); + } + hw_action += DEVX_ST_SZ_BYTES(dr_action_hw_set); + + /* smac_15_0 */ + DEVX_SET(dr_action_hw_set, hw_action, opcode, DR_STE_ACTION_MDFY_OP_SET); + DEVX_SET(dr_action_hw_set, hw_action, destination_length, 16); + DEVX_SET(dr_action_hw_set, hw_action, destination_field_code, DR_STE_V0_ACTION_MDFY_FLD_L2_1); + DEVX_SET(dr_action_hw_set, hw_action, destination_left_shifter, 0); + hdr_fld_2b = DEVX_GET(l2_hdr, l2_hdr, smac_31_0); + DEVX_SET(dr_action_hw_set, hw_action, inline_data, hdr_fld_2b); + hw_action += DEVX_ST_SZ_BYTES(dr_action_hw_set); + + if (vlan) { + DEVX_SET(dr_action_hw_set, hw_action, opcode, DR_STE_ACTION_MDFY_OP_SET); + hdr_fld_2b = DEVX_GET(l2_hdr, l2_hdr, vlan_type); + DEVX_SET(dr_action_hw_set, hw_action, inline_data, hdr_fld_2b); + DEVX_SET(dr_action_hw_set, hw_action, destination_length, 16); + DEVX_SET(dr_action_hw_set, hw_action, destination_field_code, DR_STE_V0_ACTION_MDFY_FLD_L2_2); + DEVX_SET(dr_action_hw_set, hw_action, destination_left_shifter, 0); + } + + *used_hw_action_num = required_actions; + + return 0; +} + +static void dr_ste_v0_build_eth_l2_src_dst_bit_mask(struct dr_match_param *value, + bool inner, uint8_t *bit_mask) +{ + struct dr_match_spec *mask = inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l2_src_dst, bit_mask, dmac_47_16, mask, dmac_47_16); + DR_STE_SET_TAG(eth_l2_src_dst, bit_mask, dmac_15_0, mask, dmac_15_0); + + if (mask->smac_47_16 || mask->smac_15_0) { + DR_STE_SET(eth_l2_src_dst, bit_mask, smac_47_32, + mask->smac_47_16 >> 16); + DR_STE_SET(eth_l2_src_dst, bit_mask, smac_31_0, + mask->smac_47_16 << 16 | mask->smac_15_0); + mask->smac_47_16 = 0; + mask->smac_15_0 = 0; + } + + DR_STE_SET_TAG(eth_l2_src_dst, bit_mask, first_vlan_id, mask, first_vid); + DR_STE_SET_TAG(eth_l2_src_dst, bit_mask, first_cfi, mask, first_cfi); + DR_STE_SET_TAG(eth_l2_src_dst, bit_mask, first_priority, mask, first_prio); + DR_STE_SET_ONES(eth_l2_src_dst, bit_mask, l3_type, mask, ip_version); + + if (mask->cvlan_tag) { + DR_STE_SET(eth_l2_src_dst, bit_mask, first_vlan_qualifier, -1); + mask->cvlan_tag = 0; + } else if (mask->svlan_tag) { + DR_STE_SET(eth_l2_src_dst, bit_mask, first_vlan_qualifier, -1); + mask->svlan_tag = 0; + } +} + +static int dr_ste_v0_build_eth_l2_src_dst_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l2_src_dst, tag, dmac_47_16, spec, dmac_47_16); + DR_STE_SET_TAG(eth_l2_src_dst, tag, dmac_15_0, spec, dmac_15_0); + + if (spec->smac_47_16 || spec->smac_15_0) { + DR_STE_SET(eth_l2_src_dst, tag, smac_47_32, + spec->smac_47_16 >> 16); + DR_STE_SET(eth_l2_src_dst, tag, smac_31_0, + spec->smac_47_16 << 16 | spec->smac_15_0); + spec->smac_47_16 = 0; + spec->smac_15_0 = 0; + } + + if (spec->ip_version) { + if (spec->ip_version == IP_VERSION_IPV4) { + DR_STE_SET(eth_l2_src_dst, tag, l3_type, STE_IPV4); + spec->ip_version = 0; + } else if (spec->ip_version == IP_VERSION_IPV6) { + DR_STE_SET(eth_l2_src_dst, tag, l3_type, STE_IPV6); + spec->ip_version = 0; + } else { + errno = EINVAL; + return errno; + } + } + + DR_STE_SET_TAG(eth_l2_src_dst, tag, first_vlan_id, spec, first_vid); + DR_STE_SET_TAG(eth_l2_src_dst, tag, first_cfi, spec, first_cfi); + DR_STE_SET_TAG(eth_l2_src_dst, tag, first_priority, spec, first_prio); + + if (spec->cvlan_tag) { + DR_STE_SET(eth_l2_src_dst, tag, first_vlan_qualifier, DR_STE_CVLAN); + spec->cvlan_tag = 0; + } else if (spec->svlan_tag) { + DR_STE_SET(eth_l2_src_dst, tag, first_vlan_qualifier, DR_STE_SVLAN); + spec->svlan_tag = 0; + } + return 0; +} + +static void dr_ste_v0_build_eth_l2_src_dst_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v0_build_eth_l2_src_dst_bit_mask(mask, sb->inner, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL2_SRC_DST, sb->rx, sb->inner); + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_eth_l2_src_dst_tag; +} + +static int dr_ste_v0_build_eth_l3_ipv6_dst_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l3_ipv6_dst, tag, dst_ip_127_96, spec, dst_ip_127_96); + DR_STE_SET_TAG(eth_l3_ipv6_dst, tag, dst_ip_95_64, spec, dst_ip_95_64); + DR_STE_SET_TAG(eth_l3_ipv6_dst, tag, dst_ip_63_32, spec, dst_ip_63_32); + DR_STE_SET_TAG(eth_l3_ipv6_dst, tag, dst_ip_31_0, spec, dst_ip_31_0); + + return 0; +} + +static void dr_ste_v0_build_eth_l3_ipv6_dst_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v0_build_eth_l3_ipv6_dst_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL3_IPV6_DST, sb->rx, sb->inner); + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_eth_l3_ipv6_dst_tag; +} + +static int dr_ste_v0_build_eth_l3_ipv6_src_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l3_ipv6_src, tag, src_ip_127_96, spec, src_ip_127_96); + DR_STE_SET_TAG(eth_l3_ipv6_src, tag, src_ip_95_64, spec, src_ip_95_64); + DR_STE_SET_TAG(eth_l3_ipv6_src, tag, src_ip_63_32, spec, src_ip_63_32); + DR_STE_SET_TAG(eth_l3_ipv6_src, tag, src_ip_31_0, spec, src_ip_31_0); + + return 0; +} + +static void dr_ste_v0_build_eth_l3_ipv6_src_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v0_build_eth_l3_ipv6_src_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL3_IPV6_SRC, sb->rx, sb->inner); + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_eth_l3_ipv6_src_tag; +} + +static int dr_ste_v0_build_eth_l3_ipv4_5_tuple_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, destination_address, spec, dst_ip_31_0); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, source_address, spec, src_ip_31_0); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, destination_port, spec, tcp_dport); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, destination_port, spec, udp_dport); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, source_port, spec, tcp_sport); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, source_port, spec, udp_sport); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, protocol, spec, ip_protocol); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, fragmented, spec, frag); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, dscp, spec, ip_dscp); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple, tag, ecn, spec, ip_ecn); + + if (spec->tcp_flags) { + DR_STE_SET_TCP_FLAGS(eth_l3_ipv4_5_tuple, tag, spec); + spec->tcp_flags = 0; + } + + return 0; +} + +static void dr_ste_v0_build_eth_l3_ipv4_5_tuple_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v0_build_eth_l3_ipv4_5_tuple_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL3_IPV4_5_TUPLE, sb->rx, sb->inner); + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_eth_l3_ipv4_5_tuple_tag; +} + +static void dr_ste_v0_build_eth_l2_src_or_dst_bit_mask(struct dr_match_param *value, + bool inner, uint8_t *bit_mask) +{ + struct dr_match_spec *mask = inner ? &value->inner : &value->outer; + struct dr_match_misc *misc_mask = &value->misc; + + DR_STE_SET_TAG(eth_l2_src, bit_mask, first_vlan_id, mask, first_vid); + DR_STE_SET_TAG(eth_l2_src, bit_mask, first_cfi, mask, first_cfi); + DR_STE_SET_TAG(eth_l2_src, bit_mask, first_priority, mask, first_prio); + DR_STE_SET_TAG(eth_l2_src, bit_mask, ip_fragmented, mask, frag); + DR_STE_SET_TAG(eth_l2_src, bit_mask, l3_ethertype, mask, ethertype); + DR_STE_SET_ONES(eth_l2_src, bit_mask, l3_type, mask, ip_version); + + if (mask->svlan_tag || mask->cvlan_tag) { + DR_STE_SET(eth_l2_src, bit_mask, first_vlan_qualifier, -1); + mask->cvlan_tag = 0; + mask->svlan_tag = 0; + } + + if (inner) { + if (misc_mask->inner_second_cvlan_tag || + misc_mask->inner_second_svlan_tag) { + DR_STE_SET(eth_l2_src, bit_mask, second_vlan_qualifier, -1); + misc_mask->inner_second_cvlan_tag = 0; + misc_mask->inner_second_svlan_tag = 0; + } + + DR_STE_SET_TAG(eth_l2_src, bit_mask, second_vlan_id, misc_mask, inner_second_vid); + DR_STE_SET_TAG(eth_l2_src, bit_mask, second_cfi, misc_mask, inner_second_cfi); + DR_STE_SET_TAG(eth_l2_src, bit_mask, second_priority, misc_mask, inner_second_prio); + } else { + if (misc_mask->outer_second_cvlan_tag || + misc_mask->outer_second_svlan_tag) { + DR_STE_SET(eth_l2_src, bit_mask, second_vlan_qualifier, -1); + misc_mask->outer_second_cvlan_tag = 0; + misc_mask->outer_second_svlan_tag = 0; + } + + DR_STE_SET_TAG(eth_l2_src, bit_mask, second_vlan_id, misc_mask, outer_second_vid); + DR_STE_SET_TAG(eth_l2_src, bit_mask, second_cfi, misc_mask, outer_second_cfi); + DR_STE_SET_TAG(eth_l2_src, bit_mask, second_priority, misc_mask, outer_second_prio); + } +} + +static int dr_ste_v0_build_eth_l2_src_or_dst_tag(struct dr_match_param *value, + bool inner, uint8_t *tag) +{ + struct dr_match_spec *spec = inner ? &value->inner : &value->outer; + struct dr_match_misc *misc_spec = &value->misc; + + DR_STE_SET_TAG(eth_l2_src, tag, first_vlan_id, spec, first_vid); + DR_STE_SET_TAG(eth_l2_src, tag, first_cfi, spec, first_cfi); + DR_STE_SET_TAG(eth_l2_src, tag, first_priority, spec, first_prio); + DR_STE_SET_TAG(eth_l2_src, tag, ip_fragmented, spec, frag); + DR_STE_SET_TAG(eth_l2_src, tag, l3_ethertype, spec, ethertype); + + if (spec->ip_version) { + if (spec->ip_version == IP_VERSION_IPV4) { + DR_STE_SET(eth_l2_src, tag, l3_type, STE_IPV4); + spec->ip_version = 0; + } else if (spec->ip_version == IP_VERSION_IPV6) { + DR_STE_SET(eth_l2_src, tag, l3_type, STE_IPV6); + spec->ip_version = 0; + } else { + errno = EINVAL; + return errno; + } + } + + if (spec->cvlan_tag) { + DR_STE_SET(eth_l2_src, tag, first_vlan_qualifier, DR_STE_CVLAN); + spec->cvlan_tag = 0; + } else if (spec->svlan_tag) { + DR_STE_SET(eth_l2_src, tag, first_vlan_qualifier, DR_STE_SVLAN); + spec->svlan_tag = 0; + } + + if (inner) { + if (misc_spec->inner_second_cvlan_tag) { + DR_STE_SET(eth_l2_src, tag, second_vlan_qualifier, DR_STE_CVLAN); + misc_spec->inner_second_cvlan_tag = 0; + } else if (misc_spec->inner_second_svlan_tag) { + DR_STE_SET(eth_l2_src, tag, second_vlan_qualifier, DR_STE_SVLAN); + misc_spec->inner_second_svlan_tag = 0; + } + + DR_STE_SET_TAG(eth_l2_src, tag, second_vlan_id, misc_spec, inner_second_vid); + DR_STE_SET_TAG(eth_l2_src, tag, second_cfi, misc_spec, inner_second_cfi); + DR_STE_SET_TAG(eth_l2_src, tag, second_priority, misc_spec, inner_second_prio); + } else { + if (misc_spec->outer_second_cvlan_tag) { + DR_STE_SET(eth_l2_src, tag, second_vlan_qualifier, DR_STE_CVLAN); + misc_spec->outer_second_cvlan_tag = 0; + } else if (misc_spec->outer_second_svlan_tag) { + DR_STE_SET(eth_l2_src, tag, second_vlan_qualifier, DR_STE_SVLAN); + misc_spec->outer_second_svlan_tag = 0; + } + DR_STE_SET_TAG(eth_l2_src, tag, second_vlan_id, misc_spec, outer_second_vid); + DR_STE_SET_TAG(eth_l2_src, tag, second_cfi, misc_spec, outer_second_cfi); + DR_STE_SET_TAG(eth_l2_src, tag, second_priority, misc_spec, outer_second_prio); + } + + return 0; +} + +static void dr_ste_v0_build_eth_l2_src_bit_mask(struct dr_match_param *value, + bool inner, uint8_t *bit_mask) +{ + struct dr_match_spec *mask = inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l2_src, bit_mask, smac_47_16, mask, smac_47_16); + DR_STE_SET_TAG(eth_l2_src, bit_mask, smac_15_0, mask, smac_15_0); + + dr_ste_v0_build_eth_l2_src_or_dst_bit_mask(value, inner, bit_mask); +} + +static int dr_ste_v0_build_eth_l2_src_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l2_src, tag, smac_47_16, spec, smac_47_16); + DR_STE_SET_TAG(eth_l2_src, tag, smac_15_0, spec, smac_15_0); + + return dr_ste_v0_build_eth_l2_src_or_dst_tag(value, sb->inner, tag); +} + +static void dr_ste_v0_build_eth_l2_src_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v0_build_eth_l2_src_bit_mask(mask, sb->inner, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL2_SRC, sb->rx, sb->inner); + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_eth_l2_src_tag; +} + +static void dr_ste_v0_build_eth_l2_dst_bit_mask(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *bit_mask) +{ + struct dr_match_spec *mask = sb->inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l2_dst, bit_mask, dmac_47_16, mask, dmac_47_16); + DR_STE_SET_TAG(eth_l2_dst, bit_mask, dmac_15_0, mask, dmac_15_0); + + dr_ste_v0_build_eth_l2_src_or_dst_bit_mask(value, sb->inner, bit_mask); +} + +static int dr_ste_v0_build_eth_l2_dst_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l2_dst, tag, dmac_47_16, spec, dmac_47_16); + DR_STE_SET_TAG(eth_l2_dst, tag, dmac_15_0, spec, dmac_15_0); + + return dr_ste_v0_build_eth_l2_src_or_dst_tag(value, sb->inner, tag); +} + +static void dr_ste_v0_build_eth_l2_dst_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v0_build_eth_l2_dst_bit_mask(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL2_DST, sb->rx, sb->inner); + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_eth_l2_dst_tag; +} + +static void dr_ste_v0_build_eth_l2_tnl_bit_mask(struct dr_match_param *value, + bool inner, uint8_t *bit_mask) +{ + struct dr_match_spec *mask = inner ? &value->inner : &value->outer; + struct dr_match_misc *misc = &value->misc; + + DR_STE_SET_TAG(eth_l2_tnl, bit_mask, dmac_47_16, mask, dmac_47_16); + DR_STE_SET_TAG(eth_l2_tnl, bit_mask, dmac_15_0, mask, dmac_15_0); + DR_STE_SET_TAG(eth_l2_tnl, bit_mask, first_vlan_id, mask, first_vid); + DR_STE_SET_TAG(eth_l2_tnl, bit_mask, first_cfi, mask, first_cfi); + DR_STE_SET_TAG(eth_l2_tnl, bit_mask, first_priority, mask, first_prio); + DR_STE_SET_TAG(eth_l2_tnl, bit_mask, ip_fragmented, mask, frag); + DR_STE_SET_TAG(eth_l2_tnl, bit_mask, l3_ethertype, mask, ethertype); + DR_STE_SET_ONES(eth_l2_tnl, bit_mask, l3_type, mask, ip_version); + + if (misc->vxlan_vni) { + DR_STE_SET(eth_l2_tnl, bit_mask, l2_tunneling_network_id, (misc->vxlan_vni << 8)); + misc->vxlan_vni = 0; + } + + if (mask->svlan_tag || mask->cvlan_tag) { + DR_STE_SET(eth_l2_tnl, bit_mask, first_vlan_qualifier, -1); + mask->cvlan_tag = 0; + mask->svlan_tag = 0; + } +} + +static int dr_ste_v0_build_eth_l2_tnl_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + struct dr_match_misc *misc = &value->misc; + + DR_STE_SET_TAG(eth_l2_tnl, tag, dmac_47_16, spec, dmac_47_16); + DR_STE_SET_TAG(eth_l2_tnl, tag, dmac_15_0, spec, dmac_15_0); + DR_STE_SET_TAG(eth_l2_tnl, tag, first_vlan_id, spec, first_vid); + DR_STE_SET_TAG(eth_l2_tnl, tag, first_cfi, spec, first_cfi); + DR_STE_SET_TAG(eth_l2_tnl, tag, ip_fragmented, spec, frag); + DR_STE_SET_TAG(eth_l2_tnl, tag, first_priority, spec, first_prio); + DR_STE_SET_TAG(eth_l2_tnl, tag, l3_ethertype, spec, ethertype); + + if (misc->vxlan_vni) { + DR_STE_SET(eth_l2_tnl, tag, l2_tunneling_network_id, + (misc->vxlan_vni << 8)); + misc->vxlan_vni = 0; + } + + if (spec->cvlan_tag) { + DR_STE_SET(eth_l2_tnl, tag, first_vlan_qualifier, DR_STE_CVLAN); + spec->cvlan_tag = 0; + } else if (spec->svlan_tag) { + DR_STE_SET(eth_l2_tnl, tag, first_vlan_qualifier, DR_STE_SVLAN); + spec->svlan_tag = 0; + } + + if (spec->ip_version) { + if (spec->ip_version == IP_VERSION_IPV4) { + DR_STE_SET(eth_l2_tnl, tag, l3_type, STE_IPV4); + spec->ip_version = 0; + } else if (spec->ip_version == IP_VERSION_IPV6) { + DR_STE_SET(eth_l2_tnl, tag, l3_type, STE_IPV6); + spec->ip_version = 0; + } else { + errno = EINVAL; + return errno; + } + } + + return 0; +} + +static void dr_ste_v0_build_eth_l2_tnl_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v0_build_eth_l2_tnl_bit_mask(mask, sb->inner, sb->bit_mask); + + sb->lu_type = DR_STE_V0_LU_TYPE_ETHL2_TUNNELING_I; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_eth_l2_tnl_tag; +} + +static int dr_ste_v0_build_eth_l3_ipv4_misc_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l3_ipv4_misc, tag, time_to_live, spec, ip_ttl_hoplimit); + + return 0; +} + +static void dr_ste_v0_build_eth_l3_ipv4_misc_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v0_build_eth_l3_ipv4_misc_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL3_IPV4_MISC, sb->rx, sb->inner); + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_eth_l3_ipv4_misc_tag; +} + +static int dr_ste_v0_build_eth_ipv6_l3_l4_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + struct dr_match_misc *misc = &value->misc; + + DR_STE_SET_TAG(eth_l4, tag, dst_port, spec, tcp_dport); + DR_STE_SET_TAG(eth_l4, tag, src_port, spec, tcp_sport); + DR_STE_SET_TAG(eth_l4, tag, dst_port, spec, udp_dport); + DR_STE_SET_TAG(eth_l4, tag, src_port, spec, udp_sport); + DR_STE_SET_TAG(eth_l4, tag, protocol, spec, ip_protocol); + DR_STE_SET_TAG(eth_l4, tag, fragmented, spec, frag); + DR_STE_SET_TAG(eth_l4, tag, dscp, spec, ip_dscp); + DR_STE_SET_TAG(eth_l4, tag, ecn, spec, ip_ecn); + DR_STE_SET_TAG(eth_l4, tag, ipv6_hop_limit, spec, ip_ttl_hoplimit); + + if (sb->inner) + DR_STE_SET_TAG(eth_l4, tag, flow_label, misc, inner_ipv6_flow_label); + else + DR_STE_SET_TAG(eth_l4, tag, flow_label, misc, outer_ipv6_flow_label); + + if (spec->tcp_flags) { + DR_STE_SET_TCP_FLAGS(eth_l4, tag, spec); + spec->tcp_flags = 0; + } + + return 0; +} + +static void dr_ste_v0_build_eth_ipv6_l3_l4_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v0_build_eth_ipv6_l3_l4_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL4, sb->rx, sb->inner); + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_eth_ipv6_l3_l4_tag; +} + +static int dr_ste_v0_build_mpls_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_misc2 *misc2 = &value->misc2; + + if (sb->inner) + DR_STE_SET_MPLS(mpls, misc2, inner, tag); + else + DR_STE_SET_MPLS(mpls, misc2, outer, tag); + + return 0; +} + +static void dr_ste_v0_build_mpls_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v0_build_mpls_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_LU_TYPE(MPLS_FIRST, sb->rx, sb->inner); + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_mpls_tag; +} + +static int dr_ste_v0_build_tnl_gre_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_misc *misc = &value->misc; + + DR_STE_SET_TAG(gre, tag, gre_protocol, misc, gre_protocol); + DR_STE_SET_TAG(gre, tag, gre_k_present, misc, gre_k_present); + DR_STE_SET_TAG(gre, tag, gre_key_h, misc, gre_key_h); + DR_STE_SET_TAG(gre, tag, gre_key_l, misc, gre_key_l); + + DR_STE_SET_TAG(gre, tag, gre_c_present, misc, gre_c_present); + DR_STE_SET_TAG(gre, tag, gre_s_present, misc, gre_s_present); + + return 0; +} + +static void dr_ste_v0_build_tnl_gre_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v0_build_tnl_gre_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V0_LU_TYPE_GRE; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_tnl_gre_tag; +} + +static int dr_ste_v0_build_tnl_mpls_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_misc2 *misc2 = &value->misc2; + + if (DR_STE_IS_OUTER_MPLS_OVER_GRE_SET(misc2)) { + DR_STE_SET_TAG(flex_parser_0, tag, parser_3_label, + misc2, outer_first_mpls_over_gre_label); + + DR_STE_SET_TAG(flex_parser_0, tag, parser_3_exp, + misc2, outer_first_mpls_over_gre_exp); + + DR_STE_SET_TAG(flex_parser_0, tag, parser_3_s_bos, + misc2, outer_first_mpls_over_gre_s_bos); + + DR_STE_SET_TAG(flex_parser_0, tag, parser_3_ttl, + misc2, outer_first_mpls_over_gre_ttl); + } else { + DR_STE_SET_TAG(flex_parser_0, tag, parser_3_label, + misc2, outer_first_mpls_over_udp_label); + + DR_STE_SET_TAG(flex_parser_0, tag, parser_3_exp, + misc2, outer_first_mpls_over_udp_exp); + + DR_STE_SET_TAG(flex_parser_0, tag, parser_3_s_bos, + misc2, outer_first_mpls_over_udp_s_bos); + + DR_STE_SET_TAG(flex_parser_0, tag, parser_3_ttl, + misc2, outer_first_mpls_over_udp_ttl); + } + + return 0; +} + +static void dr_ste_v0_build_tnl_mpls_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v0_build_tnl_mpls_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V0_LU_TYPE_FLEX_PARSER_0; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_tnl_mpls_tag; +} + +#define ICMP_TYPE_OFFSET_FIRST_DW 24 +#define ICMP_CODE_OFFSET_FIRST_DW 16 + +static int dr_ste_v0_build_icmp_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_misc3 *misc3 = &value->misc3; + bool is_ipv4 = DR_MASK_IS_ICMPV4_SET(misc3); + uint32_t *icmp_header_data; + uint8_t *icmp_type; + uint8_t *icmp_code; + int dw0_location; + int dw1_location; + + if (is_ipv4) { + icmp_header_data = &misc3->icmpv4_header_data; + icmp_type = &misc3->icmpv4_type; + icmp_code = &misc3->icmpv4_code; + dw0_location = sb->caps->flex_parser_id_icmp_dw0; + dw1_location = sb->caps->flex_parser_id_icmp_dw1; + } else { + icmp_header_data = &misc3->icmpv6_header_data; + icmp_type = &misc3->icmpv6_type; + icmp_code = &misc3->icmpv6_code; + dw0_location = sb->caps->flex_parser_id_icmpv6_dw0; + dw1_location = sb->caps->flex_parser_id_icmpv6_dw1; + } + + switch (dw0_location) { + case 4: + DR_STE_SET(flex_parser_1, tag, flex_parser_4, + (*icmp_type << ICMP_TYPE_OFFSET_FIRST_DW) | + (*icmp_code << ICMP_CODE_OFFSET_FIRST_DW)); + + *icmp_type = 0; + *icmp_code = 0; + break; + default: + errno = ENOTSUP; + return errno; + } + + switch (dw1_location) { + case 5: + DR_STE_SET(flex_parser_1, tag, flex_parser_5, *icmp_header_data); + *icmp_header_data = 0; + break; + default: + errno = ENOTSUP; + return errno; + } + + return 0; +} + +static int dr_ste_v0_build_icmp_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + int ret; + + ret = dr_ste_v0_build_icmp_tag(mask, sb, sb->bit_mask); + if (ret) + return ret; + + sb->lu_type = DR_STE_V0_LU_TYPE_FLEX_PARSER_1; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_icmp_tag; + + return 0; +} + +static int dr_ste_v0_build_general_purpose_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_misc2 *misc2 = &value->misc2; + + DR_STE_SET_TAG(general_purpose, tag, general_purpose_lookup_field, + misc2, metadata_reg_a); + + return 0; +} + +static void dr_ste_v0_build_general_purpose_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v0_build_general_purpose_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V0_LU_TYPE_GENERAL_PURPOSE; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_general_purpose_tag; +} + +static int dr_ste_v0_build_eth_l4_misc_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_misc3 *misc3 = &value->misc3; + + if (sb->inner) { + DR_STE_SET_TAG(eth_l4_misc, tag, seq_num, misc3, inner_tcp_seq_num); + DR_STE_SET_TAG(eth_l4_misc, tag, ack_num, misc3, inner_tcp_ack_num); + } else { + DR_STE_SET_TAG(eth_l4_misc, tag, seq_num, misc3, outer_tcp_seq_num); + DR_STE_SET_TAG(eth_l4_misc, tag, ack_num, misc3, outer_tcp_ack_num); + } + + return 0; +} + +static void dr_ste_v0_build_eth_l4_misc_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v0_build_eth_l4_misc_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_LU_TYPE(ETHL4_MISC, sb->rx, sb->inner); + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_eth_l4_misc_tag; +} + +static int +dr_ste_v0_build_flex_parser_tnl_vxlan_gpe_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_misc3 *misc3 = &value->misc3; + + DR_STE_SET_TAG(flex_parser_tnl_vxlan_gpe, tag, + outer_vxlan_gpe_flags, misc3, + outer_vxlan_gpe_flags); + DR_STE_SET_TAG(flex_parser_tnl_vxlan_gpe, tag, + outer_vxlan_gpe_next_protocol, misc3, + outer_vxlan_gpe_next_protocol); + DR_STE_SET_TAG(flex_parser_tnl_vxlan_gpe, tag, + outer_vxlan_gpe_vni, misc3, + outer_vxlan_gpe_vni); + + return 0; +} + +static void +dr_ste_v0_build_flex_parser_tnl_vxlan_gpe_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v0_build_flex_parser_tnl_vxlan_gpe_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V0_LU_TYPE_FLEX_PARSER_TNL_HEADER; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_flex_parser_tnl_vxlan_gpe_tag; +} + +static int +dr_ste_v0_build_flex_parser_tnl_geneve_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_misc *misc = &value->misc; + + DR_STE_SET_TAG(flex_parser_tnl_geneve, tag, + geneve_protocol_type, misc, geneve_protocol_type); + DR_STE_SET_TAG(flex_parser_tnl_geneve, tag, + geneve_oam, misc, geneve_oam); + DR_STE_SET_TAG(flex_parser_tnl_geneve, tag, + geneve_opt_len, misc, geneve_opt_len); + DR_STE_SET_TAG(flex_parser_tnl_geneve, tag, + geneve_vni, misc, geneve_vni); + + return 0; +} + +static void +dr_ste_v0_build_flex_parser_tnl_geneve_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v0_build_flex_parser_tnl_geneve_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V0_LU_TYPE_FLEX_PARSER_TNL_HEADER; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_flex_parser_tnl_geneve_tag; +} + +static int dr_ste_v0_build_flex_parser_tnl_gtpu_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_misc3 *misc3 = &value->misc3; + + DR_STE_SET_TAG(flex_parser_tnl_gtpu, tag, + gtpu_flags, misc3, + gtpu_flags); + DR_STE_SET_TAG(flex_parser_tnl_gtpu, tag, + gtpu_msg_type, misc3, + gtpu_msg_type); + DR_STE_SET_TAG(flex_parser_tnl_gtpu, tag, + gtpu_teid, misc3, + gtpu_teid); + + return 0; +} + +static void dr_ste_v0_build_flex_parser_tnl_gtpu_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v0_build_flex_parser_tnl_gtpu_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V0_LU_TYPE_FLEX_PARSER_TNL_HEADER; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_flex_parser_tnl_gtpu_tag; +} + +static int dr_ste_v0_build_register_0_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_misc2 *misc2 = &value->misc2; + + DR_STE_SET_TAG(register_0, tag, register_0_h, misc2, metadata_reg_c_0); + DR_STE_SET_TAG(register_0, tag, register_0_l, misc2, metadata_reg_c_1); + DR_STE_SET_TAG(register_0, tag, register_1_h, misc2, metadata_reg_c_2); + DR_STE_SET_TAG(register_0, tag, register_1_l, misc2, metadata_reg_c_3); + + return 0; +} + +static void dr_ste_v0_build_register_0_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v0_build_register_0_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V0_LU_TYPE_STEERING_REGISTERS_0; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_register_0_tag; +} + +static int dr_ste_v0_build_register_1_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_misc2 *misc2 = &value->misc2; + + DR_STE_SET_TAG(register_1, tag, register_2_h, misc2, metadata_reg_c_4); + DR_STE_SET_TAG(register_1, tag, register_2_l, misc2, metadata_reg_c_5); + DR_STE_SET_TAG(register_1, tag, register_3_h, misc2, metadata_reg_c_6); + DR_STE_SET_TAG(register_1, tag, register_3_l, misc2, metadata_reg_c_7); + + return 0; +} + +static void dr_ste_v0_build_register_1_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v0_build_register_1_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V0_LU_TYPE_STEERING_REGISTERS_1; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_register_1_tag; +} + +static void dr_ste_v0_build_src_gvmi_qpn_bit_mask(struct dr_match_param *value, + uint8_t *bit_mask) +{ + struct dr_match_misc *misc_mask = &value->misc; + + DR_STE_SET_ONES(src_gvmi_qp, bit_mask, source_gvmi, misc_mask, source_port); + DR_STE_SET_ONES(src_gvmi_qp, bit_mask, source_qp, misc_mask, source_sqn); +} + +static int dr_ste_v0_build_src_gvmi_qpn_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_misc *misc = &value->misc; + struct dr_devx_vport_cap *vport_cap; + uint8_t *bit_mask = sb->bit_mask; + bool source_gvmi_set; + + DR_STE_SET_TAG(src_gvmi_qp, tag, source_qp, misc, source_sqn); + + source_gvmi_set = DR_STE_GET(src_gvmi_qp, bit_mask, source_gvmi); + if (source_gvmi_set) { + vport_cap = dr_get_vport_cap(sb->caps, misc->source_port); + if (!vport_cap) + return errno; + + if (vport_cap->gvmi) + DR_STE_SET(src_gvmi_qp, tag, source_gvmi, vport_cap->gvmi); + + misc->source_port = 0; + } + + return 0; +} + +static void dr_ste_v0_build_src_gvmi_qpn_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v0_build_src_gvmi_qpn_bit_mask(mask, sb->bit_mask); + + sb->lu_type = DR_STE_V0_LU_TYPE_SRC_GVMI_AND_QP; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v0_build_src_gvmi_qpn_tag; +} + +static struct dr_ste_ctx ste_ctx_v0 = { + /* Builders */ + .build_eth_l2_src_dst_init = &dr_ste_v0_build_eth_l2_src_dst_init, + .build_eth_l3_ipv6_src_init = &dr_ste_v0_build_eth_l3_ipv6_src_init, + .build_eth_l3_ipv6_dst_init = &dr_ste_v0_build_eth_l3_ipv6_dst_init, + .build_eth_l3_ipv4_5_tuple_init = &dr_ste_v0_build_eth_l3_ipv4_5_tuple_init, + .build_eth_l2_src_init = &dr_ste_v0_build_eth_l2_src_init, + .build_eth_l2_dst_init = &dr_ste_v0_build_eth_l2_dst_init, + .build_eth_l2_tnl_init = &dr_ste_v0_build_eth_l2_tnl_init, + .build_eth_l3_ipv4_misc_init = &dr_ste_v0_build_eth_l3_ipv4_misc_init, + .build_eth_ipv6_l3_l4_init = &dr_ste_v0_build_eth_ipv6_l3_l4_init, + .build_mpls_init = &dr_ste_v0_build_mpls_init, + .build_tnl_gre_init = &dr_ste_v0_build_tnl_gre_init, + .build_tnl_mpls_init = &dr_ste_v0_build_tnl_mpls_init, + .build_icmp_init = &dr_ste_v0_build_icmp_init, + .build_general_purpose_init = &dr_ste_v0_build_general_purpose_init, + .build_eth_l4_misc_init = &dr_ste_v0_build_eth_l4_misc_init, + .build_tnl_vxlan_gpe_init = &dr_ste_v0_build_flex_parser_tnl_vxlan_gpe_init, + .build_tnl_geneve_init = &dr_ste_v0_build_flex_parser_tnl_geneve_init, + .build_tnl_gtpu_init = &dr_ste_v0_build_flex_parser_tnl_gtpu_init, + .build_register_0_init = &dr_ste_v0_build_register_0_init, + .build_register_1_init = &dr_ste_v0_build_register_1_init, + .build_src_gvmi_qpn_init = &dr_ste_v0_build_src_gvmi_qpn_init, + /* Getters and Setters */ + .ste_init = &dr_ste_v0_init, + .set_next_lu_type = &dr_ste_v0_set_next_lu_type, + .get_next_lu_type = &dr_ste_v0_get_next_lu_type, + .set_miss_addr = &dr_ste_v0_set_miss_addr, + .get_miss_addr = &dr_ste_v0_get_miss_addr, + .set_hit_addr = &dr_ste_v0_set_hit_addr, + .set_byte_mask = &dr_ste_v0_set_byte_mask, + .get_byte_mask = &dr_ste_v0_get_byte_mask, + /* Actions */ + .set_actions_rx = &dr_ste_v0_set_actions_rx, + .set_actions_tx = &dr_ste_v0_set_actions_tx, + .modify_field_arr_sz = ARRAY_SIZE(dr_ste_v0_action_modify_field_arr), + .modify_field_arr = dr_ste_v0_action_modify_field_arr, + .set_action_set = &dr_ste_v0_set_action_set, + .set_action_add = &dr_ste_v0_set_action_add, + .set_action_copy = &dr_ste_v0_set_action_copy, + .set_action_decap_l3_list = &dr_ste_v0_set_action_decap_l3_list, +}; + +struct dr_ste_ctx *dr_ste_get_ctx_v0(void) +{ + return &ste_ctx_v0; +} diff --git a/providers/mlx5/dr_ste_v1.c b/providers/mlx5/dr_ste_v1.c new file mode 100644 index 0000000..68e22c1 --- /dev/null +++ b/providers/mlx5/dr_ste_v1.c @@ -0,0 +1,1585 @@ +/* + * Copyright (c) 2020, Mellanox Technologies. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "dr_ste.h" + +#define DR_STE_CALC_DFNR_TYPE(lookup_type, inner) \ + ((inner) ? DR_STE_V1_LU_TYPE_##lookup_type##_I : \ + DR_STE_V1_LU_TYPE_##lookup_type##_O) + +enum dr_ste_v1_entry_format { + DR_STE_V1_TYPE_BWC_BYTE = 0x0, + DR_STE_V1_TYPE_BWC_DW = 0x1, + DR_STE_V1_TYPE_MATCH = 0x2, +}; + +/* + * Lookup type is built from 2B: [ Definer mode 1B ][ Definer index 1B ] + */ +enum dr_ste_v1_lu_type { + DR_STE_V1_LU_TYPE_NOP = 0x0000, + DR_STE_V1_LU_TYPE_ETHL2_TNL = 0x0002, + DR_STE_V1_LU_TYPE_IBL3_EXT = 0x0102, + DR_STE_V1_LU_TYPE_ETHL2_O = 0x0003, + DR_STE_V1_LU_TYPE_IBL4 = 0x0103, + DR_STE_V1_LU_TYPE_ETHL2_I = 0x0004, + DR_STE_V1_LU_TYPE_SRC_QP_GVMI = 0x0104, + DR_STE_V1_LU_TYPE_ETHL2_SRC_O = 0x0005, + DR_STE_V1_LU_TYPE_ETHL2_HEADERS_O = 0x0105, + DR_STE_V1_LU_TYPE_ETHL2_SRC_I = 0x0006, + DR_STE_V1_LU_TYPE_ETHL2_HEADERS_I = 0x0106, + DR_STE_V1_LU_TYPE_ETHL3_IPV4_5_TUPLE_O = 0x0007, + DR_STE_V1_LU_TYPE_IPV6_DES_O = 0x0107, + DR_STE_V1_LU_TYPE_ETHL3_IPV4_5_TUPLE_I = 0x0008, + DR_STE_V1_LU_TYPE_IPV6_DES_I = 0x0108, + DR_STE_V1_LU_TYPE_ETHL4_O = 0x0009, + DR_STE_V1_LU_TYPE_IPV6_SRC_O = 0x0109, + DR_STE_V1_LU_TYPE_ETHL4_I = 0x000a, + DR_STE_V1_LU_TYPE_IPV6_SRC_I = 0x010a, + DR_STE_V1_LU_TYPE_ETHL2_SRC_DST_O = 0x000b, + DR_STE_V1_LU_TYPE_MPLS_O = 0x010b, + DR_STE_V1_LU_TYPE_ETHL2_SRC_DST_I = 0x000c, + DR_STE_V1_LU_TYPE_MPLS_I = 0x010c, + DR_STE_V1_LU_TYPE_ETHL3_IPV4_MISC_O = 0x000d, + DR_STE_V1_LU_TYPE_GRE = 0x010d, + DR_STE_V1_LU_TYPE_FLEX_PARSER_TNL_HEADER = 0x000e, + DR_STE_V1_LU_TYPE_GENERAL_PURPOSE = 0x010e, + DR_STE_V1_LU_TYPE_ETHL3_IPV4_MISC_I = 0x000f, + DR_STE_V1_LU_TYPE_STEERING_REGISTERS_0 = 0x010f, + DR_STE_V1_LU_TYPE_STEERING_REGISTERS_1 = 0x0110, + DR_STE_V1_LU_TYPE_FLEX_PARSER_0 = 0x0111, + DR_STE_V1_LU_TYPE_FLEX_PARSER_1 = 0x0112, + DR_STE_V1_LU_TYPE_ETHL4_MISC_O = 0x0113, + DR_STE_V1_LU_TYPE_ETHL4_MISC_I = 0x0114, + DR_STE_V1_LU_TYPE_INVALID = 0x00ff, + DR_STE_V1_LU_TYPE_DONT_CARE = DR_STE_LU_TYPE_DONT_CARE, +}; + +enum dr_ste_v1_header_anchors { + DR_STE_HEADER_ANCHOR_START_OUTER = 0x00, + DR_STE_HEADER_ANCHOR_1ST_VLAN = 0x02, + DR_STE_HEADER_ANCHOR_IPV6_IPV4 = 0x07, + DR_STE_HEADER_ANCHOR_INNER_MAC = 0x13, + DR_STE_HEADER_ANCHOR_INNER_IPV6_IPV4 = 0x19, +}; + +enum dr_ste_v1_action_size { + DR_STE_ACTION_SINGLE_SZ = 4, + DR_STE_ACTION_DOUBLE_SZ = 8, +}; + +enum dr_ste_v1_action_insert_ptr_attr { + DR_STE_V1_ACTION_INSERT_PTR_ATTR_NONE = 0, /* Regular push header (e.g. push vlan) */ + DR_STE_V1_ACTION_INSERT_PTR_ATTR_ENCAP = 1, /* Encapsulation / Tunneling */ + DR_STE_V1_ACTION_INSERT_PTR_ATTR_ESP = 2, /* IPsec */ +}; + +enum dr_ste_v1_action_id { + DR_STE_V1_ACTION_ID_NOP = 0x00, + DR_STE_V1_ACTION_ID_COPY = 0x05, + DR_STE_V1_ACTION_ID_SET = 0x06, + DR_STE_V1_ACTION_ID_ADD = 0x07, + DR_STE_V1_ACTION_ID_REMOVE_BY_SIZE = 0x08, + DR_STE_V1_ACTION_ID_REMOVE_HEADER_TO_HEADER = 0x09, + DR_STE_V1_ACTION_ID_INSERT_INLINE = 0x0a, + DR_STE_V1_ACTION_ID_INSERT_POINTER = 0x0b, + DR_STE_V1_ACTION_ID_FLOW_TAG = 0x0c, + DR_STE_V1_ACTION_ID_QUEUE_ID_SEL = 0x0d, + DR_STE_V1_ACTION_ID_ACCELERATED_LIST = 0x0e, + DR_STE_V1_ACTION_ID_MODIFY_LIST = 0x0f, + DR_STE_V1_ACTION_ID_TRAILER = 0x13, + DR_STE_V1_ACTION_ID_COUNTER_ID = 0x14, + DR_STE_V1_ACTION_ID_MAX = 0x21, + /* use for special cases */ + DR_STE_V1_ACTION_ID_SPECIAL_ENCAP_L3 = 0x22, +}; + +enum { + DR_STE_V1_ACTION_MDFY_FLD_L2_OUT_0 = 0x00, + DR_STE_V1_ACTION_MDFY_FLD_L2_OUT_1 = 0x01, + DR_STE_V1_ACTION_MDFY_FLD_L2_OUT_2 = 0x02, + DR_STE_V1_ACTION_MDFY_FLD_SRC_L2_OUT_0 = 0x08, + DR_STE_V1_ACTION_MDFY_FLD_SRC_L2_OUT_1 = 0x09, + DR_STE_V1_ACTION_MDFY_FLD_L3_OUT_0 = 0x0e, + DR_STE_V1_ACTION_MDFY_FLD_L4_OUT_0 = 0x18, + DR_STE_V1_ACTION_MDFY_FLD_L4_OUT_1 = 0x19, + DR_STE_V1_ACTION_MDFY_FLD_IPV4_OUT_0 = 0x40, + DR_STE_V1_ACTION_MDFY_FLD_IPV4_OUT_1 = 0x41, + DR_STE_V1_ACTION_MDFY_FLD_IPV6_DST_OUT_0 = 0x44, + DR_STE_V1_ACTION_MDFY_FLD_IPV6_DST_OUT_1 = 0x45, + DR_STE_V1_ACTION_MDFY_FLD_IPV6_DST_OUT_2 = 0x46, + DR_STE_V1_ACTION_MDFY_FLD_IPV6_DST_OUT_3 = 0x47, + DR_STE_V1_ACTION_MDFY_FLD_IPV6_SRC_OUT_0 = 0x4c, + DR_STE_V1_ACTION_MDFY_FLD_IPV6_SRC_OUT_1 = 0x4d, + DR_STE_V1_ACTION_MDFY_FLD_IPV6_SRC_OUT_2 = 0x4e, + DR_STE_V1_ACTION_MDFY_FLD_IPV6_SRC_OUT_3 = 0x4f, + DR_STE_V1_ACTION_MDFY_FLD_TCP_MISC_0 = 0x5e, + DR_STE_V1_ACTION_MDFY_FLD_TCP_MISC_1 = 0x5f, + DR_STE_V1_ACTION_MDFY_FLD_METADATA_2_CQE = 0x7b, + DR_STE_V1_ACTION_MDFY_FLD_GNRL_PURPOSE = 0x7c, + DR_STE_V1_ACTION_MDFY_FLD_REGISTER_2 = 0x8c, + DR_STE_V1_ACTION_MDFY_FLD_REGISTER_3 = 0x8d, + DR_STE_V1_ACTION_MDFY_FLD_REGISTER_4 = 0x8e, + DR_STE_V1_ACTION_MDFY_FLD_REGISTER_5 = 0x8f, + DR_STE_V1_ACTION_MDFY_FLD_REGISTER_6 = 0x90, + DR_STE_V1_ACTION_MDFY_FLD_REGISTER_7 = 0x91, +}; + +static const struct dr_ste_action_modify_field dr_ste_v1_action_modify_field_arr[] = { + [MLX5_ACTION_IN_FIELD_OUT_SMAC_47_16] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_SRC_L2_OUT_0, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_SMAC_15_0] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_SRC_L2_OUT_1, .start = 16, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_ETHERTYPE] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_L2_OUT_1, .start = 0, .end = 15, + }, + [MLX5_ACTION_IN_FIELD_OUT_DMAC_47_16] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_L2_OUT_0, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_DMAC_15_0] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_L2_OUT_1, .start = 16, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_IP_DSCP] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_L3_OUT_0, .start = 18, .end = 23, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_FLAGS] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_L4_OUT_1, .start = 16, .end = 24, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_TCP, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_SPORT] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_L4_OUT_0, .start = 16, .end = 31, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_TCP, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_DPORT] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_L4_OUT_0, .start = 0, .end = 15, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_TCP, + }, + [MLX5_ACTION_IN_FIELD_OUT_IP_TTL] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_L3_OUT_0, .start = 8, .end = 15, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV4, + }, + [MLX5_ACTION_IN_FIELD_OUT_IPV6_HOPLIMIT] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_L3_OUT_0, .start = 8, .end = 15, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_UDP_SPORT] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_L4_OUT_0, .start = 16, .end = 31, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_UDP, + }, + [MLX5_ACTION_IN_FIELD_OUT_UDP_DPORT] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_L4_OUT_0, .start = 0, .end = 15, + .l4_type = DR_STE_ACTION_MDFY_TYPE_L4_UDP, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_127_96] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_IPV6_SRC_OUT_0, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_95_64] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_IPV6_SRC_OUT_1, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_63_32] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_IPV6_SRC_OUT_2, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV6_31_0] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_IPV6_SRC_OUT_3, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_127_96] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_IPV6_DST_OUT_0, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_95_64] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_IPV6_DST_OUT_1, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_63_32] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_IPV6_DST_OUT_2, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV6_31_0] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_IPV6_DST_OUT_3, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV6, + }, + [MLX5_ACTION_IN_FIELD_OUT_SIPV4] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_IPV4_OUT_0, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV4, + }, + [MLX5_ACTION_IN_FIELD_OUT_DIPV4] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_IPV4_OUT_1, .start = 0, .end = 31, + .l3_type = DR_STE_ACTION_MDFY_TYPE_L3_IPV4, + }, + [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGA] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_GNRL_PURPOSE, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGB] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_METADATA_2_CQE, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_0] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_REGISTER_6, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_1] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_REGISTER_7, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_2] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_REGISTER_4, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_3] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_REGISTER_5, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_4] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_REGISTER_2, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_METADATA_REGC_5] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_REGISTER_3, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_SEQ_NUM] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_TCP_MISC_0, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_TCP_ACK_NUM] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_TCP_MISC_1, .start = 0, .end = 31, + }, + [MLX5_ACTION_IN_FIELD_OUT_FIRST_VID] = { + .hw_field = DR_STE_V1_ACTION_MDFY_FLD_L2_OUT_2, .start = 0, .end = 15, + }, +}; + +static void dr_ste_v1_set_entry_type(uint8_t *hw_ste_p, uint8_t entry_type) +{ + DR_STE_SET(match_bwc_v1, hw_ste_p, entry_format, entry_type); +} + +static void dr_ste_v1_set_miss_addr(uint8_t *hw_ste_p, uint64_t miss_addr) +{ + uint64_t index = miss_addr >> 6; + + DR_STE_SET(match_bwc_v1, hw_ste_p, miss_address_39_32, index >> 26); + DR_STE_SET(match_bwc_v1, hw_ste_p, miss_address_31_6, index); +} + +static uint64_t dr_ste_v1_get_miss_addr(uint8_t *hw_ste_p) +{ + uint64_t index = + (DR_STE_GET(match_bwc_v1, hw_ste_p, miss_address_31_6) | + DR_STE_GET(match_bwc_v1, hw_ste_p, miss_address_39_32) << 26); + + return index << 6; +} + +static void dr_ste_v1_set_byte_mask(uint8_t *hw_ste_p, uint16_t byte_mask) +{ + DR_STE_SET(match_bwc_v1, hw_ste_p, byte_mask, byte_mask); +} + +static uint16_t dr_ste_v1_get_byte_mask(uint8_t *hw_ste_p) +{ + return DR_STE_GET(match_bwc_v1, hw_ste_p, byte_mask); +} + +static void dr_ste_v1_set_lu_type(uint8_t *hw_ste_p, uint16_t lu_type) +{ + DR_STE_SET(match_bwc_v1, hw_ste_p, entry_format, lu_type >> 8); + DR_STE_SET(match_bwc_v1, hw_ste_p, match_definer_ctx_idx, lu_type & 0xFF); +} + +static void dr_ste_v1_set_next_lu_type(uint8_t *hw_ste_p, uint16_t lu_type) +{ + DR_STE_SET(match_bwc_v1, hw_ste_p, next_entry_format, lu_type >> 8); + DR_STE_SET(match_bwc_v1, hw_ste_p, hash_definer_ctx_idx, lu_type & 0xFF); +} + +static uint16_t dr_ste_v1_get_next_lu_type(uint8_t *hw_ste_p) +{ + uint8_t mode = DR_STE_GET(match_bwc_v1, hw_ste_p, next_entry_format); + uint8_t index = DR_STE_GET(match_bwc_v1, hw_ste_p, hash_definer_ctx_idx); + + return (mode << 8 | index); +} + +static void dr_ste_v1_set_hit_addr(uint8_t *hw_ste_p, uint64_t icm_addr, uint32_t ht_size) +{ + uint64_t index = (icm_addr >> 5) | ht_size; + + DR_STE_SET(match_bwc_v1, hw_ste_p, next_table_base_39_32_size, index >> 27); + DR_STE_SET(match_bwc_v1, hw_ste_p, next_table_base_31_5_size, index); +} + +static void dr_ste_v1_init(uint8_t *hw_ste_p, uint16_t lu_type, + uint8_t entry_type, uint16_t gvmi) +{ + dr_ste_v1_set_lu_type(hw_ste_p, lu_type); + dr_ste_v1_set_next_lu_type(hw_ste_p, DR_STE_LU_TYPE_DONT_CARE); + + DR_STE_SET(match_bwc_v1, hw_ste_p, gvmi, gvmi); + DR_STE_SET(match_bwc_v1, hw_ste_p, next_table_base_63_48, gvmi); + DR_STE_SET(match_bwc_v1, hw_ste_p, miss_address_63_48, gvmi); +} + +static void dr_ste_v1_prepare_for_postsend(uint8_t *hw_ste_p, + uint32_t ste_size) +{ + uint8_t *tag = hw_ste_p + DR_STE_SIZE_CTRL; + uint8_t *mask = tag + DR_STE_SIZE_TAG; + uint8_t tmp_tag[DR_STE_SIZE_TAG] = {}; + + if (ste_size == DR_STE_SIZE_CTRL) + return; + + if (ste_size != DR_STE_SIZE) + assert(false); + + /* Backup tag */ + memcpy(tmp_tag, tag, DR_STE_SIZE_TAG); + + /* Swap mask and tag both are the same size */ + memcpy(tag, mask, DR_STE_SIZE_MASK); + memcpy(mask, tmp_tag, DR_STE_SIZE_TAG); +} + +static void dr_ste_v1_set_rx_flow_tag(uint8_t *s_action, uint32_t flow_tag) +{ + DR_STE_SET(single_action_flow_tag_v1, s_action, action_id, + DR_STE_V1_ACTION_ID_FLOW_TAG); + DR_STE_SET(single_action_flow_tag_v1, s_action, flow_tag, flow_tag); +} + +static void dr_ste_v1_set_counter_id(uint8_t *hw_ste_p, uint32_t ctr_id) +{ + DR_STE_SET(match_bwc_v1, hw_ste_p, counter_id, ctr_id); +} + +static void dr_ste_v1_set_reparse(uint8_t *hw_ste_p) +{ + DR_STE_SET(match_bwc_v1, hw_ste_p, reparse, 1); +} + +static void dr_ste_v1_set_tx_encap(uint8_t *hw_ste_p, uint8_t *d_action, + uint32_t reformat_id, int size) +{ + DR_STE_SET(double_action_insert_with_ptr_v1, d_action, action_id, + DR_STE_V1_ACTION_ID_INSERT_POINTER); + /* The hardware expects here size in words (2 bytes) */ + DR_STE_SET(double_action_insert_with_ptr_v1, d_action, size, size / 2); + DR_STE_SET(double_action_insert_with_ptr_v1, d_action, pointer, reformat_id); + DR_STE_SET(double_action_insert_with_ptr_v1, d_action, attributes, + DR_STE_V1_ACTION_INSERT_PTR_ATTR_ENCAP); + + dr_ste_v1_set_reparse(hw_ste_p); +} + +static void dr_ste_v1_set_tx_encap_l3(uint8_t *hw_ste_p, + uint8_t *frst_s_action, + uint8_t *scnd_d_action, + uint32_t reformat_id, + int size) +{ + /* Remove L2 headers */ + DR_STE_SET(single_action_remove_header_v1, frst_s_action, action_id, + DR_STE_V1_ACTION_ID_REMOVE_HEADER_TO_HEADER); + DR_STE_SET(single_action_remove_header_v1, frst_s_action, end_anchor, + DR_STE_HEADER_ANCHOR_IPV6_IPV4); + + /* Encapsulate with given reformat ID */ + DR_STE_SET(double_action_insert_with_ptr_v1, scnd_d_action, action_id, + DR_STE_V1_ACTION_ID_INSERT_POINTER); + /* The hardware expects here size in words (2 bytes) */ + DR_STE_SET(double_action_insert_with_ptr_v1, scnd_d_action, size, size / 2); + DR_STE_SET(double_action_insert_with_ptr_v1, scnd_d_action, pointer, reformat_id); + DR_STE_SET(double_action_insert_with_ptr_v1, scnd_d_action, attributes, + DR_STE_V1_ACTION_INSERT_PTR_ATTR_ENCAP); + + dr_ste_v1_set_reparse(hw_ste_p); +} + +static void dr_ste_v1_set_rx_decap(uint8_t *hw_ste_p, uint8_t *s_action) +{ + DR_STE_SET(single_action_remove_header_v1, s_action, action_id, + DR_STE_V1_ACTION_ID_REMOVE_HEADER_TO_HEADER); + DR_STE_SET(single_action_remove_header_v1, s_action, decap, 1); + DR_STE_SET(single_action_remove_header_v1, s_action, vni_to_cqe, 1); + DR_STE_SET(single_action_remove_header_v1, s_action, end_anchor, + DR_STE_HEADER_ANCHOR_INNER_MAC); + + dr_ste_v1_set_reparse(hw_ste_p); +} + +static void dr_ste_v1_set_rx_decap_l3(uint8_t *hw_ste_p, + uint8_t *s_action, + uint16_t decap_actions, + uint32_t decap_index) +{ + DR_STE_SET(single_action_modify_list_v1, s_action, action_id, + DR_STE_V1_ACTION_ID_MODIFY_LIST); + DR_STE_SET(single_action_modify_list_v1, s_action, num_of_modify_actions, + decap_actions); + DR_STE_SET(single_action_modify_list_v1, s_action, modify_actions_ptr, + decap_index); + + dr_ste_v1_set_reparse(hw_ste_p); +} + +static void dr_ste_v1_set_rewrite_actions(uint8_t *hw_ste_p, + uint8_t *s_action, + uint16_t num_of_actions, + uint32_t re_write_index) +{ + DR_STE_SET(single_action_modify_list_v1, s_action, action_id, + DR_STE_V1_ACTION_ID_MODIFY_LIST); + DR_STE_SET(single_action_modify_list_v1, s_action, num_of_modify_actions, + num_of_actions); + DR_STE_SET(single_action_modify_list_v1, s_action, modify_actions_ptr, + re_write_index); + + dr_ste_v1_set_reparse(hw_ste_p); +} + +static inline void dr_ste_v1_arr_init_next_match(uint8_t **last_ste, + uint32_t *added_stes, + uint16_t gvmi) +{ + uint8_t *action; + + (*added_stes)++; + *last_ste += DR_STE_SIZE; + dr_ste_v1_init(*last_ste, DR_STE_LU_TYPE_DONT_CARE, 0, gvmi); + dr_ste_v1_set_entry_type(*last_ste, DR_STE_V1_TYPE_MATCH); + + action = DEVX_ADDR_OF(ste_mask_and_match_v1, *last_ste, action); + memset(action, 0, DEVX_FLD_SZ_BYTES(ste_mask_and_match_v1, action)); +} + +static void dr_ste_v1_set_actions_tx(uint8_t *action_type_set, + uint8_t *last_ste, + struct dr_ste_actions_attr *attr, + uint32_t *added_stes) +{ + uint8_t *action = DEVX_ADDR_OF(ste_match_bwc_v1, last_ste, action); + + if (action_type_set[DR_ACTION_TYP_CTR]) + dr_ste_v1_set_counter_id(last_ste, attr->ctr_id); + + if (action_type_set[DR_ACTION_TYP_MODIFY_HDR]) { + dr_ste_v1_set_rewrite_actions(last_ste, action, + attr->modify_actions, + attr->modify_index); + action += DR_STE_ACTION_DOUBLE_SZ; + } + + if (action_type_set[DR_ACTION_TYP_L2_TO_TNL_L2]) { + if (action_type_set[DR_ACTION_TYP_MODIFY_HDR]) { + dr_ste_v1_arr_init_next_match(&last_ste, added_stes, attr->gvmi); + action = DEVX_ADDR_OF(ste_mask_and_match_v1, last_ste, action); + } + dr_ste_v1_set_tx_encap(last_ste, action, + attr->reformat_id, + attr->reformat_size); + action += DR_STE_ACTION_DOUBLE_SZ; + } else if (action_type_set[DR_ACTION_TYP_L2_TO_TNL_L3]) { + uint8_t *d_action; + + dr_ste_v1_arr_init_next_match(&last_ste, added_stes, attr->gvmi); + action = DEVX_ADDR_OF(ste_mask_and_match_v1, last_ste, action); + d_action = action + DR_STE_ACTION_SINGLE_SZ; + + dr_ste_v1_set_tx_encap_l3(last_ste, + action, d_action, + attr->reformat_id, + attr->reformat_size); + action += DR_STE_ACTION_SINGLE_SZ + DR_STE_ACTION_DOUBLE_SZ; + } + + dr_ste_v1_set_hit_addr(last_ste, attr->final_icm_addr, 1); +} + +static void dr_ste_v1_set_actions_rx(uint8_t *action_type_set, + uint8_t *last_ste, + struct dr_ste_actions_attr *attr, + uint32_t *added_stes) +{ + uint8_t *action = DEVX_ADDR_OF(ste_match_bwc_v1, last_ste, action); + bool new_ste = false; + bool decap; + + decap = action_type_set[DR_ACTION_TYP_TNL_L3_TO_L2] || + action_type_set[DR_ACTION_TYP_TNL_L2_TO_L2]; + + if (action_type_set[DR_ACTION_TYP_TNL_L3_TO_L2]) { + dr_ste_v1_set_rx_decap_l3(last_ste, action, + attr->decap_actions, + attr->decap_index); + action += DR_STE_ACTION_DOUBLE_SZ; + } else if (action_type_set[DR_ACTION_TYP_TNL_L2_TO_L2]) { + dr_ste_v1_set_rx_decap(last_ste, action); + action += DR_STE_ACTION_SINGLE_SZ; + } + + if (action_type_set[DR_ACTION_TYP_TAG]) { + if (action_type_set[DR_ACTION_TYP_TNL_L3_TO_L2]) { + dr_ste_v1_arr_init_next_match(&last_ste, added_stes, attr->gvmi); + action = DEVX_ADDR_OF(ste_mask_and_match_v1, last_ste, action); + new_ste = true; + } + dr_ste_v1_set_rx_flow_tag(action, attr->flow_tag); + action += DR_STE_ACTION_SINGLE_SZ; + } + + if (action_type_set[DR_ACTION_TYP_MODIFY_HDR]) { + /* Modify header and decapsulation must use different STEs */ + if (decap && !new_ste) { + dr_ste_v1_arr_init_next_match(&last_ste, added_stes, attr->gvmi); + action = DEVX_ADDR_OF(ste_mask_and_match_v1, last_ste, action); + new_ste = true; + } + dr_ste_v1_set_rewrite_actions(last_ste, action, + attr->modify_actions, + attr->modify_index); + action += DR_STE_ACTION_DOUBLE_SZ; + } + + if (action_type_set[DR_ACTION_TYP_CTR]) { + /* Counter action set after decap to exclude decaped header */ + if (decap && !new_ste) { + dr_ste_v1_arr_init_next_match(&last_ste, added_stes, attr->gvmi); + action = DEVX_ADDR_OF(ste_mask_and_match_v1, last_ste, action); + new_ste = true; + } + dr_ste_v1_set_counter_id(last_ste, attr->ctr_id); + } + + dr_ste_v1_set_hit_addr(last_ste, attr->final_icm_addr, 1); +} + +static void dr_ste_v1_set_action_set(uint8_t *d_action, + uint8_t hw_field, + uint8_t shifter, + uint8_t length, + uint32_t data) +{ + shifter += MLX5_MODIFY_HEADER_V1_QW_OFFSET; + DR_STE_SET(double_action_set_v1, d_action, action_id, DR_STE_V1_ACTION_ID_SET); + DR_STE_SET(double_action_set_v1, d_action, destination_dw_offset, hw_field); + DR_STE_SET(double_action_set_v1, d_action, destination_left_shifter, shifter); + DR_STE_SET(double_action_set_v1, d_action, destination_length, length); + DR_STE_SET(double_action_set_v1, d_action, inline_data, data); +} + +static void dr_ste_v1_set_action_add(uint8_t *d_action, + uint8_t hw_field, + uint8_t shifter, + uint8_t length, + uint32_t data) +{ + shifter += MLX5_MODIFY_HEADER_V1_QW_OFFSET; + DR_STE_SET(double_action_add_v1, d_action, action_id, DR_STE_V1_ACTION_ID_ADD); + DR_STE_SET(double_action_add_v1, d_action, destination_dw_offset, hw_field); + DR_STE_SET(double_action_add_v1, d_action, destination_left_shifter, shifter); + DR_STE_SET(double_action_add_v1, d_action, destination_length, length); + DR_STE_SET(double_action_add_v1, d_action, add_value, data); +} + +static void dr_ste_v1_set_action_copy(uint8_t *d_action, + uint8_t dst_hw_field, + uint8_t dst_shifter, + uint8_t dst_len, + uint8_t src_hw_field, + uint8_t src_shifter) +{ + dst_shifter += MLX5_MODIFY_HEADER_V1_QW_OFFSET; + src_shifter += MLX5_MODIFY_HEADER_V1_QW_OFFSET; + DR_STE_SET(double_action_copy_v1, d_action, action_id, DR_STE_V1_ACTION_ID_COPY); + DR_STE_SET(double_action_copy_v1, d_action, destination_dw_offset, dst_hw_field); + DR_STE_SET(double_action_copy_v1, d_action, destination_left_shifter, dst_shifter); + DR_STE_SET(double_action_copy_v1, d_action, destination_length, dst_len); + DR_STE_SET(double_action_copy_v1, d_action, source_dw_offset, src_hw_field); + DR_STE_SET(double_action_copy_v1, d_action, source_right_shifter, src_shifter); +} + +#define DR_STE_DECAP_L3_ACTION_NUM 8 +#define DR_STE_L2_HDR_MAX_SZ 20 + +static int +dr_ste_v1_set_action_decap_l3_list(void *data, uint32_t data_sz, + uint8_t *hw_action, uint32_t hw_action_sz, + uint16_t *used_hw_action_num) +{ + uint8_t padded_data[DR_STE_L2_HDR_MAX_SZ] = {}; + void *data_ptr = padded_data; + uint16_t used_actions = 0; + uint32_t inline_data_sz; + uint32_t i; + + if (hw_action_sz / DR_STE_ACTION_DOUBLE_SZ < DR_STE_DECAP_L3_ACTION_NUM) { + errno = EINVAL; + return errno; + } + + memcpy(padded_data, data, data_sz); + + /* Remove L2L3 outer headers */ + DR_STE_SET(single_action_remove_header_v1, hw_action, action_id, + DR_STE_V1_ACTION_ID_REMOVE_HEADER_TO_HEADER); + DR_STE_SET(single_action_remove_header_v1, hw_action, decap, 1); + DR_STE_SET(single_action_remove_header_v1, hw_action, vni_to_cqe, 1); + DR_STE_SET(single_action_remove_header_v1, hw_action, end_anchor, + DR_STE_HEADER_ANCHOR_INNER_IPV6_IPV4); + hw_action += DR_STE_ACTION_DOUBLE_SZ; + used_actions++; + + inline_data_sz = + DEVX_FLD_SZ_BYTES(ste_double_action_insert_with_inline_v1, inline_data); + + /* Add the new header inline + 2 extra bytes */ + for (i = 0; i < data_sz / inline_data_sz + 1; i++) { + void *addr_inline; + + DR_STE_SET(double_action_insert_with_inline_v1, hw_action, action_id, + DR_STE_V1_ACTION_ID_INSERT_INLINE); + /* The hardware expects here offset to words (2 bytes) */ + DR_STE_SET(double_action_insert_with_inline_v1, hw_action, start_offset, + i * 2); + + /* Copy byte byte in order to skip endianness problem */ + addr_inline = DEVX_ADDR_OF(ste_double_action_insert_with_inline_v1, + hw_action, inline_data); + memcpy(addr_inline, data_ptr, inline_data_sz); + hw_action += DR_STE_ACTION_DOUBLE_SZ; + data_ptr += inline_data_sz; + used_actions++; + } + + /* Remove 2 extra bytes */ + DR_STE_SET(single_action_remove_header_size_v1, hw_action, action_id, + DR_STE_V1_ACTION_ID_REMOVE_BY_SIZE); + DR_STE_SET(single_action_remove_header_size_v1, hw_action, start_offset, data_sz / 2); + /* The hardware expects here size in words (2 bytes) */ + DR_STE_SET(single_action_remove_header_size_v1, hw_action, remove_size, 1); + used_actions++; + + *used_hw_action_num = used_actions; + + return 0; +} + +static void dr_ste_v1_build_eth_l2_src_dst_bit_mask(struct dr_match_param *value, + bool inner, uint8_t *bit_mask) +{ + struct dr_match_spec *mask = inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l2_src_dst_v1, bit_mask, dmac_47_16, mask, dmac_47_16); + DR_STE_SET_TAG(eth_l2_src_dst_v1, bit_mask, dmac_15_0, mask, dmac_15_0); + + DR_STE_SET_TAG(eth_l2_src_dst_v1, bit_mask, smac_47_16, mask, smac_47_16); + DR_STE_SET_TAG(eth_l2_src_dst_v1, bit_mask, smac_15_0, mask, smac_15_0); + + DR_STE_SET_TAG(eth_l2_src_dst_v1, bit_mask, first_vlan_id, mask, first_vid); + DR_STE_SET_TAG(eth_l2_src_dst_v1, bit_mask, first_cfi, mask, first_cfi); + DR_STE_SET_TAG(eth_l2_src_dst_v1, bit_mask, first_priority, mask, first_prio); + DR_STE_SET_ONES(eth_l2_src_dst_v1, bit_mask, l3_type, mask, ip_version); + + if (mask->cvlan_tag) { + DR_STE_SET(eth_l2_src_dst_v1, bit_mask, first_vlan_qualifier, -1); + mask->cvlan_tag = 0; + } else if (mask->svlan_tag) { + DR_STE_SET(eth_l2_src_dst_v1, bit_mask, first_vlan_qualifier, -1); + mask->svlan_tag = 0; + } +} + +static int dr_ste_v1_build_eth_l2_src_dst_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l2_src_dst_v1, tag, dmac_47_16, spec, dmac_47_16); + DR_STE_SET_TAG(eth_l2_src_dst_v1, tag, dmac_15_0, spec, dmac_15_0); + + DR_STE_SET_TAG(eth_l2_src_dst_v1, tag, smac_47_16, spec, smac_47_16); + DR_STE_SET_TAG(eth_l2_src_dst_v1, tag, smac_15_0, spec, smac_15_0); + + if (spec->ip_version) { + if (spec->ip_version == IP_VERSION_IPV4) { + DR_STE_SET(eth_l2_src_dst_v1, tag, l3_type, STE_IPV4); + spec->ip_version = 0; + } else if (spec->ip_version == IP_VERSION_IPV6) { + DR_STE_SET(eth_l2_src_dst_v1, tag, l3_type, STE_IPV6); + spec->ip_version = 0; + } else { + errno = EINVAL; + return errno; + } + } + + DR_STE_SET_TAG(eth_l2_src_dst_v1, tag, first_vlan_id, spec, first_vid); + DR_STE_SET_TAG(eth_l2_src_dst_v1, tag, first_cfi, spec, first_cfi); + DR_STE_SET_TAG(eth_l2_src_dst_v1, tag, first_priority, spec, first_prio); + + if (spec->cvlan_tag) { + DR_STE_SET(eth_l2_src_dst_v1, tag, first_vlan_qualifier, DR_STE_CVLAN); + spec->cvlan_tag = 0; + } else if (spec->svlan_tag) { + DR_STE_SET(eth_l2_src_dst_v1, tag, first_vlan_qualifier, DR_STE_SVLAN); + spec->svlan_tag = 0; + } + return 0; +} + +static void dr_ste_v1_build_eth_l2_src_dst_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v1_build_eth_l2_src_dst_bit_mask(mask, sb->inner, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_DFNR_TYPE(ETHL2_SRC_DST, sb->inner); + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_eth_l2_src_dst_tag; +} + +static int dr_ste_v1_build_eth_l3_ipv6_dst_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l3_ipv6_dst, tag, dst_ip_127_96, spec, dst_ip_127_96); + DR_STE_SET_TAG(eth_l3_ipv6_dst, tag, dst_ip_95_64, spec, dst_ip_95_64); + DR_STE_SET_TAG(eth_l3_ipv6_dst, tag, dst_ip_63_32, spec, dst_ip_63_32); + DR_STE_SET_TAG(eth_l3_ipv6_dst, tag, dst_ip_31_0, spec, dst_ip_31_0); + + return 0; +} + +static void dr_ste_v1_build_eth_l3_ipv6_dst_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v1_build_eth_l3_ipv6_dst_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_DFNR_TYPE(IPV6_DES, sb->inner); + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_eth_l3_ipv6_dst_tag; +} + +static int dr_ste_v1_build_eth_l3_ipv6_src_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l3_ipv6_src, tag, src_ip_127_96, spec, src_ip_127_96); + DR_STE_SET_TAG(eth_l3_ipv6_src, tag, src_ip_95_64, spec, src_ip_95_64); + DR_STE_SET_TAG(eth_l3_ipv6_src, tag, src_ip_63_32, spec, src_ip_63_32); + DR_STE_SET_TAG(eth_l3_ipv6_src, tag, src_ip_31_0, spec, src_ip_31_0); + + return 0; +} + +static void dr_ste_v1_build_eth_l3_ipv6_src_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v1_build_eth_l3_ipv6_src_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_DFNR_TYPE(IPV6_SRC, sb->inner); + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_eth_l3_ipv6_src_tag; +} + +static int dr_ste_v1_build_eth_l3_ipv4_5_tuple_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple_v1, tag, destination_address, spec, dst_ip_31_0); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple_v1, tag, source_address, spec, src_ip_31_0); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple_v1, tag, destination_port, spec, tcp_dport); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple_v1, tag, destination_port, spec, udp_dport); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple_v1, tag, source_port, spec, tcp_sport); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple_v1, tag, source_port, spec, udp_sport); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple_v1, tag, protocol, spec, ip_protocol); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple_v1, tag, fragmented, spec, frag); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple_v1, tag, dscp, spec, ip_dscp); + DR_STE_SET_TAG(eth_l3_ipv4_5_tuple_v1, tag, ecn, spec, ip_ecn); + + if (spec->tcp_flags) { + DR_STE_SET_TCP_FLAGS(eth_l3_ipv4_5_tuple_v1, tag, spec); + spec->tcp_flags = 0; + } + + return 0; +} + +static void dr_ste_v1_build_eth_l3_ipv4_5_tuple_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v1_build_eth_l3_ipv4_5_tuple_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_DFNR_TYPE(ETHL3_IPV4_5_TUPLE, sb->inner); + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_eth_l3_ipv4_5_tuple_tag; +} + +static void dr_ste_v1_build_eth_l2_src_or_dst_bit_mask(struct dr_match_param *value, + bool inner, uint8_t *bit_mask) +{ + struct dr_match_spec *mask = inner ? &value->inner : &value->outer; + struct dr_match_misc *misc_mask = &value->misc; + + DR_STE_SET_TAG(eth_l2_src_v1, bit_mask, first_vlan_id, mask, first_vid); + DR_STE_SET_TAG(eth_l2_src_v1, bit_mask, first_cfi, mask, first_cfi); + DR_STE_SET_TAG(eth_l2_src_v1, bit_mask, first_priority, mask, first_prio); + DR_STE_SET_TAG(eth_l2_src_v1, bit_mask, ip_fragmented, mask, frag); // ? + DR_STE_SET_TAG(eth_l2_src_v1, bit_mask, l3_ethertype, mask, ethertype); // ? + DR_STE_SET_ONES(eth_l2_src_v1, bit_mask, l3_type, mask, ip_version); + + if (mask->svlan_tag || mask->cvlan_tag) { + DR_STE_SET(eth_l2_src_v1, bit_mask, first_vlan_qualifier, -1); + mask->cvlan_tag = 0; + mask->svlan_tag = 0; + } + + if (inner) { + if (misc_mask->inner_second_cvlan_tag || + misc_mask->inner_second_svlan_tag) { + DR_STE_SET(eth_l2_src_v1, bit_mask, second_vlan_qualifier, -1); + misc_mask->inner_second_cvlan_tag = 0; + misc_mask->inner_second_svlan_tag = 0; + } + + DR_STE_SET_TAG(eth_l2_src_v1, bit_mask, second_vlan_id, misc_mask, inner_second_vid); + DR_STE_SET_TAG(eth_l2_src_v1, bit_mask, second_cfi, misc_mask, inner_second_cfi); + DR_STE_SET_TAG(eth_l2_src_v1, bit_mask, second_priority, misc_mask, inner_second_prio); + } else { + if (misc_mask->outer_second_cvlan_tag || + misc_mask->outer_second_svlan_tag) { + DR_STE_SET(eth_l2_src_v1, bit_mask, second_vlan_qualifier, -1); + misc_mask->outer_second_cvlan_tag = 0; + misc_mask->outer_second_svlan_tag = 0; + } + + DR_STE_SET_TAG(eth_l2_src_v1, bit_mask, second_vlan_id, misc_mask, outer_second_vid); + DR_STE_SET_TAG(eth_l2_src_v1, bit_mask, second_cfi, misc_mask, outer_second_cfi); + DR_STE_SET_TAG(eth_l2_src_v1, bit_mask, second_priority, misc_mask, outer_second_prio); + } +} + +static int dr_ste_v1_build_eth_l2_src_or_dst_tag(struct dr_match_param *value, + bool inner, uint8_t *tag) +{ + struct dr_match_spec *spec = inner ? &value->inner : &value->outer; + struct dr_match_misc *misc_spec = &value->misc; + + DR_STE_SET_TAG(eth_l2_src_v1, tag, first_vlan_id, spec, first_vid); + DR_STE_SET_TAG(eth_l2_src_v1, tag, first_cfi, spec, first_cfi); + DR_STE_SET_TAG(eth_l2_src_v1, tag, first_priority, spec, first_prio); + DR_STE_SET_TAG(eth_l2_src_v1, tag, ip_fragmented, spec, frag); + DR_STE_SET_TAG(eth_l2_src_v1, tag, l3_ethertype, spec, ethertype); + + if (spec->ip_version) { + if (spec->ip_version == IP_VERSION_IPV4) { + DR_STE_SET(eth_l2_src_v1, tag, l3_type, STE_IPV4); + spec->ip_version = 0; + } else if (spec->ip_version == IP_VERSION_IPV6) { + DR_STE_SET(eth_l2_src_v1, tag, l3_type, STE_IPV6); + spec->ip_version = 0; + } else { + errno = EINVAL; + return errno; + } + } + + if (spec->cvlan_tag) { + DR_STE_SET(eth_l2_src_v1, tag, first_vlan_qualifier, DR_STE_CVLAN); + spec->cvlan_tag = 0; + } else if (spec->svlan_tag) { + DR_STE_SET(eth_l2_src_v1, tag, first_vlan_qualifier, DR_STE_SVLAN); + spec->svlan_tag = 0; + } + + if (inner) { + if (misc_spec->inner_second_cvlan_tag) { + DR_STE_SET(eth_l2_src_v1, tag, second_vlan_qualifier, DR_STE_CVLAN); + misc_spec->inner_second_cvlan_tag = 0; + } else if (misc_spec->inner_second_svlan_tag) { + DR_STE_SET(eth_l2_src_v1, tag, second_vlan_qualifier, DR_STE_SVLAN); + misc_spec->inner_second_svlan_tag = 0; + } + + DR_STE_SET_TAG(eth_l2_src_v1, tag, second_vlan_id, misc_spec, inner_second_vid); + DR_STE_SET_TAG(eth_l2_src_v1, tag, second_cfi, misc_spec, inner_second_cfi); + DR_STE_SET_TAG(eth_l2_src_v1, tag, second_priority, misc_spec, inner_second_prio); + } else { + if (misc_spec->outer_second_cvlan_tag) { + DR_STE_SET(eth_l2_src_v1, tag, second_vlan_qualifier, DR_STE_CVLAN); + misc_spec->outer_second_cvlan_tag = 0; + } else if (misc_spec->outer_second_svlan_tag) { + DR_STE_SET(eth_l2_src_v1, tag, second_vlan_qualifier, DR_STE_SVLAN); + misc_spec->outer_second_svlan_tag = 0; + } + DR_STE_SET_TAG(eth_l2_src_v1, tag, second_vlan_id, misc_spec, outer_second_vid); + DR_STE_SET_TAG(eth_l2_src_v1, tag, second_cfi, misc_spec, outer_second_cfi); + DR_STE_SET_TAG(eth_l2_src_v1, tag, second_priority, misc_spec, outer_second_prio); + } + + return 0; +} + +static void dr_ste_v1_build_eth_l2_src_bit_mask(struct dr_match_param *value, + bool inner, uint8_t *bit_mask) +{ + struct dr_match_spec *mask = inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l2_src_v1, bit_mask, smac_47_16, mask, smac_47_16); + DR_STE_SET_TAG(eth_l2_src_v1, bit_mask, smac_15_0, mask, smac_15_0); + + dr_ste_v1_build_eth_l2_src_or_dst_bit_mask(value, inner, bit_mask); +} + +static int dr_ste_v1_build_eth_l2_src_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l2_src_v1, tag, smac_47_16, spec, smac_47_16); + DR_STE_SET_TAG(eth_l2_src_v1, tag, smac_15_0, spec, smac_15_0); + + return dr_ste_v1_build_eth_l2_src_or_dst_tag(value, sb->inner, tag); +} + +static void dr_ste_v1_build_eth_l2_src_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v1_build_eth_l2_src_bit_mask(mask, sb->inner, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_DFNR_TYPE(ETHL2_SRC, sb->inner); + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_eth_l2_src_tag; +} + +static void dr_ste_v1_build_eth_l2_dst_bit_mask(struct dr_match_param *value, + bool inner, uint8_t *bit_mask) +{ + struct dr_match_spec *mask = inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l2_dst_v1, bit_mask, dmac_47_16, mask, dmac_47_16); + DR_STE_SET_TAG(eth_l2_dst_v1, bit_mask, dmac_15_0, mask, dmac_15_0); + + dr_ste_v1_build_eth_l2_src_or_dst_bit_mask(value, inner, bit_mask); +} + +static int dr_ste_v1_build_eth_l2_dst_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l2_dst_v1, tag, dmac_47_16, spec, dmac_47_16); + DR_STE_SET_TAG(eth_l2_dst_v1, tag, dmac_15_0, spec, dmac_15_0); + + return dr_ste_v1_build_eth_l2_src_or_dst_tag(value, sb->inner, tag); +} + +static void dr_ste_v1_build_eth_l2_dst_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v1_build_eth_l2_dst_bit_mask(mask, sb->inner, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_DFNR_TYPE(ETHL2, sb->inner); + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_eth_l2_dst_tag; +} + +static void dr_ste_v1_build_eth_l2_tnl_bit_mask(struct dr_match_param *value, + bool inner, uint8_t *bit_mask) +{ + struct dr_match_spec *mask = inner ? &value->inner : &value->outer; + struct dr_match_misc *misc = &value->misc; + + DR_STE_SET_TAG(eth_l2_tnl_v1, bit_mask, dmac_47_16, mask, dmac_47_16); + DR_STE_SET_TAG(eth_l2_tnl_v1, bit_mask, dmac_15_0, mask, dmac_15_0); + DR_STE_SET_TAG(eth_l2_tnl_v1, bit_mask, first_vlan_id, mask, first_vid); + DR_STE_SET_TAG(eth_l2_tnl_v1, bit_mask, first_cfi, mask, first_cfi); + DR_STE_SET_TAG(eth_l2_tnl_v1, bit_mask, first_priority, mask, first_prio); + DR_STE_SET_TAG(eth_l2_tnl_v1, bit_mask, ip_fragmented, mask, frag); + DR_STE_SET_TAG(eth_l2_tnl_v1, bit_mask, l3_ethertype, mask, ethertype); + DR_STE_SET_ONES(eth_l2_tnl_v1, bit_mask, l3_type, mask, ip_version); + + if (misc->vxlan_vni) { + DR_STE_SET(eth_l2_tnl_v1, bit_mask, l2_tunneling_network_id, (misc->vxlan_vni << 8)); + misc->vxlan_vni = 0; + } + + if (mask->svlan_tag || mask->cvlan_tag) { + DR_STE_SET(eth_l2_tnl_v1, bit_mask, first_vlan_qualifier, -1); + mask->cvlan_tag = 0; + mask->svlan_tag = 0; + } +} + +static int dr_ste_v1_build_eth_l2_tnl_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + struct dr_match_misc *misc = &value->misc; + + DR_STE_SET_TAG(eth_l2_tnl_v1, tag, dmac_47_16, spec, dmac_47_16); + DR_STE_SET_TAG(eth_l2_tnl_v1, tag, dmac_15_0, spec, dmac_15_0); + DR_STE_SET_TAG(eth_l2_tnl_v1, tag, first_vlan_id, spec, first_vid); + DR_STE_SET_TAG(eth_l2_tnl_v1, tag, first_cfi, spec, first_cfi); + DR_STE_SET_TAG(eth_l2_tnl_v1, tag, ip_fragmented, spec, frag); + DR_STE_SET_TAG(eth_l2_tnl_v1, tag, first_priority, spec, first_prio); + DR_STE_SET_TAG(eth_l2_tnl_v1, tag, l3_ethertype, spec, ethertype); + + if (misc->vxlan_vni) { + DR_STE_SET(eth_l2_tnl_v1, tag, l2_tunneling_network_id, + (misc->vxlan_vni << 8)); + misc->vxlan_vni = 0; + } + + if (spec->cvlan_tag) { + DR_STE_SET(eth_l2_tnl_v1, tag, first_vlan_qualifier, DR_STE_CVLAN); + spec->cvlan_tag = 0; + } else if (spec->svlan_tag) { + DR_STE_SET(eth_l2_tnl_v1, tag, first_vlan_qualifier, DR_STE_SVLAN); + spec->svlan_tag = 0; + } + + if (spec->ip_version) { + if (spec->ip_version == IP_VERSION_IPV4) { + DR_STE_SET(eth_l2_tnl_v1, tag, l3_type, STE_IPV4); + spec->ip_version = 0; + } else if (spec->ip_version == IP_VERSION_IPV6) { + DR_STE_SET(eth_l2_tnl_v1, tag, l3_type, STE_IPV6); + spec->ip_version = 0; + } else { + errno = EINVAL; + return errno; + } + } + + return 0; +} + +static void dr_ste_v1_build_eth_l2_tnl_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v1_build_eth_l2_tnl_bit_mask(mask, sb->inner, sb->bit_mask); + + sb->lu_type = DR_STE_V1_LU_TYPE_ETHL2_TNL; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_eth_l2_tnl_tag; +} + +static int dr_ste_v1_build_eth_l3_ipv4_misc_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + + DR_STE_SET_TAG(eth_l3_ipv4_misc_v1, tag, time_to_live, spec, ip_ttl_hoplimit); + + return 0; +} + +static void dr_ste_v1_build_eth_l3_ipv4_misc_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v1_build_eth_l3_ipv4_misc_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_DFNR_TYPE(ETHL3_IPV4_MISC, sb->inner); + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_eth_l3_ipv4_misc_tag; +} + +static int dr_ste_v1_build_eth_ipv6_l3_l4_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_spec *spec = sb->inner ? &value->inner : &value->outer; + struct dr_match_misc *misc = &value->misc; + + DR_STE_SET_TAG(eth_l4_v1, tag, dst_port, spec, tcp_dport); + DR_STE_SET_TAG(eth_l4_v1, tag, src_port, spec, tcp_sport); + DR_STE_SET_TAG(eth_l4_v1, tag, dst_port, spec, udp_dport); + DR_STE_SET_TAG(eth_l4_v1, tag, src_port, spec, udp_sport); + DR_STE_SET_TAG(eth_l4_v1, tag, protocol, spec, ip_protocol); + DR_STE_SET_TAG(eth_l4_v1, tag, fragmented, spec, frag); + DR_STE_SET_TAG(eth_l4_v1, tag, dscp, spec, ip_dscp); + DR_STE_SET_TAG(eth_l4_v1, tag, ecn, spec, ip_ecn); + DR_STE_SET_TAG(eth_l4_v1, tag, ipv6_hop_limit, spec, ip_ttl_hoplimit); + + if (sb->inner) + DR_STE_SET_TAG(eth_l4_v1, tag, flow_label, misc, inner_ipv6_flow_label); + else + DR_STE_SET_TAG(eth_l4_v1, tag, flow_label, misc, outer_ipv6_flow_label); + + if (spec->tcp_flags) { + DR_STE_SET_TCP_FLAGS(eth_l4_v1, tag, spec); + spec->tcp_flags = 0; + } + + return 0; +} + +static void dr_ste_v1_build_eth_ipv6_l3_l4_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v1_build_eth_ipv6_l3_l4_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_DFNR_TYPE(ETHL4, sb->inner); + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_eth_ipv6_l3_l4_tag; +} + +static int dr_ste_v1_build_mpls_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_misc2 *misc2 = &value->misc2; + + if (sb->inner) + DR_STE_SET_MPLS(mpls_v1, misc2, inner, tag); + else + DR_STE_SET_MPLS(mpls_v1, misc2, outer, tag); + + return 0; +} + +static void dr_ste_v1_build_mpls_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v1_build_mpls_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_CALC_DFNR_TYPE(MPLS, sb->inner); + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_mpls_tag; +} + +static int dr_ste_v1_build_tnl_gre_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_misc *misc = &value->misc; + + DR_STE_SET_TAG(gre_v1, tag, gre_protocol, misc, gre_protocol); + DR_STE_SET_TAG(gre_v1, tag, gre_k_present, misc, gre_k_present); + DR_STE_SET_TAG(gre_v1, tag, gre_key_h, misc, gre_key_h); + DR_STE_SET_TAG(gre_v1, tag, gre_key_l, misc, gre_key_l); + + DR_STE_SET_TAG(gre_v1, tag, gre_c_present, misc, gre_c_present); + DR_STE_SET_TAG(gre_v1, tag, gre_s_present, misc, gre_s_present); + + return 0; +} + +static void dr_ste_v1_build_tnl_gre_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v1_build_tnl_gre_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V1_LU_TYPE_GRE; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_tnl_gre_tag; +} + +static int dr_ste_v1_build_tnl_mpls_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_misc2 *misc2 = &value->misc2; + + if (DR_STE_IS_OUTER_MPLS_OVER_GRE_SET(misc2)) { + DR_STE_SET_TAG(mpls_v1, tag, mpls0_label, + misc2, outer_first_mpls_over_gre_label); + + DR_STE_SET_TAG(mpls_v1, tag, mpls0_exp, + misc2, outer_first_mpls_over_gre_exp); + + DR_STE_SET_TAG(mpls_v1, tag, mpls0_s_bos, + misc2, outer_first_mpls_over_gre_s_bos); + + DR_STE_SET_TAG(mpls_v1, tag, mpls0_ttl, + misc2, outer_first_mpls_over_gre_ttl); + } else { + DR_STE_SET_TAG(mpls_v1, tag, mpls0_label, + misc2, outer_first_mpls_over_udp_label); + + DR_STE_SET_TAG(mpls_v1, tag, mpls0_exp, + misc2, outer_first_mpls_over_udp_exp); + + DR_STE_SET_TAG(mpls_v1, tag, mpls0_s_bos, + misc2, outer_first_mpls_over_udp_s_bos); + + DR_STE_SET_TAG(mpls_v1, tag, mpls0_ttl, + misc2, outer_first_mpls_over_udp_ttl); + } + + return 0; +} + +static void dr_ste_v1_build_tnl_mpls_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v1_build_tnl_mpls_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V1_LU_TYPE_MPLS_I; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_tnl_mpls_tag; +} + +static int dr_ste_v1_build_icmp_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_misc3 *misc3 = &value->misc3; + bool is_ipv4 = DR_MASK_IS_ICMPV4_SET(misc3); + uint32_t *icmp_header_data; + uint8_t *icmp_type; + uint8_t *icmp_code; + + if (is_ipv4) { + icmp_header_data = &misc3->icmpv4_header_data; + icmp_type = &misc3->icmpv4_type; + icmp_code = &misc3->icmpv4_code; + } else { + icmp_header_data = &misc3->icmpv6_header_data; + icmp_type = &misc3->icmpv6_type; + icmp_code = &misc3->icmpv6_code; + } + + DR_STE_SET(icmp_v1, tag, icmp_header_data, *icmp_header_data); + DR_STE_SET(icmp_v1, tag, icmp_type, *icmp_type); + DR_STE_SET(icmp_v1, tag, icmp_code, *icmp_code); + + *icmp_header_data = 0; + *icmp_type = 0; + *icmp_code = 0; + + return 0; +} + +static int dr_ste_v1_build_icmp_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v1_build_icmp_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V1_LU_TYPE_ETHL4_MISC_O; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_icmp_tag; + + return 0; +} + +static int dr_ste_v1_build_general_purpose_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_misc2 *misc2 = &value->misc2; + + DR_STE_SET_TAG(general_purpose, tag, general_purpose_lookup_field, + misc2, metadata_reg_a); + + return 0; +} + +static void dr_ste_v1_build_general_purpose_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v1_build_general_purpose_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V1_LU_TYPE_GENERAL_PURPOSE; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_general_purpose_tag; +} + +static int dr_ste_v1_build_eth_l4_misc_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_misc3 *misc3 = &value->misc3; + + if (sb->inner) { + DR_STE_SET_TAG(eth_l4_misc_v1, tag, seq_num, misc3, inner_tcp_seq_num); + DR_STE_SET_TAG(eth_l4_misc_v1, tag, ack_num, misc3, inner_tcp_ack_num); + } else { + DR_STE_SET_TAG(eth_l4_misc_v1, tag, seq_num, misc3, outer_tcp_seq_num); + DR_STE_SET_TAG(eth_l4_misc_v1, tag, ack_num, misc3, outer_tcp_ack_num); + } + + return 0; +} + +static void dr_ste_v1_build_eth_l4_misc_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v1_build_eth_l4_misc_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V1_LU_TYPE_ETHL4_MISC_O; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_eth_l4_misc_tag; +} + +static int +dr_ste_v1_build_flex_parser_tnl_vxlan_gpe_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_misc3 *misc3 = &value->misc3; + + DR_STE_SET_TAG(flex_parser_tnl_vxlan_gpe, tag, + outer_vxlan_gpe_flags, misc3, + outer_vxlan_gpe_flags); + DR_STE_SET_TAG(flex_parser_tnl_vxlan_gpe, tag, + outer_vxlan_gpe_next_protocol, misc3, + outer_vxlan_gpe_next_protocol); + DR_STE_SET_TAG(flex_parser_tnl_vxlan_gpe, tag, + outer_vxlan_gpe_vni, misc3, + outer_vxlan_gpe_vni); + + return 0; +} + +static void +dr_ste_v1_build_flex_parser_tnl_vxlan_gpe_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v1_build_flex_parser_tnl_vxlan_gpe_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V1_LU_TYPE_FLEX_PARSER_TNL_HEADER; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_flex_parser_tnl_vxlan_gpe_tag; +} + +static int +dr_ste_v1_build_flex_parser_tnl_geneve_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_misc *misc = &value->misc; + + DR_STE_SET_TAG(flex_parser_tnl_geneve, tag, + geneve_protocol_type, misc, geneve_protocol_type); + DR_STE_SET_TAG(flex_parser_tnl_geneve, tag, + geneve_oam, misc, geneve_oam); + DR_STE_SET_TAG(flex_parser_tnl_geneve, tag, + geneve_opt_len, misc, geneve_opt_len); + DR_STE_SET_TAG(flex_parser_tnl_geneve, tag, + geneve_vni, misc, geneve_vni); + + return 0; +} + +static void +dr_ste_v1_build_flex_parser_tnl_geneve_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v1_build_flex_parser_tnl_geneve_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V1_LU_TYPE_FLEX_PARSER_TNL_HEADER; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_flex_parser_tnl_geneve_tag; +} + +static int dr_ste_v1_build_flex_parser_tnl_gtpu_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_misc3 *misc3 = &value->misc3; + + DR_STE_SET_TAG(flex_parser_tnl_gtpu, tag, + gtpu_flags, misc3, + gtpu_flags); + DR_STE_SET_TAG(flex_parser_tnl_gtpu, tag, + gtpu_msg_type, misc3, + gtpu_msg_type); + DR_STE_SET_TAG(flex_parser_tnl_gtpu, tag, + gtpu_teid, misc3, + gtpu_teid); + + return 0; +} + +static void dr_ste_v1_build_flex_parser_tnl_gtpu_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v1_build_flex_parser_tnl_gtpu_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V1_LU_TYPE_FLEX_PARSER_TNL_HEADER; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_flex_parser_tnl_gtpu_tag; +} + +static int dr_ste_v1_build_register_0_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_misc2 *misc2 = &value->misc2; + + DR_STE_SET_TAG(register_0, tag, register_0_h, misc2, metadata_reg_c_0); + DR_STE_SET_TAG(register_0, tag, register_0_l, misc2, metadata_reg_c_1); + DR_STE_SET_TAG(register_0, tag, register_1_h, misc2, metadata_reg_c_2); + DR_STE_SET_TAG(register_0, tag, register_1_l, misc2, metadata_reg_c_3); + + return 0; +} + +static void dr_ste_v1_build_register_0_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v1_build_register_0_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V1_LU_TYPE_STEERING_REGISTERS_0; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_register_0_tag; +} + +static int dr_ste_v1_build_register_1_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_misc2 *misc2 = &value->misc2; + + DR_STE_SET_TAG(register_1, tag, register_2_h, misc2, metadata_reg_c_4); + DR_STE_SET_TAG(register_1, tag, register_2_l, misc2, metadata_reg_c_5); + DR_STE_SET_TAG(register_1, tag, register_3_h, misc2, metadata_reg_c_6); + DR_STE_SET_TAG(register_1, tag, register_3_l, misc2, metadata_reg_c_7); + + return 0; +} + +static void dr_ste_v1_build_register_1_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v1_build_register_1_tag(mask, sb, sb->bit_mask); + + sb->lu_type = DR_STE_V1_LU_TYPE_STEERING_REGISTERS_1; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_register_1_tag; +} + +static void dr_ste_v1_build_src_gvmi_qpn_bit_mask(struct dr_match_param *value, + uint8_t *bit_mask) +{ + struct dr_match_misc *misc_mask = &value->misc; + + DR_STE_SET_ONES(src_gvmi_qp_v1, bit_mask, source_gvmi, misc_mask, source_port); + DR_STE_SET_ONES(src_gvmi_qp_v1, bit_mask, source_qp, misc_mask, source_sqn); +} + +static int dr_ste_v1_build_src_gvmi_qpn_tag(struct dr_match_param *value, + struct dr_ste_build *sb, + uint8_t *tag) +{ + struct dr_match_misc *misc = &value->misc; + struct dr_devx_vport_cap *vport_cap; + uint8_t *bit_mask = sb->bit_mask; + bool source_gvmi_set; + + DR_STE_SET_TAG(src_gvmi_qp_v1, tag, source_qp, misc, source_sqn); + + source_gvmi_set = DR_STE_GET(src_gvmi_qp_v1, bit_mask, source_gvmi); + if (source_gvmi_set) { + vport_cap = dr_get_vport_cap(sb->caps, misc->source_port); + if (!vport_cap) + return errno; + + if (vport_cap->gvmi) + DR_STE_SET(src_gvmi_qp_v1, tag, source_gvmi, vport_cap->gvmi); + + misc->source_port = 0; + } + + return 0; +} + +static void dr_ste_v1_build_src_gvmi_qpn_init(struct dr_ste_build *sb, + struct dr_match_param *mask) +{ + dr_ste_v1_build_src_gvmi_qpn_bit_mask(mask, sb->bit_mask); + + sb->lu_type = DR_STE_V1_LU_TYPE_SRC_QP_GVMI; + sb->byte_mask = dr_ste_conv_bit_to_byte_mask(sb->bit_mask); + sb->ste_build_tag_func = &dr_ste_v1_build_src_gvmi_qpn_tag; +} + +static struct dr_ste_ctx ste_ctx_v1 = { + /* Builders */ + .build_eth_l2_src_dst_init = &dr_ste_v1_build_eth_l2_src_dst_init, + .build_eth_l3_ipv6_src_init = &dr_ste_v1_build_eth_l3_ipv6_src_init, + .build_eth_l3_ipv6_dst_init = &dr_ste_v1_build_eth_l3_ipv6_dst_init, + .build_eth_l3_ipv4_5_tuple_init = &dr_ste_v1_build_eth_l3_ipv4_5_tuple_init, + .build_eth_l2_src_init = &dr_ste_v1_build_eth_l2_src_init, + .build_eth_l2_dst_init = &dr_ste_v1_build_eth_l2_dst_init, + .build_eth_l2_tnl_init = &dr_ste_v1_build_eth_l2_tnl_init, + .build_eth_l3_ipv4_misc_init = &dr_ste_v1_build_eth_l3_ipv4_misc_init, + .build_eth_ipv6_l3_l4_init = &dr_ste_v1_build_eth_ipv6_l3_l4_init, + .build_mpls_init = &dr_ste_v1_build_mpls_init, + .build_tnl_gre_init = &dr_ste_v1_build_tnl_gre_init, + .build_tnl_mpls_init = &dr_ste_v1_build_tnl_mpls_init, + .build_icmp_init = &dr_ste_v1_build_icmp_init, + .build_general_purpose_init = &dr_ste_v1_build_general_purpose_init, + .build_eth_l4_misc_init = &dr_ste_v1_build_eth_l4_misc_init, + .build_tnl_vxlan_gpe_init = &dr_ste_v1_build_flex_parser_tnl_vxlan_gpe_init, + .build_tnl_geneve_init = &dr_ste_v1_build_flex_parser_tnl_geneve_init, + .build_tnl_gtpu_init = &dr_ste_v1_build_flex_parser_tnl_gtpu_init, + .build_register_0_init = &dr_ste_v1_build_register_0_init, + .build_register_1_init = &dr_ste_v1_build_register_1_init, + .build_src_gvmi_qpn_init = &dr_ste_v1_build_src_gvmi_qpn_init, + /* Getters and Setters */ + .ste_init = &dr_ste_v1_init, + .set_next_lu_type = &dr_ste_v1_set_next_lu_type, + .get_next_lu_type = &dr_ste_v1_get_next_lu_type, + .set_miss_addr = &dr_ste_v1_set_miss_addr, + .get_miss_addr = &dr_ste_v1_get_miss_addr, + .set_hit_addr = &dr_ste_v1_set_hit_addr, + .set_byte_mask = &dr_ste_v1_set_byte_mask, + .get_byte_mask = &dr_ste_v1_get_byte_mask, + /* Actions */ + .set_actions_rx = &dr_ste_v1_set_actions_rx, + .set_actions_tx = &dr_ste_v1_set_actions_tx, + .modify_field_arr_sz = ARRAY_SIZE(dr_ste_v1_action_modify_field_arr), + .modify_field_arr = dr_ste_v1_action_modify_field_arr, + .set_action_set = &dr_ste_v1_set_action_set, + .set_action_add = &dr_ste_v1_set_action_add, + .set_action_copy = &dr_ste_v1_set_action_copy, + .set_action_decap_l3_list = &dr_ste_v1_set_action_decap_l3_list, + /* Send */ + .prepare_for_postsend = &dr_ste_v1_prepare_for_postsend, +}; + +struct dr_ste_ctx *dr_ste_get_ctx_v1(void) +{ + return &ste_ctx_v1; +} diff --git a/providers/mlx5/dr_table.c b/providers/mlx5/dr_table.c index 3f3a065..0e135b7 100644 --- a/providers/mlx5/dr_table.c +++ b/providers/mlx5/dr_table.c @@ -151,20 +151,19 @@ static int dr_table_init(struct mlx5dv_dr_table *tbl) static int dr_table_create_devx_tbl(struct mlx5dv_dr_table *tbl) { - uint64_t icm_addr_rx = 0; - uint64_t icm_addr_tx = 0; + struct dr_devx_flow_table_attr ft_attr = {}; + + ft_attr.type = tbl->table_type; + ft_attr.level = tbl->dmn->info.caps.max_ft_level - 1; + ft_attr.sw_owner = true; if (tbl->rx.s_anchor) - icm_addr_rx = tbl->rx.s_anchor->chunk->icm_addr; + ft_attr.icm_addr_rx = tbl->rx.s_anchor->chunk->icm_addr; if (tbl->tx.s_anchor) - icm_addr_tx = tbl->tx.s_anchor->chunk->icm_addr; + ft_attr.icm_addr_tx = tbl->tx.s_anchor->chunk->icm_addr; - tbl->devx_obj = dr_devx_create_flow_table(tbl->dmn->ctx, - tbl->table_type, - icm_addr_rx, - icm_addr_tx, - tbl->dmn->info.caps.max_ft_level - 1); + tbl->devx_obj = dr_devx_create_flow_table(tbl->dmn->ctx, &ft_attr); if (!tbl->devx_obj) return errno; diff --git a/providers/mlx5/libmlx5.map b/providers/mlx5/libmlx5.map index ef5930c..7fd90ec 100644 --- a/providers/mlx5/libmlx5.map +++ b/providers/mlx5/libmlx5.map @@ -133,3 +133,22 @@ MLX5_1.13 { mlx5dv_pp_alloc; mlx5dv_pp_free; } MLX5_1.12; + +MLX5_1.14 { + global: + mlx5dv_dr_action_create_default_miss; + mlx5dv_dr_domain_set_reclaim_device_memory; + mlx5dv_modify_qp_lag_port; + mlx5dv_query_qp_lag_port; +} MLX5_1.13; + +MLX5_1.15 { + global: + mlx5dv_dr_action_create_dest_devx_tir; +} MLX5_1.14; + +MLX5_1.16 { + global: + mlx5dv_dr_action_create_dest_array; + mlx5dv_dr_action_create_flow_sampler; +} MLX5_1.15; diff --git a/providers/mlx5/man/CMakeLists.txt b/providers/mlx5/man/CMakeLists.txt index d5f8b86..b20d4de 100644 --- a/providers/mlx5/man/CMakeLists.txt +++ b/providers/mlx5/man/CMakeLists.txt @@ -23,9 +23,11 @@ rdma_man_pages( mlx5dv_get_clock_info.3 mlx5dv_init_obj.3 mlx5dv_is_supported.3.md + mlx5dv_modify_qp_lag_port.3.md mlx5dv_open_device.3.md mlx5dv_pp_alloc.3.md mlx5dv_query_device.3 + mlx5dv_query_qp_lag_port.3.md mlx5dv_ts_to_ns.3 mlx5dv_wr_post.3.md mlx5dv.7 @@ -55,9 +57,13 @@ rdma_alias_man_pages( mlx5dv_devx_umem_reg.3 mlx5dv_devx_umem_dereg.3 mlx5dv_dr_flow.3 mlx5dv_dr_action_create_dest_table.3 mlx5dv_dr_flow.3 mlx5dv_dr_action_create_dest_ibv_qp.3 + mlx5dv_dr_flow.3 mlx5dv_dr_action_create_dest_devx_tir.3 mlx5dv_dr_flow.3 mlx5dv_dr_action_create_dest_vport.3 + mlx5dv_dr_flow.3 mlx5dv_dr_action_create_dest_array.3 mlx5dv_dr_flow.3 mlx5dv_dr_action_create_flow_counter.3 mlx5dv_dr_flow.3 mlx5dv_dr_action_create_drop.3 + mlx5dv_dr_flow.3 mlx5dv_dr_action_create_default_miss.3 + mlx5dv_dr_flow.3 mlx5dv_dr_action_create_flow_sampler.3 mlx5dv_dr_flow.3 mlx5dv_dr_action_create_flow_meter.3 mlx5dv_dr_flow.3 mlx5dv_dr_action_create_modify_header.3 mlx5dv_dr_flow.3 mlx5dv_dr_action_create_packet_reformat.3 @@ -67,6 +73,7 @@ rdma_alias_man_pages( mlx5dv_dr_flow.3 mlx5dv_dr_domain_create.3 mlx5dv_dr_flow.3 mlx5dv_dr_domain_destroy.3 mlx5dv_dr_flow.3 mlx5dv_dr_domain_sync.3 + mlx5dv_dr_flow.3 mlx5dv_dr_domain_set_reclaim_device_memory.3 mlx5dv_dr_flow.3 mlx5dv_dr_matcher_create.3 mlx5dv_dr_flow.3 mlx5dv_dr_matcher_destroy.3 mlx5dv_dr_flow.3 mlx5dv_dr_rule_create.3 diff --git a/providers/mlx5/man/mlx5dv_create_flow.3.md b/providers/mlx5/man/mlx5dv_create_flow.3.md index bc423a8..1b7622a 100644 --- a/providers/mlx5/man/mlx5dv_create_flow.3.md +++ b/providers/mlx5/man/mlx5dv_create_flow.3.md @@ -61,6 +61,10 @@ struct mlx5dv_flow_action_attr { The DEVX destination object for the matched packets. MLX5DV_FLOW_ACTION_COUNTERS_DEVX The DEVX counter object for the matched packets. + MLX5DV_FLOW_ACTION_DEFAULT_MISS + Steer the packet to the default miss destination. + MLX5DV_FLOW_ACTION_DROP + Action is dropping the matched packet. *qp* : QP passed, to be used with *type* *MLX5DV_FLOW_ACTION_DEST_IBV_QP*. diff --git a/providers/mlx5/man/mlx5dv_create_qp.3.md b/providers/mlx5/man/mlx5dv_create_qp.3.md index 856c69a..8f8c254 100644 --- a/providers/mlx5/man/mlx5dv_create_qp.3.md +++ b/providers/mlx5/man/mlx5dv_create_qp.3.md @@ -112,6 +112,13 @@ struct mlx5dv_dc_init_attr { **mlx5dv_qp_ex_from_ibv_qp_ex()** is used to get *struct mlx5dv_qp_ex* for accessing the send ops interfaces when IBV_QP_INIT_ATTR_SEND_OPS_FLAGS is used. +The MLX5DV_QP_CREATE_DISABLE_SCATTER_TO_CQE flag should be set in cases that IOVA doesn't +match the process' VA and the message payload size is small enough to trigger the scatter to CQE +feature. + +When device memory is used IBV_SEND_INLINE and scatter to CQE should not be used, as the memcpy +is not possible. + # RETURN VALUE **mlx5dv_create_qp()** diff --git a/providers/mlx5/man/mlx5dv_dr_flow.3.md b/providers/mlx5/man/mlx5dv_dr_flow.3.md index 6bba15d..b48a038 100644 --- a/providers/mlx5/man/mlx5dv_dr_flow.3.md +++ b/providers/mlx5/man/mlx5dv_dr_flow.3.md @@ -10,7 +10,7 @@ footer: mlx5 # NAME -mlx5dv_dr_domain_create, mlx5dv_dr_domain_sync, mlx5dv_dr_domain_destroy - Manage flow domains +mlx5dv_dr_domain_create, mlx5dv_dr_domain_sync, mlx5dv_dr_domain_destroy, mlx5dv_dr_domain_set_reclaim_device_memory - Manage flow domains mlx5dv_dr_table_create, mlx5dv_dr_table_destroy - Manage flow tables @@ -20,9 +20,13 @@ mlx5dv_dr_rule_create, mlx5dv_dr_rule_destroy - Manage flow rules mlx5dv_dr_action_create_drop - Create drop action +mlx5dv_dr_action_create_default_miss - Create default miss action + mlx5dv_dr_action_create_tag - Create tag actions -mlx5dv_dr_action_create_dest_ibv_qp, mlx5dv_dr_action_create_dest_table, mlx5dv_dr_action_create_dest_vport - Create packet destination actions +mlx5dv_dr_action_create_dest_ibv_qp, mlx5dv_dr_action_create_dest_table, mlx5dv_dr_action_create_dest_vport, mlx5dv_dr_action_create_dest_devx_tir - Create packet destination actions + +mlx5dv_dr_action_create_dest_array - Create destination array action mlx5dv_dr_action_create_packet_reformat - Create packet reformat actions @@ -32,6 +36,8 @@ mlx5dv_dr_action_create_flow_counter - Create devx flow counter actions mlx5dv_dr_action_create_flow_meter, mlx5dv_dr_action_modify_flow_meter - Create and modify meter action +mlx5dv_dr_action_create_flow_sampler - Create flow sampler action + mlx5dv_dr_action_destroy - Destroy actions # SYNOPSIS @@ -49,6 +55,10 @@ int mlx5dv_dr_domain_sync( int mlx5dv_dr_domain_destroy(struct mlx5dv_dr_domain *domain); +void mlx5dv_dr_domain_set_reclaim_device_memory( + struct mlx5dv_dr_domain *dmn, + bool enable); + struct mlx5dv_dr_table *mlx5dv_dr_table_create( struct mlx5dv_dr_domain *domain, uint32_t level); @@ -73,6 +83,8 @@ void mlx5dv_dr_rule_destroy(struct mlx5dv_dr_rule *rule); struct mlx5dv_dr_action *mlx5dv_dr_action_create_drop(void); +struct mlx5dv_dr_action *mlx5dv_dr_action_create_default_miss(void); + struct mlx5dv_dr_action *mlx5dv_dr_action_create_tag( uint32_t tag_value); @@ -86,6 +98,9 @@ struct mlx5dv_dr_action *mlx5dv_dr_action_create_dest_vport( struct mlx5dv_dr_domain *domain, uint32_t vport); +struct mlx5dv_dr_action *mlx5dv_dr_action_create_dest_devx_tir( + struct mlx5dv_devx_obj *devx_obj); + struct mlx5dv_dr_action *mlx5dv_dr_action_create_packet_reformat( struct mlx5dv_dr_domain *domain, uint32_t flags, @@ -109,6 +124,14 @@ int mlx5dv_dr_action_modify_flow_meter(struct mlx5dv_dr_action *action, struct mlx5dv_dr_flow_meter_attr *attr, __be64 modify_field_select); +struct mlx5dv_dr_action * +mlx5dv_dr_action_create_flow_sampler(struct mlx5dv_dr_flow_sampler_attr *attr); + +struct mlx5dv_dr_action * +mlx5dv_dr_action_create_dest_array(struct mlx5dv_dr_domain *domain, + size_t num_dest, + struct mlx5dv_dr_action_dest_attr *dests[]); + int mlx5dv_dr_action_destroy(struct mlx5dv_dr_action *action); ``` @@ -145,6 +168,11 @@ Default behavior: Forward packet to eSwitch manager vport. **MLX5DV_DR_DOMAIN_SYNC_FLAGS_HW**: clear the steering HW cache to enforce next packet hits the latest rules, in addition to the SW SYNC handling. +**MLX5DV_DR_DOMAIN_SYNC_FLAGS_MEM**: sync device memory to free cached memory. + + +*mlx5dv_dr_domain_set_reclaim_device_memory()* is used to enable the reclaiming of device memory back to the system when not in use, by default this feature is disabled. + ## Table *mlx5dv_dr_table_create()* creates a DR table in the **domain**, at the appropriate **level**, and can be used with *mlx5dv_dr_matcher_create()* and *mlx5dv_dr_action_create_dest_table()*. All packets start traversing the steering domain tree at table **level** zero (0). @@ -166,6 +194,9 @@ When an action handle is reused for multiple rules, the same action will be exec Action: Drop *mlx5dv_dr_action_create_drop* create a terminating action which drops packets. Can not be mixed with Destination actions. +Action: Default miss +*mlx5dv_dr_action_create_default_miss* create a terminating action which will execute the default behavior based on the domain type. + Action: Tag *mlx5dv_dr_action_create_tag* creates a non-terminating action which tags packets with **tag_value**. The **tag_value** is available in the CQE of the packet received. Valid only on domain type NIC_RX. @@ -173,6 +204,11 @@ Action: Destination *mlx5dv_dr_action_create_dest_ibv_qp* creates a terminating action delivering the packet to a QP, defined by **ibqp**. Valid only on domain type NIC_RX. *mlx5dv_dr_action_create_dest_table* creates a forwarding action to another flow table, defined by **table**. The destination **table** must be from the same domain with a level higher than zero. *mlx5dv_dr_action_create_dest_vport* creates a forwarding action to a **vport** on the same **domain**. Valid only on domain type FDB. +*mlx5dv_dr_action_create_dest_devx_tir* creates a terminating action delivering the packet to a TIR, defined by **devx_obj**. Valid only on domain type NIC_RX. + +Action: Array +*mlx5dv_dr_action_create_dest_array* creates an action which replicates a packet to multiple destinations. **num_dest** defines the number of replication destinations. +Each **dests** destination array entry can be of different **type**. Use type MLX5DV_DR_ACTION_DEST for direct forwarding to an action destination. Use type MLX5DV_DR_ACTION_DEST_REFORMAT when reformat action should be performed on the packet before it is forwarding to the destination action. Action: Packet Reformat *mlx5dv_dr_action_create_packet_reformat* create a packet reformat context and action in the **domain**. The **reformat_type**, **data_sz** and **data** are defined in *man mlx5dv_create_flow_action_packet_reformat*. @@ -187,6 +223,12 @@ Action: Meter *mlx5dv_dr_action_create_flow_meter* creates a meter action based on the flow meter parameters. The paramertes are according to the device specification. *mlx5dv_dr_action_modify_flow_meter* modifies existing flow meter **action** based on **modify_field_select**. **modify_field_select** is according to the device specification. +Action: Sampler +*mlx5dv_dr_action_create_flow_sampler* creates a sampler action, allowing us to duplicate and sample a portion of traffic. +Packets steered to the sampler action will be sampled with an approximate probability of 1/sample_ratio provided in **attr**, and sample_actions provided in **attr** will be executed over them. +All original packets will be steered to default_next_table in **attr**. +A modify header format SET_ACTION data can be provided in action of **attr**, which can be executed on packets before going to default flow table. On some devices, this is required to set register value. + Action Flags: action **flags** can be set to one of the types of *enum mlx5dv_dr_action_flags*: **MLX5DV_DR_ACTION_FLAGS_ROOT_LEVEL**: is used to indicate the action is targeted for flow table in level=0 (ROOT) of the specific domain. diff --git a/providers/mlx5/man/mlx5dv_modify_qp_lag_port.3.md b/providers/mlx5/man/mlx5dv_modify_qp_lag_port.3.md new file mode 100644 index 0000000..e42a979 --- /dev/null +++ b/providers/mlx5/man/mlx5dv_modify_qp_lag_port.3.md @@ -0,0 +1,45 @@ +--- +layout: page +title: mlx5dv_modify_qp_lag_port +section: 3 +tagline: Verbs +--- + +# NAME + +mlx5dv_modify_qp_lag_port - Modify the lag port information of a given QP + +# SYNOPSIS + +```c +#include + +int mlx5dv_modify_qp_lag_port(struct ibv_qp *qp, uint8_t port_num); +``` + +# DESCRIPTION + +This API enables modifying the configured port num of a given QP. + +If the QP state is modified later, the port num may be implicitly re-configured. + +Use query mlx5dv_query_qp_lag_port to check the configured and active port num values. + +# ARGUMENTS + +*qp* +: The ibv_qp object to issue the action on. + +*port_num* +: The port_num to set for the QP. + +# RETURN VALUE +0 on success; EOPNOTSUPP if not in LAG mode, or other errno value on other failures. + +# SEE ALSO + +*mlx5dv_query_qp_lag_port(3)* + +# AUTHOR + +Aharon Landau diff --git a/providers/mlx5/man/mlx5dv_query_device.3 b/providers/mlx5/man/mlx5dv_query_device.3 index d005552..26a1b8a 100644 --- a/providers/mlx5/man/mlx5dv_query_device.3 +++ b/providers/mlx5/man/mlx5dv_query_device.3 @@ -84,6 +84,7 @@ MLX5DV_CONTEXT_MASK_CLOCK_INFO_UPDATE = 1 << 5, MLX5DV_CONTEXT_MASK_FLOW_ACTION_FLAGS = 1 << 6, MLX5DV_CONTEXT_MASK_DC_ODP_CAPS = 1 << 7, MLX5DV_CONTEXT_MASK_HCA_CORE_CLOCK = 1 << 8, +MLX5DV_CONTEXT_MASK_NUM_LAG_PORTS = 1 << 9, .in -8 }; diff --git a/providers/mlx5/man/mlx5dv_query_qp_lag_port.3.md b/providers/mlx5/man/mlx5dv_query_qp_lag_port.3.md new file mode 100644 index 0000000..10fb7f9 --- /dev/null +++ b/providers/mlx5/man/mlx5dv_query_qp_lag_port.3.md @@ -0,0 +1,49 @@ +--- +layout: page +title: mlx5dv_query_qp_lag_port +section: 3 +tagline: Verbs +--- + +# NAME + +mlx5dv_query_qp_lag_port - Query the lag port information of a given QP + +# SYNOPSIS + +```c +#include + +int mlx5dv_query_qp_lag_port(struct ibv_qp *qp, uint8_t *port_num, + uint8_t *active_port_num); +``` + +# DESCRIPTION + +This API returns the configured and active port num of a given QP in mlx5 devices. + +The active port num indicates which port that the QP sends traffic out in a LAG configuration. + +The num_lag_ports field of struct mlx5dv_context greater than 1 means LAG is supported on this device. + +# ARGUMENTS + +*qp* +: The ibv_qp object to issue the action on. + +*port_num* +: The configured port num of the QP. + +*active_port_num* +: The current port num of the QP, which may different from the configured value because of the bonding status. + +# RETURN VALUE +0 on success; EOPNOTSUPP if not in LAG mode, or other errno value on other failures. + +# SEE ALSO + +*mlx5dv_modify_qp_lag_port(3)* + +# AUTHOR + +Aharon Landau diff --git a/providers/mlx5/mlx5-abi.h b/providers/mlx5/mlx5-abi.h index 2b66e82..f0c45eb 100644 --- a/providers/mlx5/mlx5-abi.h +++ b/providers/mlx5/mlx5-abi.h @@ -90,7 +90,7 @@ struct mlx5_modify_qp { struct ibv_modify_qp_ex ibv_cmd; __u32 comp_mask; struct mlx5_ib_burst_info burst_info; - __u32 reserved; + __u32 ece_options; }; #endif /* MLX5_ABI_H */ diff --git a/providers/mlx5/mlx5.c b/providers/mlx5/mlx5.c index 85ad9af..0a091f5 100644 --- a/providers/mlx5/mlx5.c +++ b/providers/mlx5/mlx5.c @@ -43,6 +43,7 @@ #include #include +#include #include "mlx5.h" #include "mlx5-abi.h" @@ -51,10 +52,6 @@ static void mlx5_free_context(struct ibv_context *ibctx); -#ifndef PCI_VENDOR_ID_MELLANOX -#define PCI_VENDOR_ID_MELLANOX 0x15b3 -#endif - #ifndef CPU_OR #define CPU_OR(x, y, z) do {} while (0) #endif @@ -147,6 +144,8 @@ static const struct verbs_context_ops mlx5_ctx_common_ops = { .destroy_wq = mlx5_destroy_wq, .free_dm = mlx5_free_dm, .get_srq_num = mlx5_get_srq_num, + .import_mr = mlx5_import_mr, + .import_pd = mlx5_import_pd, .modify_cq = mlx5_modify_cq, .modify_flow_action_esp = mlx5_modify_flow_action_esp, .modify_qp_rate_limit = mlx5_modify_qp_rate_limit, @@ -155,11 +154,15 @@ static const struct verbs_context_ops mlx5_ctx_common_ops = { .open_xrcd = mlx5_open_xrcd, .post_srq_ops = mlx5_post_srq_ops, .query_device_ex = mlx5_query_device_ex, + .query_ece = mlx5_query_ece, .query_rt_values = mlx5_query_rt_values, .read_counters = mlx5_read_counters, .reg_dm_mr = mlx5_reg_dm_mr, .alloc_null_mr = mlx5_alloc_null_mr, .free_context = mlx5_free_context, + .set_ece = mlx5_set_ece, + .unimport_mr = mlx5_unimport_mr, + .unimport_pd = mlx5_unimport_pd, }; static const struct verbs_context_ops mlx5_ctx_cqev1_ops = { @@ -797,6 +800,13 @@ int mlx5dv_query_device(struct ibv_context *ctx_in, } } + if (attrs_out->comp_mask & MLX5DV_CONTEXT_MASK_NUM_LAG_PORTS) { + if (mctx->lag_caps.num_lag_ports) { + attrs_out->num_lag_ports = mctx->lag_caps.num_lag_ports; + comp_mask_out |= MLX5DV_CONTEXT_MASK_NUM_LAG_PORTS; + } + } + attrs_out->comp_mask = comp_mask_out; return 0; @@ -866,7 +876,7 @@ static int mlx5dv_get_cq(struct ibv_cq *cq_in, cq_out->comp_mask = 0; cq_out->cqn = mcq->cqn; - cq_out->cqe_cnt = mcq->ibv_cq.cqe + 1; + cq_out->cqe_cnt = mcq->verbs_cq.cq.cqe + 1; cq_out->cqe_size = mcq->cqe_sz; cq_out->buf = mcq->active_buf->buf; cq_out->dbrec = mcq->dbrec; @@ -970,6 +980,151 @@ static int mlx5dv_get_pd(struct ibv_pd *pd_in, return 0; } +static int query_lag(struct ibv_context *ctx, uint8_t *lag_state, + uint8_t *tx_remap_affinity_1, + uint8_t *tx_remap_affinity_2) +{ + uint32_t out_lag[DEVX_ST_SZ_DW(query_lag_out)] = {}; + uint32_t in_lag[DEVX_ST_SZ_DW(query_lag_in)] = {}; + int ret; + + DEVX_SET(query_lag_in, in_lag, opcode, MLX5_CMD_OP_QUERY_LAG); + ret = mlx5dv_devx_general_cmd(ctx, in_lag, sizeof(in_lag), out_lag, + sizeof(out_lag)); + if (ret) + return ret; + + *lag_state = DEVX_GET(query_lag_out, out_lag, ctx.lag_state); + if (tx_remap_affinity_1) + *tx_remap_affinity_1 = DEVX_GET(query_lag_out, out_lag, + ctx.tx_remap_affinity_1); + if (tx_remap_affinity_2) + *tx_remap_affinity_2 = DEVX_GET(query_lag_out, out_lag, + ctx.tx_remap_affinity_2); + + return 0; +} + +static bool lag_operation_supported(struct ibv_qp *qp) +{ + struct mlx5_context *mctx = to_mctx(qp->context); + struct mlx5_qp *mqp = to_mqp(qp); + + if (!is_mlx5_dev(qp->context->device) || + (mctx->lag_caps.num_lag_ports <= 1)) + return false; + + if ((qp->qp_type == IBV_QPT_RC) || + (qp->qp_type == IBV_QPT_UD) || + (qp->qp_type == IBV_QPT_UC) || + (qp->qp_type == IBV_QPT_RAW_PACKET) || + (qp->qp_type == IBV_QPT_XRC_SEND) || + ((qp->qp_type == IBV_QPT_DRIVER) && + (mqp->dc_type == MLX5DV_DCTYPE_DCI))) + return true; + + return false; +} + + +int mlx5dv_query_qp_lag_port(struct ibv_qp *qp, uint8_t *port_num, + uint8_t *active_port_num) +{ + uint8_t lag_state, tx_remap_affinity_1, tx_remap_affinity_2; + uint32_t in_tis[DEVX_ST_SZ_DW(query_tis_in)] = {}; + uint32_t out_tis[DEVX_ST_SZ_DW(query_tis_out)] = {}; + uint32_t in_qp[DEVX_ST_SZ_DW(query_qp_in)] = {}; + uint32_t out_qp[DEVX_ST_SZ_DW(query_qp_out)] = {}; + struct mlx5_context *mctx = to_mctx(qp->context); + struct mlx5_qp *mqp = to_mqp(qp); + int ret; + + if (!lag_operation_supported(qp)) + return EOPNOTSUPP; + + ret = query_lag(qp->context, &lag_state, + &tx_remap_affinity_1, &tx_remap_affinity_2); + if (ret) + return ret; + + if (!lag_state && !mctx->lag_caps.lag_tx_port_affinity) + return EOPNOTSUPP; + + switch (qp->qp_type) { + case IBV_QPT_RAW_PACKET: + DEVX_SET(query_tis_in, in_tis, opcode, MLX5_CMD_OP_QUERY_TIS); + DEVX_SET(query_tis_in, in_tis, tisn, mqp->tisn); + ret = mlx5dv_devx_qp_query(qp, in_tis, sizeof(in_tis), out_tis, + sizeof(out_tis)); + if (ret) + return ret; + + *port_num = DEVX_GET(query_tis_out, out_tis, + tis_context.lag_tx_port_affinity); + break; + + default: + DEVX_SET(query_qp_in, in_qp, opcode, MLX5_CMD_OP_QUERY_QP); + DEVX_SET(query_qp_in, in_qp, qpn, qp->qp_num); + ret = mlx5dv_devx_qp_query(qp, in_qp, sizeof(in_qp), out_qp, + sizeof(out_qp)); + if (ret) + return ret; + + *port_num = DEVX_GET(query_qp_out, out_qp, + qpc.lag_tx_port_affinity); + break; + } + + switch (*port_num) { + case 1: + *active_port_num = tx_remap_affinity_1; + break; + + case 2: + *active_port_num = tx_remap_affinity_2; + break; + + default: + return EOPNOTSUPP; + } + + return 0; +} + +int mlx5dv_modify_qp_lag_port(struct ibv_qp *qp, uint8_t port_num) +{ + uint32_t in[DEVX_ST_SZ_DW(modify_tis_in)] = {}; + uint32_t out[DEVX_ST_SZ_DW(modify_tis_out)] = {}; + uint8_t curr_configured, curr_active; + struct mlx5_qp *mqp = to_mqp(qp); + int ret; + + /* Query lag port to see if we are at all in lag mode, otherwise FW + * might return success and ignore the modification. + */ + ret = mlx5dv_query_qp_lag_port(qp, &curr_configured, &curr_active); + if (ret) + return ret; + + switch (qp->qp_type) { + case IBV_QPT_RAW_PACKET: + DEVX_SET(modify_tis_in, in, opcode, MLX5_CMD_OP_MODIFY_TIS); + DEVX_SET(modify_tis_in, in, tisn, mqp->tisn); + DEVX_SET(modify_tis_in, in, bitmask.lag_tx_port_affinity, 1); + DEVX_SET(modify_tis_in, in, ctx.lag_tx_port_affinity, port_num); + ret = mlx5dv_devx_qp_modify(qp, in, sizeof(in), out, + sizeof(out)); + break; + + default: + ret = EOPNOTSUPP; + break; + } + + return ret; +} + LATEST_SYMVER_FUNC(mlx5dv_init_obj, 1_2, "MLX5_1.2", int, struct mlx5dv_obj *obj, uint64_t obj_type) @@ -1118,17 +1273,17 @@ repeat: static void adjust_uar_info(struct mlx5_device *mdev, struct mlx5_context *context, - struct mlx5_alloc_ucontext_resp resp) + struct mlx5_ib_alloc_ucontext_resp *resp) { - if (!resp.log_uar_size && !resp.num_uars_per_page) { + if (!resp->log_uar_size && !resp->num_uars_per_page) { /* old kernel */ context->uar_size = mdev->page_size; context->num_uars_per_page = 1; return; } - context->uar_size = 1 << resp.log_uar_size; - context->num_uars_per_page = resp.num_uars_per_page; + context->uar_size = 1 << resp->log_uar_size; + context->num_uars_per_page = resp->num_uars_per_page; } bool mlx5dv_is_supported(struct ibv_device *device) @@ -1147,117 +1302,109 @@ mlx5dv_open_device(struct ibv_device *device, struct mlx5dv_context_attr *attr) return verbs_open_device(device, attr); } -static struct verbs_context *mlx5_alloc_context(struct ibv_device *ibdev, +static int get_uar_info(struct mlx5_device *mdev, + int *tot_uuars, int *low_lat_uuars) +{ + *tot_uuars = get_total_uuars(mdev->page_size); + if (*tot_uuars < 0) { + errno = -*tot_uuars; + return -1; + } + + *low_lat_uuars = get_num_low_lat_uuars(*tot_uuars); + if (*low_lat_uuars < 0) { + errno = -*low_lat_uuars; + return -1; + } + + if (*low_lat_uuars > *tot_uuars - 1) { + errno = ENOMEM; + return -1; + } + + return 0; +} + +static void mlx5_uninit_context(struct mlx5_context *context) +{ + close_debug_file(context); + + verbs_uninit_context(&context->ibv_ctx); + free(context); +} + +static struct mlx5_context *mlx5_init_context(struct ibv_device *ibdev, int cmd_fd, void *private_data) { - struct mlx5_context *context; - struct mlx5_alloc_ucontext req; - struct mlx5_alloc_ucontext_resp resp; - int i; - int page_size; - int tot_uuars; - int low_lat_uuars; - int gross_uuars; - int j; - struct mlx5_device *mdev = to_mdev(ibdev); - struct verbs_context *v_ctx; - struct ibv_port_attr port_attr; - struct ibv_device_attr_ex device_attr; - int k; - int bfi; - int num_sys_page_map; - struct mlx5dv_context_attr *ctx_attr = private_data; - bool always_devx = false; + struct mlx5dv_context_attr *ctx_attr = private_data; + struct mlx5_device *mdev = to_mdev(ibdev); + struct mlx5_context *context; + int low_lat_uuars; + int tot_uuars; + int ret; if (ctx_attr && ctx_attr->comp_mask) { errno = EINVAL; return NULL; } + ret = get_uar_info(mdev, &tot_uuars, &low_lat_uuars); + if (ret) + return NULL; + context = verbs_init_and_alloc_context(ibdev, cmd_fd, context, ibv_ctx, RDMA_DRIVER_MLX5); if (!context) return NULL; - v_ctx = &context->ibv_ctx; - page_size = mdev->page_size; - mlx5_single_threaded = single_threaded_app(); - open_debug_file(context); set_debug_mask(); set_freeze_on_error(); if (gethostname(context->hostname, sizeof(context->hostname))) strcpy(context->hostname, "host_unknown"); - tot_uuars = get_total_uuars(page_size); - if (tot_uuars < 0) { - errno = -tot_uuars; - goto err_free; - } - - low_lat_uuars = get_num_low_lat_uuars(tot_uuars); - if (low_lat_uuars < 0) { - errno = -low_lat_uuars; - goto err_free; - } - - if (low_lat_uuars > tot_uuars - 1) { - errno = ENOMEM; - goto err_free; - } - - memset(&req, 0, sizeof(req)); - memset(&resp, 0, sizeof(resp)); - - req.total_num_bfregs = tot_uuars; - req.num_low_latency_bfregs = low_lat_uuars; - req.max_cqe_version = MLX5_CQE_VERSION_V1; - req.lib_caps |= (MLX5_LIB_CAP_4K_UAR | MLX5_LIB_CAP_DYN_UAR); - if (ctx_attr && ctx_attr->flags) { - - if (!check_comp_mask(ctx_attr->flags, - MLX5DV_CONTEXT_FLAGS_DEVX)) { - errno = EINVAL; - goto err_free; - } - - req.flags = MLX5_IB_ALLOC_UCTX_DEVX; - } else { - req.flags = MLX5_IB_ALLOC_UCTX_DEVX; - always_devx = true; - } + mlx5_single_threaded = single_threaded_app(); + context->tot_uuars = tot_uuars; + context->low_lat_uuars = low_lat_uuars; -retry_open: - if (mlx5_cmd_get_context(context, &req, sizeof(req), &resp, - sizeof(resp))) { - if (always_devx) { - req.flags &= ~MLX5_IB_ALLOC_UCTX_DEVX; - always_devx = false; - memset(&resp, 0, sizeof(resp)); - goto retry_open; - } else { - goto err_free; - } - } + return context; +} - context->max_num_qps = resp.qp_tab_size; - context->bf_reg_size = resp.bf_reg_size; - context->tot_uuars = resp.tot_bfregs; - context->low_lat_uuars = low_lat_uuars; - context->cache_line_size = resp.cache_line_size; - context->max_sq_desc_sz = resp.max_sq_desc_sz; - context->max_rq_desc_sz = resp.max_rq_desc_sz; - context->max_send_wqebb = resp.max_send_wqebb; - context->num_ports = resp.num_ports; - context->max_recv_wr = resp.max_recv_wr; - context->max_srq_recv_wr = resp.max_srq_recv_wr; - context->num_dyn_bfregs = resp.num_dyn_bfregs; - - if (resp.comp_mask & MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_DUMP_FILL_MKEY) { - context->dump_fill_mkey = resp.dump_fill_mkey; +static int mlx5_set_context(struct mlx5_context *context, + struct mlx5_ib_alloc_ucontext_resp *resp, + bool is_import) +{ + struct verbs_context *v_ctx = &context->ibv_ctx; + struct ibv_port_attr port_attr = {}; + struct ibv_device_attr_ex device_attr = {}; + int cmd_fd = v_ctx->context.cmd_fd; + struct mlx5_device *mdev = to_mdev(v_ctx->context.device); + struct ibv_device *ibdev = v_ctx->context.device; + int page_size = mdev->page_size; + int num_sys_page_map; + int gross_uuars; + int bfi; + int i, k, j; + + context->max_num_qps = resp->qp_tab_size; + context->bf_reg_size = resp->bf_reg_size; + context->cache_line_size = resp->cache_line_size; + context->max_sq_desc_sz = resp->max_sq_desc_sz; + context->max_rq_desc_sz = resp->max_rq_desc_sz; + context->max_send_wqebb = resp->max_send_wqebb; + context->num_ports = resp->num_ports; + context->max_recv_wr = resp->max_recv_wr; + context->max_srq_recv_wr = resp->max_srq_recv_wr; + context->num_dyn_bfregs = resp->num_dyn_bfregs; + + if (resp->comp_mask & MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_ECE) + context->flags |= MLX5_CTX_FLAGS_ECE_SUPPORTED; + + if (resp->comp_mask & MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_DUMP_FILL_MKEY) { + context->dump_fill_mkey = resp->dump_fill_mkey; /* Have the BE value ready to be used in data path */ - context->dump_fill_mkey_be = htobe32(resp.dump_fill_mkey); + context->dump_fill_mkey_be = htobe32(resp->dump_fill_mkey); } else { /* kernel driver will never return MLX5_INVALID_LKEY for * dump_fill_mkey @@ -1266,19 +1413,18 @@ retry_open: context->dump_fill_mkey_be = htobe32(MLX5_INVALID_LKEY); } - context->cqe_version = resp.cqe_version; - + context->cqe_version = resp->cqe_version; adjust_uar_info(mdev, context, resp); - context->cmds_supp_uhw = resp.cmds_supp_uhw; + context->cmds_supp_uhw = resp->cmds_supp_uhw; context->vendor_cap_flags = 0; list_head_init(&context->dyn_uar_bf_list); list_head_init(&context->dyn_uar_nc_list); list_head_init(&context->dyn_uar_qp_shared_list); list_head_init(&context->dyn_uar_qp_dedicated_list); - if (resp.eth_min_inline) - context->eth_min_inline_size = (resp.eth_min_inline == MLX5_USER_INLINE_MODE_NONE) ? + if (resp->eth_min_inline) + context->eth_min_inline_size = (resp->eth_min_inline == MLX5_USER_INLINE_MODE_NONE) ? 0 : MLX5_ETH_L2_INLINE_HEADER_SIZE; else context->eth_min_inline_size = MLX5_ETH_L2_INLINE_HEADER_SIZE; @@ -1300,7 +1446,12 @@ retry_open: context->prefer_bf = get_always_bf(); context->shut_up_bf = get_shut_up_bf(); - if (context->tot_uuars) { + if (resp->tot_bfregs) { + if (is_import) { + errno = EINVAL; + return EINVAL; + } + context->tot_uuars = resp->tot_bfregs; gross_uuars = context->tot_uuars / MLX5_NUM_NON_FP_BFREGS_PER_UAR * NUM_BFREGS_PER_UAR; context->bfs = calloc(gross_uuars, sizeof(*context->bfs)); if (!context->bfs) { @@ -1309,8 +1460,8 @@ retry_open: } context->flags |= MLX5_CTX_FLAGS_NO_KERN_DYN_UAR; } else { - context->qp_max_dedicated_uuars = low_lat_uuars; - context->qp_max_shared_uuars = tot_uuars - low_lat_uuars; + context->qp_max_dedicated_uuars = context->low_lat_uuars; + context->qp_max_shared_uuars = context->tot_uuars - context->low_lat_uuars; goto bf_done; } @@ -1338,9 +1489,9 @@ retry_open: if (bfi) context->bfs[bfi].buf_size = context->bf_reg_size / 2; context->bfs[bfi].uuarn = bfi; - context->bfs[bfi].uar_mmap_offset = get_uar_mmap_offset(i, - page_size, - uar_type_to_cmd(context->uar[i].type)); + context->bfs[bfi].uar_mmap_offset = + get_uar_mmap_offset(i, page_size, + uar_type_to_cmd(context->uar[i].type)); } } } @@ -1348,23 +1499,16 @@ retry_open: bf_done: context->hca_core_clock = NULL; - if (resp.response_length + sizeof(resp.ibv_resp) >= - offsetof(struct mlx5_alloc_ucontext_resp, hca_core_clock_offset) + - sizeof(resp.hca_core_clock_offset) && - resp.comp_mask & MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET) { - context->core_clock.offset = resp.hca_core_clock_offset; + if (resp->comp_mask & MLX5_IB_ALLOC_UCONTEXT_RESP_MASK_CORE_CLOCK_OFFSET) { + context->core_clock.offset = resp->hca_core_clock_offset; mlx5_map_internal_clock(mdev, &v_ctx->context); } context->clock_info_page = NULL; - if (resp.response_length + sizeof(resp.ibv_resp) >= - offsetof(struct mlx5_alloc_ucontext_resp, clock_info_versions) + - sizeof(resp.clock_info_versions) && - (resp.clock_info_versions & (1 << MLX5_IB_CLOCK_INFO_V1))) { + if ((resp->clock_info_versions & (1 << MLX5_IB_CLOCK_INFO_V1))) mlx5_map_clock_info(mdev, &v_ctx->context); - } - context->flow_action_flags = resp.flow_action_flags; + context->flow_action_flags = resp->flow_action_flags; mlx5_read_env(ibdev, context); @@ -1379,7 +1523,6 @@ bf_done: goto err_free; } - memset(&device_attr, 0, sizeof(device_attr)); if (!mlx5_query_device_ex(&v_ctx->context, NULL, &device_attr, sizeof(struct ibv_device_attr_ex))) { context->cached_device_cap_flags = @@ -1401,7 +1544,7 @@ bf_done: MLX5_IB_UAPI_UAR_ALLOC_TYPE_NC); context->cq_uar_reg = context->cq_uar ? context->cq_uar->uar : context->uar[0].reg; - return v_ctx; + return 0; err_free_bf: free(context->bfs); @@ -1411,10 +1554,98 @@ err_free: if (context->uar[i].reg) munmap(context->uar[i].reg, page_size); } - close_debug_file(context); - verbs_uninit_context(&context->ibv_ctx); - free(context); + return -1; +} + +static struct verbs_context *mlx5_alloc_context(struct ibv_device *ibdev, + int cmd_fd, + void *private_data) +{ + struct mlx5_context *context; + struct mlx5_alloc_ucontext req = {}; + struct mlx5_alloc_ucontext_resp resp = {}; + struct mlx5dv_context_attr *ctx_attr = private_data; + bool always_devx = false; + int ret; + + context = mlx5_init_context(ibdev, cmd_fd, NULL); + if (!context) + return NULL; + + req.total_num_bfregs = context->tot_uuars; + req.num_low_latency_bfregs = context->low_lat_uuars; + req.max_cqe_version = MLX5_CQE_VERSION_V1; + req.lib_caps |= (MLX5_LIB_CAP_4K_UAR | MLX5_LIB_CAP_DYN_UAR); + if (ctx_attr && ctx_attr->flags) { + + if (!check_comp_mask(ctx_attr->flags, + MLX5DV_CONTEXT_FLAGS_DEVX)) { + errno = EINVAL; + goto err; + } + + req.flags = MLX5_IB_ALLOC_UCTX_DEVX; + } else { + req.flags = MLX5_IB_ALLOC_UCTX_DEVX; + always_devx = true; + } + +retry_open: + if (mlx5_cmd_get_context(context, &req, sizeof(req), &resp, + sizeof(resp))) { + if (always_devx) { + req.flags &= ~MLX5_IB_ALLOC_UCTX_DEVX; + always_devx = false; + memset(&resp, 0, sizeof(resp)); + goto retry_open; + } else { + goto err; + } + } + + ret = mlx5_set_context(context, &resp.drv_payload, false); + if (ret) + goto err; + + return &context->ibv_ctx; + +err: + mlx5_uninit_context(context); + return NULL; +} + +static struct verbs_context *mlx5_import_context(struct ibv_device *ibdev, + int cmd_fd) + +{ + struct mlx5_ib_alloc_ucontext_resp resp = {}; + DECLARE_COMMAND_BUFFER_LINK(driver_attr, UVERBS_OBJECT_DEVICE, + UVERBS_METHOD_QUERY_CONTEXT, 1, + NULL); + struct ibv_context *context; + struct mlx5_context *mctx; + int ret; + + mctx = mlx5_init_context(ibdev, cmd_fd, NULL); + if (!mctx) + return NULL; + + context = &mctx->ibv_ctx.context; + + fill_attr_out_ptr(driver_attr, MLX5_IB_ATTR_QUERY_CONTEXT_RESP_UCTX, &resp); + ret = ibv_cmd_query_context(context, driver_attr); + if (ret) + goto err; + + ret = mlx5_set_context(mctx, &resp, true); + if (ret) + goto err; + + return &mctx->ibv_ctx; + +err: + mlx5_uninit_context(mctx); return NULL; } @@ -1470,6 +1701,7 @@ static const struct verbs_device_ops mlx5_dev_ops = { .alloc_device = mlx5_device_alloc, .uninit_device = mlx5_uninit_device, .alloc_context = mlx5_alloc_context, + .import_context = mlx5_import_context, }; bool is_mlx5_dev(struct ibv_device *device) diff --git a/providers/mlx5/mlx5.h b/providers/mlx5/mlx5.h index 54a9e1c..faa15b1 100644 --- a/providers/mlx5/mlx5.h +++ b/providers/mlx5/mlx5.h @@ -51,6 +51,10 @@ #define PFX "mlx5: " +#ifndef PCI_VENDOR_ID_MELLANOX +#define PCI_VENDOR_ID_MELLANOX 0x15b3 +#endif + typedef _Atomic(uint32_t) atomic_uint32_t; enum { @@ -233,6 +237,12 @@ struct mlx5_uar_info { enum mlx5_ctx_flags { MLX5_CTX_FLAGS_FATAL_STATE = 1 << 0, MLX5_CTX_FLAGS_NO_KERN_DYN_UAR = 1 << 1, + MLX5_CTX_FLAGS_ECE_SUPPORTED = 1 << 2, +}; + +struct mlx5_lag_caps { + uint8_t num_lag_ports; + uint8_t lag_tx_port_affinity; }; struct mlx5_context { @@ -303,6 +313,7 @@ struct mlx5_context { struct mlx5dv_striding_rq_caps striding_rq_caps; uint32_t tunnel_offloads_caps; struct mlx5_packet_pacing_caps packet_pacing_caps; + struct mlx5_lag_caps lag_caps; pthread_mutex_t dyn_bfregs_mutex; /* protects the dynamic bfregs allocation */ uint32_t num_dyn_bfregs; uint32_t max_num_legacy_dyn_uar_sys_page; @@ -391,8 +402,7 @@ enum { }; struct mlx5_cq { - /* ibv_cq should always be subset of ibv_cq_ex */ - struct ibv_cq_ex ibv_cq; + struct verbs_cq verbs_cq; struct mlx5_buf buf_a; struct mlx5_buf buf_b; struct mlx5_buf *active_buf; @@ -538,7 +548,6 @@ struct mlx5_dm { struct mlx5_mr { struct verbs_mr vmr; - struct mlx5_buf buf; uint32_t alloc_flags; }; @@ -593,6 +602,19 @@ struct mlx5_qp { uint32_t rqn; uint32_t sqn; uint64_t tir_icm_addr; + /* + * ECE configuration is done in create/modify QP stages, + * so this value is cached version of the requested ECE prior + * to its execution. This field will be cleared after successful + * call to relevant "executor". + */ + uint32_t set_ece; + /* + * This field indicates returned ECE options from the device + * as were received from the HW in previous stage. Every + * write to the set_ece will clear this field. + */ + uint32_t get_ece; }; struct mlx5_ah { @@ -645,6 +667,10 @@ enum mlx5_devx_obj_type { MLX5_DEVX_FLOW_METER = 3, MLX5_DEVX_QP = 4, MLX5_DEVX_PKT_REFORMAT_CTX = 5, + MLX5_DEVX_TIR = 6, + MLX5_DEVX_FLOW_GROUP = 7, + MLX5_DEVX_FLOW_TABLE_ENTRY = 8, + MLX5_DEVX_FLOW_SAMPLER = 9, }; struct mlx5dv_devx_obj { @@ -652,6 +678,7 @@ struct mlx5dv_devx_obj { uint32_t handle; enum mlx5_devx_obj_type type; uint32_t object_id; + uint64_t rx_icm_addr; }; struct mlx5_var_obj { @@ -670,6 +697,8 @@ struct mlx5_devx_umem { struct mlx5dv_devx_umem dv_devx_umem; struct ibv_context *context; uint32_t handle; + void *addr; + size_t size; }; struct mlx5_mkey { @@ -743,7 +772,7 @@ static inline struct mlx5_parent_domain *to_mparent_domain(struct ibv_pd *ibpd) static inline struct mlx5_cq *to_mcq(struct ibv_cq *ibcq) { - return container_of((struct ibv_cq_ex *)ibcq, struct mlx5_cq, ibv_cq); + return container_of(ibcq, struct mlx5_cq, verbs_cq.cq); } static inline struct mlx5_srq *to_msrq(struct ibv_srq *ibsrq) @@ -1018,6 +1047,12 @@ int mlx5_advise_mr(struct ibv_pd *pd, uint32_t flags, struct ibv_sge *sg_list, uint32_t num_sges); +struct ibv_mr *mlx5_import_mr(struct ibv_pd *pd, + uint32_t mr_handle); +void mlx5_unimport_mr(struct ibv_mr *mr); +struct ibv_pd *mlx5_import_pd(struct ibv_context *context, + uint32_t pd_handle); +void mlx5_unimport_pd(struct ibv_pd *pd); int mlx5_qp_fill_wr_pfns(struct mlx5_qp *mqp, const struct ibv_qp_init_attr_ex *attr, const struct mlx5dv_qp_init_attr *mlx5_attr); @@ -1025,6 +1060,9 @@ void clean_dyn_uars(struct ibv_context *context); struct mlx5_bf *mlx5_attach_dedicated_uar(struct ibv_context *context, uint32_t flags); +int mlx5_set_ece(struct ibv_qp *qp, struct ibv_ece *ece); +int mlx5_query_ece(struct ibv_qp *qp, struct ibv_ece *ece); + static inline void *mlx5_find_uidx(struct mlx5_context *ctx, uint32_t uidx) { int tind = uidx >> MLX5_UIDX_TABLE_SHIFT; diff --git a/providers/mlx5/mlx5_ifc.h b/providers/mlx5/mlx5_ifc.h index 79acde9..dc83777 100644 --- a/providers/mlx5/mlx5_ifc.h +++ b/providers/mlx5/mlx5_ifc.h @@ -47,9 +47,17 @@ enum { MLX5_CMD_OP_INIT2RTR_QP = 0x503, MLX5_CMD_OP_RTR2RTS_QP = 0x504, MLX5_CMD_OP_RTS2RTS_QP = 0x505, + MLX5_CMD_OP_QUERY_QP = 0x50b, MLX5_CMD_OP_QUERY_ESW_VPORT_CONTEXT = 0x752, MLX5_CMD_OP_QUERY_ROCE_ADDRESS = 0x760, + MLX5_CMD_OP_QUERY_LAG = 0x842, + MLX5_CMD_OP_CREATE_TIR = 0x900, + MLX5_CMD_OP_MODIFY_TIS = 0x913, + MLX5_CMD_OP_QUERY_TIS = 0x915, MLX5_CMD_OP_CREATE_FLOW_TABLE = 0x930, + MLX5_CMD_OP_QUERY_FLOW_TABLE = 0x932, + MLX5_CMD_OP_CREATE_FLOW_GROUP = 0x933, + MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY = 0x936, MLX5_CMD_OP_CREATE_FLOW_COUNTER = 0x939, MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT = 0x93d, MLX5_CMD_OP_DEALLOC_PACKET_REFORMAT_CONTEXT = 0x93e, @@ -88,11 +96,23 @@ struct mlx5_ifc_atomic_caps_bits { u8 reserved_at_2b0[0x550]; }; +struct mlx5_ifc_roce_cap_bits { + u8 reserved_0[0x6]; + u8 fl_rc_qp_when_roce_enabled[0x1]; + u8 reserved_at_7[0x19]; + + u8 reserved_at_20[0x7e0]; +}; + +enum { + MLX5_MULTI_PATH_FT_MAX_LEVEL = 64, +}; + struct mlx5_ifc_flow_table_context_bits { u8 reformat_en[0x1]; u8 decap_en[0x1]; u8 sw_owner[0x1]; - u8 reserved_at_3[0x1]; + u8 termination_table[0x1]; u8 table_miss_action[0x4]; u8 level[0x8]; u8 reserved_at_10[0x8]; @@ -144,6 +164,35 @@ struct mlx5_ifc_create_flow_table_out_bits { u8 icm_address_31_0[0x20]; }; +struct mlx5_ifc_query_flow_table_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; + + u8 table_type[0x8]; + u8 reserved_at_88[0x18]; + + u8 reserved_at_a0[0x8]; + u8 table_id[0x18]; + + u8 reserved_at_c0[0x140]; +}; + +struct mlx5_ifc_query_flow_table_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x80]; + + struct mlx5_ifc_flow_table_context_bits flow_table_context; +}; + struct mlx5_ifc_sync_steering_in_bits { u8 opcode[0x10]; u8 uid[0x10]; @@ -180,7 +229,9 @@ struct mlx5_ifc_device_mem_cap_bits { u8 steering_sw_icm_start_address[0x40]; - u8 reserved_at_100[0x12]; + u8 reserved_at_100[0x8]; + u8 log_header_modify_sw_icm_size[0x8]; + u8 reserved_at_110[0x2]; u8 log_sw_icm_alloc_granularity[0x6]; u8 log_steering_sw_icm_size[0x8]; @@ -489,7 +540,9 @@ struct mlx5_ifc_flow_table_prop_layout_bits { u8 reformat_l3_tunnel_to_l2[0x1]; u8 reformat_l2_to_l3_tunnel[0x1]; u8 reformat_and_modify_action[0x1]; - u8 reserved_at_15[0xb]; + u8 reserved_at_15[0x9]; + u8 sw_owner_v2[0x1]; + u8 reserved_at_1f[0x1]; u8 reserved_at_20[0x2]; u8 log_max_ft_size[0x6]; @@ -526,6 +579,15 @@ enum { MLX5_FLEX_PARSER_GTPU_ENABLED = 1 << 11, }; +enum mlx5_ifc_steering_format_version { + MLX5_HW_CONNECTX_5 = 0x0, + MLX5_HW_CONNECTX_6DX = 0x1, +}; + +enum mlx5_ifc_ste_v1_modify_hdr_offset { + MLX5_MODIFY_HEADER_V1_QW_OFFSET = 0x20, +}; + struct mlx5_ifc_cmd_hca_cap_bits { u8 access_other_hca_roce[0x1]; u8 reserved_at_1[0x1e]; @@ -746,7 +808,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 log_bf_reg_size[0x5]; u8 reserved_at_270[0x6]; u8 lag_dct[0x2]; - u8 reserved_at_278[0x3]; + u8 lag_tx_port_affinity[0x1]; + u8 reserved_at_279[0x2]; u8 lag_master[0x1]; u8 num_lag_ports[0x4]; @@ -837,7 +900,8 @@ struct mlx5_ifc_cmd_hca_cap_bits { u8 general_obj_types[0x40]; - u8 reserved_at_440[0x8]; + u8 reserved_at_440[0x4]; + u8 steering_format_version[0x4]; u8 create_qp_start_hint[0x18]; u8 reserved_at_460[0x10]; @@ -1029,6 +1093,7 @@ union mlx5_ifc_hca_cap_union_bits { struct mlx5_ifc_flow_table_eswitch_cap_bits flow_table_eswitch_cap; struct mlx5_ifc_device_mem_cap_bits device_mem_cap; struct mlx5_ifc_odp_cap_bits odp_cap; + struct mlx5_ifc_roce_cap_bits roce_caps; u8 reserved_at_0[0x8000]; }; @@ -1064,6 +1129,7 @@ enum mlx5_cap_type { enum { MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE = 0x0 << 1, + MLX5_SET_HCA_CAP_OP_MOD_ROCE = 0x4 << 1, MLX5_SET_HCA_CAP_OP_MOD_NIC_FLOW_TABLE = 0x7 << 1, MLX5_SET_HCA_CAP_OP_MOD_ESW_FLOW_TABLE = 0x8 << 1, MLX5_SET_HCA_CAP_OP_MOD_DEVICE_MEMORY = 0xf << 1, @@ -1314,6 +1380,163 @@ struct mlx5_ifc_ste_modify_packet_bits { u8 miss_rank[0x2]; }; +struct mlx5_ifc_ste_single_action_flow_tag_v1_bits { + u8 action_id[0x8]; + u8 flow_tag[0x18]; +}; + +struct mlx5_ifc_ste_single_action_modify_list_v1_bits { + u8 action_id[0x8]; + u8 num_of_modify_actions[0x8]; + u8 modify_actions_ptr[0x10]; +}; + +struct mlx5_ifc_ste_single_action_remove_header_v1_bits { + u8 action_id[0x8]; + u8 reserved_at_8[0x2]; + u8 start_anchor[0x6]; + u8 reserved_at_10[0x2]; + u8 end_anchor[0x6]; + u8 reserved_at_18[0x4]; + u8 decap[0x1]; + u8 vni_to_cqe[0x1]; + u8 qos_profile[0x2]; +}; + +struct mlx5_ifc_ste_single_action_remove_header_size_v1_bits { + u8 action_id[0x8]; + u8 reserved_at_8[0x2]; + u8 start_anchor[0x6]; + u8 outer_l4_remove[0x1]; + u8 reserved_at_11[0x1]; + u8 start_offset[0x7]; + u8 reserved_at_18[0x1]; + u8 remove_size[0x6]; +}; + +struct mlx5_ifc_ste_double_action_copy_v1_bits { + u8 action_id[0x8]; + u8 destination_dw_offset[0x8]; + u8 reserved_at_10[0x2]; + u8 destination_left_shifter[0x6]; + u8 reserved_at_18[0x2]; + u8 destination_length[0x6]; + + u8 reserved_at_20[0x8]; + u8 source_dw_offset[0x8]; + u8 reserved_at_30[0x2]; + u8 source_right_shifter[0x6]; + u8 reserved_at_38[0x8]; +}; + +struct mlx5_ifc_ste_double_action_set_v1_bits { + u8 action_id[0x8]; + u8 destination_dw_offset[0x8]; + u8 reserved_at_10[0x2]; + u8 destination_left_shifter[0x6]; + u8 reserved_at_18[0x2]; + u8 destination_length[0x6]; + + u8 inline_data[0x20]; +}; + +struct mlx5_ifc_ste_double_action_add_v1_bits { + u8 action_id[0x8]; + u8 destination_dw_offset[0x8]; + u8 reserved_at_10[0x2]; + u8 destination_left_shifter[0x6]; + u8 reserved_at_18[0x2]; + u8 destination_length[0x6]; + + u8 add_value[0x20]; +}; + +struct mlx5_ifc_ste_double_action_insert_with_inline_v1_bits { + u8 action_id[0x8]; + u8 reserved_at_8[0x2]; + u8 start_anchor[0x6]; + u8 start_offset[0x7]; + u8 reserved_at_17[0x9]; + + u8 inline_data[0x20]; +}; + +struct mlx5_ifc_ste_double_action_insert_with_ptr_v1_bits { + u8 action_id[0x8]; + u8 reserved_at_8[0x2]; + u8 start_anchor[0x6]; + u8 start_offset[0x7]; + u8 size[0x6]; + u8 attributes[0x3]; + + u8 pointer[0x20]; +}; + +struct mlx5_ifc_ste_double_action_modify_action_list_v1_bits { + u8 action_id[0x8]; + u8 modify_actions_pattern_pointer[0x18]; + + u8 number_of_modify_actions[0x8]; + u8 modify_actions_argument_pointer[0x18]; +}; + +struct mlx5_ifc_ste_match_bwc_v1_bits { + u8 entry_format[0x8]; + u8 counter_id[0x18]; + + u8 miss_address_63_48[0x10]; + u8 match_definer_ctx_idx[0x8]; + u8 miss_address_39_32[0x8]; + + u8 miss_address_31_6[0x1a]; + u8 reserved_at_5a[0x1]; + u8 match_polarity[0x1]; + u8 reparse[0x1]; + u8 reserved_at_5d[0x3]; + + u8 next_table_base_63_48[0x10]; + u8 hash_definer_ctx_idx[0x8]; + u8 next_table_base_39_32_size[0x8]; + + u8 next_table_base_31_5_size[0x1b]; + u8 hash_type[0x2]; + u8 hash_after_actions[0x1]; + u8 reserved_at_9e[0x2]; + + u8 byte_mask[0x10]; + u8 next_entry_format[0x1]; + u8 mask_mode[0x1]; + u8 gvmi[0xe]; + + u8 action[0x40]; +}; + +struct mlx5_ifc_ste_mask_and_match_v1_bits { + u8 entry_format[0x8]; + u8 counter_id[0x18]; + + u8 miss_address_63_48[0x10]; + u8 match_definer_ctx_idx[0x8]; + u8 miss_address_39_32[0x8]; + + u8 miss_address_31_6[0x1a]; + u8 reserved_at_5a[0x1]; + u8 match_polarity[0x1]; + u8 reparse[0x1]; + u8 reserved_at_5d[0x3]; + + u8 next_table_base_63_48[0x10]; + u8 hash_definer_ctx_idx[0x8]; + u8 next_table_base_39_32_size[0x8]; + + u8 next_table_base_31_5_size[0x1b]; + u8 hash_type[0x2]; + u8 hash_after_actions[0x1]; + u8 reserved_at_9e[0x2]; + + u8 action[0x60]; +}; + struct mlx5_ifc_ste_eth_l2_src_bits { u8 smac_47_16[0x20]; @@ -1347,6 +1570,39 @@ struct mlx5_ifc_ste_eth_l2_src_bits { u8 second_vlan_id[0xc]; }; +struct mlx5_ifc_ste_eth_l2_src_v1_bits { + u8 reserved_at_0[0x1]; + u8 sx_sniffer[0x1]; + u8 functional_loopback[0x1]; + u8 ip_fragmented[0x1]; + u8 qp_type[0x2]; + u8 encapsulation_type[0x2]; + u8 port[0x2]; + u8 l3_type[0x2]; + u8 l4_type[0x2]; + u8 first_vlan_qualifier[0x2]; + u8 first_priority[0x3]; + u8 first_cfi[0x1]; + u8 first_vlan_id[0xc]; + + u8 smac_47_16[0x20]; + + u8 smac_15_0[0x10]; + u8 l3_ethertype[0x10]; + + u8 reserved_at_60[0x6]; + u8 tcp_syn[0x1]; + u8 reserved_at_67[0x3]; + u8 force_loopback[0x1]; + u8 l2_ok[0x1]; + u8 l3_ok[0x1]; + u8 l4_ok[0x1]; + u8 second_vlan_qualifier[0x2]; + u8 second_priority[0x3]; + u8 second_cfi[0x1]; + u8 second_vlan_id[0xc]; +}; + struct mlx5_ifc_ste_eth_l2_dst_bits { u8 dmac_47_16[0x20]; @@ -1380,6 +1636,39 @@ struct mlx5_ifc_ste_eth_l2_dst_bits { u8 second_vlan_id[0xc]; }; +struct mlx5_ifc_ste_eth_l2_dst_v1_bits { + u8 reserved_at_0[0x1]; + u8 sx_sniffer[0x1]; + u8 functional_lb[0x1]; + u8 ip_fragmented[0x1]; + u8 qp_type[0x2]; + u8 encapsulation_type[0x2]; + u8 port[0x2]; + u8 l3_type[0x2]; + u8 l4_type[0x2]; + u8 first_vlan_qualifier[0x2]; + u8 first_priority[0x3]; + u8 first_cfi[0x1]; + u8 first_vlan_id[0xc]; + + u8 dmac_47_16[0x20]; + + u8 dmac_15_0[0x10]; + u8 l3_ethertype[0x10]; + + u8 reserved_at_60[0x6]; + u8 tcp_syn[0x1]; + u8 reserved_at_67[0x3]; + u8 force_lb[0x1]; + u8 l2_ok[0x1]; + u8 l3_ok[0x1]; + u8 l4_ok[0x1]; + u8 second_vlan_qualifier[0x2]; + u8 second_priority[0x3]; + u8 second_cfi[0x1]; + u8 second_vlan_id[0xc]; +}; + struct mlx5_ifc_ste_eth_l2_src_dst_bits { u8 dmac_47_16[0x20]; @@ -1401,6 +1690,26 @@ struct mlx5_ifc_ste_eth_l2_src_dst_bits { u8 first_vlan_id[0xc]; }; +struct mlx5_ifc_ste_eth_l2_src_dst_v1_bits { + u8 dmac_47_16[0x20]; + + u8 smac_47_16[0x20]; + + u8 dmac_15_0[0x10]; + u8 reserved_at_50[0x2]; + u8 functional_lb[0x1]; + u8 reserved_at_53[0x5]; + u8 port[0x2]; + u8 l3_type[0x2]; + u8 reserved_at_5c[0x2]; + u8 first_vlan_qualifier[0x2]; + + u8 first_priority[0x3]; + u8 first_cfi[0x1]; + u8 first_vlan_id[0xc]; + u8 smac_15_0[0x10]; +}; + struct mlx5_ifc_ste_eth_l3_ipv4_5_tuple_bits { u8 destination_address[0x20]; @@ -1428,6 +1737,32 @@ struct mlx5_ifc_ste_eth_l3_ipv4_5_tuple_bits { u8 protocol[0x8]; }; +struct mlx5_ifc_ste_eth_l3_ipv4_5_tuple_v1_bits { + u8 source_address[0x20]; + + u8 destination_address[0x20]; + + u8 source_port[0x10]; + u8 destination_port[0x10]; + + u8 reserved_at_60[0x4]; + u8 l4_ok[0x1]; + u8 l3_ok[0x1]; + u8 fragmented[0x1]; + u8 tcp_ns[0x1]; + u8 tcp_cwr[0x1]; + u8 tcp_ece[0x1]; + u8 tcp_urg[0x1]; + u8 tcp_ack[0x1]; + u8 tcp_psh[0x1]; + u8 tcp_rst[0x1]; + u8 tcp_syn[0x1]; + u8 tcp_fin[0x1]; + u8 dscp[0x6]; + u8 ecn[0x2]; + u8 protocol[0x8]; +}; + struct mlx5_ifc_ste_eth_l3_ipv6_dst_bits { u8 dst_ip_127_96[0x20]; @@ -1460,6 +1795,27 @@ struct mlx5_ifc_ste_eth_l2_tnl_bits { u8 first_vlan_id[0xc]; }; +struct mlx5_ifc_ste_eth_l2_tnl_v1_bits { + u8 l2_tunneling_network_id[0x20]; + + u8 dmac_47_16[0x20]; + + u8 dmac_15_0[0x10]; + u8 l3_ethertype[0x10]; + + u8 reserved_at_60[0x3]; + u8 ip_fragmented[0x1]; + u8 reserved_at_64[0x2]; + u8 encp_type[0x2]; + u8 reserved_at_68[0x2]; + u8 l3_type[0x2]; + u8 l4_type[0x2]; + u8 first_vlan_qualifier[0x2]; + u8 first_priority[0x3]; + u8 first_cfi[0x1]; + u8 first_vlan_id[0xc]; +}; + struct mlx5_ifc_ste_eth_l3_ipv6_src_bits { u8 src_ip_127_96[0x20]; @@ -1487,6 +1843,23 @@ struct mlx5_ifc_ste_eth_l3_ipv4_misc_bits { u8 reserved_at_60[0x20]; }; +struct mlx5_ifc_ste_eth_l3_ipv4_misc_v1_bits { + u8 identification[0x10]; + u8 flags[0x3]; + u8 fragment_offset[0xd]; + + u8 total_length[0x10]; + u8 checksum[0x10]; + + u8 version[0x4]; + u8 ihl[0x4]; + u8 time_to_live[0x8]; + u8 reserved_at_50[0x10]; + + u8 reserved_at_60[0x1c]; + u8 voq_internal_prio[0x4]; +}; + struct mlx5_ifc_ste_eth_l4_bits { u8 fragmented[0x1]; u8 first_fragment[0x1]; @@ -1518,6 +1891,37 @@ struct mlx5_ifc_ste_eth_l4_bits { u8 flow_label[0x14]; }; +struct mlx5_ifc_ste_eth_l4_v1_bits { + u8 ipv6_version[0x4]; + u8 reserved_at_4[0x4]; + u8 dscp[0x6]; + u8 ecn[0x2]; + u8 ipv6_hop_limit[0x8]; + u8 protocol[0x8]; + + u8 src_port[0x10]; + u8 dst_port[0x10]; + + u8 first_fragment[0x1]; + u8 reserved_at_41[0xb]; + u8 flow_label[0x14]; + + u8 tcp_data_offset[0x4]; + u8 l4_ok[0x1]; + u8 l3_ok[0x1]; + u8 fragmented[0x1]; + u8 tcp_ns[0x1]; + u8 tcp_cwr[0x1]; + u8 tcp_ece[0x1]; + u8 tcp_urg[0x1]; + u8 tcp_ack[0x1]; + u8 tcp_psh[0x1]; + u8 tcp_rst[0x1]; + u8 tcp_syn[0x1]; + u8 tcp_fin[0x1]; + u8 ipv6_paylen[0x10]; +}; + struct mlx5_ifc_ste_eth_l4_misc_bits { u8 checksum[0x10]; u8 length[0x10]; @@ -1530,6 +1934,18 @@ struct mlx5_ifc_ste_eth_l4_misc_bits { u8 window_size[0x10]; }; +struct mlx5_ifc_ste_eth_l4_misc_v1_bits { + u8 window_size[0x10]; + u8 urgent_pointer[0x10]; + + u8 ack_num[0x20]; + + u8 seq_num[0x20]; + + u8 length[0x10]; + u8 checksum[0x10]; +}; + struct mlx5_ifc_ste_mpls_bits { u8 mpls0_label[0x14]; u8 mpls0_exp[0x3]; @@ -1553,6 +1969,30 @@ struct mlx5_ifc_ste_mpls_bits { u8 mpls0_qualifier[0x1]; }; +struct mlx5_ifc_ste_mpls_v1_bits { + u8 reserved_at_0[0x15]; + u8 mpls_ok[0x1]; + u8 mpls4_s_bit[0x1]; + u8 mpls4_qualifier[0x1]; + u8 mpls3_s_bit[0x1]; + u8 mpls3_qualifier[0x1]; + u8 mpls2_s_bit[0x1]; + u8 mpls2_qualifier[0x1]; + u8 mpls1_s_bit[0x1]; + u8 mpls1_qualifier[0x1]; + u8 mpls0_s_bit[0x1]; + u8 mpls0_qualifier[0x1]; + + u8 mpls0_label[0x14]; + u8 mpls0_exp[0x3]; + u8 mpls0_s_bos[0x1]; + u8 mpls0_ttl[0x8]; + + u8 mpls1_label[0x20]; + + u8 mpls2_label[0x20]; +}; + struct mlx5_ifc_ste_register_0_bits { u8 register_0_h[0x20]; @@ -1593,6 +2033,25 @@ struct mlx5_ifc_ste_gre_bits { u8 seq_num[0x20]; }; +struct mlx5_ifc_ste_gre_v1_bits { + u8 gre_c_present[0x1]; + u8 reserved_at_1[0x1]; + u8 gre_k_present[0x1]; + u8 gre_s_present[0x1]; + u8 strict_src_route[0x1]; + u8 recur[0x3]; + u8 flags[0x5]; + u8 version[0x3]; + u8 gre_protocol[0x10]; + + u8 reserved_at_20[0x20]; + + u8 gre_key_h[0x18]; + u8 gre_key_l[0x8]; + + u8 reserved_at_60[0x20]; +}; + struct mlx5_ifc_ste_flex_parser_0_bits { u8 parser_3_label[0x14]; u8 parser_3_exp[0x3]; @@ -1677,6 +2136,35 @@ struct mlx5_ifc_ste_src_gvmi_qp_bits { u8 reserved_at_60[0x20]; }; +struct mlx5_ifc_ste_src_gvmi_qp_v1_bits { + u8 loopback_synd[0x8]; + u8 reserved_at_8[0x7]; + u8 functional_lb[0x1]; + u8 source_gvmi[0x10]; + + u8 force_lb[0x1]; + u8 reserved_at_21[0x1]; + u8 source_is_requestor[0x1]; + u8 reserved_at_23[0x5]; + u8 source_qp[0x18]; + + u8 reserved_at_40[0x20]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_ste_icmp_v1_bits { + u8 icmp_payload_data[0x20]; + + u8 icmp_header_data[0x20]; + + u8 icmp_type[0x8]; + u8 icmp_code[0x8]; + u8 reserved_at_50[0x10]; + + u8 reserved_at_60[0x20]; +}; + struct mlx5_ifc_set_action_in_bits { u8 action_type[0x4]; u8 field[0xc]; @@ -1841,6 +2329,7 @@ struct mlx5_ifc_alloc_flow_counter_out_bits { enum { MLX5_OBJ_TYPE_FLOW_METER = 0x000a, + MLX5_OBJ_TYPE_FLOW_SAMPLER = 0x0020, }; struct mlx5_ifc_general_obj_in_cmd_hdr_bits { @@ -1898,6 +2387,36 @@ struct mlx5_ifc_query_flow_meter_out_bits { struct mlx5_ifc_flow_meter_bits obj; }; +struct mlx5_ifc_flow_sampler_bits { + u8 modify_field_select[0x40]; + + u8 table_type[0x8]; + u8 level[0x8]; + u8 reserved_at_50[0xf]; + u8 ignore_flow_level[0x1]; + + u8 sample_ratio[0x20]; + + u8 reserved_at_80[0x8]; + u8 sample_table_id[0x18]; + + u8 reserved_at_a0[0x8]; + u8 default_table_id[0x18]; + + u8 sw_steering_icm_address_rx[0x40]; + u8 sw_steering_icm_address_tx[0x40]; +}; + +struct mlx5_ifc_create_flow_sampler_in_bits { + struct mlx5_ifc_general_obj_in_cmd_hdr_bits hdr; + struct mlx5_ifc_flow_sampler_bits sampler; +}; + +struct mlx5_ifc_query_flow_sampler_out_bits { + struct mlx5_ifc_general_obj_out_cmd_hdr_bits hdr; + struct mlx5_ifc_flow_sampler_bits obj; +}; + struct mlx5_ifc_esw_vport_context_bits { u8 reserved_at_0[0x3]; u8 vport_svlan_strip[0x1]; @@ -2131,6 +2650,18 @@ struct mlx5_ifc_qpc_bits { u8 dbr_umem_id[0x20]; }; +struct mlx5_ifc_create_tir_out_bits { + u8 status[0x8]; + u8 icm_address_63_40[0x18]; + + u8 syndrome[0x20]; + + u8 icm_address_39_32[0x8]; + u8 tirn[0x18]; + + u8 icm_address_31_0[0x20]; +}; + struct mlx5_ifc_create_qp_out_bits { u8 status[0x8]; u8 reserved_at_8[0x18]; @@ -2258,6 +2789,152 @@ struct mlx5_ifc_rst2init_qp_in_bits { u8 reserved_at_800[0x80]; }; +struct mlx5_ifc_query_qp_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; + + u8 opt_param_mask[0x20]; + + u8 reserved_at_a0[0x20]; + + struct mlx5_ifc_qpc_bits qpc; + + u8 reserved_at_800[0x80]; + + u8 pas[0][0x40]; +}; + +struct mlx5_ifc_query_qp_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x8]; + u8 qpn[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_tisc_bits { + u8 strict_lag_tx_port_affinity[0x1]; + u8 tls_en[0x1]; + u8 reserved_at_2[0x2]; + u8 lag_tx_port_affinity[0x04]; + + u8 reserved_at_8[0x4]; + u8 prio[0x4]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x100]; + + u8 reserved_at_120[0x8]; + u8 transport_domain[0x18]; + + u8 reserved_at_140[0x8]; + u8 underlay_qpn[0x18]; + + u8 reserved_at_160[0x8]; + u8 pd[0x18]; + + u8 reserved_at_180[0x380]; +}; + +struct mlx5_ifc_query_tis_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; + + struct mlx5_ifc_tisc_bits tis_context; +}; + +struct mlx5_ifc_query_tis_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x8]; + u8 tisn[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_lagc_bits { + u8 reserved_at_0[0x1d]; + u8 lag_state[0x3]; + + u8 reserved_at_20[0x14]; + u8 tx_remap_affinity_2[0x4]; + u8 reserved_at_38[0x4]; + u8 tx_remap_affinity_1[0x4]; +}; + +struct mlx5_ifc_query_lag_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + struct mlx5_ifc_lagc_bits ctx; +}; + +struct mlx5_ifc_query_lag_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_modify_tis_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +struct mlx5_ifc_modify_tis_bitmask_bits { + u8 reserved_at_0[0x20]; + + u8 reserved_at_20[0x1d]; + u8 lag_tx_port_affinity[0x1]; + u8 strict_lag_tx_port_affinity[0x1]; + u8 prio[0x1]; +}; + +struct mlx5_ifc_modify_tis_in_bits { + u8 opcode[0x10]; + u8 uid[0x10]; + + u8 reserved_at_20[0x10]; + u8 op_mod[0x10]; + + u8 reserved_at_40[0x8]; + u8 tisn[0x18]; + + u8 reserved_at_60[0x20]; + + struct mlx5_ifc_modify_tis_bitmask_bits bitmask; + + u8 reserved_at_c0[0x40]; + + struct mlx5_ifc_tisc_bits ctx; +}; + enum roce_version { MLX5_ROCE_VERSION_1 = 0, MLX5_ROCE_VERSION_2 = 2, @@ -2332,48 +3009,126 @@ struct mlx5_ifc_dr_action_hw_copy_bits { u8 reserved_at_38[0x8]; }; -enum { - MLX5_DR_ACTION_MDFY_HW_FLD_L2_0 = 0, - MLX5_DR_ACTION_MDFY_HW_FLD_L2_1 = 1, - MLX5_DR_ACTION_MDFY_HW_FLD_L2_2 = 2, - MLX5_DR_ACTION_MDFY_HW_FLD_L3_0 = 3, - MLX5_DR_ACTION_MDFY_HW_FLD_L3_1 = 4, - MLX5_DR_ACTION_MDFY_HW_FLD_L3_2 = 5, - MLX5_DR_ACTION_MDFY_HW_FLD_L3_3 = 6, - MLX5_DR_ACTION_MDFY_HW_FLD_L3_4 = 7, - MLX5_DR_ACTION_MDFY_HW_FLD_L4_0 = 8, - MLX5_DR_ACTION_MDFY_HW_FLD_L4_1 = 9, - MLX5_DR_ACTION_MDFY_HW_FLD_MPLS = 10, - MLX5_DR_ACTION_MDFY_HW_FLD_L2_TNL_0 = 11, - MLX5_DR_ACTION_MDFY_HW_FLD_REG_0 = 12, - MLX5_DR_ACTION_MDFY_HW_FLD_REG_1 = 13, - MLX5_DR_ACTION_MDFY_HW_FLD_REG_2 = 14, - MLX5_DR_ACTION_MDFY_HW_FLD_REG_3 = 15, - MLX5_DR_ACTION_MDFY_HW_FLD_L4_2 = 16, - MLX5_DR_ACTION_MDFY_HW_FLD_FLEX_0 = 17, - MLX5_DR_ACTION_MDFY_HW_FLD_FLEX_1 = 18, - MLX5_DR_ACTION_MDFY_HW_FLD_FLEX_2 = 19, - MLX5_DR_ACTION_MDFY_HW_FLD_FLEX_3 = 20, - MLX5_DR_ACTION_MDFY_HW_FLD_L2_TNL_1 = 21, - MLX5_DR_ACTION_MDFY_HW_FLD_METADATA = 22, - MLX5_DR_ACTION_MDFY_HW_FLD_RESERVED = 23, +struct mlx5_ifc_create_flow_group_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x60]; + + u8 table_type[0x8]; + u8 reserved_at_88[0x18]; + + u8 reserved_at_a0[0x8]; + u8 table_id[0x18]; + + u8 reserved_at_c0[0x1f40]; }; -enum { - MLX5_DR_ACTION_MDFY_HW_OP_COPY = 0x1, - MLX5_DR_ACTION_MDFY_HW_OP_SET = 0x2, - MLX5_DR_ACTION_MDFY_HW_OP_ADD = 0x3, +struct mlx5_ifc_create_flow_group_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x8]; + u8 group_id[0x18]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_dest_format_bits { + u8 destination_type[0x8]; + u8 destination_id[0x18]; + + u8 reserved_at_20[0x1]; + u8 packet_reformat[0x1]; + u8 reserved_at_22[0x1e]; +}; + +struct mlx5_ifc_extended_dest_format_bits { + struct mlx5_ifc_dest_format_bits destination_entry; + + u8 packet_reformat_id[0x20]; + + u8 reserved_at_60[0x20]; +}; + +struct mlx5_ifc_flow_counter_list_bits { + u8 flow_counter_id[0x20]; + + u8 reserved_at_20[0x20]; +}; + +union mlx5_ifc_dest_format_flow_counter_list_auto_bits { + struct mlx5_ifc_dest_format_bits dest_format; + struct mlx5_ifc_flow_counter_list_bits flow_counter_list; + u8 reserved_at_0[0x40]; +}; + +struct mlx5_ifc_flow_context_bits { + u8 reserved_at_00[0x20]; + + u8 group_id[0x20]; + + u8 reserved_at_40[0x8]; + u8 flow_tag[0x18]; + + u8 reserved_at_60[0x10]; + u8 action[0x10]; + + u8 extended_destination[0x1]; + u8 reserved_at_81[0x7]; + u8 destination_list_size[0x18]; + + u8 reserved_at_a0[0x8]; + u8 flow_counter_list_size[0x18]; + + u8 reserved_at_c0[0x1740]; + + union mlx5_ifc_dest_format_flow_counter_list_auto_bits destination[0]; +}; + +struct mlx5_ifc_set_fte_in_bits { + u8 opcode[0x10]; + u8 reserved_at_10[0x10]; + + u8 reserved_at_20[0x60]; + + u8 table_type[0x8]; + u8 reserved_at_88[0x18]; + + u8 reserved_at_a0[0x8]; + u8 table_id[0x18]; + + u8 reserved_at_c0[0x40]; + u8 flow_index[0x20]; + + u8 reserved_at_120[0xe0]; + struct mlx5_ifc_flow_context_bits flow_context; +}; + +struct mlx5_ifc_set_fte_out_bits { + u8 status[0x8]; + u8 reserved_at_8[0x18]; + + u8 syndrome[0x20]; + + u8 reserved_at_40[0x40]; +}; + +enum dr_devx_flow_dest_type { + MLX5_FLOW_DEST_TYPE_VPORT = 0x0, + MLX5_FLOW_DEST_TYPE_TIR = 0x2, + + MLX5_FLOW_DEST_TYPE_COUNTER = 0x100, }; enum { - MLX5_DR_ACTION_MDFY_HW_HDR_L3_NONE = 0x0, - MLX5_DR_ACTION_MDFY_HW_HDR_L3_IPV4 = 0x1, - MLX5_DR_ACTION_MDFY_HW_HDR_L3_IPV6 = 0x2, + MLX5_FLOW_CONTEXT_ACTION_FWD_DEST = 0x4, + MLX5_FLOW_CONTEXT_ACTION_COUNT = 0x8, }; enum { - MLX5_DR_ACTION_MDFY_HW_HDR_L4_NONE = 0x0, - MLX5_DR_ACTION_MDFY_HW_HDR_L4_TCP = 0x1, - MLX5_DR_ACTION_MDFY_HW_HDR_L4_UDP = 0x2, + MLX5_QPC_PAGE_OFFSET_QUANTA = 64, }; #endif /* MLX5_IFC_H */ diff --git a/providers/mlx5/mlx5dv.h b/providers/mlx5/mlx5dv.h index 27a7170..07d3c3d 100644 --- a/providers/mlx5/mlx5dv.h +++ b/providers/mlx5/mlx5dv.h @@ -81,6 +81,7 @@ enum mlx5dv_context_comp_mask { MLX5DV_CONTEXT_MASK_FLOW_ACTION_FLAGS = 1 << 6, MLX5DV_CONTEXT_MASK_DC_ODP_CAPS = 1 << 7, MLX5DV_CONTEXT_MASK_HCA_CORE_CLOCK = 1 << 8, + MLX5DV_CONTEXT_MASK_NUM_LAG_PORTS = 1 << 9, }; struct mlx5dv_cqe_comp_caps { @@ -133,6 +134,7 @@ struct mlx5dv_context { uint32_t flow_action_flags; /* use enum mlx5dv_flow_action_cap_flags */ uint32_t dc_odp_caps; /* use enum ibv_odp_transport_cap_bits */ void *hca_core_clock; + uint8_t num_lag_ports; }; enum mlx5dv_context_flags { @@ -335,6 +337,7 @@ enum mlx5dv_flow_action_type { MLX5DV_FLOW_ACTION_TAG, MLX5DV_FLOW_ACTION_DEST_DEVX, MLX5DV_FLOW_ACTION_COUNTERS_DEVX, + MLX5DV_FLOW_ACTION_DEFAULT_MISS, }; struct mlx5dv_flow_action_attr { @@ -1438,6 +1441,7 @@ enum mlx5dv_dr_domain_type { enum mlx5dv_dr_domain_sync_flags { MLX5DV_DR_DOMAIN_SYNC_FLAGS_SW = 1 << 0, MLX5DV_DR_DOMAIN_SYNC_FLAGS_HW = 1 << 1, + MLX5DV_DR_DOMAIN_SYNC_FLAGS_MEM = 1 << 2, }; struct mlx5dv_dr_flow_meter_attr { @@ -1448,6 +1452,14 @@ struct mlx5dv_dr_flow_meter_attr { void *flow_meter_parameter; }; +struct mlx5dv_dr_flow_sampler_attr { + uint32_t sample_ratio; + struct mlx5dv_dr_table *default_next_table; + uint32_t num_sample_actions; + struct mlx5dv_dr_action **sample_actions; + __be64 action; +}; + struct mlx5dv_dr_domain * mlx5dv_dr_domain_create(struct ibv_context *ctx, enum mlx5dv_dr_domain_type type); @@ -1456,6 +1468,9 @@ int mlx5dv_dr_domain_destroy(struct mlx5dv_dr_domain *domain); int mlx5dv_dr_domain_sync(struct mlx5dv_dr_domain *domain, uint32_t flags); +void mlx5dv_dr_domain_set_reclaim_device_memory(struct mlx5dv_dr_domain *dmn, + bool enable); + struct mlx5dv_dr_table * mlx5dv_dr_table_create(struct mlx5dv_dr_domain *domain, uint32_t level); @@ -1491,8 +1506,36 @@ struct mlx5dv_dr_action * mlx5dv_dr_action_create_dest_vport(struct mlx5dv_dr_domain *domain, uint32_t vport); +struct mlx5dv_dr_action * +mlx5dv_dr_action_create_dest_devx_tir(struct mlx5dv_devx_obj *devx_obj); + +enum mlx5dv_dr_action_dest_type { + MLX5DV_DR_ACTION_DEST, + MLX5DV_DR_ACTION_DEST_REFORMAT, +}; + +struct mlx5dv_dr_action_dest_reformat { + struct mlx5dv_dr_action *reformat; + struct mlx5dv_dr_action *dest; +}; + +struct mlx5dv_dr_action_dest_attr { + enum mlx5dv_dr_action_dest_type type; + union { + struct mlx5dv_dr_action *dest; + struct mlx5dv_dr_action_dest_reformat *dest_reformat; + }; +}; + +struct mlx5dv_dr_action * +mlx5dv_dr_action_create_dest_array(struct mlx5dv_dr_domain *domain, + size_t num_dest, + struct mlx5dv_dr_action_dest_attr *dests[]); + struct mlx5dv_dr_action *mlx5dv_dr_action_create_drop(void); +struct mlx5dv_dr_action *mlx5dv_dr_action_create_default_miss(void); + struct mlx5dv_dr_action *mlx5dv_dr_action_create_tag(uint32_t tag_value); struct mlx5dv_dr_action * @@ -1518,6 +1561,9 @@ int mlx5dv_dr_action_modify_flow_meter(struct mlx5dv_dr_action *action, struct mlx5dv_dr_flow_meter_attr *attr, __be64 modify_field_select); +struct mlx5dv_dr_action * +mlx5dv_dr_action_create_flow_sampler(struct mlx5dv_dr_flow_sampler_attr *attr); + int mlx5dv_dr_action_destroy(struct mlx5dv_dr_action *action); int mlx5dv_dump_dr_domain(FILE *fout, struct mlx5dv_dr_domain *domain); @@ -1536,6 +1582,12 @@ struct mlx5dv_pp *mlx5dv_pp_alloc(struct ibv_context *context, void mlx5dv_pp_free(struct mlx5dv_pp *pp); +int mlx5dv_query_qp_lag_port(struct ibv_qp *qp, + uint8_t *port_num, + uint8_t *active_port_num); + +int mlx5dv_modify_qp_lag_port(struct ibv_qp *qp, uint8_t port_num); + #ifdef __cplusplus } #endif diff --git a/providers/mlx5/mlx5dv_dr.h b/providers/mlx5/mlx5dv_dr.h index dc99075..091a797 100644 --- a/providers/mlx5/mlx5dv_dr.h +++ b/providers/mlx5/mlx5dv_dr.h @@ -35,6 +35,7 @@ #include #include +#include #include #include "mlx5dv.h" #include "mlx5_ifc.h" @@ -94,47 +95,7 @@ dr_icm_next_higher_chunk(enum dr_icm_chunk_size chunk) } enum dr_ste_lu_type { - DR_STE_LU_TYPE_NOP = 0x00, - DR_STE_LU_TYPE_SRC_GVMI_AND_QP = 0x05, - DR_STE_LU_TYPE_ETHL2_TUNNELING_I = 0x0a, - DR_STE_LU_TYPE_ETHL2_DST_O = 0x06, - DR_STE_LU_TYPE_ETHL2_DST_I = 0x07, - DR_STE_LU_TYPE_ETHL2_DST_D = 0x1b, - DR_STE_LU_TYPE_ETHL2_SRC_O = 0x08, - DR_STE_LU_TYPE_ETHL2_SRC_I = 0x09, - DR_STE_LU_TYPE_ETHL2_SRC_D = 0x1c, - DR_STE_LU_TYPE_ETHL2_SRC_DST_O = 0x36, - DR_STE_LU_TYPE_ETHL2_SRC_DST_I = 0x37, - DR_STE_LU_TYPE_ETHL2_SRC_DST_D = 0x38, - DR_STE_LU_TYPE_ETHL3_IPV6_DST_O = 0x0d, - DR_STE_LU_TYPE_ETHL3_IPV6_DST_I = 0x0e, - DR_STE_LU_TYPE_ETHL3_IPV6_DST_D = 0x1e, - DR_STE_LU_TYPE_ETHL3_IPV6_SRC_O = 0x0f, - DR_STE_LU_TYPE_ETHL3_IPV6_SRC_I = 0x10, - DR_STE_LU_TYPE_ETHL3_IPV6_SRC_D = 0x1f, - DR_STE_LU_TYPE_ETHL3_IPV4_5_TUPLE_O = 0x11, - DR_STE_LU_TYPE_ETHL3_IPV4_5_TUPLE_I = 0x12, - DR_STE_LU_TYPE_ETHL3_IPV4_5_TUPLE_D = 0x20, - DR_STE_LU_TYPE_ETHL3_IPV4_MISC_O = 0x29, - DR_STE_LU_TYPE_ETHL3_IPV4_MISC_I = 0x2a, - DR_STE_LU_TYPE_ETHL3_IPV4_MISC_D = 0x2b, - DR_STE_LU_TYPE_ETHL4_O = 0x13, - DR_STE_LU_TYPE_ETHL4_I = 0x14, - DR_STE_LU_TYPE_ETHL4_D = 0x21, - DR_STE_LU_TYPE_ETHL4_MISC_O = 0x2c, - DR_STE_LU_TYPE_ETHL4_MISC_I = 0x2d, - DR_STE_LU_TYPE_ETHL4_MISC_D = 0x2e, - DR_STE_LU_TYPE_MPLS_FIRST_O = 0x15, - DR_STE_LU_TYPE_MPLS_FIRST_I = 0x24, - DR_STE_LU_TYPE_MPLS_FIRST_D = 0x25, - DR_STE_LU_TYPE_GRE = 0x16, - DR_STE_LU_TYPE_FLEX_PARSER_0 = 0x22, - DR_STE_LU_TYPE_FLEX_PARSER_1 = 0x23, - DR_STE_LU_TYPE_FLEX_PARSER_TNL_HEADER = 0x19, - DR_STE_LU_TYPE_GENERAL_PURPOSE = 0x18, - DR_STE_LU_TYPE_STEERING_REGISTERS_0 = 0x2f, - DR_STE_LU_TYPE_STEERING_REGISTERS_1 = 0x30, - DR_STE_LU_TYPE_DONT_CARE = 0x0f, + DR_STE_LU_TYPE_DONT_CARE = 0x0f, }; enum dr_ste_entry_type { @@ -148,6 +109,7 @@ enum { DR_STE_SIZE_CTRL = 32, DR_STE_SIZE_TAG = 16, DR_STE_SIZE_MASK = 16, + DR_STE_LOG_SIZE = 6, }; enum { @@ -156,6 +118,7 @@ enum { enum { DR_MODIFY_ACTION_SIZE = 8, + DR_MODIFY_ACTION_LOG_SIZE = 3, }; enum dr_matcher_criteria { @@ -181,16 +144,20 @@ enum dr_action_type { DR_ACTION_TYP_MODIFY_HDR, DR_ACTION_TYP_VPORT, DR_ACTION_TYP_METER, + DR_ACTION_TYP_MISS, + DR_ACTION_TYP_SAMPLER, + DR_ACTION_TYP_DEST_ARRAY, DR_ACTION_TYP_MAX, }; struct dr_icm_pool; struct dr_icm_chunk; -struct dr_icm_bucket; +struct dr_icm_buddy_mem; struct dr_ste_htbl; struct dr_match_param; struct dr_devx_caps; struct dr_matcher_rx_tx; +struct dr_ste_ctx; struct dr_data_seg { uint64_t addr; @@ -239,7 +206,7 @@ struct dr_ste_htbl_ctrl { }; struct dr_ste_htbl { - uint8_t lu_type; + uint16_t lu_type; uint16_t byte_mask; atomic_int refcount; struct dr_icm_chunk *chunk; @@ -273,17 +240,17 @@ struct dr_ste_build { bool inner; bool rx; struct dr_devx_caps *caps; - uint8_t lu_type; + uint16_t lu_type; uint16_t byte_mask; uint8_t bit_mask[DR_STE_SIZE_MASK]; int (*ste_build_tag_func)(struct dr_match_param *spec, struct dr_ste_build *sb, - uint8_t *hw_ste_p); + uint8_t *tag); }; struct dr_ste_htbl *dr_ste_htbl_alloc(struct dr_icm_pool *pool, enum dr_icm_chunk_size chunk_size, - uint8_t lu_type, uint16_t byte_mask); + uint16_t lu_type, uint16_t byte_mask); int dr_ste_htbl_free(struct dr_ste_htbl *htbl); static inline void dr_htbl_put(struct dr_ste_htbl *htbl) @@ -299,29 +266,72 @@ static inline void dr_htbl_get(struct dr_ste_htbl *htbl) /* STE utils */ uint32_t dr_ste_calc_hash_index(uint8_t *hw_ste_p, struct dr_ste_htbl *htbl); -void dr_ste_init(uint8_t *hw_ste_p, uint8_t lu_type, uint8_t entry_type, uint16_t gvmi); -void dr_ste_always_hit_htbl(struct dr_ste *ste, struct dr_ste_htbl *next_htbl); -void dr_ste_set_miss_addr(uint8_t *hw_ste, uint64_t miss_addr); -uint64_t dr_ste_get_miss_addr(uint8_t *hw_ste); -void dr_ste_set_hit_addr(uint8_t *hw_ste, uint64_t icm_addr, uint32_t ht_size); -void dr_ste_always_miss_addr(struct dr_ste *ste, uint64_t miss_addr); +void dr_ste_set_miss_addr(struct dr_ste_ctx *ste_ctx, uint8_t *hw_ste_p, + uint64_t miss_addr); +void dr_ste_set_hit_addr_by_next_htbl(struct dr_ste_ctx *ste_ctx, + uint8_t *hw_ste, + struct dr_ste_htbl *next_htbl); +void dr_ste_set_hit_addr(struct dr_ste_ctx *ste_ctx, uint8_t *hw_ste_p, + uint64_t icm_addr, uint32_t ht_size); void dr_ste_set_bit_mask(uint8_t *hw_ste_p, uint8_t *bit_mask); -bool dr_ste_not_used_ste(struct dr_ste *ste); bool dr_ste_is_last_in_rule(struct dr_matcher_rx_tx *nic_matcher, uint8_t ste_location); -void dr_ste_rx_set_flow_tag(uint8_t *hw_ste_p, uint32_t flow_tag); -void dr_ste_set_counter_id(uint8_t *hw_ste_p, uint32_t ctr_id); -void dr_ste_set_tx_encap(void *hw_ste_p, uint32_t reformat_id, int size, bool encap_l3); -void dr_ste_set_rx_decap(uint8_t *hw_ste_p); -void dr_ste_set_rx_decap_l3(uint8_t *hw_ste_p, bool vlan); -void dr_ste_set_entry_type(uint8_t *hw_ste_p, uint8_t entry_type); -uint8_t dr_ste_get_entry_type(uint8_t *hw_ste_p); -void dr_ste_set_rewrite_actions(uint8_t *hw_ste_p, uint16_t num_of_actions, - uint32_t re_write_index); uint64_t dr_ste_get_icm_addr(struct dr_ste *ste); uint64_t dr_ste_get_mr_addr(struct dr_ste *ste); struct list_head *dr_ste_get_miss_list(struct dr_ste *ste); +struct dr_ste_actions_attr { + uint32_t modify_index; + uint16_t modify_actions; + uint32_t decap_index; + uint16_t decap_actions; + bool decap_with_vlan; + uint64_t final_icm_addr; + uint32_t flow_tag; + uint32_t ctr_id; + uint16_t gvmi; + uint16_t hit_gvmi; + uint32_t reformat_id; + uint32_t reformat_size; +}; + +void dr_ste_set_actions_rx(struct dr_ste_ctx *ste_ctx, + uint8_t *action_type_set, + uint8_t *last_ste, + struct dr_ste_actions_attr *attr, + uint32_t *added_stes); +void dr_ste_set_actions_tx(struct dr_ste_ctx *ste_ctx, + uint8_t *action_type_set, + uint8_t *last_ste, + struct dr_ste_actions_attr *attr, + uint32_t *added_stes); +void dr_ste_set_action_set(struct dr_ste_ctx *ste_ctx, + __be64 *hw_action, + uint8_t hw_field, + uint8_t shifter, + uint8_t length, + uint32_t data); +void dr_ste_set_action_add(struct dr_ste_ctx *ste_ctx, + __be64 *hw_action, + uint8_t hw_field, + uint8_t shifter, + uint8_t length, + uint32_t data); +void dr_ste_set_action_copy(struct dr_ste_ctx *ste_ctx, + __be64 *hw_action, + uint8_t dst_hw_field, + uint8_t dst_shifter, + uint8_t dst_len, + uint8_t src_hw_field, + uint8_t src_shifter); +int dr_ste_set_action_decap_l3_list(struct dr_ste_ctx *ste_ctx, + void *data, uint32_t data_sz, + uint8_t *hw_action, uint32_t hw_action_sz, + uint16_t *used_hw_action_num); +const struct dr_ste_action_modify_field * +dr_ste_conv_modify_hdr_sw_field(struct dr_ste_ctx *ste_ctx, uint16_t sw_field); + +struct dr_ste_ctx *dr_ste_get_ctx(uint8_t version); void dr_ste_free(struct dr_ste *ste, struct mlx5dv_dr_matcher *matcher, struct dr_matcher_rx_tx *nic_matcher); @@ -339,8 +349,11 @@ static inline void dr_ste_get(struct dr_ste *ste) atomic_fetch_add(&ste->refcount, 1); } -void dr_ste_set_hit_addr_by_next_htbl(uint8_t *hw_ste, - struct dr_ste_htbl *next_htbl); +static inline bool dr_ste_is_not_used(struct dr_ste *ste) +{ + return !atomic_load(&ste->refcount); +} + bool dr_ste_equal_tag(void *src, void *dst); int dr_ste_create_next_htbl(struct mlx5dv_dr_matcher *matcher, struct dr_matcher_rx_tx *nic_matcher, @@ -357,71 +370,92 @@ int dr_ste_build_ste_arr(struct mlx5dv_dr_matcher *matcher, struct dr_matcher_rx_tx *nic_matcher, struct dr_match_param *value, uint8_t *ste_arr); -int dr_ste_build_eth_l2_src_des(struct dr_ste_build *builder, - struct dr_match_param *mask, - bool inner, bool rx); -void dr_ste_build_eth_l3_ipv4_5_tuple(struct dr_ste_build *sb, +void dr_ste_build_eth_l2_src_dst(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx); +void dr_ste_build_eth_l3_ipv4_5_tuple(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, struct dr_match_param *mask, bool inner, bool rx); -void dr_ste_build_eth_l3_ipv4_misc(struct dr_ste_build *sb, +void dr_ste_build_eth_l3_ipv4_misc(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, struct dr_match_param *mask, bool inner, bool rx); -void dr_ste_build_eth_l3_ipv6_dst(struct dr_ste_build *sb, +void dr_ste_build_eth_l3_ipv6_dst(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, struct dr_match_param *mask, bool inner, bool rx); -void dr_ste_build_eth_l3_ipv6_src(struct dr_ste_build *sb, +void dr_ste_build_eth_l3_ipv6_src(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, struct dr_match_param *mask, bool inner, bool rx); -void dr_ste_build_eth_l2_src(struct dr_ste_build *sb, - struct dr_match_param *mask, - bool inner, bool rx); -void dr_ste_build_eth_l2_dst(struct dr_ste_build *sb, +void dr_ste_build_eth_l2_src(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, struct dr_match_param *mask, bool inner, bool rx); -void dr_ste_build_eth_l2_tnl(struct dr_ste_build *sb, +void dr_ste_build_eth_l2_dst(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, struct dr_match_param *mask, bool inner, bool rx); -void dr_ste_build_ipv6_l3_l4(struct dr_ste_build *sb, +void dr_ste_build_eth_l2_tnl(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, struct dr_match_param *mask, bool inner, bool rx); -void dr_ste_build_eth_l4_misc(struct dr_ste_build *sb, +void dr_ste_build_eth_ipv6_l3_l4(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx); +void dr_ste_build_eth_l4_misc(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, struct dr_match_param *mask, bool inner, bool rx); -void dr_ste_build_gre(struct dr_ste_build *sb, - struct dr_match_param *mask, - bool inner, bool rx); -void dr_ste_build_mpls(struct dr_ste_build *sb, +void dr_ste_build_tnl_gre(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx); +void dr_ste_build_mpls(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, struct dr_match_param *mask, bool inner, bool rx); -void dr_ste_build_flex_parser_0(struct dr_ste_build *sb, +void dr_ste_build_tnl_mpls(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx); +int dr_ste_build_icmp(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, + struct dr_match_param *mask, + struct dr_devx_caps *caps, + bool inner, bool rx); +void dr_ste_build_tnl_vxlan_gpe(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, struct dr_match_param *mask, bool inner, bool rx); -int dr_ste_build_flex_parser_1(struct dr_ste_build *sb, - struct dr_match_param *mask, - struct dr_devx_caps *caps, - bool inner, bool rx); -void dr_ste_build_flex_parser_tnl_vxlan_gpe(struct dr_ste_build *sb, - struct dr_match_param *mask, - bool inner, bool rx); -void dr_ste_build_flex_parser_tnl_geneve(struct dr_ste_build *sb, - struct dr_match_param *mask, - bool inner, bool rx); -void dr_ste_build_flex_parser_tnl_gtpu(struct dr_ste_build *sb, - struct dr_match_param *mask, - bool inner, bool rx); -void dr_ste_build_general_purpose(struct dr_ste_build *sb, +void dr_ste_build_tnl_geneve(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx); +void dr_ste_build_tnl_gtpu(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, + struct dr_match_param *mask, + bool inner, bool rx); +void dr_ste_build_general_purpose(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, struct dr_match_param *mask, bool inner, bool rx); -void dr_ste_build_register_0(struct dr_ste_build *sb, +void dr_ste_build_register_0(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, struct dr_match_param *mask, bool inner, bool rx); -void dr_ste_build_register_1(struct dr_ste_build *sb, +void dr_ste_build_register_1(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, struct dr_match_param *mask, bool inner, bool rx); -int dr_ste_build_src_gvmi_qpn(struct dr_ste_build *sb, - struct dr_match_param *mask, - struct dr_devx_caps *caps, - bool inner, bool rx); +void dr_ste_build_src_gvmi_qpn(struct dr_ste_ctx *ste_ctx, + struct dr_ste_build *sb, + struct dr_match_param *mask, + struct dr_devx_caps *caps, + bool inner, bool rx); void dr_ste_build_empty_always_hit(struct dr_ste_build *sb, bool rx); /* Actions utils */ @@ -538,10 +572,10 @@ struct dr_match_misc3 { uint32_t outer_vxlan_gpe_next_protocol:8; uint32_t icmpv4_header_data; uint32_t icmpv6_header_data; - uint32_t icmpv6_code:8; - uint32_t icmpv6_type:8; - uint32_t icmpv4_code:8; - uint32_t icmpv4_type:8; + uint8_t icmpv6_code; + uint8_t icmpv6_type; + uint8_t icmpv4_code; + uint8_t icmpv4_type; uint32_t gtpu_teid; uint32_t gtpu_msg_type:8; uint32_t gtpu_flags:3; @@ -555,9 +589,9 @@ struct dr_match_param { struct dr_match_misc3 misc3; }; -#define DR_MASK_IS_FLEX_PARSER_ICMPV4_SET(_misc3) (_misc3->icmpv4_type || \ - _misc3->icmpv4_code || \ - _misc3->icmpv4_header_data) +#define DR_MASK_IS_ICMPV4_SET(_misc3) ((_misc3)->icmpv4_type || \ + (_misc3)->icmpv4_code || \ + (_misc3)->icmpv4_header_data) struct dr_esw_caps { uint64_t drop_icm_address_rx; @@ -565,6 +599,7 @@ struct dr_esw_caps { uint64_t uplink_icm_address_rx; uint64_t uplink_icm_address_tx; bool sw_owner; + bool sw_owner_v2; }; struct dr_devx_vport_cap { @@ -573,6 +608,10 @@ struct dr_devx_vport_cap { uint64_t icm_address_tx; }; +struct dr_devx_roce_cap { + bool fl_rc_qp_when_roce_enabled; +}; + struct dr_devx_caps { uint16_t gvmi; uint64_t nic_rx_drop_address; @@ -581,6 +620,7 @@ struct dr_devx_caps { uint64_t esw_rx_drop_address; uint64_t esw_tx_drop_address; uint32_t log_icm_size; + uint8_t log_modify_hdr_icm_size; uint64_t hdr_modify_icm_addr; uint32_t flex_protocols; uint8_t flex_parser_id_icmp_dw0; @@ -588,12 +628,71 @@ struct dr_devx_caps { uint8_t flex_parser_id_icmpv6_dw0; uint8_t flex_parser_id_icmpv6_dw1; uint8_t max_ft_level; + uint8_t sw_format_ver; bool eswitch_manager; bool rx_sw_owner; bool tx_sw_owner; bool fdb_sw_owner; + bool rx_sw_owner_v2; + bool tx_sw_owner_v2; + bool fdb_sw_owner_v2; uint32_t num_vports; struct dr_devx_vport_cap *vports_caps; + struct dr_devx_roce_cap roce_caps; +}; + +struct dr_devx_flow_table_attr { + uint8_t type; + uint8_t level; + bool sw_owner; + bool term_tbl; + bool reformat_en; + uint64_t icm_addr_rx; + uint64_t icm_addr_tx; +}; + +struct dr_devx_flow_group_attr { + uint32_t table_id; + uint32_t table_type; +}; + +struct dr_devx_flow_dest_info { + enum dr_devx_flow_dest_type type; + union { + uint32_t vport_num; + uint32_t tir_num; + uint32_t counter_id; + }; + bool has_reformat; + uint32_t reformat_id; +}; + +struct dr_devx_flow_fte_attr { + uint32_t table_id; + uint32_t table_type; + uint32_t group_id; + uint32_t flow_tag; + uint32_t action; + uint32_t dest_size; + struct dr_devx_flow_dest_info *dest_arr; + bool extended_dest; +}; + +struct dr_devx_tbl { + uint8_t type; + uint8_t level; + struct mlx5dv_devx_obj *ft_dvo; + struct mlx5dv_devx_obj *fg_dvo; + struct mlx5dv_devx_obj *fte_dvo; +}; + +struct dr_devx_flow_sampler_attr { + uint8_t table_type; + uint8_t level; + uint8_t ignore_flow_level; + uint32_t sample_ratio; + uint32_t default_next_table_id; + uint32_t sample_table_id; }; struct dr_domain_rx_tx { @@ -614,8 +713,13 @@ struct dr_domain_info { struct dr_devx_caps caps; }; +enum dr_domain_flags { + DR_DOMAIN_FLAG_MEMORY_RECLAIM = 1 << 0, +}; + struct mlx5dv_dr_domain { struct ibv_context *ctx; + struct dr_ste_ctx *ste_ctx; struct ibv_pd *pd; struct mlx5dv_devx_uar *uar; enum mlx5dv_dr_domain_type type; @@ -626,6 +730,7 @@ struct mlx5dv_dr_domain { struct dr_send_ring *send_ring; struct dr_domain_info info; struct list_head tbl_list; + uint32_t flags; }; struct dr_table_rx_tx { @@ -675,6 +780,35 @@ struct dr_rule_member { struct list_node use_ste_list; }; +struct dr_ste_action_modify_field { + uint16_t hw_field; + uint8_t start; + uint8_t end; + uint8_t l3_type; + uint8_t l4_type; +}; + +struct dr_devx_tbl_with_refs { + uint16_t ref_actions_num; + struct mlx5dv_dr_action **ref_actions; + struct dr_devx_tbl *devx_tbl; +}; + +struct dr_flow_sampler { + struct mlx5dv_devx_obj *devx_obj; + uint64_t rx_icm_addr; + uint64_t tx_icm_addr; + struct mlx5dv_dr_table *next_ft; +}; + +struct dr_flow_sampler_restore_tbl { + struct mlx5dv_dr_table *tbl; + struct mlx5dv_dr_matcher *matcher; + struct mlx5dv_dr_rule *rule; + struct mlx5dv_dr_action **actions; + uint16_t num_of_actions; +}; + struct mlx5dv_dr_action { enum dr_action_type action_type; atomic_int refcount; @@ -712,8 +846,22 @@ struct mlx5dv_dr_action { uint64_t rx_icm_addr; uint64_t tx_icm_addr; } meter; + struct { + struct mlx5dv_dr_domain *dmn; + struct dr_devx_tbl_with_refs *term_tbl; + struct dr_flow_sampler *sampler_default; + struct dr_flow_sampler_restore_tbl *restore_tbl; + struct dr_flow_sampler *sampler_restore; + } sampler; struct mlx5dv_dr_table *dest_tbl; struct { + struct mlx5dv_dr_domain *dmn; + struct list_head actions_list; + struct dr_devx_tbl *devx_tbl; + uint64_t rx_icm_addr; + uint64_t tx_icm_addr; + } dest_array; + struct { struct mlx5dv_devx_obj *devx_obj; uint32_t offset; } ctr; @@ -722,7 +870,13 @@ struct mlx5dv_dr_action { struct dr_devx_vport_cap *caps; uint32_t num; } vport; - struct ibv_qp *qp; + struct { + bool is_qp; + union { + struct mlx5dv_devx_obj *devx_tir; + struct ibv_qp *qp; + }; + } dest_qp; struct mlx5dv_devx_obj *devx_obj; uint32_t flow_tag; }; @@ -768,13 +922,17 @@ struct mlx5dv_dr_rule { void dr_rule_update_rule_member(struct dr_ste *new_ste, struct dr_ste *ste); struct dr_icm_chunk { - struct dr_icm_bucket *bucket; + struct dr_icm_buddy_mem *buddy_mem; struct list_node chunk_list; uint32_t rkey; uint32_t num_of_entries; uint32_t byte_size; uint64_t icm_addr; uint64_t mr_addr; + /* indicates the index of this chunk in the whole memory, + * used for deleting the chunk from the buddy + */ + uint32_t seg; /* Memory optimisation */ struct dr_ste *ste_arr; @@ -782,14 +940,13 @@ struct dr_icm_chunk { struct list_head *miss_list; }; -static inline int dr_matcher_supp_flex_parser_icmp_v4(struct dr_devx_caps *caps) +static inline int +dr_icm_pool_dm_type_to_entry_size(enum dr_icm_type icm_type) { - return caps->flex_protocols & MLX5_FLEX_PARSER_ICMP_V4_ENABLED; -} + if (icm_type == DR_ICM_TYPE_STE) + return DR_STE_SIZE; -static inline int dr_matcher_supp_flex_parser_icmp_v6(struct dr_devx_caps *caps) -{ - return caps->flex_protocols & MLX5_FLEX_PARSER_ICMP_V6_ENABLED; + return DR_MODIFY_ACTION_SIZE; } static inline uint32_t @@ -805,10 +962,7 @@ dr_icm_pool_chunk_size_to_byte(enum dr_icm_chunk_size chunk_size, int num_of_entries; int entry_size; - if (icm_type == DR_ICM_TYPE_STE) - entry_size = DR_STE_SIZE; - else - entry_size = DR_MODIFY_ACTION_SIZE; + entry_size = dr_icm_pool_dm_type_to_entry_size(icm_type); num_of_entries = dr_icm_pool_chunk_size_to_entries(chunk_size); @@ -838,11 +992,22 @@ int dr_devx_query_gvmi(struct ibv_context *ctx, int dr_devx_query_esw_caps(struct ibv_context *ctx, struct dr_esw_caps *caps); int dr_devx_sync_steering(struct ibv_context *ctx); -struct mlx5dv_devx_obj *dr_devx_create_flow_table(struct ibv_context *ctx, - uint32_t table_type, - uint64_t icm_addr_rx, - uint64_t icm_addr_tx, - u8 level); +struct mlx5dv_devx_obj * +dr_devx_create_flow_table(struct ibv_context *ctx, + struct dr_devx_flow_table_attr *table_attr); +int dr_devx_query_flow_table(struct mlx5dv_devx_obj *obj, uint32_t type, + uint64_t *rx_icm_addr, uint64_t *tx_icm_addr); +struct dr_devx_tbl * +dr_devx_create_always_hit_ft(struct ibv_context *ctx, + struct dr_devx_flow_table_attr *ft_attr, + struct dr_devx_flow_group_attr *fg_attr, + struct dr_devx_flow_fte_attr *fte_attr); +void dr_devx_destroy_always_hit_ft(struct dr_devx_tbl *devx_tbl); +struct mlx5dv_devx_obj * +dr_devx_create_flow_sampler(struct ibv_context *ctx, + struct dr_devx_flow_sampler_attr *sampler_attr); +int dr_devx_query_flow_sampler(struct mlx5dv_devx_obj *obj, + uint64_t *rx_icm_addr, uint64_t *tx_icm_addr); struct mlx5dv_devx_obj *dr_devx_create_reformat_ctx(struct ibv_context *ctx, enum reformat_type rt, size_t reformat_size, @@ -896,6 +1061,7 @@ struct dr_devx_qp_rtr_attr { uint16_t port_num; uint8_t min_rnr_timer; uint8_t sgid_index; + bool fl; }; int dr_devx_modify_qp_init2rtr(struct ibv_context *ctx, @@ -922,17 +1088,20 @@ static inline bool dr_is_root_table(struct mlx5dv_dr_table *tbl) struct dr_icm_pool *dr_icm_pool_create(struct mlx5dv_dr_domain *dmn, enum dr_icm_type icm_type); void dr_icm_pool_destroy(struct dr_icm_pool *pool); +int dr_icm_pool_sync_pool(struct dr_icm_pool *pool); struct dr_icm_chunk *dr_icm_alloc_chunk(struct dr_icm_pool *pool, enum dr_icm_chunk_size chunk_size); void dr_icm_free_chunk(struct dr_icm_chunk *chunk); -bool dr_ste_is_not_valid_entry(uint8_t *p_hw_ste); +void dr_ste_prepare_for_postsend(struct dr_ste_ctx *ste_ctx, + uint8_t *hw_ste_p, uint32_t ste_size); int dr_ste_htbl_init_and_postsend(struct mlx5dv_dr_domain *dmn, struct dr_domain_rx_tx *nic_dmn, struct dr_ste_htbl *htbl, struct dr_htbl_connect_info *connect_info, bool update_hw_ste); -void dr_ste_set_formated_ste(uint16_t gvmi, +void dr_ste_set_formated_ste(struct dr_ste_ctx *ste_ctx, + uint16_t gvmi, struct dr_domain_rx_tx *nic_dmn, struct dr_ste_htbl *htbl, uint8_t *formated_ste, @@ -1017,4 +1186,31 @@ int dr_send_postsend_formated_htbl(struct mlx5dv_dr_domain *dmn, bool update_hw_ste); int dr_send_postsend_action(struct mlx5dv_dr_domain *dmn, struct mlx5dv_dr_action *action); +/* buddy functions & structure */ +struct dr_icm_mr; + +struct dr_icm_buddy_mem { + bitmap **bits; + unsigned int *num_free; + bitmap **set_bit; + uint32_t max_order; + struct list_node list_node; + struct dr_icm_mr *icm_mr; + struct dr_icm_pool *pool; + + /* This is the list of used chunks. HW may be accessing this memory */ + struct list_head used_list; + size_t used_memory; + + /* hardware may be accessing this memory but at some future, + * undetermined time, it might cease to do so. + * sync_ste command sets them free. + */ + struct list_head hot_list; +}; + +int dr_buddy_init(struct dr_icm_buddy_mem *buddy, uint32_t max_order); +void dr_buddy_cleanup(struct dr_icm_buddy_mem *buddy); +int dr_buddy_alloc_mem(struct dr_icm_buddy_mem *buddy, int order); +void dr_buddy_free_mem(struct dr_icm_buddy_mem *buddy, uint32_t seg, int order); #endif diff --git a/providers/mlx5/qp.c b/providers/mlx5/qp.c index 1e65d8b..077346d 100644 --- a/providers/mlx5/qp.c +++ b/providers/mlx5/qp.c @@ -2107,7 +2107,7 @@ static void umr_strided_seg_create(struct mlx5_qp *qp, { struct mlx5_wqe_umr_repeat_block_seg *rb = seg; struct mlx5_wqe_umr_repeat_ent_seg *eb; - int byte_count = 0; + uint64_t byte_count = 0; int tmp; int i; diff --git a/providers/mlx5/verbs.c b/providers/mlx5/verbs.c index 47e8380..7907218 100644 --- a/providers/mlx5/verbs.c +++ b/providers/mlx5/verbs.c @@ -48,6 +48,7 @@ #include #include +#include #include #include #include @@ -323,7 +324,7 @@ static void mlx5_insert_dyn_uuars(struct mlx5_context *ctx, list_node_init(&bf->uar_entry); list_add_tail(head, &bf->uar_entry); if (!bf_uar->dyn_alloc_uar) - bf->bfreg_dyn_index = (ctx->curr_legacy_dyn_sys_uar_page - 1) * num_bfregs_per_page; + bf->bfreg_dyn_index = (ctx->curr_legacy_dyn_sys_uar_page - 1) * num_bfregs_per_page + j; bf->dyn_alloc_uar = bf_uar->dyn_alloc_uar; bf->need_lock = bf_uar->qp_shared; mlx5_spinlock_init(&bf->lock, bf->need_lock); @@ -556,26 +557,39 @@ static int mlx5_dealloc_parent_domain(struct mlx5_parent_domain *mparent_domain) return 0; } -int mlx5_free_pd(struct ibv_pd *pd) +static int _mlx5_free_pd(struct ibv_pd *pd, bool unimport) { int ret; struct mlx5_parent_domain *mparent_domain = to_mparent_domain(pd); struct mlx5_pd *mpd = to_mpd(pd); - if (mparent_domain) + if (mparent_domain) { + if (unimport) + return EINVAL; + return mlx5_dealloc_parent_domain(mparent_domain); + } if (atomic_load(&mpd->refcount) > 1) return EBUSY; + if (unimport) + goto end; + ret = ibv_cmd_dealloc_pd(pd); if (ret) return ret; +end: free(mpd); return 0; } +int mlx5_free_pd(struct ibv_pd *pd) +{ + return _mlx5_free_pd(pd, false); +} + struct ibv_mr *mlx5_reg_mr(struct ibv_pd *pd, void *addr, size_t length, uint64_t hca_va, int acc) { @@ -592,7 +606,6 @@ struct ibv_mr *mlx5_reg_mr(struct ibv_pd *pd, void *addr, size_t length, ret = ibv_cmd_reg_mr(pd, addr, length, hca_va, access, &mr->vmr, &cmd, sizeof(cmd), &resp, sizeof(resp)); if (ret) { - mlx5_free_buf(&(mr->buf)); free(mr); return NULL; } @@ -674,9 +687,6 @@ int mlx5_rereg_mr(struct verbs_mr *vmr, int flags, struct ibv_pd *pd, struct ibv_rereg_mr cmd; struct ib_uverbs_rereg_mr_resp resp; - if (flags & IBV_REREG_MR_KEEP_VALID) - return ENOTSUP; - return ibv_cmd_rereg_mr(vmr, flags, addr, length, (uintptr_t)addr, access, pd, &cmd, sizeof(cmd), &resp, sizeof(resp)); @@ -707,6 +717,67 @@ int mlx5_advise_mr(struct ibv_pd *pd, return ibv_cmd_advise_mr(pd, advice, flags, sg_list, num_sge); } +struct ibv_pd *mlx5_import_pd(struct ibv_context *context, + uint32_t pd_handle) +{ + DECLARE_COMMAND_BUFFER(cmd, + UVERBS_OBJECT_PD, + MLX5_IB_METHOD_PD_QUERY, + 2); + + struct mlx5_pd *pd; + int ret; + + pd = calloc(1, sizeof *pd); + if (!pd) + return NULL; + + fill_attr_in_obj(cmd, MLX5_IB_ATTR_QUERY_PD_HANDLE, pd_handle); + fill_attr_out_ptr(cmd, MLX5_IB_ATTR_QUERY_PD_RESP_PDN, &pd->pdn); + + ret = execute_ioctl(context, cmd); + if (ret) { + free(pd); + return NULL; + } + + pd->ibv_pd.context = context; + pd->ibv_pd.handle = pd_handle; + atomic_init(&pd->refcount, 1); + + return &pd->ibv_pd; +} + +void mlx5_unimport_pd(struct ibv_pd *pd) +{ + if (_mlx5_free_pd(pd, true)) + assert(false); +} + +struct ibv_mr *mlx5_import_mr(struct ibv_pd *pd, + uint32_t mr_handle) +{ + struct mlx5_mr *mr; + int ret; + + mr = calloc(1, sizeof *mr); + if (!mr) + return NULL; + + ret = ibv_cmd_query_mr(pd, &mr->vmr, mr_handle); + if (ret) { + free(mr); + return NULL; + } + + return &mr->vmr.ibv_mr; +} + +void mlx5_unimport_mr(struct ibv_mr *ibmr) +{ + free(to_mmr(ibmr)); +} + struct ibv_mw *mlx5_alloc_mw(struct ibv_pd *pd, enum ibv_mw_type type) { struct ibv_mw *mw; @@ -1001,13 +1072,13 @@ static struct ibv_cq_ex *create_cq(struct ibv_context *context, struct ibv_cq_init_attr_ex cq_attr_ex = *cq_attr; cq_attr_ex.cqe = ncqe - 1; - ret = ibv_cmd_create_cq_ex(context, &cq_attr_ex, &cq->ibv_cq, + ret = ibv_cmd_create_cq_ex(context, &cq_attr_ex, &cq->verbs_cq, &cmd_ex.ibv_cmd, sizeof(cmd_ex), &resp_ex.ibv_resp, sizeof(resp_ex)); } else { ret = ibv_cmd_create_cq(context, ncqe - 1, cq_attr->channel, cq_attr->comp_vector, - ibv_cq_ex_to_cq(&cq->ibv_cq), + &cq->verbs_cq.cq, &cmd.ibv_cmd, sizeof(cmd), &resp.ibv_resp, sizeof(resp)); } @@ -1027,7 +1098,7 @@ static struct ibv_cq_ex *create_cq(struct ibv_context *context, cq->stall_adaptive_enable = to_mctx(context)->stall_adaptive_enable; cq->stall_cycles = to_mctx(context)->stall_cycles; - return &cq->ibv_cq; + return &cq->verbs_cq.cq_ex; err_db: mlx5_free_db(to_mctx(context), cq->dbrec, cq->parent_domain, cq->custom_db); @@ -1108,7 +1179,7 @@ int mlx5_resize_cq(struct ibv_cq *ibcq, int cqe) return EINVAL; mlx5_spin_lock(&cq->lock); - cq->active_cqes = cq->ibv_cq.cqe; + cq->active_cqes = cq->verbs_cq.cq.cqe; if (cq->active_buf == &cq->buf_a) cq->resize_buf = &cq->buf_b; else @@ -1142,7 +1213,7 @@ int mlx5_resize_cq(struct ibv_cq *ibcq, int cqe) mlx5_cq_resize_copy_cqes(cq); mlx5_free_cq_buf(mctx, cq->active_buf); cq->active_buf = cq->resize_buf; - cq->ibv_cq.cqe = cqe - 1; + cq->verbs_cq.cq.cqe = cqe - 1; mlx5_spin_unlock(&cq->lock); cq->resize_buf = NULL; return 0; @@ -1708,6 +1779,26 @@ static const char *qptype2key(enum ibv_qp_type type) } } +static size_t mlx5_set_custom_qp_alignment(struct ibv_context *context, + struct mlx5_qp *qp) +{ + uint32_t max_stride; + uint32_t buf_page; + + /* The main QP buffer alignment requirement is QP_PAGE_SIZE / + * MLX5_QPC_PAGE_OFFSET_QUANTA. In case the buffer is contig, then + * QP_PAGE_SIZE is the buffer size align to system page_size roundup to + * the next pow of two. + */ + buf_page = roundup_pow_of_two(align(qp->buf_size, + to_mdev(context->device)->page_size)); + /* Another QP buffer alignment requirement is to consider send wqe and + * receive wqe strides. + */ + max_stride = max((1 << qp->sq.wqe_shift), (1 << qp->rq.wqe_shift)); + return max(max_stride, buf_page / MLX5_QPC_PAGE_OFFSET_QUANTA); +} + static int mlx5_alloc_qp_buf(struct ibv_context *context, struct ibv_qp_init_attr_ex *attr, struct mlx5_qp *qp, @@ -1717,6 +1808,7 @@ static int mlx5_alloc_qp_buf(struct ibv_context *context, enum mlx5_alloc_type alloc_type; enum mlx5_alloc_type default_alloc_type = MLX5_ALLOC_TYPE_ANON; const char *qp_huge_key; + size_t req_align = to_mdev(context->device)->page_size; if (qp->sq.wqe_cnt) { qp->sq.wrid = malloc(qp->sq.wqe_cnt * sizeof(*qp->sq.wrid)); @@ -1760,13 +1852,15 @@ static int mlx5_alloc_qp_buf(struct ibv_context *context, if (alloc_type == MLX5_ALLOC_TYPE_CUSTOM) { qp->buf.mparent_domain = to_mparent_domain(attr->pd); - qp->buf.req_alignment = to_mdev(context->device)->page_size; + if (attr->qp_type != IBV_QPT_RAW_PACKET && + !(qp->flags & MLX5_QP_FLAGS_USE_UNDERLAY)) + req_align = mlx5_set_custom_qp_alignment(context, qp); + qp->buf.req_alignment = req_align; qp->buf.resource_type = MLX5DV_RES_TYPE_QP; } err = mlx5_alloc_prefered_buf(to_mctx(context), &qp->buf, - align(qp->buf_size, to_mdev - (context->device)->page_size), + align(qp->buf_size, req_align), to_mdev(context->device)->page_size, alloc_type, MLX5_QP_PREFIX); @@ -1843,6 +1937,42 @@ static void mlx5_free_qp_buf(struct mlx5_context *ctx, struct mlx5_qp *qp) free(qp->sq.wr_data); } +int mlx5_set_ece(struct ibv_qp *qp, struct ibv_ece *ece) +{ + struct mlx5_context *context = to_mctx(qp->context); + struct mlx5_qp *mqp = to_mqp(qp); + + if (ece->comp_mask) { + errno = EINVAL; + return errno; + } + + if (ece->vendor_id != PCI_VENDOR_ID_MELLANOX) { + errno = EINVAL; + return errno; + } + + if (!(context->flags & MLX5_CTX_FLAGS_ECE_SUPPORTED)) { + errno = EOPNOTSUPP; + return errno; + } + + mqp->set_ece = ece->options; + /* Clean previously returned ECE options */ + mqp->get_ece = 0; + return 0; +} + +int mlx5_query_ece(struct ibv_qp *qp, struct ibv_ece *ece) +{ + struct mlx5_qp *mqp = to_mqp(qp); + + ece->vendor_id = PCI_VENDOR_ID_MELLANOX; + ece->options = mqp->get_ece; + ece->comp_mask = 0; + return 0; +} + static int mlx5_cmd_create_rss_qp(struct ibv_context *context, struct ibv_qp_init_attr_ex *attr, struct mlx5_qp *qp, @@ -1866,7 +1996,7 @@ static int mlx5_cmd_create_rss_qp(struct ibv_context *context, attr->rx_hash_conf.rx_hash_key_len); ret = ibv_cmd_create_qp_ex2(context, &qp->verbs_qp, - sizeof(qp->verbs_qp), attr, + attr, &cmd_ex_rss.ibv_cmd, sizeof(cmd_ex_rss), &resp.ibv_resp, sizeof(resp)); if (ret) @@ -1899,7 +2029,7 @@ static int mlx5_cmd_create_qp_ex(struct ibv_context *context, cmd_ex.drv_payload = cmd->drv_payload; ret = ibv_cmd_create_qp_ex2(context, &qp->verbs_qp, - sizeof(qp->verbs_qp), attr, &cmd_ex.ibv_cmd, + attr, &cmd_ex.ibv_cmd, sizeof(cmd_ex), &resp->ibv_resp, sizeof(*resp)); @@ -1989,8 +2119,11 @@ static int create_dct(struct ibv_context *context, } } cmd.uidx = usr_idx; + if (ctx->flags & MLX5_CTX_FLAGS_ECE_SUPPORTED) + /* Create QP should start from ECE version 1 as a trigger */ + cmd.ece_options = 0x10000000; - ret = ibv_cmd_create_qp_ex(context, &qp->verbs_qp, sizeof(qp->verbs_qp), + ret = ibv_cmd_create_qp_ex(context, &qp->verbs_qp, attr, &cmd.ibv_cmd, sizeof(cmd), &resp.ibv_resp, sizeof(resp)); if (ret) { @@ -2000,6 +2133,7 @@ static int create_dct(struct ibv_context *context, return ret; } + qp->get_ece = resp.ece_options; qp->dc_type = MLX5DV_DCTYPE_DCT; qp->rsc.type = MLX5_RSC_TYPE_QP; if (ctx->cqe_version) @@ -2287,10 +2421,14 @@ static struct ibv_qp *create_qp(struct ibv_context *context, } } + if (ctx->flags & MLX5_CTX_FLAGS_ECE_SUPPORTED) + /* Create QP should start from ECE version 1 as a trigger */ + cmd.ece_options = 0x10000000; + if (attr->comp_mask & MLX5_CREATE_QP_EX2_COMP_MASK) ret = mlx5_cmd_create_qp_ex(context, attr, &cmd, qp, &resp_ex); else - ret = ibv_cmd_create_qp_ex(context, &qp->verbs_qp, sizeof(qp->verbs_qp), + ret = ibv_cmd_create_qp_ex(context, &qp->verbs_qp, attr, &cmd.ibv_cmd, sizeof(cmd), &resp.ibv_resp, sizeof(resp)); if (ret) { @@ -2312,6 +2450,7 @@ static struct ibv_qp *create_qp(struct ibv_context *context, pthread_mutex_unlock(&ctx->qp_table_mutex); } + qp->get_ece = resp_drv->ece_options; map_uuar(context, qp, resp_drv->bfreg_index, bf); qp->rq.max_post = qp->rq.wqe_cnt; @@ -2529,7 +2668,7 @@ enum { static int modify_dct(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask) { - struct ibv_modify_qp_ex cmd_ex = {}; + struct mlx5_modify_qp cmd_ex = {}; struct mlx5_modify_qp_ex_resp resp = {}; struct mlx5_qp *mqp = to_mqp(qp); struct mlx5_context *context = to_mctx(qp->context); @@ -2537,8 +2676,10 @@ static int modify_dct(struct ibv_qp *qp, struct ibv_qp_attr *attr, bool dct_create; int ret; - ret = ibv_cmd_modify_qp_ex(qp, attr, attr_mask, &cmd_ex, sizeof(cmd_ex), - &resp.ibv_resp, sizeof(resp)); + cmd_ex.ece_options = mqp->set_ece; + ret = ibv_cmd_modify_qp_ex(qp, attr, attr_mask, &cmd_ex.ibv_cmd, + sizeof(cmd_ex), &resp.ibv_resp, + sizeof(resp)); if (ret) return ret; @@ -2564,6 +2705,10 @@ static int modify_dct(struct ibv_qp *qp, struct ibv_qp_attr *attr, } qp->qp_num = resp.dctn; + if (mqp->set_ece) { + mqp->set_ece = 0; + mqp->get_ece = resp.ece_options; + } if (!context->cqe_version) { pthread_mutex_lock(&context->qp_table_mutex); @@ -2582,8 +2727,8 @@ int mlx5_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, int attr_mask) { struct ibv_modify_qp cmd = {}; - struct ibv_modify_qp_ex cmd_ex = {}; - struct ib_uverbs_ex_modify_qp_resp resp = {}; + struct mlx5_modify_qp cmd_ex = {}; + struct mlx5_modify_qp_ex_resp resp = {}; struct mlx5_qp *mqp = to_mqp(qp); struct mlx5_context *context = to_mctx(qp->context); int ret; @@ -2628,12 +2773,20 @@ int mlx5_modify_qp(struct ibv_qp *qp, struct ibv_qp_attr *attr, } } - if (attr_mask & MLX5_MODIFY_QP_EX_ATTR_MASK) - ret = ibv_cmd_modify_qp_ex(qp, attr, attr_mask, &cmd_ex, - sizeof(cmd_ex), &resp, sizeof(resp)); - else + if (attr_mask & MLX5_MODIFY_QP_EX_ATTR_MASK || mqp->set_ece) { + cmd_ex.ece_options = mqp->set_ece; + ret = ibv_cmd_modify_qp_ex(qp, attr, attr_mask, &cmd_ex.ibv_cmd, + sizeof(cmd_ex), &resp.ibv_resp, + sizeof(resp)); + } else { ret = ibv_cmd_modify_qp(qp, attr, attr_mask, &cmd, sizeof(cmd)); + } + + if (!ret && mqp->set_ece) { + mqp->set_ece = 0; + mqp->get_ece = resp.ece_options; + } if (!ret && (attr_mask & IBV_QP_STATE) && @@ -2750,8 +2903,8 @@ static void mlx5_ah_set_udp_sport(struct mlx5_ah *ah, if (fl) sport = ibv_flow_label_to_udp_sport(fl); else - sport = rand() % (IB_ROCE_UDP_ENCAP_VALID_PORT_MAX + 1 - - IB_ROCE_UDP_ENCAP_VALID_PORT_MIN) + sport = get_random() % (IB_ROCE_UDP_ENCAP_VALID_PORT_MAX + 1 + - IB_ROCE_UDP_ENCAP_VALID_PORT_MIN) + IB_ROCE_UDP_ENCAP_VALID_PORT_MIN; ah->av.rlid = htobe16(sport); @@ -2800,7 +2953,7 @@ struct ibv_ah *mlx5_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr) attr->grh.sgid_index, &gid_type)) goto err; - if (gid_type == IBV_GID_TYPE_ROCE_V2) + if (gid_type == IBV_GID_TYPE_SYSFS_ROCE_V2) mlx5_ah_set_udp_sport(ah, attr); /* Since RoCE packets must contain GRH, this bit is reserved @@ -2898,8 +3051,12 @@ int mlx5_get_srq_num(struct ibv_srq *srq, uint32_t *srq_num) { struct mlx5_srq *msrq = to_msrq(srq); + /* May be used by DC users in addition to XRC ones, as there is no + * indication on the SRQ for DC usage we can't force the above check. + * Even DC users are encouraged to use mlx5dv_init_obj() to get + * the SRQN. + */ *srq_num = msrq->srqn; - return 0; } @@ -3156,7 +3313,7 @@ struct ibv_srq *mlx5_create_srq_ex(struct ibv_context *context, */ attr->attr.max_wr = msrq->max - 1; - err = ibv_cmd_create_srq_ex(context, &msrq->vsrq, sizeof(msrq->vsrq), + err = ibv_cmd_create_srq_ex(context, &msrq->vsrq, attr, &cmd.ibv_cmd, sizeof(cmd), &resp.ibv_resp, sizeof(resp)); @@ -3260,6 +3417,31 @@ static void get_pci_atomic_caps(struct ibv_context *context, } } +static void get_lag_caps(struct ibv_context *ctx) +{ + uint16_t opmod = MLX5_SET_HCA_CAP_OP_MOD_GENERAL_DEVICE | + HCA_CAP_OPMOD_GET_CUR; + uint32_t out[DEVX_ST_SZ_DW(query_hca_cap_out)] = {}; + uint32_t in[DEVX_ST_SZ_DW(query_hca_cap_in)] = {}; + struct mlx5_context *mctx = to_mctx(ctx); + int ret; + + DEVX_SET(query_hca_cap_in, in, opcode, MLX5_CMD_OP_QUERY_HCA_CAP); + DEVX_SET(query_hca_cap_in, in, op_mod, opmod); + + ret = mlx5dv_devx_general_cmd(ctx, in, sizeof(in), out, sizeof(out)); + if (ret) + return; + + mctx->lag_caps.num_lag_ports = + DEVX_GET(query_hca_cap_out, out, + capability.cmd_hca_cap.num_lag_ports); + + mctx->lag_caps.lag_tx_port_affinity = + DEVX_GET(query_hca_cap_out, out, + capability.cmd_hca_cap.lag_tx_port_affinity); +} + int mlx5_query_device_ex(struct ibv_context *context, const struct ibv_query_device_ex_input *input, struct ibv_device_attr_ex *attr, @@ -3345,6 +3527,8 @@ int mlx5_query_device_ex(struct ibv_context *context, sizeof(attr->pci_atomic_caps)) get_pci_atomic_caps(context, attr); + get_lag_caps(context); + return 0; } @@ -4355,6 +4539,8 @@ __mlx5dv_create_flow(struct mlx5dv_flow_matcher *flow_matcher, bool have_dest_devx = false; bool have_flow_tag = false; bool have_counter = false; + bool have_default = false; + bool have_drop = false; int ret; int i; DECLARE_COMMAND_BUFFER(cmd, UVERBS_OBJECT_FLOW, @@ -4379,7 +4565,8 @@ __mlx5dv_create_flow(struct mlx5dv_flow_matcher *flow_matcher, type = actions_attr[i].type; switch (type) { case MLX5DV_FLOW_ACTION_DEST_IBV_QP: - if (have_qp || have_dest_devx) { + if (have_qp || have_dest_devx || have_default || + have_drop) { errno = EOPNOTSUPP; goto err; } @@ -4401,7 +4588,8 @@ __mlx5dv_create_flow(struct mlx5dv_flow_matcher *flow_matcher, num_flow_actions++; break; case MLX5DV_FLOW_ACTION_DEST_DEVX: - if (have_dest_devx || have_qp) { + if (have_dest_devx || have_qp || have_default || + have_drop) { errno = EOPNOTSUPP; goto err; } @@ -4436,6 +4624,28 @@ __mlx5dv_create_flow(struct mlx5dv_flow_matcher *flow_matcher, have_counter = true; break; + case MLX5DV_FLOW_ACTION_DEFAULT_MISS: + if (have_qp || have_dest_devx || have_default || + have_drop) { + errno = EOPNOTSUPP; + goto err; + } + fill_attr_in_uint32(cmd, + MLX5_IB_ATTR_CREATE_FLOW_FLAGS, + MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DEFAULT_MISS); + have_default = true; + break; + case MLX5DV_FLOW_ACTION_DROP: + if (have_qp || have_dest_devx || have_default || + have_drop) { + errno = EOPNOTSUPP; + goto err; + } + fill_attr_in_uint32(cmd, + MLX5_IB_ATTR_CREATE_FLOW_FLAGS, + MLX5_IB_ATTR_CREATE_FLOW_FLAGS_DROP); + have_drop = true; + break; default: errno = EOPNOTSUPP; goto err; @@ -4489,6 +4699,9 @@ mlx5dv_devx_umem_reg(struct ibv_context *context, void *addr, size_t size, uint3 return NULL; } + if (ibv_dontfork_range(addr, size)) + goto err; + fill_attr_in_uint64(cmd, MLX5_IB_ATTR_DEVX_UMEM_REG_ADDR, (intptr_t)addr); fill_attr_in_uint64(cmd, MLX5_IB_ATTR_DEVX_UMEM_REG_LEN, size); fill_attr_in_uint32(cmd, MLX5_IB_ATTR_DEVX_UMEM_REG_ACCESS, access); @@ -4499,12 +4712,17 @@ mlx5dv_devx_umem_reg(struct ibv_context *context, void *addr, size_t size, uint3 ret = execute_ioctl(context, cmd); if (ret) - goto err; + goto err_umem_reg_cmd; umem->handle = read_attr_obj(MLX5_IB_ATTR_DEVX_UMEM_REG_HANDLE, handle); umem->context = context; + umem->addr = addr; + umem->size = size; return &umem->dv_devx_umem; + +err_umem_reg_cmd: + ibv_dofork_range(addr, size); err: free(umem); return NULL; @@ -4525,6 +4743,7 @@ int mlx5dv_devx_umem_dereg(struct mlx5dv_devx_umem *dv_devx_umem) if (ret) return ret; + ibv_dofork_range(umem->addr, umem->size); free(umem); return 0; } @@ -4542,6 +4761,14 @@ static void set_devx_obj_info(const void *in, const void *out, obj->type = MLX5_DEVX_FLOW_TABLE; obj->object_id = DEVX_GET(create_flow_table_out, out, table_id); break; + case MLX5_CMD_OP_CREATE_FLOW_GROUP: + obj->type = MLX5_DEVX_FLOW_GROUP; + obj->object_id = DEVX_GET(create_flow_group_out, out, group_id); + break; + case MLX5_CMD_OP_SET_FLOW_TABLE_ENTRY: + obj->type = MLX5_DEVX_FLOW_TABLE_ENTRY; + obj->object_id = DEVX_GET(set_fte_in, in, flow_index); + break; case MLX5_CMD_OP_CREATE_FLOW_COUNTER: obj->type = MLX5_DEVX_FLOW_COUNTER; obj->object_id = DEVX_GET(alloc_flow_counter_out, out, flow_counter_id); @@ -4550,6 +4777,8 @@ static void set_devx_obj_info(const void *in, const void *out, obj_type = DEVX_GET(general_obj_in_cmd_hdr, in, obj_type); if (obj_type == MLX5_OBJ_TYPE_FLOW_METER) obj->type = MLX5_DEVX_FLOW_METER; + else if (obj_type == MLX5_OBJ_TYPE_FLOW_SAMPLER) + obj->type = MLX5_DEVX_FLOW_SAMPLER; obj->object_id = DEVX_GET(general_obj_out_cmd_hdr, out, obj_id); break; @@ -4557,6 +4786,13 @@ static void set_devx_obj_info(const void *in, const void *out, obj->type = MLX5_DEVX_QP; obj->object_id = DEVX_GET(create_qp_out, out, qpn); break; + case MLX5_CMD_OP_CREATE_TIR: + obj->type = MLX5_DEVX_TIR; + obj->object_id = DEVX_GET(create_tir_out, out, tirn); + obj->rx_icm_addr = DEVX_GET(create_tir_out, out, icm_address_31_0); + obj->rx_icm_addr |= (uint64_t)DEVX_GET(create_tir_out, out, icm_address_39_32) << 32; + obj->rx_icm_addr |= (uint64_t)DEVX_GET(create_tir_out, out, icm_address_63_40) << 40; + break; case MLX5_CMD_OP_ALLOC_PACKET_REFORMAT_CONTEXT: obj->type = MLX5_DEVX_PKT_REFORMAT_CTX; obj->object_id = DEVX_GET(alloc_packet_reformat_context_out, diff --git a/providers/qedr/common_hsi.h b/providers/qedr/common_hsi.h index 791006b..9feefaf 100644 --- a/providers/qedr/common_hsi.h +++ b/providers/qedr/common_hsi.h @@ -1010,12 +1010,14 @@ struct db_roce_dpm_params #define DB_ROCE_DPM_PARAMS_WQE_SIZE_SHIFT 16 #define DB_ROCE_DPM_PARAMS_RESERVED0_MASK 0x1 #define DB_ROCE_DPM_PARAMS_RESERVED0_SHIFT 27 -#define DB_ROCE_DPM_PARAMS_COMPLETION_FLG_MASK 0x1 /* RoCE completion flag */ -#define DB_ROCE_DPM_PARAMS_COMPLETION_FLG_SHIFT 28 +#define DB_ROCE_DPM_PARAMS_ACK_REQUEST_MASK 0x1 /* RoCE ack request (will be set to 1) */ +#define DB_ROCE_DPM_PARAMS_ACK_REQUEST_SHIFT 28 #define DB_ROCE_DPM_PARAMS_S_FLG_MASK 0x1 /* RoCE S flag */ #define DB_ROCE_DPM_PARAMS_S_FLG_SHIFT 29 -#define DB_ROCE_DPM_PARAMS_RESERVED1_MASK 0x3 -#define DB_ROCE_DPM_PARAMS_RESERVED1_SHIFT 30 +#define DB_ROCE_DPM_PARAMS_COMPLETION_FLG_MASK 0x1 /* RoCE completion flag for FW use */ +#define DB_ROCE_DPM_PARAMS_COMPLETION_FLG_SHIFT 30 +#define DB_ROCE_DPM_PARAMS_RESERVED1_MASK 0x1 +#define DB_ROCE_DPM_PARAMS_RESERVED1_SHIFT 31 }; /* diff --git a/providers/qedr/qelr.h b/providers/qedr/qelr.h index ac522cb..23f3184 100644 --- a/providers/qedr/qelr.h +++ b/providers/qedr/qelr.h @@ -123,10 +123,13 @@ struct qelr_device { }; enum qelr_dpm_flags { - QELR_DPM_FLAGS_ENHANCED = (1 << 0), - QELR_DPM_FLAGS_LEGACY = (1 << 1), + QELR_DPM_FLAGS_ENHANCED = (1 << 0), + QELR_DPM_FLAGS_LEGACY = (1 << 1), + QELR_DPM_FLAGS_EDPM_MODE = (1 << 2), }; +#define QELR_MAX_SRQ_ID 4096 + struct qelr_devctx { struct verbs_context ibv_ctx; FILE *dbg_fp; @@ -137,6 +140,7 @@ struct qelr_devctx { enum qelr_dpm_flags dpm_flags; uint32_t kernel_page_size; uint16_t ldpm_limit_size; + uint16_t edpm_limit_size; uint8_t edpm_trans_size; uint32_t max_send_wr; @@ -145,6 +149,7 @@ struct qelr_devctx { uint32_t sges_per_send_wr; uint32_t sges_per_recv_wr; uint32_t sges_per_srq_wr; + struct qelr_srq **srq_table; int max_cqes; }; @@ -225,6 +230,10 @@ struct qelr_rdma_ext { __be32 dma_length; }; +struct qelr_xrceth { + __be32 xrc_srq; +}; + /* rdma extension, invalidate / immediate data + padding, inline data... */ #define QELR_MAX_DPM_PAYLOAD (sizeof(struct qelr_rdma_ext) + sizeof(uint64_t) +\ ROCE_REQ_MAX_INLINE_DATA_SIZE) @@ -257,16 +266,24 @@ struct qelr_srq_hwq_info { }; struct qelr_srq { - struct ibv_srq ibv_srq; + struct verbs_srq verbs_srq; struct qelr_srq_hwq_info hw_srq; uint16_t srq_id; pthread_spinlock_t lock; + bool is_xrc; +}; + +enum qelr_qp_flags { + QELR_QP_FLAG_SQ = 1 << 0, + QELR_QP_FLAG_RQ = 1 << 1, }; struct qelr_qp { - struct ibv_qp ibv_qp; + struct verbs_qp verbs_qp; + struct ibv_qp *ibv_qp; pthread_spinlock_t q_lock; enum qelr_qp_state state; /* QP state */ + uint8_t flags; struct qelr_qp_hwq_info sq; struct qelr_qp_hwq_info rq; @@ -289,6 +306,7 @@ struct qelr_qp { int sq_sig_all; int atomic_supported; uint8_t edpm_disabled; + uint8_t edpm_mode; struct qelr_srq *srq; }; @@ -302,9 +320,16 @@ static inline struct qelr_device *get_qelr_dev(struct ibv_device *ibdev) return container_of(ibdev, struct qelr_device, ibv_dev.device); } +static inline struct ibv_qp *get_ibv_qp(struct qelr_qp *qp) +{ + return &qp->verbs_qp.qp; +} + static inline struct qelr_qp *get_qelr_qp(struct ibv_qp *ibqp) { - return container_of(ibqp, struct qelr_qp, ibv_qp); + struct verbs_qp *vqp = (struct verbs_qp *)ibqp; + + return container_of(vqp, struct qelr_qp, verbs_qp); } static inline struct qelr_pd *get_qelr_pd(struct ibv_pd *ibpd) @@ -319,7 +344,14 @@ static inline struct qelr_cq *get_qelr_cq(struct ibv_cq *ibcq) static inline struct qelr_srq *get_qelr_srq(struct ibv_srq *ibsrq) { - return container_of(ibsrq, struct qelr_srq, ibv_srq); + struct verbs_srq *vsrq = (struct verbs_srq *)ibsrq; + + return container_of(vsrq, struct qelr_srq, verbs_srq); +} + +static inline struct ibv_srq *get_ibv_srq(struct qelr_srq *srq) +{ + return &srq->verbs_srq.srq; } #define SET_FIELD(value, name, flag) \ diff --git a/providers/qedr/qelr_abi.h b/providers/qedr/qelr_abi.h index 345872c..6af9da0 100644 --- a/providers/qedr/qelr_abi.h +++ b/providers/qedr/qelr_abi.h @@ -51,5 +51,9 @@ DECLARE_DRV_CMD(qelr_reg_mr, IB_USER_VERBS_CMD_REG_MR, empty, empty); DECLARE_DRV_CMD(qelr_create_srq, IB_USER_VERBS_CMD_CREATE_SRQ, qedr_create_srq_ureq, qedr_create_srq_uresp); +DECLARE_DRV_CMD(qelr_create_srq_ex, IB_USER_VERBS_CMD_CREATE_XSRQ, + qedr_create_srq_ureq, qedr_create_srq_uresp); +DECLARE_DRV_CMD(qelr_create_qp_ex, IB_USER_VERBS_EX_CMD_CREATE_QP, + qedr_create_qp_ureq, qedr_create_qp_uresp); #endif /* __QELR_ABI_H__ */ diff --git a/providers/qedr/qelr_hsi_rdma.h b/providers/qedr/qelr_hsi_rdma.h index ced75d4..0e97635 100644 --- a/providers/qedr/qelr_hsi_rdma.h +++ b/providers/qedr/qelr_hsi_rdma.h @@ -52,7 +52,11 @@ struct rdma_cqe_responder __le32 imm_data_or_inv_r_Key /* immediate data in case imm_flg is set, or invalidated r_key in case inv_flg is set */; __le32 length; __le32 imm_data_hi /* High bytes of immediate data in case imm_flg is set in iWARP only */; - __le16 rq_cons /* Valid only when status is WORK_REQUEST_FLUSHED_ERR. Indicates an aggregative flush on all posted RQ WQEs until the reported rq_cons. */; + __le16 rq_cons_or_srq_id;/* When type is RDMA_CQE_TYPE_RESPONDER_RQ and status is + * WORK_REQUEST_FLUSHED_ERR it indicates an aggregative + * flush on all posted RQ WQEs until the reported rq_cons. + * When type is RDMA_CQE_TYPE_RESPONDER_XRC_SRQ it is the srq_id + */ uint8_t flags; #define RDMA_CQE_RESPONDER_TOGGLE_BIT_MASK 0x1 /* indicates a valid completion written by FW. FW toggle this bit each time it finishes producing all PBL entries */ #define RDMA_CQE_RESPONDER_TOGGLE_BIT_SHIFT 0 @@ -133,6 +137,7 @@ enum rdma_cqe_requester_status_enum RDMA_CQE_REQ_STS_RNR_NAK_RETRY_CNT_ERR, RDMA_CQE_REQ_STS_TRANSPORT_RETRY_CNT_ERR, RDMA_CQE_REQ_STS_WORK_REQUEST_FLUSHED_ERR, + RDMA_CQE_REQ_STS_XRC_VIOLATION_ERR, MAX_RDMA_CQE_REQUESTER_STATUS_ENUM }; @@ -163,6 +168,7 @@ enum rdma_cqe_type RDMA_CQE_TYPE_REQUESTER, RDMA_CQE_TYPE_RESPONDER_RQ, RDMA_CQE_TYPE_RESPONDER_SRQ, + RDMA_CQE_TYPE_RESPONDER_XRC_SRQ, RDMA_CQE_TYPE_INVALID, MAX_RDMA_CQE_TYPE }; diff --git a/providers/qedr/qelr_main.c b/providers/qedr/qelr_main.c index e7045ca..bdfaa93 100644 --- a/providers/qedr/qelr_main.c +++ b/providers/qedr/qelr_main.c @@ -41,7 +41,7 @@ #include #include "qelr.h" -#include "qelr_main.h" +#include "qelr_verbs.h" #include "qelr_chain.h" #include @@ -61,6 +61,8 @@ static void qelr_free_context(struct ibv_context *ibctx); #define PCI_DEVICE_ID_QLOGIC_57980S_IOV (0x1664) #define PCI_DEVICE_ID_QLOGIC_AH (0x8070) #define PCI_DEVICE_ID_QLOGIC_AH_IOV (0x8090) +#define PCI_DEVICE_ID_QLOGIC_AHP (0x8170) +#define PCI_DEVICE_ID_QLOGIC_AHP_IOV (0x8190) uint32_t qelr_dp_level; uint32_t qelr_dp_module; @@ -79,6 +81,8 @@ static const struct verbs_match_ent hca_table[] = { QHCA(57980S_IOV), QHCA(AH), QHCA(AH_IOV), + QHCA(AHP), + QHCA(AHP_IOV), {} }; @@ -167,8 +171,9 @@ static struct verbs_context *qelr_alloc_context(struct ibv_device *ibdev, int cmd_fd, void *private_data) { + struct verbs_context *v_ctx; struct qelr_devctx *ctx; - struct qelr_alloc_context cmd; + struct qelr_alloc_context cmd = {}; struct qelr_alloc_context_resp resp; ctx = verbs_init_and_alloc_context(ibdev, cmd_fd, ctx, ibv_ctx, @@ -176,18 +181,26 @@ static struct verbs_context *qelr_alloc_context(struct ibv_device *ibdev, if (!ctx) return NULL; + v_ctx = &ctx->ibv_ctx; memset(&resp, 0, sizeof(resp)); qelr_open_debug_file(ctx); qelr_set_debug_mask(); - cmd.context_flags = QEDR_ALLOC_UCTX_DB_REC; + cmd.context_flags = QEDR_ALLOC_UCTX_DB_REC | QEDR_SUPPORT_DPM_SIZES; + cmd.context_flags |= QEDR_ALLOC_UCTX_EDPM_MODE; if (ibv_cmd_get_context(&ctx->ibv_ctx, &cmd.ibv_cmd, sizeof(cmd), &resp.ibv_resp, sizeof(resp))) goto cmd_err; verbs_set_ops(&ctx->ibv_ctx, &qelr_ctx_ops); + ctx->srq_table = calloc(QELR_MAX_SRQ_ID, sizeof(*ctx->srq_table)); + if (!ctx->srq_table) { + DP_ERR(ctx->dbg_fp, "failed to allocate srq_table\n"); + return NULL; + } + ctx->kernel_page_size = sysconf(_SC_PAGESIZE); ctx->db_pa = resp.db_pa; ctx->db_size = resp.db_size; @@ -199,6 +212,9 @@ static struct verbs_context *qelr_alloc_context(struct ibv_device *ibdev, if (resp.dpm_flags & QEDR_DPM_TYPE_ROCE_LEGACY) ctx->dpm_flags |= QELR_DPM_FLAGS_LEGACY; + + if (resp.dpm_flags & QEDR_DPM_TYPE_ROCE_EDPM_MODE) + ctx->dpm_flags |= QELR_DPM_FLAGS_EDPM_MODE; } else { if (resp.dpm_flags & QEDR_DPM_TYPE_IWARP_LEGACY) ctx->dpm_flags = QELR_DPM_FLAGS_LEGACY; @@ -208,9 +224,12 @@ static struct verbs_context *qelr_alloc_context(struct ibv_device *ibdev, if (resp.dpm_flags & QEDR_DPM_SIZES_SET) { ctx->ldpm_limit_size = resp.ldpm_limit_size; ctx->edpm_trans_size = resp.edpm_trans_size; + ctx->edpm_limit_size = resp.edpm_limit_size ? + resp.edpm_limit_size : QEDR_EDPM_MAX_SIZE; } else { ctx->ldpm_limit_size = QEDR_LDPM_MAX_SIZE; ctx->edpm_trans_size = QEDR_EDPM_TRANS_SIZE; + ctx->edpm_limit_size = QEDR_EDPM_MAX_SIZE; } ctx->max_send_wr = resp.max_send_wr; @@ -233,6 +252,12 @@ static struct verbs_context *qelr_alloc_context(struct ibv_device *ibdev, goto cmd_err; } + v_ctx->create_qp_ex = qelr_create_qp_ex; + v_ctx->open_xrcd = qelr_open_xrcd; + v_ctx->close_xrcd = qelr_close_xrcd; + v_ctx->create_srq_ex = qelr_create_srq_ex; + v_ctx->get_srq_num = qelr_get_srq_num; + return &ctx->ibv_ctx; cmd_err: @@ -250,6 +275,7 @@ static void qelr_free_context(struct ibv_context *ibctx) if (ctx->db_addr) munmap(ctx->db_addr, ctx->db_size); + free(ctx->srq_table); qelr_close_debug_file(ctx); verbs_uninit_context(&ctx->ibv_ctx); free(ctx); diff --git a/providers/qedr/qelr_verbs.c b/providers/qedr/qelr_verbs.c index 45b6e78..4e77a19 100644 --- a/providers/qedr/qelr_verbs.c +++ b/providers/qedr/qelr_verbs.c @@ -331,6 +331,16 @@ int qelr_destroy_cq(struct ibv_cq *ibv_cq) return 0; } +static struct qelr_srq *qelr_get_srq(struct qelr_devctx *cxt, uint32_t srq_id) +{ + if (unlikely(srq_id >= QELR_MAX_SRQ_ID)) { + DP_ERR(cxt->dbg_fp, "invalid srq_id %u\n", srq_id); + return NULL; + } + + return cxt->srq_table[srq_id]; +} + int qelr_query_srq(struct ibv_srq *ibv_srq, struct ibv_srq_attr *attr) { struct ibv_query_srq cmd; @@ -364,6 +374,7 @@ static void qelr_destroy_srq_buffers(struct ibv_srq *ibv_srq) int qelr_destroy_srq(struct ibv_srq *ibv_srq) { + struct qelr_devctx *cxt = get_qelr_ctx(ibv_srq->context); struct qelr_srq *srq = get_qelr_srq(ibv_srq); int ret; @@ -371,6 +382,9 @@ int qelr_destroy_srq(struct ibv_srq *ibv_srq) if (ret) return ret; + if (srq->is_xrc) + cxt->srq_table[srq->srq_id] = NULL; + qelr_destroy_srq_buffers(ibv_srq); free(srq); @@ -385,16 +399,23 @@ static void qelr_create_srq_configure_req(struct qelr_srq *srq, req->prod_pair_addr = (uintptr_t)srq->hw_srq.virt_prod_pair_addr; } +static inline void +qelr_create_srq_configure_req_ex(struct qelr_srq *srq, + struct qelr_create_srq_ex *req) +{ + req->srq_addr = (uintptr_t)srq->hw_srq.chain.first_addr; + req->srq_len = srq->hw_srq.chain.size; + req->prod_pair_addr = (uintptr_t)srq->hw_srq.virt_prod_pair_addr; +} + static int qelr_create_srq_buffers(struct qelr_devctx *cxt, - struct qelr_srq *srq, - struct ibv_srq_init_attr *attrs) + struct qelr_srq *srq, uint32_t max_wr) { - uint32_t max_wr, max_sges; + uint32_t max_sges; int chain_size, prod_size; void *addr; int rc; - max_wr = attrs->attr.max_wr; if (!max_wr) return -EINVAL; @@ -441,6 +462,7 @@ struct ibv_srq *qelr_create_srq(struct ibv_pd *pd, struct qelr_devctx *cxt = get_qelr_ctx(pd->context); struct qelr_create_srq req; struct qelr_create_srq_resp resp; + struct ibv_srq *ibv_srq; struct qelr_srq *srq; int ret; @@ -448,7 +470,9 @@ struct ibv_srq *qelr_create_srq(struct ibv_pd *pd, if (!srq) return NULL; - ret = qelr_create_srq_buffers(cxt, srq, init_attr); + ibv_srq = &srq->verbs_srq.srq; + + ret = qelr_create_srq_buffers(cxt, srq, init_attr->attr.max_wr); if (ret) { free(srq); return NULL; @@ -456,15 +480,15 @@ struct ibv_srq *qelr_create_srq(struct ibv_pd *pd, pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE); qelr_create_srq_configure_req(srq, &req); - ret = ibv_cmd_create_srq(pd, &srq->ibv_srq, init_attr, &req.ibv_cmd, + ret = ibv_cmd_create_srq(pd, ibv_srq, init_attr, &req.ibv_cmd, sizeof(req), &resp.ibv_resp, sizeof(resp)); if (ret) { - qelr_destroy_srq_buffers(&srq->ibv_srq); + qelr_destroy_srq_buffers(ibv_srq); free(srq); return NULL; } - return &srq->ibv_srq; + return ibv_srq; } static void qelr_free_rq(struct qelr_qp *qp) @@ -487,16 +511,26 @@ static void qelr_chain_free_rq(struct qelr_qp *qp) qelr_chain_free(&qp->rq.chain); } +static inline bool qelr_qp_has_rq(struct qelr_qp *qp) +{ + return !!(qp->flags & QELR_QP_FLAG_RQ); +} + +static inline bool qelr_qp_has_sq(struct qelr_qp *qp) +{ + return !!(qp->flags & QELR_QP_FLAG_SQ); +} + static inline int qelr_create_qp_buffers_sq(struct qelr_devctx *cxt, struct qelr_qp *qp, - struct ibv_qp_init_attr *attrs) + struct ibv_qp_init_attr_ex *attrx) { uint32_t max_send_wr, max_send_sges, max_send_buf; int chain_size; int rc; /* SQ */ - max_send_wr = attrs->cap.max_send_wr; + max_send_wr = attrx->cap.max_send_wr; max_send_wr = max_t(uint32_t, max_send_wr, 1); max_send_wr = min_t(uint32_t, max_send_wr, cxt->max_send_wr); max_send_sges = max_send_wr * cxt->sges_per_send_wr; @@ -516,14 +550,14 @@ static inline int qelr_create_qp_buffers_sq(struct qelr_devctx *cxt, static inline int qelr_create_qp_buffers_rq(struct qelr_devctx *cxt, struct qelr_qp *qp, - struct ibv_qp_init_attr *attrs) + struct ibv_qp_init_attr_ex *attrx) { uint32_t max_recv_wr, max_recv_sges, max_recv_buf; int chain_size; int rc; /* RQ */ - max_recv_wr = attrs->cap.max_recv_wr; + max_recv_wr = attrx->cap.max_recv_wr; max_recv_wr = max_t(uint32_t, max_recv_wr, 1); max_recv_wr = min_t(uint32_t, max_recv_wr, cxt->max_recv_wr); max_recv_sges = max_recv_wr * cxt->sges_per_recv_wr; @@ -543,20 +577,25 @@ static inline int qelr_create_qp_buffers_rq(struct qelr_devctx *cxt, static inline int qelr_create_qp_buffers(struct qelr_devctx *cxt, struct qelr_qp *qp, - struct ibv_qp_init_attr *attrs) + struct ibv_qp_init_attr_ex *attrx) { int rc; - rc = qelr_create_qp_buffers_sq(cxt, qp, attrs); - if (rc) - return rc; + if (qelr_qp_has_sq(qp)) { + rc = qelr_create_qp_buffers_sq(cxt, qp, attrx); + if (rc) + return rc; + } - rc = qelr_create_qp_buffers_rq(cxt, qp, attrs); - if (rc) { - qelr_chain_free_sq(qp); - if (qp->sq.db_rec_map) - munmap(qp->sq.db_rec_map, cxt->kernel_page_size); - return rc; + if (qelr_qp_has_rq(qp)) { + rc = qelr_create_qp_buffers_rq(cxt, qp, attrx); + if (rc && qelr_qp_has_sq(qp)) { + qelr_chain_free_sq(qp); + if (qp->sq.db_rec_map) + munmap(qp->sq.db_rec_map, + cxt->kernel_page_size); + return rc; + } } return 0; @@ -564,7 +603,7 @@ static inline int qelr_create_qp_buffers(struct qelr_devctx *cxt, static inline int qelr_configure_qp_sq(struct qelr_devctx *cxt, struct qelr_qp *qp, - struct ibv_qp_init_attr *attrs, + struct ibv_qp_init_attr_ex *attrx, struct qelr_create_qp_resp *resp) { qp->sq.icid = resp->sq_icid; @@ -608,7 +647,6 @@ static inline int qelr_configure_qp_sq(struct qelr_devctx *cxt, static inline int qelr_configure_qp_rq(struct qelr_devctx *cxt, struct qelr_qp *qp, - struct ibv_qp_init_attr *attrs, struct qelr_create_qp_resp *resp) { /* RQ */ @@ -655,7 +693,7 @@ static inline int qelr_configure_qp_rq(struct qelr_devctx *cxt, } static inline int qelr_configure_qp(struct qelr_devctx *cxt, struct qelr_qp *qp, - struct ibv_qp_init_attr *attrs, + struct ibv_qp_init_attr_ex *attrx, struct qelr_create_qp_resp *resp) { int rc; @@ -664,29 +702,35 @@ static inline int qelr_configure_qp(struct qelr_devctx *cxt, struct qelr_qp *qp, pthread_spin_init(&qp->q_lock, PTHREAD_PROCESS_PRIVATE); qp->qp_id = resp->qp_id; qp->state = QELR_QPS_RST; - qp->sq_sig_all = attrs->sq_sig_all; + qp->sq_sig_all = attrx->sq_sig_all; qp->atomic_supported = resp->atomic_supported; + if (cxt->dpm_flags & QELR_DPM_FLAGS_EDPM_MODE) + qp->edpm_mode = 1; - rc = qelr_configure_qp_sq(cxt, qp, attrs, resp); - if (rc) - return rc; - rc = qelr_configure_qp_rq(cxt, qp, attrs, resp); - if (rc) - qelr_free_sq(qp); + if (qelr_qp_has_sq(qp)) { + rc = qelr_configure_qp_sq(cxt, qp, attrx, resp); + if (rc) + return rc; + } + + if (qelr_qp_has_rq(qp)) { + rc = qelr_configure_qp_rq(cxt, qp, resp); + if (rc && qelr_qp_has_sq(qp)) + qelr_free_sq(qp); + } return rc; } -static inline void qelr_print_qp_init_attr( - struct qelr_devctx *cxt, - struct ibv_qp_init_attr *attr) +static inline void qelr_print_qp_init_attr(struct qelr_devctx *cxt, + struct ibv_qp_init_attr_ex *attrx) { DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP, "create qp: send_cq=%p, recv_cq=%p, srq=%p, max_inline_data=%d, max_recv_sge=%d, max_recv_wr=%d, max_send_sge=%d, max_send_wr=%d, qp_type=%d, sq_sig_all=%d\n", - attr->send_cq, attr->recv_cq, attr->srq, - attr->cap.max_inline_data, attr->cap.max_recv_sge, - attr->cap.max_recv_wr, attr->cap.max_send_sge, - attr->cap.max_send_wr, attr->qp_type, attr->sq_sig_all); + attrx->send_cq, attrx->recv_cq, attrx->srq, + attrx->cap.max_inline_data, attrx->cap.max_recv_sge, + attrx->cap.max_recv_wr, attrx->cap.max_send_sge, + attrx->cap.max_send_wr, attrx->qp_type, attrx->sq_sig_all); } static inline void @@ -712,63 +756,23 @@ qelr_create_qp_configure_req(struct qelr_qp *qp, memset(req, 0, sizeof(*req)); req->qp_handle_hi = U64_HI(qp); req->qp_handle_lo = U64_LO(qp); - qelr_create_qp_configure_sq_req(qp, req); - qelr_create_qp_configure_rq_req(qp, req); + if (qelr_qp_has_sq(qp)) + qelr_create_qp_configure_sq_req(qp, req); + if (qelr_qp_has_rq(qp)) + qelr_create_qp_configure_rq_req(qp, req); } -struct ibv_qp *qelr_create_qp(struct ibv_pd *pd, - struct ibv_qp_init_attr *attrs) +static inline void qelr_basic_qp_config(struct qelr_qp *qp, + struct ibv_qp_init_attr_ex *attrx) { - struct qelr_devctx *cxt = get_qelr_ctx(pd->context); - struct qelr_create_qp_resp resp = {}; - struct qelr_create_qp req; - struct qelr_qp *qp; - int rc; - - qelr_print_qp_init_attr(cxt, attrs); - - qp = calloc(1, sizeof(*qp)); - if (!qp) - return NULL; - - if (attrs->srq) - qp->srq = get_qelr_srq(attrs->srq); - - rc = qelr_create_qp_buffers(cxt, qp, attrs); - if (rc) - goto err0; - - qelr_create_qp_configure_req(qp, &req); - - rc = ibv_cmd_create_qp(pd, &qp->ibv_qp, attrs, &req.ibv_cmd, - sizeof(req), &resp.ibv_resp, sizeof(resp)); - if (rc) { - DP_ERR(cxt->dbg_fp, - "create qp: failed on ibv_cmd_create_qp with %d\n", rc); - goto err1; - } - - rc = qelr_configure_qp(cxt, qp, attrs, &resp); - if (rc) - goto err2; + if (attrx->srq) + qp->srq = get_qelr_srq(attrx->srq); - DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP, - "create qp: successfully created %p. handle_hi=%x handle_lo=%x\n", - qp, req.qp_handle_hi, req.qp_handle_lo); - - return &qp->ibv_qp; - -err2: - rc = ibv_cmd_destroy_qp(&qp->ibv_qp); - if (rc) - DP_ERR(cxt->dbg_fp, "create qp: fatal fault. rc=%d\n", rc); -err1: - qelr_chain_free_sq(qp); - qelr_chain_free_rq(qp); -err0: - free(qp); + if (attrx->qp_type == IBV_QPT_RC || attrx->qp_type == IBV_QPT_XRC_SEND) + qp->flags |= QELR_QP_FLAG_SQ; - return NULL; + if (attrx->qp_type == IBV_QPT_RC && !qp->srq) + qp->flags |= QELR_QP_FLAG_RQ; } static void qelr_print_ah_attr(struct qelr_devctx *cxt, struct ibv_ah_attr *attr) @@ -860,7 +864,7 @@ static int qelr_update_qp_state(struct qelr_qp *qp, /* iWARP states are updated implicitely by driver and don't have a * real purpose in user-lib. */ - if (IS_IWARP(qp->ibv_qp.context->device)) + if (IS_IWARP(qp->ibv_qp->context->device)) return 0; new_state = get_qelr_qp_state(new_ib_state); @@ -892,7 +896,8 @@ static int qelr_update_qp_state(struct qelr_qp *qp, /* Update doorbell (in case post_recv was done before * move to RTR) */ - if (IS_ROCE(qp->ibv_qp.context->device)) { + if (IS_ROCE(qp->ibv_qp->context->device) && + (qelr_qp_has_rq(qp))) { mmio_wc_start(); writel(qp->rq.db_data.raw, qp->rq.db); mmio_flush_writes(); @@ -1100,7 +1105,8 @@ static inline void qelr_init_dpm_info(struct qelr_devctx *cxt, /* Check if edpm can be used */ if (wr->send_flags & IBV_SEND_INLINE && !qp->edpm_disabled && - cxt->dpm_flags & QELR_DPM_FLAGS_ENHANCED) { + cxt->dpm_flags & QELR_DPM_FLAGS_ENHANCED && + data_size <= cxt->edpm_limit_size) { memset(dpm, 0, sizeof(*dpm)); dpm->rdma_ext = (struct qelr_rdma_ext *)&dpm->payload; dpm->is_edpm = 1; @@ -1134,11 +1140,17 @@ static inline void qelr_edpm_set_msg_data(struct qelr_qp *qp, uint8_t comp) { uint32_t wqe_size, dpm_size, params; + /* edpm mode - 0 : ack field is treated by old FW as "completion" + * edpm mode - 1 : ack field is treated by new FW as ack which is + * always required. + */ + uint8_t ack = (qp->edpm_mode) ? 1 : comp; params = 0; wqe_size = length + (QELR_IS_IMM_OR_INV(opcode) ? sizeof(uint32_t) : 0); dpm_size = wqe_size + sizeof(struct db_roce_dpm_data); + SET_FIELD(params, DB_ROCE_DPM_PARAMS_ACK_REQUEST, ack); SET_FIELD(params, DB_ROCE_DPM_PARAMS_DPM_TYPE, DPM_ROCE); SET_FIELD(params, DB_ROCE_DPM_PARAMS_OPCODE, opcode); SET_FIELD(params, DB_ROCE_DPM_PARAMS_WQE_SIZE, wqe_size); @@ -1522,6 +1534,28 @@ static inline int qelr_can_post_send(struct qelr_devctx *cxt, return 0; } +static void qelr_configure_xrc_srq(struct ibv_send_wr *wr, + struct rdma_sq_common_wqe *wqe, + struct qelr_dpm *dpm) +{ + struct rdma_sq_send_wqe_1st *xrc_wqe; + + /* xrc_srq location is the same for all relevant wqes */ + xrc_wqe = (struct rdma_sq_send_wqe_1st *)wqe; + xrc_wqe->xrc_srq = htole32(wr->qp_type.xrc.remote_srqn); + + if (dpm->is_edpm) { + struct qelr_xrceth *xrceth; + + xrceth = (struct qelr_xrceth *) + &dpm->payload[dpm->payload_offset]; + xrceth->xrc_srq = htobe32(wr->qp_type.xrc.remote_srqn); + dpm->payload_offset += sizeof(*xrceth); + dpm->payload_size += sizeof(*xrceth); + dpm->rdma_ext = (struct qelr_rdma_ext *)&dpm->payload_offset; + } +} + static int __qelr_post_send(struct qelr_devctx *cxt, struct qelr_qp *qp, struct ibv_send_wr *wr, int data_size, int *normal_db_required) @@ -1559,6 +1593,8 @@ static int __qelr_post_send(struct qelr_devctx *cxt, struct qelr_qp *qp, wqe->prev_wqe_size = qp->prev_wqe_size; qp->wqe_wr_id[qp->sq.prod].opcode = qelr_ibv_to_wc_opcode(wr->opcode); + if (get_ibv_qp(qp)->qp_type == IBV_QPT_XRC_SEND) + qelr_configure_xrc_srq(wr, wqe, &dpm); switch (wr->opcode) { case IBV_WR_SEND_WITH_IMM: @@ -2034,7 +2070,7 @@ static int process_req(struct qelr_qp *qp, struct qelr_cq *cq, int num_entries, struct ibv_wc *wc, uint16_t hw_cons, enum ibv_wc_status status, int force) { - struct qelr_devctx *cxt = get_qelr_ctx(qp->ibv_qp.context); + struct qelr_devctx *cxt = get_qelr_ctx(qp->ibv_qp->context); uint16_t cnt = 0; while (num_entries && qp->sq.wqe_cons != hw_cons) { @@ -2090,7 +2126,7 @@ static int qelr_poll_cq_req(struct qelr_qp *qp, struct qelr_cq *cq, int num_entries, struct ibv_wc *wc, struct rdma_cqe_requester *req) { - struct qelr_devctx *cxt = get_qelr_ctx(qp->ibv_qp.context); + struct qelr_devctx *cxt = get_qelr_ctx(qp->ibv_qp->context); uint16_t sq_cons = le16toh(req->sq_cons); int cnt = 0; @@ -2192,11 +2228,11 @@ static int qelr_poll_cq_req(struct qelr_qp *qp, struct qelr_cq *cq, return cnt; } -static void __process_resp_one(struct qelr_qp *qp, struct qelr_cq *cq, +static void __process_resp_one(struct qelr_devctx *cxt, struct qelr_cq *cq, struct ibv_wc *wc, - struct rdma_cqe_responder *resp, uint64_t wr_id) + struct rdma_cqe_responder *resp, uint64_t wr_id, + uint32_t qp_id) { - struct qelr_devctx *cxt = get_qelr_ctx(qp->ibv_qp.context); enum ibv_wc_status wc_status = IBV_WC_SUCCESS; uint8_t flags; @@ -2225,6 +2261,9 @@ static void __process_resp_one(struct qelr_qp *qp, struct qelr_cq *cq, case RDMA_CQE_RESP_STS_OK: wc_status = IBV_WC_SUCCESS; wc->byte_len = le32toh(resp->length); + if (GET_FIELD(resp->flags, RDMA_CQE_REQUESTER_TYPE) == + RDMA_CQE_TYPE_RESPONDER_XRC_SRQ) + wc->src_qp = le16toh(resp->rq_cons_or_srq_id); flags = resp->flags & QELR_RESP_RDMA_IMM; @@ -2257,14 +2296,14 @@ static void __process_resp_one(struct qelr_qp *qp, struct qelr_cq *cq, /* fill WC */ wc->status = wc_status; - wc->qp_num = qp->qp_id; + wc->qp_num = qp_id; } -static int process_resp_one_srq(struct qelr_qp *qp, struct qelr_cq *cq, +static int process_resp_one_srq(struct qelr_srq *srq, struct qelr_cq *cq, struct ibv_wc *wc, - struct rdma_cqe_responder *resp) + struct rdma_cqe_responder *resp, uint32_t qp_id) { - struct qelr_srq_hwq_info *hw_srq = &qp->srq->hw_srq; + struct qelr_srq_hwq_info *hw_srq = &srq->hw_srq; uint64_t wr_id; wr_id = (((uint64_t)(le32toh(resp->srq_wr_id.hi))) << 32) + @@ -2273,10 +2312,11 @@ static int process_resp_one_srq(struct qelr_qp *qp, struct qelr_cq *cq, if (resp->status == RDMA_CQE_RESP_STS_WORK_REQUEST_FLUSHED_ERR) { wc->byte_len = 0; wc->status = IBV_WC_WR_FLUSH_ERR; - wc->qp_num = qp->qp_id; + wc->qp_num = qp_id; wc->wr_id = wr_id; } else { - __process_resp_one(qp, cq, wc, resp, wr_id); + __process_resp_one(get_qelr_ctx(srq->verbs_srq.srq.context), + cq, wc, resp, wr_id, qp_id); } hw_srq->wr_cons_cnt++; @@ -2289,7 +2329,8 @@ static int process_resp_one(struct qelr_qp *qp, struct qelr_cq *cq, { uint64_t wr_id = qp->rqe_wr_id[qp->rq.cons].wr_id; - __process_resp_one(qp, cq, wc, resp, wr_id); + __process_resp_one(get_qelr_ctx(qp->ibv_qp->context), cq, wc, resp, + wr_id, qp->qp_id); while (qp->rqe_wr_id[qp->rq.cons].wqe_size--) qelr_chain_consume(&qp->rq.chain); @@ -2349,13 +2390,14 @@ static void try_consume_resp_cqe(struct qelr_cq *cq, struct qelr_qp *qp, } } -static int qelr_poll_cq_resp_srq(struct qelr_qp *qp, struct qelr_cq *cq, +static int qelr_poll_cq_resp_srq(struct qelr_srq *srq, struct qelr_cq *cq, int num_entries, struct ibv_wc *wc, - struct rdma_cqe_responder *resp, int *update) + struct rdma_cqe_responder *resp, int *update, + uint32_t qp_id) { int cnt; - cnt = process_resp_one_srq(qp, cq, wc, resp); + cnt = process_resp_one_srq(srq, cq, wc, resp, qp_id); consume_cqe(cq); *update |= 1; @@ -2366,7 +2408,7 @@ static int qelr_poll_cq_resp(struct qelr_qp *qp, struct qelr_cq *cq, int num_entries, struct ibv_wc *wc, struct rdma_cqe_responder *resp, int *update) { - uint16_t rq_cons = le16toh(resp->rq_cons); + uint16_t rq_cons = le16toh(resp->rq_cons_or_srq_id); int cnt; if (resp->status == RDMA_CQE_RESP_STS_WORK_REQUEST_FLUSHED_ERR) { @@ -2393,13 +2435,35 @@ static void doorbell_cq(struct qelr_cq *cq, uint32_t cons, uint8_t flags) mmio_flush_writes(); } +static struct qelr_srq *qelr_get_xrc_srq_from_cqe(struct qelr_cq *cq, + union rdma_cqe *cqe, + struct qelr_qp *qp) +{ + struct qelr_devctx *cxt; + struct qelr_srq *srq; + uint16_t srq_id; + + srq_id = le16toh(cqe->resp.rq_cons_or_srq_id); + cxt = get_qelr_ctx(cq->ibv_cq.context); + srq = qelr_get_srq(cxt, srq_id); + if (unlikely(!srq)) { + DP_ERR(cxt->dbg_fp, "srq handle is null\n"); + return NULL; + } + + return srq; +} + int qelr_poll_cq(struct ibv_cq *ibcq, int num_entries, struct ibv_wc *wc) { struct qelr_cq *cq = get_qelr_cq(ibcq); int done = 0; union rdma_cqe *cqe = get_cqe(cq); + struct qelr_srq *srq; + struct regpair *qph; int update = 0; uint32_t db_cons; + uint32_t qp_id; while (num_entries && is_valid_cqe(cq, cqe)) { int cnt = 0; @@ -2409,7 +2473,8 @@ int qelr_poll_cq(struct ibv_cq *ibcq, int num_entries, struct ibv_wc *wc) udma_from_device_barrier(); qp = cqe_get_qp(cqe); - if (!qp) { + if (!qp && + cqe_get_type(cqe) != RDMA_CQE_TYPE_RESPONDER_XRC_SRQ) { DP_ERR(stderr, "Error: CQE QP pointer is NULL. CQE=%p\n", cqe); break; @@ -2425,9 +2490,23 @@ int qelr_poll_cq(struct ibv_cq *ibcq, int num_entries, struct ibv_wc *wc) cnt = qelr_poll_cq_resp(qp, cq, num_entries, wc, &cqe->resp, &update); break; + case RDMA_CQE_TYPE_RESPONDER_XRC_SRQ: + qph = &cqe->req.qp_handle; + srq = qelr_get_xrc_srq_from_cqe(cq, cqe, qp); + if (unlikely(!srq)) { + consume_cqe(cq); + cqe = get_cqe(cq); + update |= 1; + continue; + } + qp_id = le32toh(qph->lo); + cnt = qelr_poll_cq_resp_srq(srq, cq, num_entries, wc, + &cqe->resp, &update, qp_id); + break; case RDMA_CQE_TYPE_RESPONDER_SRQ: - cnt = qelr_poll_cq_resp_srq(qp, cq, num_entries, wc, - &cqe->resp, &update); + cnt = qelr_poll_cq_resp_srq(qp->srq, cq, num_entries, + wc, &cqe->resp, &update, + qp->qp_id); break; case RDMA_CQE_TYPE_INVALID: default: @@ -2509,3 +2588,208 @@ void qelr_async_event(struct ibv_context *context, fprintf(stderr, "qelr_async_event not implemented yet cq=%p qp=%p\n", cq, qp); } + +struct ibv_xrcd *qelr_open_xrcd(struct ibv_context *context, + struct ibv_xrcd_init_attr *init_attr) +{ + struct qelr_devctx *cxt = get_qelr_ctx(context); + struct ib_uverbs_open_xrcd_resp resp; + struct ibv_open_xrcd cmd; + struct verbs_xrcd *xrcd; + int rc; + + xrcd = calloc(1, sizeof(*xrcd)); + if (!xrcd) + return NULL; + + rc = ibv_cmd_open_xrcd(context, xrcd, sizeof(*xrcd), init_attr, &cmd, + sizeof(cmd), &resp, sizeof(resp)); + if (rc) { + DP_ERR(cxt->dbg_fp, "open xrcd: failed with rc=%d.\n", rc); + free(xrcd); + return NULL; + } + + return &xrcd->xrcd; +} + +int qelr_close_xrcd(struct ibv_xrcd *ibxrcd) +{ + struct verbs_xrcd *xrcd = container_of(ibxrcd, struct verbs_xrcd, xrcd); + struct qelr_devctx *cxt = get_qelr_ctx(ibxrcd->context); + int rc; + + rc = ibv_cmd_close_xrcd(xrcd); + if (rc) { + DP_ERR(cxt->dbg_fp, "close xrcd: failed with rc=%d.\n", rc); + free(xrcd); + } + + return rc; +} + +static struct ibv_srq * +qelr_create_xrc_srq(struct ibv_context *context, + struct ibv_srq_init_attr_ex *init_attr) +{ + struct qelr_devctx *cxt = get_qelr_ctx(context); + struct qelr_create_srq_ex req; + struct qelr_create_srq_resp resp; + struct ibv_srq *ibv_srq; + struct qelr_srq *srq; + int rc = 0; + + srq = calloc(1, sizeof(*srq)); + if (!srq) + goto err0; + ibv_srq = &srq->verbs_srq.srq; + + rc = qelr_create_srq_buffers(cxt, srq, init_attr->attr.max_wr); + if (rc) + goto err1; + + pthread_spin_init(&srq->lock, PTHREAD_PROCESS_PRIVATE); + qelr_create_srq_configure_req_ex(srq, &req); + + rc = ibv_cmd_create_srq_ex(context, + &srq->verbs_srq, + init_attr, &req.ibv_cmd, sizeof(req), + &resp.ibv_resp, sizeof(resp)); + if (rc) + goto err1; + + if (unlikely(resp.srq_id >= QELR_MAX_SRQ_ID)) { + rc = -EINVAL; + goto err1; + } + + srq->srq_id = resp.srq_id; + srq->is_xrc = 1; + + cxt->srq_table[resp.srq_id] = srq; + + DP_VERBOSE(cxt->dbg_fp, QELR_MSG_SRQ, + "create srq_ex: successfully created %p.\n", srq); + + return ibv_srq; + +err1: + qelr_destroy_srq_buffers(ibv_srq); + free(srq); +err0: + DP_ERR(cxt->dbg_fp, + "create srq: failed to create %p. rc=%d\n", srq, rc); + return NULL; +} + +int qelr_get_srq_num(struct ibv_srq *ibv_srq, uint32_t *srq_num) +{ + struct qelr_srq *srq = get_qelr_srq(ibv_srq); + + *srq_num = srq->srq_id; + + return 0; +} + +struct ibv_srq *qelr_create_srq_ex(struct ibv_context *context, + struct ibv_srq_init_attr_ex *init_attr) +{ + struct qelr_devctx *cxt = get_qelr_ctx(context); + + if (init_attr->srq_type == IBV_SRQT_BASIC) + return qelr_create_srq(init_attr->pd, + (struct ibv_srq_init_attr *)init_attr); + + if (init_attr->srq_type == IBV_SRQT_XRC) + return qelr_create_xrc_srq(context, init_attr); + + DP_ERR(cxt->dbg_fp, "failed to create srq type %d\n", + init_attr->srq_type); + + return NULL; +} + +static struct ibv_qp *create_qp(struct ibv_context *context, + struct ibv_qp_init_attr_ex *attrx) +{ + struct qelr_devctx *cxt = get_qelr_ctx(context); + struct qelr_create_qp_resp resp = {}; + struct qelr_create_qp req; + struct ibv_qp *ibqp; + struct qelr_qp *qp; + int rc; + + qelr_print_qp_init_attr(cxt, attrx); + + qp = calloc(1, sizeof(*qp)); + if (!qp) + return NULL; + + qelr_basic_qp_config(qp, attrx); + + rc = qelr_create_qp_buffers(cxt, qp, attrx); + if (rc) + goto err0; + + qelr_create_qp_configure_req(qp, &req); + + rc = ibv_cmd_create_qp_ex(context, &qp->verbs_qp, + attrx, &req.ibv_cmd, sizeof(req), + &resp.ibv_resp, sizeof(resp)); + if (rc) { + DP_ERR(cxt->dbg_fp, + "create qp: failed on ibv_cmd_create_qp with %d\n", rc); + goto err1; + } + + rc = qelr_configure_qp(cxt, qp, attrx, &resp); + if (rc) + goto err2; + + DP_VERBOSE(cxt->dbg_fp, QELR_MSG_QP, + "create qp: successfully created %p. handle_hi=%x handle_lo=%x\n", + qp, req.qp_handle_hi, req.qp_handle_lo); + + ibqp = (struct ibv_qp *)&qp->verbs_qp; + qp->ibv_qp = ibqp; + + return get_ibv_qp(qp); + +err2: + rc = ibv_cmd_destroy_qp(get_ibv_qp(qp)); + if (rc) + DP_ERR(cxt->dbg_fp, "create qp: fatal fault. rc=%d\n", rc); +err1: + if (qelr_qp_has_sq(qp)) + qelr_chain_free(&qp->sq.chain); + + if (qelr_qp_has_rq(qp)) + qelr_chain_free(&qp->rq.chain); +err0: + free(qp); + + return NULL; +} + +struct ibv_qp *qelr_create_qp_ex(struct ibv_context *context, + struct ibv_qp_init_attr_ex *attr) +{ + return create_qp(context, attr); + +} + +struct ibv_qp *qelr_create_qp(struct ibv_pd *pd, struct ibv_qp_init_attr *attr) +{ + struct ibv_qp *qp; + struct ibv_qp_init_attr_ex attrx = {}; + + memcpy(&attrx, attr, sizeof(*attr)); + attrx.comp_mask = IBV_QP_INIT_ATTR_PD; + attrx.pd = pd; + + qp = create_qp(pd->context, &attrx); + if (qp) + memcpy(attr, &attrx, sizeof(*attr)); + + return qp; +} diff --git a/providers/qedr/qelr_verbs.h b/providers/qedr/qelr_verbs.h index d0eacbf..bbfd490 100644 --- a/providers/qedr/qelr_verbs.h +++ b/providers/qedr/qelr_verbs.h @@ -84,4 +84,12 @@ int qelr_post_srq_recv(struct ibv_srq *ibsrq, struct ibv_recv_wr *wr, void qelr_async_event(struct ibv_context *context, struct ibv_async_event *event); +struct ibv_xrcd *qelr_open_xrcd(struct ibv_context *context, + struct ibv_xrcd_init_attr *init_attr); +int qelr_close_xrcd(struct ibv_xrcd *ibxrcd); +struct ibv_srq *qelr_create_srq_ex(struct ibv_context *context, + struct ibv_srq_init_attr_ex *init_attr); +struct ibv_qp *qelr_create_qp_ex(struct ibv_context *context, + struct ibv_qp_init_attr_ex *attrx); +int qelr_get_srq_num(struct ibv_srq *srq, uint32_t *srq_num); #endif /* __QELR_VERBS_H__ */ diff --git a/providers/siw/siw.c b/providers/siw/siw.c index 9530833..0f94e61 100644 --- a/providers/siw/siw.c +++ b/providers/siw/siw.c @@ -184,11 +184,6 @@ fail: return NULL; } -static int siw_resize_cq(struct ibv_cq *base_cq, int num_cqe) -{ - return -EOPNOTSUPP; -} - static int siw_destroy_cq(struct ibv_cq *base_cq) { struct siw_cq *cq = cq_base2siw(base_cq); @@ -433,16 +428,6 @@ static int siw_destroy_qp(struct ibv_qp *base_qp) return 0; } -static struct ibv_ah *siw_create_ah(struct ibv_pd *pd, struct ibv_ah_attr *attr) -{ - return NULL; -} - -static int siw_destroy_ah(struct ibv_ah *ah) -{ - return -EOPNOTSUPP; -} - static void siw_async_event(struct ibv_context *ctx, struct ibv_async_event *event) { @@ -832,13 +817,11 @@ static int siw_poll_cq(struct ibv_cq *ibcq, int num_entries, struct ibv_wc *wc) static const struct verbs_context_ops siw_context_ops = { .alloc_pd = siw_alloc_pd, .async_event = siw_async_event, - .create_ah = siw_create_ah, .create_cq = siw_create_cq, .create_qp = siw_create_qp, .create_srq = siw_create_srq, .dealloc_pd = siw_free_pd, .dereg_mr = siw_dereg_mr, - .destroy_ah = siw_destroy_ah, .destroy_cq = siw_destroy_cq, .destroy_qp = siw_destroy_qp, .destroy_srq = siw_destroy_srq, @@ -854,7 +837,6 @@ static const struct verbs_context_ops siw_context_ops = { .query_qp = siw_query_qp, .reg_mr = siw_reg_mr, .req_notify_cq = siw_notify_cq, - .resize_cq = siw_resize_cq, }; static struct verbs_context *siw_alloc_context(struct ibv_device *base_dev, diff --git a/pyverbs/CMakeLists.txt b/pyverbs/CMakeLists.txt index 8603e9d..9542c4b 100755 --- a/pyverbs/CMakeLists.txt +++ b/pyverbs/CMakeLists.txt @@ -24,7 +24,8 @@ rdma_python_module(pyverbs utils.py ) -# mlx5 provider is not built without coherent DMA, e.g. ARM32 build. +# mlx5 and efa providers are not built without coherent DMA, e.g. ARM32 build. if (HAVE_COHERENT_DMA) add_subdirectory(providers/mlx5) +add_subdirectory(providers/efa) endif() diff --git a/pyverbs/addr.pxd b/pyverbs/addr.pxd index e7322e8..768367d 100644 --- a/pyverbs/addr.pxd +++ b/pyverbs/addr.pxd @@ -5,6 +5,7 @@ from .base cimport PyverbsObject, PyverbsCM from pyverbs cimport libibverbs as v +from .cmid cimport UDParam cdef class GID(PyverbsObject): @@ -18,6 +19,7 @@ cdef class GlobalRoute(PyverbsObject): cdef class AHAttr(PyverbsObject): cdef v.ibv_ah_attr ah_attr + cdef init_from_ud_param(self, UDParam udparam) cdef class AH(PyverbsCM): cdef v.ibv_ah *ah diff --git a/pyverbs/addr.pyx b/pyverbs/addr.pyx index c36b1b4..cae7340 100644 --- a/pyverbs/addr.pyx +++ b/pyverbs/addr.pyx @@ -1,11 +1,12 @@ # SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) # Copyright (c) 2018, Mellanox Technologies. All rights reserved. See COPYING file -from libc.stdint cimport uint8_t +from libc.stdint cimport uint8_t, uintptr_t from .pyverbs_error import PyverbsUserError, PyverbsRDMAError from pyverbs.utils import gid_str_to_array, gid_str from pyverbs.base import PyverbsRDMAErrno +from pyverbs.cmid cimport UDParam cimport pyverbs.libibverbs as v from pyverbs.pd cimport PD from pyverbs.cq cimport WC @@ -262,6 +263,14 @@ cdef class AHAttr(PyverbsObject): self.ah_attr.grh.hop_limit = gr.hop_limit self.ah_attr.grh.traffic_class = gr.traffic_class + cdef init_from_ud_param(self, UDParam udparam): + """ + Initiate the AHAttr from UDParam's ah_attr. + :param udparam: UDParam that contains the AHAttr. + :return: None + """ + self.ah_attr = udparam.ud_param.ah_attr + @property def port_num(self): return self.ah_attr.port_num @@ -380,7 +389,8 @@ cdef class AH(PyverbsCM): * *wc* A WC object to use for AH initialization * *grh* - A GRH object to use for AH initialization (when using wc) + Pointer to GRH object to use for AH initialization (when using + wc) * *port_num* Port number to be used for this AH (when using wc) :return: An AH object on success @@ -393,9 +403,9 @@ cdef class AH(PyverbsCM): else: # Create AH from WC wc = kwargs['wc'] - grh = kwargs['grh'] + grh = kwargs['grh'] port_num = kwargs['port_num'] - self.ah = v.ibv_create_ah_from_wc(pd.pd, &wc.wc, &grh.grh, port_num) + self.ah = v.ibv_create_ah_from_wc(pd.pd, &wc.wc, grh, port_num) if self.ah == NULL: raise PyverbsRDMAErrno('Failed to create AH') pd.add_ref(self) diff --git a/pyverbs/cmid.pxd b/pyverbs/cmid.pxd index df3c920..2880c56 100755 --- a/pyverbs/cmid.pxd +++ b/pyverbs/cmid.pxd @@ -12,6 +12,8 @@ cdef class CMID(PyverbsCM): cdef object event_channel cdef object ctx cdef object pd + cdef object mrs + cdef add_ref(self, obj) cpdef close(self) @@ -32,3 +34,10 @@ cdef class AddrInfo(PyverbsObject): cdef class ConnParam(PyverbsObject): cdef cm.rdma_conn_param conn_param + + +cdef class UDParam(PyverbsObject): + cdef cm.rdma_ud_param ud_param + +cdef class JoinMCAttrEx(PyverbsObject): + cdef cm.rdma_cm_join_mc_attr_ex join_mc_attr_ex diff --git a/pyverbs/cmid.pyx b/pyverbs/cmid.pyx index 66d7326..b75bc26 100755 --- a/pyverbs/cmid.pyx +++ b/pyverbs/cmid.pyx @@ -1,10 +1,14 @@ +from libc.stdint cimport uintptr_t from libc.string cimport memset +import weakref -from pyverbs.pyverbs_error import PyverbsUserError -from pyverbs.qp cimport QPInitAttr, QPAttr +from pyverbs.pyverbs_error import PyverbsUserError, PyverbsError +from pyverbs.qp cimport QPInitAttr, QPAttr, ECE from pyverbs.base import PyverbsRDMAErrno +from pyverbs.base cimport close_weakrefs cimport pyverbs.libibverbs_enums as e cimport pyverbs.librdmacm_enums as ce +from pyverbs.addr cimport AH, AHAttr from pyverbs.device cimport Context cimport pyverbs.libibverbs as v cimport pyverbs.librdmacm as cm @@ -69,21 +73,87 @@ cdef class ConnParam(PyverbsObject): print_format.format('qp number', self.conn_param.qp_num) +cdef class JoinMCAttrEx(PyverbsObject): + + def __init__(self, AddrInfo addr not None, comp_mask=0, join_flags=0): + """ + Initialize a JoinMCAttrEx object over an underlying + rdma_cm_join_mc_attr_ex C object which contains the extended join + multicast attributes. + :param addr: Multicast address identifying the group to join. + :param comp_mask: Bitwise OR between "rdma_cm_join_mc_attr_mask" enum. + :param join_flags: Single flag from "rdma_cm_mc_join_flags" enum. + Indicates the type of the join requests. + """ + super().__init__() + self.join_mc_attr_ex.addr = addr.addr_info.ai_src_addr + self.join_mc_attr_ex.comp_mask = comp_mask + self.join_mc_attr_ex.join_flags = join_flags + + @property + def join_flags(self): + return self.join_mc_attr_ex.join_flags + @join_flags.setter + def join_flags(self, val): + self.join_mc_attr_ex.join_flags = val + + @property + def comp_mask(self): + return self.join_mc_attr_ex.comp_mask + @comp_mask.setter + def comp_mask(self, val): + self.join_mc_attr_ex.comp_mask = val + + +cdef class UDParam(PyverbsObject): + + def __init__(self, CMEvent cm_event not None): + """ + Initialize a UDParam object over an underlying rdma_ud_param + C object which contains UD connection parameters. + :param cm_event: The creator of UDParam. When the active side gets + connection establishment event, the event contains + UDParam for the passive CMID details. + :return: UDParam object + """ + super().__init__() + memset(&self.ud_param, 0, sizeof(cm.rdma_ud_param)) + self.ud_param = (cm_event).event.param.ud + + @property + def qp_num(self): + return self.ud_param.qp_num + + @property + def qkey(self): + return self.ud_param.qkey + + @property + def ah_attr(self): + ah_attr = AHAttr() + ah_attr.init_from_ud_param(self) + return ah_attr + + cdef class AddrInfo(PyverbsObject): - def __init__(self, src=None, dst=None, service=None, port_space=0, - flags=0): + def __init__(self, src=None, dst=None, src_service=None, dst_service=None, + port_space=0, flags=0): """ Initialize an AddrInfo object over an underlying rdma_addrinfo C object. :param src: Name, dotted-decimal IPv4 or IPv6 hex address to bind to. :param dst: Name, dotted-decimal IPv4 or IPv6 hex address to connect to. - :param service: The service name or port number of the address. + :param src_service: The service name or port number of the source + address. + :param dst_service: The service name or port number of the destination + address. :param port_space: RDMA port space used (RDMA_PS_UDP or RDMA_PS_TCP). :param flags: Hint flags which control the operation. :return: An AddrInfo object which contains information needed to establish communication. """ - cdef char* srvc = NULL + cdef char* src_srvc = NULL + cdef char* dst_srvc = NULL cdef char* src_addr = NULL cdef char* dst_addr = NULL cdef cm.rdma_addrinfo hints @@ -99,28 +169,32 @@ cdef class AddrInfo(PyverbsObject): if isinstance(dst, str): dst = dst.encode('utf-8') dst_addr = dst - if service is not None: - if isinstance(service, str): - service = service.encode('utf-8') - srvc = service + if src_service is not None: + if isinstance(src_service, str): + src_service = src_service.encode('utf-8') + src_srvc = src_service + if dst_service is not None: + if isinstance(dst_service, str): + dst_service = dst_service.encode('utf-8') + dst_srvc = dst_service hints_ptr = &hints memset(hints_ptr, 0, sizeof(cm.rdma_addrinfo)) hints.ai_port_space = port_space hints.ai_flags = flags if flags & ce.RAI_PASSIVE: - ret = cm.rdma_getaddrinfo(src_addr, srvc, hints_ptr, + ret = cm.rdma_getaddrinfo(src_addr, src_srvc, hints_ptr, &self.addr_info) else: if src: hints.ai_flags |= ce.RAI_PASSIVE - ret = cm.rdma_getaddrinfo(src_addr, NULL, hints_ptr, &res) + ret = cm.rdma_getaddrinfo(src_addr, src_srvc, hints_ptr, &res) if ret != 0: raise PyverbsRDMAErrno('Failed to get Address Info') hints.ai_src_addr = res.ai_src_addr hints.ai_src_len = res.ai_src_len hints.ai_flags &= ~ce.RAI_PASSIVE - ret = cm.rdma_getaddrinfo(dst_addr, srvc, hints_ptr, + ret = cm.rdma_getaddrinfo(dst_addr, dst_srvc, hints_ptr, &self.addr_info) if src: cm.rdma_freeaddrinfo(res) @@ -234,6 +308,7 @@ cdef class CMID(PyverbsCM): self.pd = None self.ctx = None self.event_channel = None + self.mrs = weakref.WeakSet() if creator is None: return elif isinstance(creator, AddrInfo): @@ -266,6 +341,12 @@ cdef class CMID(PyverbsCM): raise PyverbsRDMAErrno('Cannot create CM ID from {obj}' .format(obj=type(creator))) + cdef add_ref(self, obj): + if isinstance(obj, MR): + self.mrs.add(obj) + else: + raise PyverbsError('Unrecognized object type') + @property def event_channel(self): return self.event_channel @@ -278,6 +359,12 @@ cdef class CMID(PyverbsCM): def pd(self): return self.pd + @property + def qpn(self): + if self.id.qp: + return self.id.qp.qp_num + return None + def __dealloc__(self): self.close() @@ -296,6 +383,7 @@ cdef class CMID(PyverbsCM): (self.ctx).context = NULL if self.pd: (self.pd).pd = NULL + close_weakrefs([self.mrs]) self.id = NULL def get_request(self): @@ -347,6 +435,40 @@ cdef class CMID(PyverbsCM): if ret != 0: raise PyverbsRDMAErrno('Failed to Resolve Address') + def join_multicast(self, AddrInfo addr=None, JoinMCAttrEx mc_attr=None, + context=0): + """ + Joins a multicast group and attaches an associated QP to the group. + :param addr: Multicast address identifying the group to join. + :param mc_attr: JoinMCAttrEx object is requierd to use + rdma_join_multicast_ex. This object contains the join + flags and the AddrInfo to join. + :param context: User-defined context associated with the join request. + :return: None + """ + cdef cm.rdma_cm_join_mc_attr_ex *mc_join_attr = NULL + if not addr and not mc_attr: + raise PyverbsUserError('Join to multicast must have AddrInfo or JoinMCAttrEx arguments') + if not mc_attr: + ret = cm.rdma_join_multicast(self.id, addr.addr_info.ai_src_addr, + context) + else: + ret = cm.rdma_join_multicast_ex(self.id, &mc_attr.join_mc_attr_ex, + context) + if ret != 0: + raise PyverbsRDMAErrno('Failed to Join multicast') + + def leave_multicast(self, AddrInfo addr not None): + """ + Leaves a multicast group and detaches an associated QP from the group. + :param addr: AddrInfo object, represent the multicast address that + identifies the group to leave. + :return: None + """ + ret = cm.rdma_leave_multicast(self.id, addr.addr_info.ai_src_addr) + if ret != 0: + raise PyverbsRDMAErrno('Failed to leave multicast') + def resolve_route(self, timeout_ms=2000): """ Resolve an RDMA route to the destination address in order to establish @@ -424,6 +546,27 @@ cdef class CMID(PyverbsCM): if ret != 0: raise PyverbsRDMAErrno('Failed to Complete an active connection request') + def set_local_ece(self, ECE ece): + """ + Set local ECE paraemters to be used for REQ/REP communication. + :param ece: ECE object with the requested configuration + :return: None + """ + rc = cm.rdma_set_local_ece(self.id, &ece.ece) + if rc != 0: + raise PyverbsRDMAErrno('Failed to set local ECE') + + def get_remote_ece(self): + """ + Get ECE parameters as were received from the communication peer. + :return: ECE object with the ece configuration + """ + ece = ECE() + rc = cm.rdma_get_remote_ece(self.id, &ece.ece) + if rc != 0: + raise PyverbsRDMAErrno('Failed to get remote ECE') + return ece + def create_qp(self, QPInitAttr qp_init not None): """ Create a QP, which is associated with CMID. @@ -478,32 +621,106 @@ cdef class CMID(PyverbsCM): :param size: The total length of the memory to register :return: registered MR """ - return MR(self.pd, size, e.IBV_ACCESS_LOCAL_WRITE) + return MR(self, size, e.IBV_ACCESS_LOCAL_WRITE) - def post_recv(self, MR mr not None): + def reg_read(self, size=0): + """ + Registers a memory region for sending or receiving messages or for + remote read operations. + :param size: The total length of the memory to register + :return: registered MR + """ + return MR(self, size, e.IBV_ACCESS_REMOTE_READ) + + def reg_write(self, size=0): + """ + Registers a memory region for sending or receiving messages or for + remote write operations. + :param size: The total length of the memory to register + :return: registered MR + """ + return MR(self, size, e.IBV_ACCESS_REMOTE_WRITE) + + def post_recv(self, MR mr not None, length=None): """ Posts a recv_wr via QP associated with CMID. Context param of rdma_post_recv C function currently not supported. :param mr: A valid MR object. + :param length: length of buffer to recv (default: mr length). :return: None """ - ret = cm.rdma_post_recv(self.id, NULL, mr.buf, mr.mr.length, mr.mr) + if not length: + length = mr.mr.length + ret = cm.rdma_post_recv(self.id, NULL, mr.buf, length, mr.mr) if ret != 0: raise PyverbsRDMAErrno('Failed to Post Receive') - def post_send(self, MR mr not None, flags=v.IBV_SEND_SIGNALED): + def post_send(self, MR mr not None, flags=v.IBV_SEND_SIGNALED, length=None): """ Posts a message via QP associated with CMID. Context param of rdma_post_send C function currently not supported. :param mr: A valid MR object which contains message to send. :param flags: flags for send work request. + :param length: length of buffer to send (default: mr length). :return: None """ - ret = cm.rdma_post_send(self.id, NULL, mr.buf, mr.mr.length, mr.mr, + if not length: + length = mr.mr.length + ret = cm.rdma_post_send(self.id, NULL, mr.buf, length, mr.mr, flags) if ret != 0: raise PyverbsRDMAErrno('Failed to Post Send') + def post_read(self, MR mr not None, length, remote_addr, rkey, + flags=0): + """ + Post read WR using the CMIDs internal QP. + :param mr: A valid MR object. + :param length: length of buffer to send. + :param remote_addr: The remote MR address. + :param rkey: The remote MR rkey. + :param flags: flags for send work request. + :return: None + """ + ret = cm.rdma_post_read(self.id, NULL, mr.buf, length, mr.mr, + flags, remote_addr, rkey) + if ret != 0: + raise PyverbsRDMAErrno('Failed to Post Read') + + def post_write(self, MR mr not None, length, remote_addr, rkey, + flags=0): + """ + Post write WR using the CMIDs internal QP. + :param mr: A valid MR object. + :param length: length of buffer to send. + :param remote_addr: The remote MR address. + :param rkey: The remote MR rkey. + :param flags: flags for send work request. + :return: None + """ + ret = cm.rdma_post_write(self.id, NULL, mr.buf, length, mr.mr, + flags, remote_addr, rkey) + if ret != 0: + raise PyverbsRDMAErrno('Failed to Post Write') + + def post_ud_send(self, MR mr not None, AH ah not None, rqpn=0, + flags=v.IBV_SEND_SIGNALED, length=None): + """ + Posts a message via UD QP associated with CMID to another UD QP. + :param mr: A valid MR object which contains message to send. + :param ah: The destination AH. + :param rqpn: The remote QP number. + :param flags: flags for send work request. + :param length: length of buffer to send. + :return: None + """ + if not length: + length = mr.mr.length + ret = cm.rdma_post_ud_send(self.id, NULL, mr.buf, length, mr.mr, + flags, ah.ah, rqpn) + if ret != 0: + raise PyverbsRDMAErrno('Failed to Post Send') + def get_recv_comp(self): """ Polls the receive CQ associated with CMID for a work completion. diff --git a/pyverbs/cq.pyx b/pyverbs/cq.pyx index 7eef890..8e1ac77 100755 --- a/pyverbs/cq.pyx +++ b/pyverbs/cq.pyx @@ -165,8 +165,8 @@ cdef class CQ(PyverbsCM): """ rc = v.ibv_req_notify_cq(self.cq, solicited_only) if rc != 0: - raise PyverbsRDMAErrno('Request notify CQ returned {rc}'. - format(rc=rc)) + raise PyverbsRDMAError('Request notify CQ returned {rc}'. + format(rc=rc), rc) def ack_events(self, num_events): """ diff --git a/pyverbs/device.pxd b/pyverbs/device.pxd index 99edf4b..0519c4b 100755 --- a/pyverbs/device.pxd +++ b/pyverbs/device.pxd @@ -19,6 +19,8 @@ cdef class Context(PyverbsCM): cdef object qps cdef object xrcds cdef object vars + cdef object uars + cdef object pps cdef class DeviceAttr(PyverbsObject): cdef v.ibv_device_attr dev_attr @@ -63,6 +65,5 @@ cdef class DM(PyverbsCM): cdef class PortAttr(PyverbsObject): cdef v.ibv_port_attr attr -cdef class VAR(PyverbsObject): - cdef object context - cpdef close(self) +cdef class GIDEntry(PyverbsObject): + cdef v.ibv_gid_entry entry diff --git a/pyverbs/device.pyx b/pyverbs/device.pyx index e939c0b..b16d6d0 100755 --- a/pyverbs/device.pyx +++ b/pyverbs/device.pyx @@ -15,6 +15,7 @@ from pyverbs.base import PyverbsRDMAErrno from pyverbs.base cimport close_weakrefs cimport pyverbs.libibverbs_enums as e cimport pyverbs.libibverbs as v +cimport pyverbs.librdmacm as cm from pyverbs.cmid cimport CMID from pyverbs.xrcd cimport XRCD from pyverbs.addr cimport GID @@ -24,6 +25,9 @@ from pyverbs.qp cimport QP from libc.stdlib cimport free, malloc from libc.string cimport memset from libc.stdint cimport uint64_t +from libc.stdint cimport uint16_t +from libc.stdint cimport uint32_t +from pyverbs.utils import gid_str cdef extern from 'endian.h': unsigned long be64toh(unsigned long host_64bits); @@ -35,11 +39,12 @@ class Device(PyverbsObject): It is not a part of objects creation order - there's no need for the user to create it for such purposes. """ - def __init__(self, name, guid, node_type, transport_type): + def __init__(self, name, guid, node_type, transport_type, index): self._node_type = node_type self._transport_type = transport_type self._name = name self._guid = guid + self._index = index @property def name(self): @@ -57,12 +62,16 @@ class Device(PyverbsObject): def guid(self): return self._guid + @property + def index(self): + return self._index + def __str__(self): return 'Device {dev}, node type {ntype}, transport type {ttype},' \ - ' guid {guid}'.format(dev=self.name.decode(), + ' guid {guid}, index {index}'.format(dev=self.name.decode(), ntype=translate_node_type(self.node_type), ttype=translate_transport_type(self.transport_type), - guid=guid_to_hex(self.guid)) + guid=guid_to_hex(self.guid), index=self._index) cdef class Context(PyverbsCM): @@ -87,6 +96,9 @@ cdef class Context(PyverbsCM): * *cmid* A CMID object. If not None, it means that the device was already opened by a CMID class, and only a pointer assignment is missing. + * *cmd_fd* + A command FD. If passed, the device will be imported from the + given cmd_fd using ibv_import_device. :return: None """ cdef int count @@ -101,14 +113,22 @@ cdef class Context(PyverbsCM): self.qps = weakref.WeakSet() self.xrcds = weakref.WeakSet() self.vars = weakref.WeakSet() + self.uars = weakref.WeakSet() + self.pps = weakref.WeakSet() self.name = kwargs.get('name') provider_attr = kwargs.get('attr') cmid = kwargs.get('cmid') + cmd_fd = kwargs.get('cmd_fd') if cmid is not None: self.context = cmid.id.verbs cmid.ctx = self return + if cmd_fd is not None: + self.context = v.ibv_import_device(cmd_fd) + if self.context == NULL: + raise PyverbsRDMAErrno('Failed to import device') + return if self.name is None: raise PyverbsUserError('Device name must be provided') @@ -150,8 +170,7 @@ cdef class Context(PyverbsCM): self.xrcds, self.vars]) rc = v.ibv_close_device(self.context) if rc != 0: - raise PyverbsRDMAErrno('Failed to close device {dev}'. - format(dev=self.device.name)) + raise PyverbsRDMAErrno(f'Failed to close device {self.name}') self.context = NULL @property @@ -187,6 +206,13 @@ cdef class Context(PyverbsCM): format(name=self.name), rc) return dev_attr_ex + def query_pkey(self, unsigned int port_num, int index): + cdef uint16_t pkey + rc = v.ibv_query_pkey(self.context, port_num, index, &pkey) + if rc != 0: + raise PyverbsRDMAError(f'Failed to query pkey {index} of port {port_num}') + return pkey + def query_gid(self, unsigned int port_num, int index): gid = GID() rc = v.ibv_query_gid(self.context, port_num, index, &gid.gid) @@ -196,7 +222,7 @@ cdef class Context(PyverbsCM): return gid def query_gid_type(self, unsigned int port_num, unsigned int index): - cdef v.ibv_gid_type gid_type + cdef v.ibv_gid_type_sysfs gid_type rc = v.ibv_query_gid_type(self.context, port_num, index, &gid_type) if rc != 0: raise PyverbsRDMAErrno('Failed to query gid type of port {p} and gid index {g}' @@ -216,6 +242,53 @@ cdef class Context(PyverbsCM): format(p=port_num), rc) return port_attrs + def query_gid_table(self, size_t max_entries, uint32_t flags=0): + """ + Queries the GID tables of the device for at most entries + and returns them. + :param max_entries: Maximum number of GID entries to retrieve + :param flags: Specifies new extra members of struct ibv_gid_entry to + query + :return: List of GIDEntry objects on success + """ + cdef v.ibv_gid_entry *entries + cdef v.ibv_gid_entry entry + + entries = malloc(max_entries * + sizeof(v.ibv_gid_entry)) + rc = v.ibv_query_gid_table(self.context, entries, max_entries, flags) + if rc < 0: + raise PyverbsRDMAError('Failed to query gid tables of the device', + rc) + gid_entries = [] + for i in range(rc): + entry = entries[i] + gid_entries.append(GIDEntry(entry.gid._global.subnet_prefix, + entry.gid._global.interface_id, entry.gid_index, + entry.port_num, entry.gid_type, + entry.ndev_ifindex)) + free(entries) + return gid_entries + + def query_gid_ex(self, uint32_t port_num, uint32_t gid_index, + uint32_t flags=0): + """ + Queries the GID table of port in index , and + returns the GID entry. + :param port_num: The port number to query + :param gid_index: The index in the GID table to query + :param flags: Specifies new extra members of struct ibv_gid_entry to + query + :return: GIDEntry object on success + """ + entry = GIDEntry() + rc = v.ibv_query_gid_ex(self.context, port_num, gid_index, + &entry.entry, flags) + if rc != 0: + raise PyverbsRDMAError(f'Failed to query gid table of port '\ + f'{port_num} in index {gid_index}', rc) + return entry + cdef add_ref(self, obj): if isinstance(obj, PD): self.pds.add(obj) @@ -229,8 +302,6 @@ cdef class Context(PyverbsCM): self.qps.add(obj) elif isinstance(obj, XRCD): self.xrcds.add(obj) - elif isinstance(obj, VAR): - self.vars.add(obj) else: raise PyverbsError('Unrecognized object type') @@ -238,6 +309,10 @@ cdef class Context(PyverbsCM): def cmd_fd(self): return self.context.cmd_fd + @property + def name(self): + return self.name + cdef class DeviceAttr(PyverbsObject): """ @@ -790,6 +865,63 @@ cdef class PortAttr(PyverbsObject): print_format.format('Flags', self.attr.flags) +cdef class GIDEntry(PyverbsObject): + def __init__(self, subnet_prefix=0, interface_id=0, gid_index=0, + port_num=0, gid_type=0, ndev_ifindex=0): + super().__init__() + self.entry.gid._global.subnet_prefix = subnet_prefix + self.entry.gid._global.interface_id = interface_id + self.entry.gid_index = gid_index + self.entry.port_num = port_num + self.entry.gid_type = gid_type + self.entry.ndev_ifindex = ndev_ifindex + + @property + def gid_subnet_prefix(self): + return self.entry.gid._global.subnet_prefix + + @property + def gid_interface_id(self): + return self.entry.gid._global.interface_id + + @property + def gid_index(self): + return self.entry.gid_index + + @property + def port_num(self): + return self.entry.port_num + + @property + def gid_type(self): + return self.entry.gid_type + + @property + def ndev_ifindex(self): + return self.entry.ndev_ifindex + + def gid_str(self): + return gid_str(self.gid_subnet_prefix, self.gid_interface_id) + + def __str__(self): + print_format = '{:<24}: {:<20}\n' + return print_format.format('GID', self.gid_str()) +\ + print_format.format('GID Index', self.gid_index) +\ + print_format.format('Port number', self.port_num) +\ + print_format.format('GID type', translate_gid_type( + self.gid_type)) +\ + print_format.format('Ndev ifindex', self.ndev_ifindex) + + +def translate_gid_type(gid_type): + types = {e.IBV_GID_TYPE_IB: 'IB', e.IBV_GID_TYPE_ROCE_V1: 'RoCEv1', + e.IBV_GID_TYPE_ROCE_V2: 'RoCEv2'} + try: + return types[gid_type] + except KeyError: + return f'Unknown gid_type ({gid_type})' + + def guid_format(num): """ Get GUID representation of the given number, including change of endianness. @@ -958,6 +1090,7 @@ def get_device_list(): device node type device transport type device guid + device index """ cdef int count = 0; cdef v.ibv_device **dev_list; @@ -971,23 +1104,30 @@ def get_device_list(): node = dev_list[i].node_type transport = dev_list[i].transport_type guid = be64toh(v.ibv_get_device_guid(dev_list[i])) - devices.append(Device(name, guid, node, transport)) + index = v.ibv_get_device_index(dev_list[i]) + devices.append(Device(name, guid, node, transport, index)) finally: v.ibv_free_device_list(dev_list) return devices -cdef class VAR(PyverbsObject): +def rdma_get_devices(): """ - This is an abstract class of Virtio Access Region (VAR). - Each device specific VAR implementation should inherit this class - and initialize it according to the device attributes. + Get the RDMA devices. + :return: list of Device objects. """ - def __init__(self, Context context not None, **kwargs): - self.context = context - - def __dealloc__(self): - self.close() - - cpdef close(self): - pass + cdef int count + cdef v.ibv_context **ctx_list + ctx_list = cm.rdma_get_devices(&count) + if ctx_list == NULL: + raise PyverbsRDMAErrno('Failed to get device list') + devices = [] + for i in range(count): + name = ctx_list[i].device.name + node = ctx_list[i].device.node_type + transport = ctx_list[i].device.transport_type + guid = be64toh(v.ibv_get_device_guid(ctx_list[i].device)) + index = v.ibv_get_device_index(ctx_list[i].device) + devices.append(Device(name, guid, node, transport, index)) + cm.rdma_free_devices(ctx_list) + return devices diff --git a/pyverbs/libibverbs.pxd b/pyverbs/libibverbs.pxd index 6ffa303..6fbba54 100755 --- a/pyverbs/libibverbs.pxd +++ b/pyverbs/libibverbs.pxd @@ -146,7 +146,7 @@ cdef extern from 'infiniband/verbs.h': ibv_pd *pd unsigned int rkey unsigned int handle - ibv_mw_type mw_type + ibv_mw_type type cdef struct ibv_alloc_dm_attr: size_t length @@ -303,6 +303,11 @@ cdef extern from 'infiniband/verbs.h': unsigned long length unsigned int mw_access_flags + cdef struct ibv_mw_bind: + uint64_t wr_id + unsigned int send_flags + ibv_mw_bind_info bind_info + cdef struct bind_mw: ibv_mw *mw unsigned int rkey @@ -313,10 +318,6 @@ cdef extern from 'infiniband/verbs.h': unsigned short hdr_sz unsigned short mss - cdef union unnamed: - bind_mw bind_mw - tso tso - cdef struct xrc: unsigned int remote_srqn @@ -329,10 +330,12 @@ cdef extern from 'infiniband/verbs.h': ibv_sge *sg_list int num_sge ibv_wr_opcode opcode + uint32_t imm_data unsigned int send_flags wr wr qp_type qp_type - unnamed unnamed + bind_mw bind_mw + tso tso cdef struct ibv_qp_cap: unsigned int max_send_wr @@ -475,7 +478,20 @@ cdef extern from 'infiniband/verbs.h': uint64_t wr_id unsigned int wr_flags + cdef struct ibv_ece: + uint32_t vendor_id + uint32_t options + uint32_t comp_mask + + cdef struct ibv_gid_entry: + ibv_gid gid + uint32_t gid_index + uint32_t port_num + uint32_t gid_type + uint32_t ndev_ifindex + ibv_device **ibv_get_device_list(int *n) + int ibv_get_device_index(ibv_device *device); void ibv_free_device_list(ibv_device **list) ibv_context *ibv_open_device(ibv_device *device) int ibv_close_device(ibv_context *context) @@ -486,10 +502,14 @@ cdef extern from 'infiniband/verbs.h': unsigned long ibv_get_device_guid(ibv_device *device) int ibv_query_gid(ibv_context *context, unsigned int port_num, int index, ibv_gid *gid) + int ibv_query_pkey(ibv_context *context, unsigned int port_num, + int index, uint16_t *pkey) ibv_pd *ibv_alloc_pd(ibv_context *context) int ibv_dealloc_pd(ibv_pd *pd) ibv_mr *ibv_reg_mr(ibv_pd *pd, void *addr, size_t length, int access) int ibv_dereg_mr(ibv_mr *mr) + int ibv_advise_mr(ibv_pd *pd, uint32_t advice, uint32_t flags, + ibv_sge *sg_list, uint32_t num_sge) ibv_mw *ibv_alloc_mw(ibv_pd *pd, ibv_mw_type type) int ibv_dealloc_mw(ibv_mw *mw) ibv_dm *ibv_alloc_dm(ibv_context *context, ibv_alloc_dm_attr *attr) @@ -549,6 +569,7 @@ cdef extern from 'infiniband/verbs.h': int ibv_destroy_qp(ibv_qp *qp) int ibv_post_recv(ibv_qp *qp, ibv_recv_wr *wr, ibv_recv_wr **bad_wr) int ibv_post_send(ibv_qp *qp, ibv_send_wr *wr, ibv_send_wr **bad_wr) + int ibv_bind_mw(ibv_qp *qp, ibv_mw *mw, ibv_mw_bind *mw_bind) ibv_xrcd *ibv_open_xrcd(ibv_context *context, ibv_xrcd_init_attr *xrcd_init_attr) int ibv_close_xrcd(ibv_xrcd *xrcd) @@ -594,8 +615,21 @@ cdef extern from 'infiniband/verbs.h': void ibv_wr_start(ibv_qp_ex *qp) int ibv_wr_complete(ibv_qp_ex *qp) void ibv_wr_abort(ibv_qp_ex *qp) + ibv_context *ibv_import_device(int cmd_fd) + ibv_mr *ibv_import_mr(ibv_pd *pd, uint32_t handle) + void ibv_unimport_mr(ibv_mr *mr) + ibv_pd *ibv_import_pd(ibv_context *context, uint32_t handle) + void ibv_unimport_pd(ibv_pd *pd) + int ibv_query_gid_ex(ibv_context *context, uint32_t port_num, + uint32_t gid_index, ibv_gid_entry *entry, + uint32_t flags) + ssize_t ibv_query_gid_table(ibv_context *context, + ibv_gid_entry *entries, size_t max_entries, + uint32_t flags) cdef extern from 'infiniband/driver.h': int ibv_query_gid_type(ibv_context *context, uint8_t port_num, - unsigned int index, ibv_gid_type *type) + unsigned int index, ibv_gid_type_sysfs *type) + int ibv_set_ece(ibv_qp *qp, ibv_ece *ece) + int ibv_query_ece(ibv_qp *qp, ibv_ece *ece) diff --git a/pyverbs/libibverbs_enums.pxd b/pyverbs/libibverbs_enums.pxd index 7f61be2..94be7f1 100755 --- a/pyverbs/libibverbs_enums.pxd +++ b/pyverbs/libibverbs_enums.pxd @@ -21,6 +21,7 @@ cdef extern from '': IBV_NODE_RNIC IBV_NODE_USNIC IBV_NODE_USNIC_UDP + IBV_NODE_UNSPECIFIED cpdef enum: IBV_LINK_LAYER_UNSPECIFIED @@ -426,13 +427,29 @@ cdef extern from '': cdef void *IBV_ALLOCATOR_USE_DEFAULT + cpdef enum ibv_gid_type: + IBV_GID_TYPE_IB + IBV_GID_TYPE_ROCE_V1 + IBV_GID_TYPE_ROCE_V2 + + +cdef extern from "": + cdef unsigned long long IBV_ADVISE_MR_ADVICE_PREFETCH + cdef unsigned long long IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE + cdef unsigned long long IBV_ADVISE_MR_FLAG_FLUSH + cdef unsigned long long IBV_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT + _IBV_DEVICE_RAW_SCATTER_FCS = IBV_DEVICE_RAW_SCATTER_FCS _IBV_DEVICE_PCI_WRITE_END_PADDING = IBV_DEVICE_PCI_WRITE_END_PADDING _IBV_ALLOCATOR_USE_DEFAULT = IBV_ALLOCATOR_USE_DEFAULT +_IBV_ADVISE_MR_ADVICE_PREFETCH = IBV_ADVISE_MR_ADVICE_PREFETCH +_IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE = IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE +_IBV_ADVISE_MR_FLAG_FLUSH = IBV_ADVISE_MR_FLAG_FLUSH +_IBV_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT = IBV_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT cdef extern from '': - cpdef enum ibv_gid_type: - IBV_GID_TYPE_IB_ROCE_V1 - IBV_GID_TYPE_ROCE_V2 + cpdef enum ibv_gid_type_sysfs: + IBV_GID_TYPE_SYSFS_IB_ROCE_V1 + IBV_GID_TYPE_SYSFS_ROCE_V2 diff --git a/pyverbs/librdmacm.pxd b/pyverbs/librdmacm.pxd index 03c0cdd..c56dc99 100755 --- a/pyverbs/librdmacm.pxd +++ b/pyverbs/librdmacm.pxd @@ -72,6 +72,11 @@ cdef extern from '': void *ai_connect rdma_addrinfo *ai_next + cdef struct rdma_cm_join_mc_attr_ex: + uint32_t comp_mask + uint32_t join_flags + sockaddr *addr + # These non rdmacm structs defined in one of rdma_cma.h's included header files cdef struct sockaddr: unsigned short sa_family @@ -88,6 +93,8 @@ cdef extern from '': rdma_event_channel *rdma_create_event_channel() void rdma_destroy_event_channel(rdma_event_channel *channel) + ibv_context **rdma_get_devices(int *num_devices) + void rdma_free_devices (ibv_context **list); int rdma_get_cm_event(rdma_event_channel *channel, rdma_cm_event **event) int rdma_ack_cm_event(rdma_cm_event *event) char *rdma_event_str(rdma_cm_event_type event) @@ -97,11 +104,17 @@ cdef extern from '': int rdma_create_id(rdma_event_channel *channel, rdma_cm_id **id, void *context, rdma_port_space ps) int rdma_destroy_id(rdma_cm_id *id) + int rdma_get_remote_ece(rdma_cm_id *id, ibv_ece *ece) + int rdma_set_local_ece(rdma_cm_id *id, ibv_ece *ece) int rdma_get_request(rdma_cm_id *listen, rdma_cm_id **id) int rdma_bind_addr(rdma_cm_id *id, sockaddr *addr) int rdma_resolve_addr(rdma_cm_id *id, sockaddr *src_addr, sockaddr *dst_addr, int timeout_ms) int rdma_resolve_route(rdma_cm_id *id, int timeout_ms) + int rdma_join_multicast(rdma_cm_id *id, sockaddr *addr, void *context) + int rdma_join_multicast_ex(rdma_cm_id *id, rdma_cm_join_mc_attr_ex *mc_join_attr, + void *context) + int rdma_leave_multicast(rdma_cm_id *id, sockaddr *addr) int rdma_connect(rdma_cm_id *id, rdma_conn_param *conn_param) int rdma_disconnect(rdma_cm_id *id) int rdma_listen(rdma_cm_id *id, int backlog) @@ -121,7 +134,18 @@ cdef extern from '': size_t length, ibv_mr *mr) int rdma_post_send(rdma_cm_id *id, void *context, void *addr, size_t length, ibv_mr *mr, int flags) + int rdma_post_ud_send(rdma_cm_id *id, void *context, void *addr, + size_t length, ibv_mr *mr, int flags, ibv_ah *ah, + uint32_t remote_qpn) + int rdma_post_read(rdma_cm_id *id, void *context, void *addr, + size_t length, ibv_mr *mr, int flags, + uint64_t remote_addr, uint32_t rkey) + int rdma_post_write(rdma_cm_id *id, void *context, void *addr, + size_t length, ibv_mr *mr, int flags, + uint64_t remote_addr, uint32_t rkey) int rdma_get_send_comp(rdma_cm_id *id, ibv_wc *wc) int rdma_get_recv_comp(rdma_cm_id *id, ibv_wc *wc) ibv_mr *rdma_reg_msgs(rdma_cm_id *id, void *addr, size_t length) + ibv_mr *rdma_reg_read(rdma_cm_id *id, void *addr, size_t length) + ibv_mr *rdma_reg_write(rdma_cm_id *id, void *addr, size_t length) int rdma_dereg_mr(ibv_mr *mr) diff --git a/pyverbs/librdmacm_enums.pxd b/pyverbs/librdmacm_enums.pxd index a47e484..edf11e6 100755 --- a/pyverbs/librdmacm_enums.pxd +++ b/pyverbs/librdmacm_enums.pxd @@ -36,3 +36,11 @@ cdef extern from '': RAI_NUMERICHOST RAI_NOROUTE RAI_FAMILY + + cpdef enum rdma_cm_join_mc_attr_mask: + RDMA_CM_JOIN_MC_ATTR_ADDRESS + RDMA_CM_JOIN_MC_ATTR_JOIN_FLAGS + + cpdef enum rdma_cm_mc_join_flags: + RDMA_MC_JOIN_FLAG_FULLMEMBER + RDMA_MC_JOIN_FLAG_SENDONLY_FULLMEMBER diff --git a/pyverbs/mem_alloc.pyx b/pyverbs/mem_alloc.pyx index 3be1031..24be4f1 100644 --- a/pyverbs/mem_alloc.pyx +++ b/pyverbs/mem_alloc.pyx @@ -7,6 +7,7 @@ from posix.stdlib cimport posix_memalign as c_posix_memalign from libc.stdlib cimport malloc as c_malloc, free as c_free from posix.mman cimport mmap as c_mmap, munmap as c_munmap from libc.stdint cimport uintptr_t +from libc.string cimport memset cimport posix.mman as mm cdef extern from 'sys/mman.h': @@ -58,7 +59,8 @@ def malloc(size): def posix_memalign(size, alignment=8): """ - Python wrapper for the stdlib posix_memalign function + Python wrapper for the stdlib posix_memalign function. + The function calls posix_memalign and memsets the memory to 0. :param size: The size of the memory block in bytes :param alignment: Alignment of the allocated memory, must be a power of two :return: The address of the allocated memory, which is a multiple of @@ -68,6 +70,7 @@ def posix_memalign(size, alignment=8): ret = c_posix_memalign(&ptr, alignment, size) if ret: raise MemoryError('Failed to allocate memory ({err}'.format(ret)) + memset(ptr, 0, size) return ptr diff --git a/pyverbs/mr.pxd b/pyverbs/mr.pxd index 82ae79f..ebe8ada 100644 --- a/pyverbs/mr.pxd +++ b/pyverbs/mr.pxd @@ -4,22 +4,29 @@ #cython: language_level=3 from pyverbs.base cimport PyverbsCM +cimport pyverbs.librdmacm as cm from . cimport libibverbs as v cdef class MR(PyverbsCM): cdef object pd + cdef object cmid cdef v.ibv_mr *mr cdef int mmap_length cdef object is_huge cdef object is_user_addr cdef void *buf + cdef object _is_imported cpdef read(self, length, offset) cdef class MWBindInfo(PyverbsCM): cdef v.ibv_mw_bind_info info cdef object mr +cdef class MWBind(PyverbsCM): + cdef v.ibv_mw_bind mw_bind + cdef object mr + cdef class MW(PyverbsCM): cdef object pd cdef v.ibv_mw *mw diff --git a/pyverbs/mr.pyx b/pyverbs/mr.pyx index b7b2196..7011da1 100644 --- a/pyverbs/mr.pyx +++ b/pyverbs/mr.pyx @@ -6,14 +6,16 @@ import logging from posix.mman cimport mmap, munmap, MAP_PRIVATE, PROT_READ, PROT_WRITE, \ MAP_ANONYMOUS, MAP_HUGETLB -from pyverbs.pyverbs_error import PyverbsError, PyverbsRDMAError +from pyverbs.pyverbs_error import PyverbsError, PyverbsRDMAError, \ + PyverbsUserError +from libc.stdint cimport uintptr_t, SIZE_MAX from pyverbs.base import PyverbsRDMAErrno from posix.stdlib cimport posix_memalign from libc.string cimport memcpy, memset cimport pyverbs.libibverbs_enums as e -from libc.stdint cimport uintptr_t from pyverbs.device cimport DM from libc.stdlib cimport free +from .cmid cimport CMID from .pd cimport PD cdef extern from 'sys/mman.h': @@ -27,32 +29,52 @@ cdef class MR(PyverbsCM): MR class represents ibv_mr. Buffer allocation in done in the c'tor. Freeing it is done in close(). """ - def __init__(self, PD pd not None, length, access, address=None): + def __init__(self, creator not None, length=0, access=0, address=None, + implicit=False, **kwargs): """ Allocate a user-level buffer of length and register a Memory Region of the given length and access flags. - :param pd: A PD object - :param length: Length in bytes + :param creator: A PD/CMID object. In case of CMID is passed the MR will + be registered using rdma_reg_msgs/write/read according + to the passed access flag of local_write/remote_write or + remote_read respectively. + :param length: Length (in bytes) of MR's buffer. :param access: Access flags, see ibv_access_flags enum :param address: Memory address to register (Optional). If it's not provided, a memory will be allocated in the class initialization. + :param implicit: Implicit the MR address. + :param kwargs: Arguments: + * *handle* + A valid kernel handle for a MR object in the given PD (creator). + If passed, the MR will be imported and associated with the + context that is associated with the given PD using ibv_import_mr. :return: The newly created MR on success """ super().__init__() if self.mr != NULL: return self.is_huge = True if access & e.IBV_ACCESS_HUGETLB else False - # We want to enable registering an MR of size 0 but this fails with a - # buffer of size 0, so in this case lets increase the buffer - if length == 0: - length = 10 if address: self.is_user_addr = True # uintptr_t is guaranteed to be large enough to hold any pointer. # In order to safely cast addr to void*, it is firstly cast to uintptr_t. self.buf = address - else: + + mr_handle = kwargs.get('handle') + # If a MR handle is passed import MR and finish + if mr_handle is not None: + pd = creator + self.mr = v.ibv_import_mr(pd.pd, mr_handle) + if self.mr == NULL: + raise PyverbsRDMAErrno('Failed to import MR') + self._is_imported = True + self.pd = pd + pd.add_ref(self) + return + + # Allocate a buffer + if not address and length > 0: if self.is_huge: # Rounding up to multiple of HUGE_PAGE_SIZE self.mmap_length = length + (HUGE_PAGE_SIZE - length % HUGE_PAGE_SIZE) \ @@ -68,15 +90,34 @@ cdef class MR(PyverbsCM): raise PyverbsError('Failed to allocate MR buffer of size {l}'. format(l=length)) memset(self.buf, 0, length) - self.mr = v.ibv_reg_mr(pd.pd, self.buf, length, access) + if isinstance(creator, PD): + pd = creator + if implicit: + self.mr = v.ibv_reg_mr(pd.pd, NULL, SIZE_MAX, access) + else: + self.mr = v.ibv_reg_mr(pd.pd, self.buf, length, access) + self.pd = pd + pd.add_ref(self) + elif isinstance(creator, CMID): + cmid = creator + if access == e.IBV_ACCESS_LOCAL_WRITE: + self.mr = cm.rdma_reg_msgs(cmid.id, self.buf, length) + elif access == e.IBV_ACCESS_REMOTE_WRITE: + self.mr = cm.rdma_reg_write(cmid.id, self.buf, length) + elif access == e.IBV_ACCESS_REMOTE_READ: + self.mr = cm.rdma_reg_read(cmid.id, self.buf, length) + self.cmid = cmid + cmid.add_ref(self) if self.mr == NULL: raise PyverbsRDMAErrno('Failed to register a MR. length: {l}, access flags: {a}'. format(l=length, a=access)) - self.pd = pd - pd.add_ref(self) self.logger.debug('Registered ibv_mr. Length: {l}, access flags {a}'. format(l=length, a=access)) + def unimport(self): + v.ibv_unimport_mr(self.mr) + self.close() + def __dealloc__(self): self.close() @@ -86,34 +127,43 @@ cdef class MR(PyverbsCM): MR may be deleted directly or indirectly by closing its context, which leaves the Python PD object without the underlying C object, so during destruction, need to check whether or not the C object exists. + In case of an imported MR no deregistration will be done, it's left + for the original MR, in order to prevent double dereg by the GC. :return: None """ if self.mr != NULL: self.logger.debug('Closing MR') - rc = v.ibv_dereg_mr(self.mr) - if rc != 0: - raise PyverbsRDMAError('Failed to dereg MR', rc) + if not self._is_imported: + rc = v.ibv_dereg_mr(self.mr) + if rc != 0: + raise PyverbsRDMAError('Failed to dereg MR', rc) + if not self.is_user_addr: + if self.is_huge: + munmap(self.buf, self.mmap_length) + else: + free(self.buf) self.mr = NULL self.pd = None - if not self.is_user_addr: - if self.is_huge: - munmap(self.buf, self.mmap_length) - else: - free(self.buf) - self.buf = NULL + self.buf = NULL + self.cmid = None - def write(self, data, length): + def write(self, data, length, offset=0): """ Write user data to the MR's buffer using memcpy :param data: User data to write :param length: Length of the data to write + :param offset: Writing offset :return: None """ + if not self.buf or length < 0: + raise PyverbsUserError('The MR buffer isn\'t allocated or length' + f' {length} is invalid') # If data is a string, cast it to bytes as Python3 doesn't # automatically convert it. + cdef int off = offset if isinstance(data, str): data = data.encode() - memcpy(self.buf, data, length) + memcpy((self.buf + off), data, length) cpdef read(self, length, offset): """ @@ -125,6 +175,11 @@ cdef class MR(PyverbsCM): cdef char *data cdef int off = offset # we can't use offset in the next line, as it is # a Python object and not C + if offset < 0: + raise PyverbsUserError(f'Invalid offset {offset}') + if not self.buf or length < 0: + raise PyverbsUserError('The MR buffer isn\'t allocated or length' + f' {length} is invalid') data = (self.buf + off) return data[:length] @@ -144,6 +199,19 @@ cdef class MR(PyverbsCM): def length(self): return self.mr.length + @property + def handle(self): + return self.mr.handle + + def __str__(self): + print_format = '{:22}: {:<20}\n' + return 'MR\n' + \ + print_format.format('lkey', self.lkey) + \ + print_format.format('rkey', self.rkey) + \ + print_format.format('length', self.length) + \ + print_format.format('buf', self.buf) + \ + print_format.format('handle', self.handle) + cdef class MWBindInfo(PyverbsCM): def __init__(self, MR mr not None, addr, length, mw_access_flags): @@ -154,6 +222,39 @@ cdef class MWBindInfo(PyverbsCM): self.info.length = length self.info.mw_access_flags = mw_access_flags + @property + def mw_access_flags(self): + return self.info.mw_access_flags + + @property + def length(self): + return self.info.length + + @property + def addr(self): + return self.info.addr + + def __str__(self): + print_format = '{:22}: {:<20}\n' + return 'MWBindInfo:\n' +\ + print_format.format('Addr', self.info.addr) +\ + print_format.format('Length', self.info.length) +\ + print_format.format('MW access flags', self.info.mw_access_flags) + + +cdef class MWBind(PyverbsCM): + def __init__(self, MWBindInfo info not None,send_flags, wr_id=0): + super().__init__() + self.mw_bind.wr_id = wr_id + self.mw_bind.send_flags = send_flags + self.mw_bind.bind_info = info.info + + def __str__(self): + print_format = '{:22}: {:<20}\n' + return 'MWBind:\n' +\ + print_format.format('WR id', self.mw_bind.wr_id) +\ + print_format.format('Send flags', self.mw_bind.send_flags) + cdef class MW(PyverbsCM): def __init__(self, PD pd not None, v.ibv_mw_type mw_type): @@ -192,6 +293,25 @@ cdef class MW(PyverbsCM): self.mw = NULL self.pd = None + @property + def handle(self): + return self.mw.handle + + @property + def rkey(self): + return self.mw.rkey + + @property + def type(self): + return self.mw.type + + def __str__(self): + print_format = '{:22}: {:<20}\n' + return 'MW:\n' +\ + print_format.format('Rkey', self.mw.rkey) +\ + print_format.format('Handle', self.mw.handle) +\ + print_format.format('MW Type', mwtype2str(self.mw.type)) + cdef class DMMR(MR): def __init__(self, PD pd not None, length, access, DM dm, offset): diff --git a/pyverbs/pd.pxd b/pyverbs/pd.pxd index ae4324a..94d453e 100644 --- a/pyverbs/pd.pxd +++ b/pyverbs/pd.pxd @@ -19,6 +19,7 @@ cdef class PD(PyverbsCM): cdef object ahs cdef object qps cdef object parent_domains + cdef object _is_imported cdef class ParentDomainInitAttr(PyverbsObject): cdef v.ibv_parent_domain_init_attr init_attr diff --git a/pyverbs/pd.pyx b/pyverbs/pd.pyx index 9e8395f..2a35d11 100755 --- a/pyverbs/pd.pyx +++ b/pyverbs/pd.pyx @@ -1,5 +1,7 @@ # SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) # Copyright (c) 2019, Mellanox Technologies. All rights reserved. +from libc.stdint cimport uintptr_t, uint32_t +from libc.stdlib cimport malloc import weakref import logging @@ -7,8 +9,8 @@ from pyverbs.pyverbs_error import PyverbsUserError, PyverbsError, \ PyverbsRDMAError from pyverbs.base import PyverbsRDMAErrno from pyverbs.base cimport close_weakrefs +from pyverbs.wr cimport copy_sg_array from pyverbs.device cimport Context -from libc.stdint cimport uintptr_t from pyverbs.cmid cimport CMID from .mr cimport MR, MW, DMMR from pyverbs.srq cimport SRQ @@ -18,19 +20,31 @@ from pyverbs.qp cimport QP cdef class PD(PyverbsCM): - def __init__(self, object creator not None): + def __init__(self, object creator not None, **kwargs): """ Initializes a PD object. A reference for the creating Context is kept so that Python's GC will destroy the objects in the right order. :param creator: The Context/CMID object creating the PD + :param kwargs: Arguments: + * *handle* + A valid kernel handle for a PD object in the given creator + (Context). If passed, the PD will be imported and associated + with the given handle in the given context using ibv_import_pd. """ super().__init__() + pd_handle = kwargs.get('handle') if issubclass(type(creator), Context): # Check if the ibv_pd* was initialized by an inheriting class if self.pd == NULL: - self.pd = v.ibv_alloc_pd((creator).context) + if pd_handle is not None: + self.pd = v.ibv_import_pd((creator).context, pd_handle) + self._is_imported = True + err_str = 'Failed to import PD' + else: + self.pd = v.ibv_alloc_pd((creator).context) + err_str = 'Failed to allocate PD' if self.pd == NULL: - raise PyverbsRDMAErrno('Failed to allocate PD') + raise PyverbsRDMAErrno(err_str) self.ctx = creator elif issubclass(type(creator), CMID): cmid = creator @@ -41,7 +55,7 @@ cdef class PD(PyverbsCM): raise PyverbsUserError('Cannot create PD from {type}' .format(type=type(creator))) self.ctx.add_ref(self) - self.logger.debug('PD: Allocated ibv_pd') + self.logger.debug('Created PD') self.srqs = weakref.WeakSet() self.mrs = weakref.WeakSet() self.mws = weakref.WeakSet() @@ -49,6 +63,27 @@ cdef class PD(PyverbsCM): self.qps = weakref.WeakSet() self.parent_domains = weakref.WeakSet() + def advise_mr(self, advise, uint32_t flags, sg_list not None): + """ + Give advice or directions to the kernel about an address range + belonging to a MR. + :param advise: The requested advise value + :param flags: Describes the properties of the advise operation + :param sg_list: The scatter gather list + :return: 0 on success, otherwise PyverbsRDMAError will be raised + """ + num_sges = len(sg_list) + dst_sg_list = malloc(num_sges * sizeof(v.ibv_sge)) + copy_sg_array(dst_sg_list, sg_list, num_sges) + rc = v.ibv_advise_mr(self.pd, advise, flags, dst_sg_list, num_sges) + if rc: + raise PyverbsRDMAError('Failed to advise MR', rc) + return rc + + def unimport(self): + v.ibv_unimport_pd(self.pd) + self.close() + def __dealloc__(self): """ Closes the inner PD. @@ -62,15 +97,18 @@ cdef class PD(PyverbsCM): PD may be deleted directly or indirectly by closing its context, which leaves the Python PD object without the underlying C object, so during destruction, need to check whether or not the C object exists. + In case of an imported PD no deallocation will be done, it's left for + the original PD, in order to prevent double dealloc by the GC. :return: None """ if self.pd != NULL: self.logger.debug('Closing PD') close_weakrefs([self.parent_domains, self.qps, self.ahs, self.mws, self.mrs, self.srqs]) - rc = v.ibv_dealloc_pd(self.pd) - if rc != 0: - raise PyverbsRDMAError('Failed to dealloc PD', rc) + if not self._is_imported: + rc = v.ibv_dealloc_pd(self.pd) + if rc != 0: + raise PyverbsRDMAError('Failed to dealloc PD', rc) self.pd = NULL self.ctx = None @@ -90,6 +128,10 @@ cdef class PD(PyverbsCM): else: raise PyverbsError('Unrecognized object type') + @property + def handle(self): + return self.pd.handle + cdef void *pd_alloc(v.ibv_pd *pd, void *pd_context, size_t size, size_t alignment, v.uint64_t resource_type): diff --git a/pyverbs/providers/efa/CMakeLists.txt b/pyverbs/providers/efa/CMakeLists.txt new file mode 100644 index 0000000..7e3a882 --- /dev/null +++ b/pyverbs/providers/efa/CMakeLists.txt @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright 2020 Amazon.com, Inc. or its affiliates. All rights reserved. + +rdma_cython_module(pyverbs/providers/efa efa + efadv.pyx +) diff --git a/pyverbs/providers/efa/__init__.pxd b/pyverbs/providers/efa/__init__.pxd new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/pyverbs/providers/efa/__init__.pxd diff --git a/pyverbs/providers/efa/__init__.py b/pyverbs/providers/efa/__init__.py new file mode 100644 index 0000000..e69de29 --- /dev/null +++ b/pyverbs/providers/efa/__init__.py diff --git a/pyverbs/providers/efa/efadv.pxd b/pyverbs/providers/efa/efadv.pxd new file mode 100644 index 0000000..c4013dd --- /dev/null +++ b/pyverbs/providers/efa/efadv.pxd @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright 2020 Amazon.com, Inc. or its affiliates. All rights reserved. + +#cython: language_level=3 + +cimport pyverbs.providers.efa.libefa as dv + +from pyverbs.addr cimport AH +from pyverbs.base cimport PyverbsObject +from pyverbs.device cimport Context +from pyverbs.qp cimport QP + + +cdef class EfaContext(Context): + pass + + +cdef class EfaDVDeviceAttr(PyverbsObject): + cdef dv.efadv_device_attr dv + + +cdef class EfaAH(AH): + pass + + +cdef class EfaDVAHAttr(PyverbsObject): + cdef dv.efadv_ah_attr ah_attr + + +cdef class SRDQP(QP): + pass diff --git a/pyverbs/providers/efa/efadv.pyx b/pyverbs/providers/efa/efadv.pyx new file mode 100644 index 0000000..dfbde9e --- /dev/null +++ b/pyverbs/providers/efa/efadv.pyx @@ -0,0 +1,150 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright 2020 Amazon.com, Inc. or its affiliates. All rights reserved. + +cimport pyverbs.providers.efa.efadv_enums as dve +cimport pyverbs.providers.efa.libefa as dv + +from pyverbs.base import PyverbsRDMAErrno, PyverbsRDMAError +from pyverbs.pd cimport PD +from pyverbs.qp cimport QP, QPInitAttr + + +def dev_cap_to_str(flags): + l = { + dve.EFADV_DEVICE_ATTR_CAPS_RDMA_READ: 'RDMA Read', + dve.EFADV_DEVICE_ATTR_CAPS_RNR_RETRY: 'RNR Retry', + } + return bitmask_to_str(flags, l) + + +def bitmask_to_str(bits, values): + numeric_bits = bits + flags = [] + for k, v in sorted(values.items()): + if bits & k: + flags.append(v) + bits -= k + if bits: + flags.append(f'??({bits:x})') + if not flags: + flags.append('None') + return ', '.join(flags) + f' ({numeric_bits:x})' + + +cdef class EfaContext(Context): + """ + Represent efa context, which extends Context. + """ + def __init__(self, name=''): + """ + Open an efa device + :param name: The RDMA device's name (used by parent class) + :return: None + """ + super().__init__(name=name) + + def query_efa_device(self): + """ + Queries the provider for device-specific attributes. + :return: An EfaDVDeviceAttr containing the attributes. + """ + dv_attr = EfaDVDeviceAttr() + rc = dv.efadv_query_device(self.context, &dv_attr.dv, sizeof(dv_attr.dv)) + if rc: + raise PyverbsRDMAError(f'Failed to query efa device {self.name}', rc) + return dv_attr + + +cdef class EfaDVDeviceAttr(PyverbsObject): + """ + Represents efadv_context struct, which exposes efa-specific capabilities, + reported by efadv_query_device. + """ + @property + def comp_mask(self): + return self.dv.comp_mask + + @property + def max_sq_wr(self): + return self.dv.max_sq_wr + + @property + def max_rq_wr(self): + return self.dv.max_rq_wr + + @property + def max_sq_sge(self): + return self.dv.max_sq_sge + + @property + def max_rq_sge(self): + return self.dv.max_rq_sge + + @property + def inline_buf_size(self): + return self.dv.inline_buf_size + + @property + def device_caps(self): + return self.dv.device_caps + + @property + def max_rdma_size(self): + return self.dv.max_rdma_size + + def __str__(self): + print_format = '{:20}: {:<20}\n' + return print_format.format('comp_mask', self.dv.comp_mask) + \ + print_format.format('Max SQ WR', self.dv.max_sq_wr) + \ + print_format.format('Max RQ WR', self.dv.max_rq_wr) + \ + print_format.format('Max SQ SQE', self.dv.max_sq_sge) + \ + print_format.format('Max RQ SQE', self.dv.max_rq_sge) + \ + print_format.format('Inline buffer size', self.dv.inline_buf_size) + \ + print_format.format('Device Capabilities', dev_cap_to_str(self.dv.device_caps)) + \ + print_format.format('Max RDMA Size', self.dv.max_rdma_size) + + +cdef class EfaDVAHAttr(PyverbsObject): + """ + Represents efadv_ah_attr struct + """ + @property + def comp_mask(self): + return self.ah_attr.comp_mask + + @property + def ahn(self): + return self.ah_attr.ahn + + def __str__(self): + print_format = '{:20}: {:<20}\n' + return print_format.format('comp_mask', self.ah_attr.comp_mask) + \ + print_format.format('ahn', self.ah_attr.ahn) + + +cdef class EfaAH(AH): + def query_efa_ah(self): + """ + Queries the provider for EFA specific AH attributes. + :return: An EfaDVAHAttr containing the attributes. + """ + ah_attr = EfaDVAHAttr() + err = dv.efadv_query_ah(self.ah, &ah_attr.ah_attr, sizeof(ah_attr.ah_attr)) + if err: + raise PyverbsRDMAError('Failed to query efa ah', err) + return ah_attr + + +cdef class SRDQP(QP): + """ + Initializes an SRD QP according to the user-provided data. + :param pd: PD object + :param init_attr: QPInitAttr object + :return: An initialized SRDQP + """ + def __init__(self, PD pd not None, QPInitAttr init_attr not None): + pd.add_ref(self) + self.qp = dv.efadv_create_driver_qp(pd.pd, &init_attr.attr, dve.EFADV_QP_DRIVER_TYPE_SRD) + if self.qp == NULL: + raise PyverbsRDMAErrno('Failed to create SRD QP') + super().__init__(pd, init_attr) diff --git a/pyverbs/providers/efa/efadv_enums.pxd b/pyverbs/providers/efa/efadv_enums.pxd new file mode 100644 index 0000000..b678c72 --- /dev/null +++ b/pyverbs/providers/efa/efadv_enums.pxd @@ -0,0 +1,13 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) + +#cython: language_level=3 + + +cdef extern from 'infiniband/efadv.h': + + cpdef enum: + EFADV_DEVICE_ATTR_CAPS_RDMA_READ + EFADV_DEVICE_ATTR_CAPS_RNR_RETRY + + cpdef enum: + EFADV_QP_DRIVER_TYPE_SRD diff --git a/pyverbs/providers/efa/libefa.pxd b/pyverbs/providers/efa/libefa.pxd new file mode 100644 index 0000000..6ebf3a7 --- /dev/null +++ b/pyverbs/providers/efa/libefa.pxd @@ -0,0 +1,31 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright 2020 Amazon.com, Inc. or its affiliates. All rights reserved. + +from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t +cimport pyverbs.libibverbs as v + + +cdef extern from 'infiniband/efadv.h': + + cdef struct efadv_device_attr: + uint64_t comp_mask; + uint32_t max_sq_wr; + uint32_t max_rq_wr; + uint16_t max_sq_sge; + uint16_t max_rq_sge; + uint16_t inline_buf_size; + uint8_t reserved[2]; + uint32_t device_caps; + uint32_t max_rdma_size; + + cdef struct efadv_ah_attr: + uint64_t comp_mask; + uint16_t ahn; + uint8_t reserved[6]; + + int efadv_query_device(v.ibv_context *ibvctx, efadv_device_attr *attrs, + uint32_t inlen) + int efadv_query_ah(v.ibv_ah *ibvah, efadv_ah_attr *attr, + uint32_t inlen) + v.ibv_qp *efadv_create_driver_qp(v.ibv_pd *ibvpd, v.ibv_qp_init_attr *attr, + uint32_t driver_qp_type) diff --git a/pyverbs/providers/mlx5/libmlx5.pxd b/pyverbs/providers/mlx5/libmlx5.pxd index b346326..51b798f 100644 --- a/pyverbs/providers/mlx5/libmlx5.pxd +++ b/pyverbs/providers/mlx5/libmlx5.pxd @@ -3,7 +3,7 @@ include 'mlx5dv_enums.pxd' -from libc.stdint cimport uint16_t, uint32_t, uint64_t +from libc.stdint cimport uint8_t, uint16_t, uint32_t, uint64_t from libcpp cimport bool cimport pyverbs.libibverbs as v @@ -42,6 +42,7 @@ cdef extern from 'infiniband/mlx5dv.h': unsigned long max_clock_info_update_nsec unsigned int flow_action_flags unsigned int dc_odp_caps + uint8_t num_lag_ports cdef struct mlx5dv_dc_init_attr: mlx5dv_dc_type dc_type @@ -68,6 +69,16 @@ cdef extern from 'infiniband/mlx5dv.h': cdef struct mlx5dv_pp: uint16_t index + cdef struct mlx5dv_devx_uar: + void *reg_addr; + void *base_addr; + uint32_t page_id; + long mmap_off; + uint64_t comp_mask; + + cdef struct mlx5dv_qp_ex: + uint64_t comp_mask + bool mlx5dv_is_supported(v.ibv_device *device) v.ibv_context* mlx5dv_open_device(v.ibv_device *device, mlx5dv_context_attr *attr) @@ -76,6 +87,9 @@ cdef extern from 'infiniband/mlx5dv.h': v.ibv_qp *mlx5dv_create_qp(v.ibv_context *context, v.ibv_qp_init_attr_ex *qp_attr, mlx5dv_qp_init_attr *mlx5_qp_attr) + int mlx5dv_query_qp_lag_port(v.ibv_qp *qp, uint8_t *port_num, + uint8_t *active_port_num) + int mlx5dv_modify_qp_lag_port(v.ibv_qp *qp, uint8_t port_num) v.ibv_cq_ex *mlx5dv_create_cq(v.ibv_context *context, v.ibv_cq_init_attr_ex *cq_attr, mlx5dv_cq_init_attr *mlx5_cq_attr) @@ -85,3 +99,9 @@ cdef extern from 'infiniband/mlx5dv.h': mlx5dv_pp *mlx5dv_pp_alloc(v.ibv_context *context, size_t pp_context_sz, const void *pp_context, uint32_t flags) void mlx5dv_pp_free(mlx5dv_pp *pp) + mlx5dv_devx_uar *mlx5dv_devx_alloc_uar(v.ibv_context *context, + uint32_t flags) + void mlx5dv_devx_free_uar(mlx5dv_devx_uar *devx_uar) + void mlx5dv_wr_set_dc_addr(mlx5dv_qp_ex *mqp, v.ibv_ah *ah, + uint32_t remote_dctn, uint64_t remote_dc_key) + mlx5dv_qp_ex *mlx5dv_qp_ex_from_ibv_qp_ex(v.ibv_qp_ex *qp_ex) diff --git a/pyverbs/providers/mlx5/mlx5dv.pxd b/pyverbs/providers/mlx5/mlx5dv.pxd index 23af002..111c779 100644 --- a/pyverbs/providers/mlx5/mlx5dv.pxd +++ b/pyverbs/providers/mlx5/mlx5dv.pxd @@ -4,14 +4,13 @@ #cython: language_level=3 cimport pyverbs.providers.mlx5.libmlx5 as dv -from pyverbs.device cimport Context, VAR from pyverbs.base cimport PyverbsObject +from pyverbs.device cimport Context +from pyverbs.qp cimport QP, QPEx from pyverbs.cq cimport CQEX -from pyverbs.qp cimport QP cdef class Mlx5Context(Context): - cdef object pps cpdef close(self) cdef class Mlx5DVContextAttr(PyverbsObject): @@ -26,7 +25,7 @@ cdef class Mlx5DVDCInitAttr(PyverbsObject): cdef class Mlx5DVQPInitAttr(PyverbsObject): cdef dv.mlx5dv_qp_init_attr attr -cdef class Mlx5QP(QP): +cdef class Mlx5QP(QPEx): cdef object dc_type cdef class Mlx5DVCQInitAttr(PyverbsObject): @@ -35,11 +34,17 @@ cdef class Mlx5DVCQInitAttr(PyverbsObject): cdef class Mlx5CQ(CQEX): pass -cdef class Mlx5VAR(VAR): +cdef class Mlx5VAR(PyverbsObject): cdef dv.mlx5dv_var *var + cdef object context cpdef close(self) cdef class Mlx5PP(PyverbsObject): cdef dv.mlx5dv_pp *pp cdef object context cpdef close(self) + +cdef class Mlx5UAR(PyverbsObject): + cdef dv.mlx5dv_devx_uar *uar + cdef object context + cpdef close(self) diff --git a/pyverbs/providers/mlx5/mlx5dv.pyx b/pyverbs/providers/mlx5/mlx5dv.pyx index 7ea6fbb..59d2435 100644 --- a/pyverbs/providers/mlx5/mlx5dv.pyx +++ b/pyverbs/providers/mlx5/mlx5dv.pyx @@ -1,19 +1,20 @@ # SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) # Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. See COPYING file +from libc.stdint cimport uintptr_t, uint8_t import logging -from pyverbs.pyverbs_error import PyverbsUserError +from pyverbs.pyverbs_error import PyverbsUserError, PyverbsRDMAError cimport pyverbs.providers.mlx5.mlx5dv_enums as dve cimport pyverbs.providers.mlx5.libmlx5 as dv +from pyverbs.qp cimport QPInitAttrEx, QPEx from pyverbs.base import PyverbsRDMAErrno from pyverbs.base cimport close_weakrefs cimport pyverbs.libibverbs_enums as e -from pyverbs.qp cimport QPInitAttrEx from pyverbs.cq cimport CqInitAttrEx cimport pyverbs.libibverbs as v +from pyverbs.addr cimport AH from pyverbs.pd cimport PD -import weakref cdef class Mlx5DVContextAttr(PyverbsObject): @@ -60,7 +61,6 @@ cdef class Mlx5Context(Context): super().__init__(name=name, attr=attr) if not dv.mlx5dv_is_supported(self.device): raise PyverbsUserError('This is not an MLX5 device') - self.pps = weakref.WeakSet() self.context = dv.mlx5dv_open_device(self.device, &attr.attr) if self.context == NULL: raise PyverbsRDMAErrno('Failed to open mlx5 context on {dev}' @@ -92,12 +92,6 @@ cdef class Mlx5Context(Context): format(name=self.name, rc=rc)) return dv_attr - cdef add_ref(self, obj): - if isinstance(obj, Mlx5PP): - self.pps.add(obj) - else: - super().add_ref(obj) - def __dealloc__(self): self.close() @@ -159,6 +153,10 @@ cdef class Mlx5DVContext(PyverbsObject): def dc_odp_caps(self): return self.dv.dc_odp_caps + @property + def num_lag_ports(self): + return self.dv.num_lag_ports + def __str__(self): print_format = '{:20}: {:<20}\n' ident_format = ' {:20}: {:<20}\n' @@ -197,7 +195,8 @@ cdef class Mlx5DVContext(PyverbsObject): self.dv.max_clock_info_update_nsec) +\ print_format.format('Flow action flags', self.dv.flow_action_flags) +\ - print_format.format('DC ODP caps', self.dv.dc_odp_caps) + print_format.format('DC ODP caps', self.dv.dc_odp_caps) +\ + print_format.format('Num LAG ports', self.dv.num_lag_ports) cdef class Mlx5DVDCInitAttr(PyverbsObject): @@ -310,7 +309,7 @@ cdef class Mlx5DVQPInitAttr(PyverbsObject): self.attr.dc_init_attr.dct_access_key = val -cdef class Mlx5QP(QP): +cdef class Mlx5QP(QPEx): def __init__(self, Mlx5Context context, QPInitAttrEx init_attr, Mlx5DVQPInitAttr dv_init_attr): """ @@ -358,6 +357,42 @@ cdef class Mlx5QP(QP): return super()._get_comp_mask(dst) return masks[self.dc_type][dst] | e.IBV_QP_STATE + def wr_set_dc_addr(self, AH ah, remote_dctn, remote_dc_key): + """ + Attach a DC info to the last work request. + :param ah: Address Handle to the requested DCT. + :param remote_dctn: The remote DCT number. + :param remote_dc_key: The remote DC key. + """ + dv.mlx5dv_wr_set_dc_addr(dv.mlx5dv_qp_ex_from_ibv_qp_ex(self.qp_ex), + ah.ah, remote_dctn, remote_dc_key) + + @staticmethod + def query_lag_port(QP qp): + """ + Queries for port num that the QP desired to use, and the port that + is currently used by the bond for this QP. + :param qp: Queries the port for this QP. + :return: Tuple of the desired port and actual port which used by the HW. + """ + cdef uint8_t port_num + cdef uint8_t active_port_num + rc = dv.mlx5dv_query_qp_lag_port(qp.qp, &port_num, &active_port_num) + if rc != 0: + raise PyverbsRDMAError(f'Failed to query QP #{qp.qp.qp_num}', rc) + return port_num, active_port_num + + @staticmethod + def modify_lag_port(QP qp, uint8_t port_num): + """ + Modifies the lag port num that the QP desires to use. + :param qp: Modifies the port for this QP. + :param port_num: The desired port to be used by the QP to send traffic + in a LAG configuration. + """ + rc = dv.mlx5dv_modify_qp_lag_port(qp.qp, port_num) + if rc != 0: + raise PyverbsRDMAError(f'Failed to modify lag of QP #{qp.qp.qp_num}', rc) cdef class Mlx5DVCQInitAttr(PyverbsObject): """ @@ -574,12 +609,13 @@ def send_ops_flags_to_str(flags): return bitmask_to_str(flags, l) -cdef class Mlx5VAR(VAR): +cdef class Mlx5VAR(PyverbsObject): def __init__(self, Context context not None, flags=0): + self.context = context self.var = dv.mlx5dv_alloc_var(context.context, flags) if self.var == NULL: raise PyverbsRDMAErrno('Failed to allocate VAR') - context.add_ref(self) + context.vars.add(self) def __dealloc__(self): self.close() @@ -632,7 +668,7 @@ cdef class Mlx5PP(PyverbsObject): pp_ctx_bytes, flags) if self.pp == NULL: raise PyverbsRDMAErrno('Failed to allocate packet pacing entry') - (context).add_ref(self) + context.pps.add(self) def __dealloc__(self): self.close() @@ -645,3 +681,47 @@ cdef class Mlx5PP(PyverbsObject): @property def index(self): return self.pp.index + + +cdef class Mlx5UAR(PyverbsObject): + def __init__(self, Context context not None, flags=0): + self.uar = dv.mlx5dv_devx_alloc_uar(context.context, flags) + if self.uar == NULL: + raise PyverbsRDMAErrno('Failed to allocate UAR') + context.uars.add(self) + + def __dealloc__(self): + self.close() + + cpdef close(self): + if self.uar != NULL: + dv.mlx5dv_devx_free_uar(self.uar) + self.uar = NULL + + def __str__(self): + print_format = '{:20}: {:<20}\n' + return print_format.format('reg addr', self.uar.reg_addr) +\ + print_format.format('base addr', self.uar.base_addr) +\ + print_format.format('page id', self.uar.page_id) +\ + print_format.format('mmap off', self.uar.mmap_off) +\ + print_format.format('comp mask', self.uar.comp_mask) + + @property + def reg_addr(self): + return self.uar.reg_addr + + @property + def base_addr(self): + return self.uar.base_addr + + @property + def page_id(self): + return self.uar.page_id + + @property + def mmap_off(self): + return self.uar.mmap_off + + @property + def comp_mask(self): + return self.uar.comp_mask diff --git a/pyverbs/providers/mlx5/mlx5dv_enums.pxd b/pyverbs/providers/mlx5/mlx5dv_enums.pxd index 2c12ddb..2349fdf 100644 --- a/pyverbs/providers/mlx5/mlx5dv_enums.pxd +++ b/pyverbs/providers/mlx5/mlx5dv_enums.pxd @@ -16,6 +16,7 @@ cdef extern from 'infiniband/mlx5dv.h': MLX5DV_CONTEXT_MASK_DYN_BFREGS = 1 << 4 MLX5DV_CONTEXT_MASK_CLOCK_INFO_UPDATE = 1 << 5 MLX5DV_CONTEXT_MASK_FLOW_ACTION_FLAGS = 1 << 6 + MLX5DV_CONTEXT_MASK_NUM_LAG_PORTS = 1 << 9 cpdef enum mlx5dv_context_flags: MLX5DV_CONTEXT_FLAGS_CQE_V1 = 1 << 0 @@ -84,6 +85,8 @@ cdef extern from 'infiniband/mlx5dv.h': cpdef unsigned long long MLX5DV_RES_TYPE_DBR cpdef unsigned long long MLX5DV_RES_TYPE_SRQ cpdef unsigned long long MLX5DV_PP_ALLOC_FLAGS_DEDICATED_INDEX + cpdef unsigned long long MLX5DV_UAR_ALLOC_TYPE_BF + cpdef unsigned long long MLX5DV_UAR_ALLOC_TYPE_NC _MLX5DV_RES_TYPE_QP = MLX5DV_RES_TYPE_QP @@ -91,3 +94,5 @@ _MLX5DV_RES_TYPE_RWQ = MLX5DV_RES_TYPE_RWQ _MLX5DV_RES_TYPE_DBR = MLX5DV_RES_TYPE_DBR _MLX5DV_RES_TYPE_SRQ = MLX5DV_RES_TYPE_SRQ _MLX5DV_PP_ALLOC_FLAGS_DEDICATED_INDEX = MLX5DV_PP_ALLOC_FLAGS_DEDICATED_INDEX +_MLX5DV_UAR_ALLOC_TYPE_BF = MLX5DV_UAR_ALLOC_TYPE_BF +_MLX5DV_UAR_ALLOC_TYPE_NC = MLX5DV_UAR_ALLOC_TYPE_NC diff --git a/pyverbs/qp.pxd b/pyverbs/qp.pxd index 209a243..01f7b38 100644 --- a/pyverbs/qp.pxd +++ b/pyverbs/qp.pxd @@ -37,9 +37,15 @@ cdef class QP(PyverbsCM): cdef update_cqs(self, init_attr) cdef object scq cdef object rcq + cdef object mws + cdef object srq + cdef add_ref(self, obj) cdef class DataBuffer(PyverbsCM): cdef v.ibv_data_buf data cdef class QPEx(QP): cdef v.ibv_qp_ex *qp_ex + +cdef class ECE(PyverbsCM): + cdef v.ibv_ece ece diff --git a/pyverbs/qp.pyx b/pyverbs/qp.pyx index 95ef554..24dfc66 100755 --- a/pyverbs/qp.pyx +++ b/pyverbs/qp.pyx @@ -3,15 +3,16 @@ from libc.stdlib cimport malloc, free from libc.string cimport memcpy +import weakref +from pyverbs.pyverbs_error import PyverbsUserError, PyverbsError, PyverbsRDMAError from pyverbs.utils import gid_str, qp_type_to_str, qp_state_to_str, mtu_to_str -from pyverbs.pyverbs_error import PyverbsUserError, PyverbsError, \ - PyverbsRDMAError from pyverbs.utils import access_flags_to_str, mig_state_to_str -from pyverbs.base import PyverbsRDMAErrno +from pyverbs.mr cimport MW, MWBindInfo, MWBind from pyverbs.wr cimport RecvWR, SendWR, SGE +from pyverbs.base import PyverbsRDMAErrno from pyverbs.addr cimport AHAttr, GID, AH -from pyverbs.mr cimport MW, MWBindInfo +from pyverbs.base cimport close_weakrefs cimport pyverbs.libibverbs_enums as e from pyverbs.addr cimport GlobalRoute from pyverbs.device cimport Context @@ -871,6 +872,49 @@ cdef class QPAttr(PyverbsObject): print_format.format('Rate limit', self.attr.rate_limit) +cdef class ECE(PyverbsCM): + def __init__(self, vendor_id=0, options=0, comp_mask=0): + """ + :param vendor_id: Unique identifier of the provider vendor. + :param options: Provider specific attributes which are supported or + needed to be enabled by ECE users. + :param comp_mask: A bitmask specifying which ECE options should be + valid. + """ + super().__init__() + self.ece.vendor_id = vendor_id + self.ece.options = options + self.ece.comp_mask = comp_mask + + @property + def vendor_id(self): + return self.ece.vendor_id + @vendor_id.setter + def vendor_id(self, val): + self.ece.vendor_id = val + + @property + def options(self): + return self.ece.options + @options.setter + def options(self, val): + self.ece.options = val + + @property + def comp_mask(self): + return self.ece.comp_mask + @comp_mask.setter + def comp_mask(self, val): + self.ece.comp_mask = val + + def __str__(self): + print_format = '{:22}: 0x{:<20x}\n' + return 'ECE:\n' +\ + print_format.format('Vendor ID', self.ece.vendor_id) +\ + print_format.format('Options', self.ece.options) +\ + print_format.format('Comp Mask', self.ece.comp_mask) + + cdef class QP(PyverbsCM): def __init__(self, object creator not None, object init_attr not None, QPAttr qp_attr=None): @@ -898,6 +942,7 @@ cdef class QP(PyverbsCM): cdef PD pd cdef Context ctx super().__init__() + self.mws = weakref.WeakSet() self.update_cqs(init_attr) # QP initialization was not done by the provider, we should do it here if self.qp == NULL: @@ -927,6 +972,10 @@ cdef class QP(PyverbsCM): self.pd = pd pd.add_ref(self) self.context = None + if init_attr.srq is not None: + srq = init_attr.srq + srq.add_ref(self) + self.srq = srq if qp_attr is not None: funcs = {e.IBV_QPT_RC: self.to_init, e.IBV_QPT_UC: self.to_init, @@ -964,12 +1013,19 @@ cdef class QP(PyverbsCM): def _create_qp_ex(self, Context ctx, QPInitAttrEx attr): self.qp = v.ibv_create_qp_ex(ctx.context, &attr.attr) + cdef add_ref(self, obj): + if isinstance(obj, MW): + self.mws.add(obj) + else: + raise PyverbsError('Unrecognized object type') + def __dealloc__(self): self.close() cpdef close(self): if self.qp != NULL: self.logger.debug('Closing QP') + close_weakrefs([self.mws]) rc = v.ibv_destroy_qp(self.qp) if rc: raise PyverbsRDMAError('Failed to destroy QP', rc) @@ -1133,6 +1189,41 @@ cdef class QP(PyverbsCM): memcpy(&bad_wr.send_wr, my_bad_wr, sizeof(bad_wr.send_wr)) raise PyverbsRDMAError('Failed to post send', rc) + def set_ece(self, ECE ece): + """ + Set ECE options and use them for QP configuration stage + :param ece: The requested ECE values. + :return: None + """ + if ece.ece.vendor_id == 0: + return + + rc = v.ibv_set_ece(self.qp, &ece.ece) + if rc != 0: + raise PyverbsRDMAError('Failed to set ECE', rc) + + def query_ece(self): + """ + Query QPs ECE options + :return: ECE object with this QP ece configuration. + """ + ece = ECE() + rc = v.ibv_query_ece(self.qp, &ece.ece) + if rc != 0: + raise PyverbsRDMAError('Failed to query ECE', rc) + return ece + + def bind_mw(self, MW mw not None, MWBind mw_bind): + """ + Bind Memory window type 1. + :param mw: The memory window to bind. + :param mw_bind: MWBind object, includes the bind attributes. + :return: None + """ + rc = v.ibv_bind_mw(self.qp, mw.mw, &mw_bind.mw_bind) + if rc != 0: + raise PyverbsRDMAError('Failed to Bind MW', rc) + @property def qp_type(self): return self.qp.qp_type @@ -1169,9 +1260,12 @@ cdef class QPEx(QP): :return: An initialized QPEx object """ super().__init__(creator, init_attr, qp_attr) - self.qp_ex = v.ibv_qp_to_qp_ex(self.qp) - if self.qp_ex == NULL: - raise PyverbsRDMAErrno('Failed to create extended QP') + if init_attr.comp_mask & v.IBV_QP_INIT_ATTR_SEND_OPS_FLAGS: + self.qp_ex = v.ibv_qp_to_qp_ex(self.qp) + if self.qp_ex == NULL: + raise PyverbsRDMAErrno('Failed to create extended QP') + else: + self.logger.debug('qp_ex is not accessible since IBV_QP_INIT_ATTR_SEND_OPS_FLAGS was not passed.') @property def comp_mask(self): @@ -1205,6 +1299,7 @@ cdef class QPEx(QP): info = &bind_info.info v.ibv_wr_bind_mw(self.qp_ex, mw.mw, rkey, info) + self.add_ref(mw) def wr_local_inv(self, invalidate_rkey): v.ibv_wr_local_inv(self.qp_ex, invalidate_rkey) diff --git a/pyverbs/srq.pxd b/pyverbs/srq.pxd index a7b7b34..89018f9 100755 --- a/pyverbs/srq.pxd +++ b/pyverbs/srq.pxd @@ -21,4 +21,6 @@ cdef class SrqInitAttrEx(PyverbsObject): cdef class SRQ(PyverbsCM): cdef v.ibv_srq *srq cdef object cq + cdef object qps + cdef add_ref(self, obj) cpdef close(self) diff --git a/pyverbs/srq.pyx b/pyverbs/srq.pyx index 826579a..baa877b 100755 --- a/pyverbs/srq.pyx +++ b/pyverbs/srq.pyx @@ -1,9 +1,12 @@ -from pyverbs.pyverbs_error import PyverbsRDMAError +import weakref +from pyverbs.pyverbs_error import PyverbsRDMAError, PyverbsError from pyverbs.base import PyverbsRDMAErrno +from pyverbs.base cimport close_weakrefs from pyverbs.device cimport Context from pyverbs.cq cimport CQEX, CQ from pyverbs.xrcd cimport XRCD from pyverbs.wr cimport RecvWR +from pyverbs.qp cimport QP from pyverbs.pd cimport PD from libc.errno cimport errno from libc.string cimport memcpy @@ -129,6 +132,7 @@ cdef class SRQ(PyverbsCM): super().__init__() self.srq = NULL self.cq = None + self.qps = weakref.WeakSet() if isinstance(creator, PD): self._create_srq(creator, attr) elif type(creator) == Context: @@ -146,14 +150,22 @@ cdef class SRQ(PyverbsCM): cpdef close(self): if self.srq != NULL: self.logger.debug('Closing SRQ') + close_weakrefs([self.qps]) rc = v.ibv_destroy_srq(self.srq) if rc != 0: raise PyverbsRDMAError('Failed to destroy SRQ', rc) self.srq = NULL self.cq =None + cdef add_ref(self, obj): + if isinstance(obj, QP): + self.qps.add(obj) + else: + raise PyverbsError('Unrecognized object type') + def _create_srq(self, PD pd, SrqInitAttr init_attr): self.srq = v.ibv_create_srq(pd.pd, &init_attr.attr) + pd.add_ref(self) def _create_srq_ex(self, Context context, SrqInitAttrEx init_attr_ex): self.srq = v.ibv_create_srq_ex(context.context, &init_attr_ex.attr) diff --git a/pyverbs/wr.pxd b/pyverbs/wr.pxd index ea054ff..8c37c20 100644 --- a/pyverbs/wr.pxd +++ b/pyverbs/wr.pxd @@ -17,3 +17,5 @@ cdef class RecvWR(PyverbsCM): cdef class SendWR(PyverbsCM): cdef v.ibv_send_wr send_wr cdef object ah + +cdef copy_sg_array(v.ibv_sge *dst, sg, num_sge) diff --git a/pyverbs/wr.pyx b/pyverbs/wr.pyx index 37028c5..52e9d55 100644 --- a/pyverbs/wr.pyx +++ b/pyverbs/wr.pyx @@ -2,8 +2,10 @@ # Copyright (c) 2019 Mellanox Technologies Inc. All rights reserved. See COPYING file from pyverbs.pyverbs_error import PyverbsUserError, PyverbsError -from pyverbs.base import PyverbsRDMAErrno +from pyverbs.base import PyverbsRDMAErrno, inc_rkey +from pyverbs.mr cimport MW, MR, MWBindInfo cimport pyverbs.libibverbs_enums as e +cimport pyverbs.libibverbs as v from pyverbs.addr cimport AH from libc.stdlib cimport free, malloc from libc.string cimport memcpy @@ -143,21 +145,25 @@ cdef class RecvWR(PyverbsCM): cdef class SendWR(PyverbsCM): - def __init__(self, wr_id=0, opcode=e.IBV_WR_SEND, num_sge=0, sg = None, - send_flags=e.IBV_SEND_SIGNALED, SendWR next_wr = None): + def __init__(self, wr_id=0, opcode=e.IBV_WR_SEND, num_sge=0, imm_data=0, + sg = None, send_flags=e.IBV_SEND_SIGNALED, + SendWR next_wr = None): """ Initialize a SendWR object with user-provided or default values. :param wr_id: A user-defined WR ID :param opcode: The WR's opcode :param num_sge: Number of scatter-gather elements in the WR - :param send_flags: Send flags as define in ibv_send_flags enum + :param imm_data: Immediate data :param sg: A SGE element, head of the scatter-gather list + :param send_flags: Send flags as define in ibv_send_flags enum :return: An initialized SendWR object """ cdef v.ibv_sge *dst super().__init__() - if num_sge < 1 or sg is None: + mw_opcodes = [e.IBV_WR_LOCAL_INV, e.IBV_WR_BIND_MW, + e.IBV_WR_SEND_WITH_INV] + if opcode not in mw_opcodes and (num_sge < 1 or sg is None): raise PyverbsUserError('A WR needs at least one SGE') self.send_wr.sg_list = malloc(num_sge * sizeof(v.ibv_sge)) if self.send_wr.sg_list == NULL: @@ -170,6 +176,7 @@ cdef class SendWR(PyverbsCM): self.send_wr.next = &next_wr.send_wr self.send_wr.opcode = opcode self.send_wr.send_flags = send_flags + self.send_wr.imm_data = imm_data self.ah = None def __dealloc(self): @@ -184,7 +191,8 @@ cdef class SendWR(PyverbsCM): print_format.format('Num SGE', self.send_wr.num_sge) +\ print_format.format('Opcode', self.send_wr.opcode) +\ print_format.format('Send flags', - send_flags_to_str(self.send_wr.send_flags)) + send_flags_to_str(self.send_wr.send_flags) +\ + print_format.format('Imm Data', self.send_wr.imm_data)) @property def next_wr(self): @@ -212,6 +220,13 @@ cdef class SendWR(PyverbsCM): self.send_wr.num_sge = val @property + def imm_data(self): + return self.send_wr.imm_data + @imm_data.setter + def imm_data(self, val): + self.send_wr.imm_data = val + + @property def opcode(self): return self.send_wr.opcode @opcode.setter @@ -273,6 +288,23 @@ cdef class SendWR(PyverbsCM): self.send_wr.wr.atomic.compare_add = compare_add self.send_wr.wr.atomic.swap = swap + def set_bind_wr(self, MW mw, MWBindInfo bind_info): + """ + Set the members of the bind_mw struct in the send_wr. + :param mw: The MW to bind. + :param bind_info: MWBindInfo object, includes the bind attributes. + :return: None + """ + self.send_wr.bind_mw.mw = mw.mw + # Create the new key from the MW rkey. + rkey = inc_rkey(mw.rkey) + self.send_wr.bind_mw.rkey = rkey + self.send_wr.bind_mw.bind_info = bind_info.info + + @property + def rkey(self): + return self.send_wr.bind_mw.rkey + def set_qp_type_xrc(self, remote_srqn): """ Set the members of the xrc struct in the send_wr's qp_type union, used diff --git a/redhat/rdma-core.spec b/redhat/rdma-core.spec index dcd6ad3..ec0c63d 100644 --- a/redhat/rdma-core.spec +++ b/redhat/rdma-core.spec @@ -1,5 +1,5 @@ Name: rdma-core -Version: 29.0 +Version: 32.0 Release: 1%{?dist} Summary: RDMA core userspace libraries and daemons @@ -28,7 +28,7 @@ BuildRequires: valgrind-devel BuildRequires: systemd BuildRequires: systemd-devel %if 0%{?fedora} >= 32 -%define with_pyverbs %{?_with_pyverbs: 0} %{?!_with_pyverbs: 1} +%define with_pyverbs %{?_with_pyverbs: 1} %{?!_with_pyverbs: %{?!_without_pyverbs: 1} %{?_without_pyverbs: 0}} %else %define with_pyverbs %{?_with_pyverbs: 1} %{?!_with_pyverbs: 0} %endif @@ -299,7 +299,7 @@ easy, object-oriented access to IB verbs. -DCMAKE_INSTALL_SYSTEMD_SERVICEDIR:PATH=%{_unitdir} \ -DCMAKE_INSTALL_INITDDIR:PATH=%{_initrddir} \ -DCMAKE_INSTALL_RUNDIR:PATH=%{_rundir} \ - -DCMAKE_INSTALL_DOCDIR:PATH=%{_docdir}/%{name}-%{version} \ + -DCMAKE_INSTALL_DOCDIR:PATH=%{_docdir}/%{name} \ -DCMAKE_INSTALL_UDEV_RULESDIR:PATH=%{_udevrulesdir} \ -DCMAKE_INSTALL_PERLDIR:PATH=%{perl_vendorlib} \ -DENABLE_IBDIAGS_COMPAT:BOOL=True \ @@ -331,16 +331,12 @@ mkdir -p %{buildroot}%{_libexecdir} mkdir -p %{buildroot}%{_udevrulesdir} mkdir -p %{buildroot}%{dracutlibdir}/modules.d/05rdma mkdir -p %{buildroot}%{sysmodprobedir} -install -D -m0644 redhat/rdma.conf %{buildroot}/%{_sysconfdir}/rdma/rdma.conf -install -D -m0644 redhat/rdma.sriov-vfs %{buildroot}/%{_sysconfdir}/rdma/sriov-vfs install -D -m0644 redhat/rdma.mlx4.conf %{buildroot}/%{_sysconfdir}/rdma/mlx4.conf -install -D -m0644 redhat/rdma.service %{buildroot}%{_unitdir}/rdma.service install -D -m0755 redhat/rdma.modules-setup.sh %{buildroot}%{dracutlibdir}/modules.d/05rdma/module-setup.sh -install -D -m0644 redhat/rdma.udev-rules %{buildroot}%{_udevrulesdir}/98-rdma.rules install -D -m0644 redhat/rdma.mlx4.sys.modprobe %{buildroot}%{sysmodprobedir}/libmlx4.conf -install -D -m0755 redhat/rdma.kernel-init %{buildroot}%{_libexecdir}/rdma-init-kernel -install -D -m0755 redhat/rdma.sriov-init %{buildroot}%{_libexecdir}/rdma-set-sriov-vf install -D -m0755 redhat/rdma.mlx4-setup.sh %{buildroot}%{_libexecdir}/mlx4-setup.sh +rm -f %{buildroot}%{_sysconfdir}/rdma/modules/rdma.conf +install -D -m0644 redhat/rdma.conf %{buildroot}%{_sysconfdir}/rdma/modules/rdma.conf # ibacm bin/ib_acme -D . -O @@ -391,25 +387,22 @@ rm -rf %{buildroot}/%{_sbindir}/srp_daemon.sh %files %dir %{_sysconfdir}/rdma -%dir %{_docdir}/%{name}-%{version} -%doc %{_docdir}/%{name}-%{version}/README.md -%doc %{_docdir}/%{name}-%{version}/rxe.md -%doc %{_docdir}/%{name}-%{version}/udev.md -%doc %{_docdir}/%{name}-%{version}/tag_matching.md +%dir %{_docdir}/%{name} +%doc %{_docdir}/%{name}/README.md +%doc %{_docdir}/%{name}/rxe.md +%doc %{_docdir}/%{name}/udev.md +%doc %{_docdir}/%{name}/tag_matching.md %config(noreplace) %{_sysconfdir}/rdma/mlx4.conf %config(noreplace) %{_sysconfdir}/rdma/modules/infiniband.conf %config(noreplace) %{_sysconfdir}/rdma/modules/iwarp.conf %config(noreplace) %{_sysconfdir}/rdma/modules/opa.conf %config(noreplace) %{_sysconfdir}/rdma/modules/rdma.conf %config(noreplace) %{_sysconfdir}/rdma/modules/roce.conf -%config(noreplace) %{_sysconfdir}/rdma/rdma.conf -%config(noreplace) %{_sysconfdir}/rdma/sriov-vfs %config(noreplace) %{_sysconfdir}/udev/rules.d/* %config(noreplace) %{_sysconfdir}/modprobe.d/mlx4.conf %config(noreplace) %{_sysconfdir}/modprobe.d/truescale.conf %{_unitdir}/rdma-hw.target %{_unitdir}/rdma-load-modules@.service -%{_unitdir}/rdma.service %dir %{dracutlibdir}/modules.d/05rdma %{dracutlibdir}/modules.d/05rdma/module-setup.sh %{_udevrulesdir}/../rdma_rename @@ -419,10 +412,7 @@ rm -rf %{buildroot}/%{_sbindir}/srp_daemon.sh %{_udevrulesdir}/90-rdma-hw-modules.rules %{_udevrulesdir}/90-rdma-ulp-modules.rules %{_udevrulesdir}/90-rdma-umad.rules -%{_udevrulesdir}/98-rdma.rules %{sysmodprobedir}/libmlx4.conf -%{_libexecdir}/rdma-init-kernel -%{_libexecdir}/rdma-set-sriov-vf %{_libexecdir}/mlx4-setup.sh %{_libexecdir}/truescale-serdes.cmds %{_sbindir}/rdma-ndd @@ -432,7 +422,7 @@ rm -rf %{buildroot}/%{_sbindir}/srp_daemon.sh %license COPYING.* %files devel -%doc %{_docdir}/%{name}-%{version}/MAINTAINERS +%doc %{_docdir}/%{name}/MAINTAINERS %dir %{_includedir}/infiniband %dir %{_includedir}/rdma %{_includedir}/infiniband/* @@ -573,7 +563,7 @@ rm -rf %{buildroot}/%{_sbindir}/srp_daemon.sh %{_libdir}/libmlx5.so.* %{_libdir}/libmlx4.so.* %config(noreplace) %{_sysconfdir}/libibverbs.d/*.driver -%doc %{_docdir}/%{name}-%{version}/libibverbs.md +%doc %{_docdir}/%{name}/libibverbs.md %files -n libibverbs-utils %{_bindir}/ibv_* @@ -591,7 +581,7 @@ rm -rf %{buildroot}/%{_sbindir}/srp_daemon.sh %{_unitdir}/ibacm.socket %dir %{_libdir}/ibacm %{_libdir}/ibacm/* -%doc %{_docdir}/%{name}-%{version}/ibacm.md +%doc %{_docdir}/%{name}/ibacm.md %files -n iwpmd %{_sbindir}/iwpmd @@ -609,7 +599,7 @@ rm -rf %{buildroot}/%{_sbindir}/srp_daemon.sh %{_libdir}/librdmacm*.so.* %dir %{_libdir}/rsocket %{_libdir}/rsocket/*.so* -%doc %{_docdir}/%{name}-%{version}/librdmacm.md +%doc %{_docdir}/%{name}/librdmacm.md %{_mandir}/man7/rsocket.* %files -n librdmacm-utils @@ -654,10 +644,10 @@ rm -rf %{buildroot}/%{_sbindir}/srp_daemon.sh %{_mandir}/man5/srp_daemon_port@.service.5* %{_mandir}/man8/ibsrpdm.8* %{_mandir}/man8/srp_daemon.8* -%doc %{_docdir}/%{name}-%{version}/ibsrpdm.md +%doc %{_docdir}/%{name}/ibsrpdm.md %if %{with_pyverbs} %files -n python3-pyverbs %{python3_sitearch}/pyverbs -%{_docdir}/%{name}-%{version}/tests/*.py +%{_docdir}/%{name}/tests/*.py %endif diff --git a/redhat/rdma.conf b/redhat/rdma.conf index f5b74b2..4e2901b 100644 --- a/redhat/rdma.conf +++ b/redhat/rdma.conf @@ -1,18 +1,24 @@ -# Load IPoIB -IPOIB_LOAD=yes -# Load SRP (SCSI Remote Protocol initiator support) module -SRP_LOAD=yes -# Load SRPT (SCSI Remote Protocol target support) module -SRPT_LOAD=yes -# Load iSER (iSCSI over RDMA initiator support) module -ISER_LOAD=yes -# Load iSERT (iSCSI over RDMA target support) module -ISERT_LOAD=yes -# Load RDS (Reliable Datagram Service) network protocol -RDS_LOAD=no -# Load NFSoRDMA client transport module -XPRTRDMA_LOAD=yes -# Load NFSoRDMA server transport module -SVCRDMA_LOAD=no -# Load Tech Preview device driver modules -TECH_PREVIEW_LOAD=no +# These modules are loaded by the system if any RDMA devices is installed +# iSCSI over RDMA client support +ib_iser + +# iSCSI over RDMA target support +ib_isert + +# SCSI RDMA Protocol target driver +ib_srpt + +# User access to RDMA verbs (supports libibverbs) +ib_uverbs + +# User access to RDMA connection management (supports librdmacm) +rdma_ucm + +# RDS over RDMA support +# rds_rdma + +# NFS over RDMA client support +xprtrdma + +# NFS over RDMA server support +svcrdma diff --git a/redhat/rdma.modules-setup.sh b/redhat/rdma.modules-setup.sh index 803fc60..97b33b0 100644 --- a/redhat/rdma.modules-setup.sh +++ b/redhat/rdma.modules-setup.sh @@ -11,16 +11,21 @@ depends() { } install() { - inst /etc/rdma/rdma.conf inst /etc/rdma/mlx4.conf - inst /etc/rdma/sriov-vfs - inst /usr/libexec/rdma-init-kernel + inst /etc/rdma/modules/infiniband.conf + inst /etc/rdma/modules/iwarp.conf + inst /etc/rdma/modules/opa.conf + inst /etc/rdma/modules/rdma.conf + inst /etc/rdma/modules/roce.conf inst /usr/libexec/mlx4-setup.sh - inst /usr/libexec/rdma-set-sriov-vf inst /usr/lib/modprobe.d/libmlx4.conf inst_multiple lspci setpci awk sleep inst_multiple -o /etc/modprobe.d/mlx4.conf - inst_rules 98-rdma.rules 70-persistent-ipoib.rules + inst_rules 60-rdma-ndd.rules 60-rdma-persistent-naming.rules 70-persistent-ipoib.rules 75-rdma-description.rules 90-rdma-hw-modules.rules 90-rdma-ulp-modules.rules 90-rdma-umad.rules + inst_multiple -o \ + $systemdsystemunitdir/rdma-hw.target \ + $systemdsystemunitdir/rdma-load-modules@.service \ + $systemdsystemunitdir/rdma-ndd.service } installkernel() { diff --git a/srp_daemon/srp_daemon.c b/srp_daemon/srp_daemon.c index f14d9f5..bf689b1 100644 --- a/srp_daemon/srp_daemon.c +++ b/srp_daemon/srp_daemon.c @@ -133,7 +133,7 @@ static int check_process_uniqueness(struct config_t *conf) char path[256]; int fd; - snprintf(path, sizeof(path), SRP_DEAMON_LOCK_PREFIX "_%s_%d", + snprintf(path, sizeof(path), SRP_DAEMON_LOCK_PREFIX "_%s_%d", conf->dev_name, conf->port_num); if ((fd = open(path, O_CREAT|O_RDWR, @@ -142,7 +142,6 @@ static int check_process_uniqueness(struct config_t *conf) return -1; } - fchmod(fd, S_IRUSR|S_IRGRP|S_IROTH|S_IWUSR|S_IWGRP|S_IWOTH); if (0 != lockf(fd, F_TLOCK, 0)) { pr_err("failed to lock %s (errno: %d). possibly another " "srp_daemon is locking it\n", path, errno); @@ -231,7 +230,7 @@ static void usage(const char *argv0) fprintf(stderr, "-R perform complete Rescan every seconds\n"); fprintf(stderr, "-T Retries to connect to existing target after Timeout of seconds\n"); fprintf(stderr, "-l Transport retry count before failing IO. should be in range [2..7], (default 2)\n"); - fprintf(stderr, "-f use rules File to set to which target(s) to connect (default: " SRP_DEAMON_CONFIG_FILE ")\n"); + fprintf(stderr, "-f use rules File to set to which target(s) to connect (default: " SRP_DAEMON_CONFIG_FILE ")\n"); fprintf(stderr, "-t Timeout for mad response in milliseconds\n"); fprintf(stderr, "-r number of send Retries for each mad\n"); fprintf(stderr, "-n New connection command format - use also initiator extension\n"); @@ -1505,6 +1504,7 @@ static int parse_other_option(struct rule *rule, char *ptr) { static const char *const opt[] = { "allow_ext_sg=", + "ch_count=", "cmd_sg_entries=", "comp_vector=", "max_cmd_per_lun=", @@ -1735,7 +1735,7 @@ static int get_config(struct config_t *conf, int argc, char *argv[]) conf->retry_timeout = 20; conf->add_target_file = NULL; conf->print_initiator_ext = 0; - conf->rules_file = SRP_DEAMON_CONFIG_FILE; + conf->rules_file = SRP_DAEMON_CONFIG_FILE; conf->rules = NULL; conf->tl_retry_count = 0; diff --git a/suse/rdma-core.spec b/suse/rdma-core.spec index 019e1db..1b7bc72 100644 --- a/suse/rdma-core.spec +++ b/suse/rdma-core.spec @@ -23,7 +23,7 @@ %define git_ver %{nil} Name: rdma-core -Version: 29.0 +Version: 32.0 Release: 0 Summary: RDMA core userspace libraries and daemons License: GPL-2.0-only OR BSD-2-Clause @@ -453,12 +453,6 @@ mkdir -p %{buildroot}%{dracutlibdir}/modules.d/05rdma mkdir -p %{buildroot}%{sysmodprobedir} mkdir -p %{buildroot}%{_unitdir} -# SRIOV service -install -D -m0644 redhat/rdma.sriov-vfs %{buildroot}/%{_sysconfdir}/rdma/sriov-vfs -install -D -m0755 redhat/rdma.sriov-init %{buildroot}%{_libexecdir}/rdma-set-sriov-vf -install -D -m0644 suse/rdma.sriov-rules %{buildroot}%{_udevrulesdir}/98-rdma-sriov.rules -install -D -m0644 suse/rdma.sriov-service %{buildroot}%{_unitdir}/rdma-sriov.service - # Port type setup for mlx4 dual port cards install -D -m0644 redhat/rdma.mlx4.conf %{buildroot}/%{_sysconfdir}/rdma/mlx4.conf sed 's%/usr/libexec%/usr/lib%g' redhat/rdma.mlx4.sys.modprobe > %{buildroot}%{sysmodprobedir}/50-libmlx4.conf @@ -587,7 +581,6 @@ rm -rf %{buildroot}/%{_sbindir}/srp_daemon.sh %config(noreplace) %{_sysconfdir}/rdma/modules/opa.conf %config(noreplace) %{_sysconfdir}/rdma/modules/rdma.conf %config(noreplace) %{_sysconfdir}/rdma/modules/roce.conf -%config(noreplace) %{_sysconfdir}/rdma/sriov-vfs %if 0%{?dma_coherent} %config(noreplace) %{_sysconfdir}/modprobe.d/mlx4.conf %endif @@ -595,7 +588,6 @@ rm -rf %{buildroot}/%{_sbindir}/srp_daemon.sh %config(noreplace) %{_sysconfdir}/udev/rules.d/70-persistent-ipoib.rules %{_unitdir}/rdma-hw.target %{_unitdir}/rdma-load-modules@.service -%{_unitdir}/rdma-sriov.service %dir %{dracutlibdir} %dir %{dracutlibdir}/modules.d %dir %{dracutlibdir}/modules.d/05rdma @@ -606,9 +598,7 @@ rm -rf %{buildroot}/%{_sbindir}/srp_daemon.sh %{_udevrulesdir}/90-rdma-hw-modules.rules %{_udevrulesdir}/90-rdma-ulp-modules.rules %{_udevrulesdir}/90-rdma-umad.rules -%{_udevrulesdir}/98-rdma-sriov.rules %{sysmodprobedir}/50-libmlx4.conf -%{_libexecdir}/rdma-set-sriov-vf %{_libexecdir}/mlx4-setup.sh %{_libexecdir}/truescale-serdes.cmds %license COPYING.* diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index d90c89e..4de98d0 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -3,14 +3,21 @@ rdma_python_test(tests __init__.py + args_parser.py base.py + base_rdmacm.py + mlx5_base.py rdmacm_utils.py test_addr.py test_cq.py test_cq_events.py test_cqex.py test_device.py + test_efadv.py + test_mlx5_dc.py + test_mlx5_lag_affinity.py test_mlx5_pp.py + test_mlx5_uar.py test_mlx5_var.py test_mr.py test_odp.py @@ -20,6 +27,7 @@ rdma_python_test(tests test_qpex.py test_rdmacm.py test_relaxed_ordering.py + test_shared_pd.py utils.py ) diff --git a/tests/__init__.py b/tests/__init__.py index a746e71..3728c56 100644 --- a/tests/__init__.py +++ b/tests/__init__.py @@ -4,6 +4,8 @@ import importlib import os +from args_parser import parser + # Load every test as a module in the system so that unittest's loader can find it def _load_tests(): res = [] @@ -20,8 +22,23 @@ __test_modules__ = _load_tests() # a single test. tests = importlib.import_module(".", __name__) + +def _show_tests_and_exit(loader, standard_tests, pattern): + """ + Prints the full test names that are loaded with the current modules via + loadTestsFromModule protocol, without modifying standard_tests. + """ + for mod in __test_modules__: + for test in loader.loadTestsFromModule(mod, pattern): + for test_case in test: + print(test_case.id()) + return standard_tests + + def load_tests(loader, standard_tests, pattern): """Implement the loadTestsFromModule protocol""" + if parser.args['list_tests']: + return _show_tests_and_exit(loader, standard_tests, pattern) for mod in __test_modules__: standard_tests.addTests(loader.loadTestsFromModule(mod, pattern)) return standard_tests diff --git a/tests/args_parser.py b/tests/args_parser.py new file mode 100644 index 0000000..71a0f34 --- /dev/null +++ b/tests/args_parser.py @@ -0,0 +1,33 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2020 Kamal Heib , All rights reserved. See COPYING file + +import argparse +import sys + + +class ArgsParser(object): + def __init__(self): + self.args = None + + def get_config(self): + return self.args + + def parse_args(self): + parser = argparse.ArgumentParser() + parser.add_argument('--dev', + help='RDMA device to run the tests on') + parser.add_argument('-v', '--verbose', dest='verbosity', + action='store_const', + const=2, help='Verbose output') + parser.add_argument('--list-tests', action='store_true', default=False, + help='Print a list of the full test names that are ' + 'loaded by default and exit without running ' + 'them.') + ns, args = parser.parse_known_args() + self.args = vars(ns) + if self.args['verbosity']: + args += ['--verbose'] + sys.argv[1:] = args + + +parser = ArgsParser() diff --git a/tests/base.py b/tests/base.py index ece2443..3eb5f5d 100755 --- a/tests/base.py +++ b/tests/base.py @@ -1,21 +1,22 @@ # SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) # Copyright (c) 2019 Mellanox Technologies, Inc . All rights reserved. See COPYING file +import subprocess import unittest import tempfile import random import errno import stat +import json import os -from pyverbs.pyverbs_error import PyverbsRDMAError, PyverbsUserError from pyverbs.qp import QPCap, QPInitAttrEx, QPInitAttr, QPAttr, QP -from pyverbs.cmid import CMID, AddrInfo, CMEventChannel, ConnParam +from pyverbs.srq import SRQ, SrqInitAttrEx, SrqInitAttr, SrqAttr +from pyverbs.pyverbs_error import PyverbsRDMAError from pyverbs.addr import AHAttr, GlobalRoute from pyverbs.xrcd import XRCD, XRCDInitAttr -from pyverbs.srq import SRQ, SrqInitAttrEx from pyverbs.device import Context -import pyverbs.cm_enums as ce +from args_parser import parser import pyverbs.device as d import pyverbs.enums as e from pyverbs.pd import PD @@ -34,6 +35,7 @@ TIMEOUT = 14 MLNX_VENDOR_ID = 0x02c9 CX3_MLNX_PART_ID = 4099 CX3Pro_MLNX_PART_ID = 4103 +DCT_KEY = 0xbadc0de # Dictionary: vendor_id -> array of part_ids of devices that lack RoCEv2 support ROCEV2_UNSUPPORTED_DEVS = {MLNX_VENDOR_ID: [CX3Pro_MLNX_PART_ID, CX3_MLNX_PART_ID]} @@ -43,20 +45,45 @@ def has_roce_hw_bug(vendor_id, vendor_part_id): return vendor_part_id in ROCEV2_UNSUPPORTED_DEVS.get(vendor_id, []) +def set_rnr_attributes(qp_attr): + """ + Set default QP RNR attributes. + :param qp_attr: The QPAttr to set its attributes + :return: None + """ + qp_attr.min_rnr_timer = MIN_RNR_TIMER + qp_attr.timeout = TIMEOUT + qp_attr.retry_cnt = RETRY_CNT + qp_attr.rnr_retry = RNR_RETRY + + class PyverbsAPITestCase(unittest.TestCase): + def __init__(self, methodName='runTest'): + super().__init__(methodName) + # Hold the command line arguments + self.config = parser.get_config() + def setUp(self): """ Opens the devices and queries them """ - lst = d.get_device_list() self.devices = [] - if len(lst) == 0: - raise unittest.SkipTest('No IB devices found') - for dev in lst: - c = d.Context(name=dev.name.decode()) + + dev_name = self.config['dev'] + if dev_name: + c = d.Context(name=dev_name) attr = c.query_device() attr_ex = c.query_device_ex() self.devices.append((c, attr, attr_ex)) + else: + for dev in d.get_device_list(): + c = d.Context(name=dev.name.decode()) + attr = c.query_device() + attr_ex = c.query_device_ex() + self.devices.append((c, attr, attr_ex)) + + if len(self.devices) == 0: + raise unittest.SkipTest('No IB devices found') def tearDown(self): for tup in self.devices: @@ -64,41 +91,49 @@ class PyverbsAPITestCase(unittest.TestCase): class RDMATestCase(unittest.TestCase): - """ - A base class for test cases which provides the option for user parameters. - These can be provided by manually adding the test case to the runner: - suite = unittest.TestSuite() - ... # Regular auto-detection of test cases, no parameters used. - # Now follows your manual addition of test cases e.g: - suite.addTest(RDMATestCase.parametrize(, dev_name='..', - ib_port=1, gid_index=3, - pkey_index=42)) - """ ZERO_GID = '0000:0000:0000:0000' def __init__(self, methodName='runTest', dev_name=None, ib_port=None, gid_index=None, pkey_index=None): super(RDMATestCase, self).__init__(methodName) - self.dev_name = dev_name + # Hold the command line arguments + self.config = parser.get_config() + dev = self.config['dev'] + self.dev_name = dev_name if dev_name else dev self.ib_port = ib_port self.gid_index = gid_index self.pkey_index = pkey_index + self.ip_addr = None - @staticmethod - def parametrize(testcase_klass, dev_name=None, ib_port=None, gid_index=None, - pkey_index=None): + def is_eth_and_has_roce_hw_bug(self): """ - Create a test suite containing all the tests from the given subclass - with the given dev_name, port, gid index and pkey_index. + Check if the link layer is Ethernet and the device lacks RoCEv2 support + with a known HW bug. + return: True if the link layer is Ethernet and device is not supported """ - loader = unittest.TestLoader() - names = loader.getTestCaseNames(testcase_klass) - suite = unittest.TestSuite() - for n in names: - suite.addTest(testcase_klass(n, dev_name=dev_name, ib_port=ib_port, - gid_index=gid_index, - pkey_index=pkey_index)) - return suite + ctx = d.Context(name=self.dev_name) + port_attrs = ctx.query_port(self.ib_port) + dev_attrs = ctx.query_device() + vendor_id = dev_attrs.vendor_id + vendor_pid = dev_attrs.vendor_part_id + return port_attrs.link_layer == e.IBV_LINK_LAYER_ETHERNET and \ + has_roce_hw_bug(vendor_id, vendor_pid) + + @staticmethod + def get_net_name(dev): + out = subprocess.check_output(['ls', + '/sys/class/infiniband/{}/device/net/' + .format(dev)]) + return out.decode().split('\n')[0] + + @staticmethod + def get_ip_address(ifname): + out = subprocess.check_output(['ip', '-j', 'addr', 'show', ifname]) + loaded_json = json.loads(out.decode()) + interface = loaded_json[0]['addr_info'][0]['local'] + if 'fe80::' in interface: + interface = interface + '%' + ifname + return interface def setUp(self): """ @@ -135,10 +170,9 @@ class RDMATestCase(unittest.TestCase): if not self.args: raise unittest.SkipTest('No port is up, can\'t run traffic') # Choose one combination and use it - args = random.choice(self.args) - self.dev_name = args[0] - self.ib_port = args[1] - self.gid_index = args[2] + self._select_config() + self.dev_info = {'dev_name': self.dev_name, 'ib_port': self.ib_port, + 'gid_index': self.gid_index} def _add_gids_per_port(self, ctx, dev, port): # Don't add ports which are not active @@ -155,114 +189,39 @@ class RDMATestCase(unittest.TestCase): continue # Avoid RoCEv2 GIDs on unsupported devices if port_attrs.link_layer == e.IBV_LINK_LAYER_ETHERNET and \ - ctx.query_gid_type(port, idx) == e.IBV_GID_TYPE_ROCE_V2 and \ + ctx.query_gid_type(port, idx) == \ + e.IBV_GID_TYPE_SYSFS_ROCE_V2 and \ has_roce_hw_bug(vendor_id, vendor_pid): continue - self.args.append([dev, port, idx]) + if not os.path.exists('/sys/class/infiniband/{}/device/net/'.format(dev)): + self.args.append([dev, port, idx, None]) + continue + net_name = self.get_net_name(dev) + try: + ip_addr = self.get_ip_address(net_name) + except (KeyError, IndexError): + self.args.append([dev, port, idx, None]) + else: + self.args.append([dev, port, idx, ip_addr]) def _add_gids_per_device(self, ctx, dev): port_count = ctx.query_device().phys_port_cnt for port in range(port_count): self._add_gids_per_port(ctx, dev, port+1) - -class CMResources: - """ - CMResources class is a base aggregator object which contains basic - resources for RDMA CM communication. - """ - def __init__(self, **kwargs): - """ - :param kwargs: Arguments: - * *src* (str) - Local address to bind to (for passive side) - * *dst* (str) - Destination address to connect (for active side) - * *port* (str) - Port number of the address - * *is_async* (bool) - A flag which indicates if its asynchronous RDMACM - * *with_ext_qp* (bool) - If set, an external RC QP will be created and used by RDMACM - """ - src = kwargs.get('src') - dst = kwargs.get('dst') - self.is_server = True if dst is None else False - self.qp_init_attr = None - self.is_async = kwargs.get('is_async', False) - self.with_ext_qp = kwargs.get('with_ext_qp', False) - self.connected = False - # When passive side (server) listens to incoming connection requests, - # for each new request it creates a new cmid which is used to establish - # the connection with the remote side - self.child_id = None - self.msg_size = 1024 - self.num_msgs = 100 - self.channel = None - self.cq = None - self.qp = None - self.port = kwargs.get('port') if kwargs.get('port') else '7471' - self.mr = None - if self.is_server: - self.ai = AddrInfo(src, None, self.port, ce.RDMA_PS_TCP, - ce.RAI_PASSIVE) - else: - self.ai = AddrInfo(src, dst, self.port, ce.RDMA_PS_TCP) - if self.is_async: - self.create_event_channel() - self.cmid = CMID(creator=self.channel) + def _select_config(self): + args_with_inet_ip = [] + for arg in self.args: + if arg[3]: + args_with_inet_ip.append(arg) + if args_with_inet_ip: + args = random.choice(args_with_inet_ip) else: - self.cmid = CMID(creator=self.ai, - qp_init_attr=self.create_qp_init_attr()) - - def create_mr(self): - if self.is_server: - self.mr = self.child_id.reg_msgs(self.msg_size) - else: - self.mr = self.cmid.reg_msgs(self.msg_size) - - def create_event_channel(self): - self.channel = CMEventChannel() - - @staticmethod - def create_qp_init_attr(rcq=None, scq=None): - return QPInitAttr(qp_type=e.IBV_QPT_RC, rcq=rcq, scq=scq, - cap=QPCap(max_recv_wr=1)) - - @staticmethod - def create_conn_param(qp_num=0): - return ConnParam(qp_num=qp_num) - - def create_child_id(self, cm_event=None): - if not self.is_server: - raise PyverbsUserError('create_child_id can be used only in passive side') - if self.is_async: - self.child_id = CMID(creator=cm_event, listen_id=self.cmid) - else: - self.child_id = self.cmid.get_request() - - def create_qp(self): - """ - Create a rdmacm QP. If self.with_ext_qp is set, then an external CQ and - RC QP will be created and set in self.cq and self.qp - respectively. - """ - cmid = self.child_id if self.is_server else self.cmid - if not self.with_ext_qp: - cmid.create_qp(self.create_qp_init_attr()) - else: - self.cq = CQ(cmid.context, self.num_msgs, None, None, 0) - init_attr = self.create_qp_init_attr(rcq=self.cq, scq=self.cq) - self.qp = QP(cmid.pd, init_attr, QPAttr()) - - def modify_ext_qp_to_rts(self): - cmid = self.child_id if self.is_server else self.cmid - attr, mask = cmid.init_qp_attr(e.IBV_QPS_INIT) - self.qp.modify(attr, mask) - attr, mask = cmid.init_qp_attr(e.IBV_QPS_RTR) - self.qp.modify(attr, mask) - attr, mask = cmid.init_qp_attr(e.IBV_QPS_RTS) - self.qp.modify(attr, mask) + args = random.choice(self.args) + self.dev_name = args[0] + self.ib_port = args[1] + self.gid_index = args[2] + self.ip_addr = args[3] class BaseResources(object): @@ -278,10 +237,17 @@ class BaseResources(object): :param ib_port: IB port of the device to use (default: 1) :param gid_index: Which GID index to use (default: 0) """ - self.ctx = Context(name=dev_name) + self.dev_name = dev_name self.gid_index = gid_index - self.pd = PD(self.ctx) self.ib_port = ib_port + self.create_context() + self.create_pd() + + def create_context(self): + self.ctx = Context(name=self.dev_name) + + def create_pd(self): + self.pd = PD(self.ctx) class TrafficResources(BaseResources): @@ -289,31 +255,39 @@ class TrafficResources(BaseResources): Basic traffic class. It provides the basic RDMA resources and operations needed for traffic. """ - def __init__(self, dev_name, ib_port, gid_index): + def __init__(self, dev_name, ib_port, gid_index, with_srq=False, + qp_count=1): """ Initializes a TrafficResources object with the given values and creates basic RDMA resources. :param dev_name: Device name to be used :param ib_port: IB port of the device to use :param gid_index: Which GID index to use + :param with_srq: If True, create SRQ and attach to QPs + :param qp_count: Number of QPs to create """ super(TrafficResources, self).__init__(dev_name=dev_name, ib_port=ib_port, gid_index=gid_index) - self.psn = random.getrandbits(24) self.msg_size = 1024 self.num_msgs = 1000 self.port_attr = None self.mr = None + self.use_mr_prefetch = None + self.srq = None self.cq = None - self.qp = None - self.rqpn = 0 - self.rpsn = 0 + self.qps = [] + self.qps_num = [] + self.psns = [] + self.rqps_num = None + self.rpsns = None + self.with_srq = with_srq + self.qp_count = qp_count self.init_resources() @property - def qpn(self): - return self.qp.qp_num + def qp(self): + return self.qps[0] def init_resources(self): """ @@ -321,9 +295,11 @@ class TrafficResources(BaseResources): :return: None """ self.port_attr = self.ctx.query_port(self.ib_port) + if self.with_srq: + self.create_srq() self.create_cq() self.create_mr() - self.create_qp() + self.create_qps() def create_cq(self): """ @@ -341,24 +317,56 @@ class TrafficResources(BaseResources): """ self.mr = MR(self.pd, self.msg_size, e.IBV_ACCESS_LOCAL_WRITE) - def create_qp(self): + def create_qp_cap(self): + return QPCap(max_recv_wr=self.num_msgs) + + def create_qp_init_attr(self): + return QPInitAttr(qp_type=e.IBV_QPT_RC, scq=self.cq, rcq=self.cq, + srq=self.srq, cap=self.create_qp_cap()) + + def create_qp_attr(self): + return QPAttr(port_num=self.ib_port) + + def create_qps(self): """ - Initializes self.qp with an RC QP. + Initializes self.qps with RC QPs. :return: None """ - qp_caps = QPCap(max_recv_wr=self.num_msgs) - qp_init_attr = QPInitAttr(qp_type=e.IBV_QPT_RC, scq=self.cq, - rcq=self.cq, cap=qp_caps) - qp_attr = QPAttr(port_num=self.ib_port) - self.qp = QP(self.pd, qp_init_attr, qp_attr) + qp_init_attr = self.create_qp_init_attr() + qp_attr = self.create_qp_attr() + for _ in range(self.qp_count): + try: + qp = QP(self.pd, qp_init_attr, qp_attr) + self.qps.append(qp) + self.qps_num.append(qp.qp_num) + self.psns.append(random.getrandbits(24)) + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest(f'Create QP type {qp_init_attr.qp_type} is not supported') + raise ex + + def create_srq_attr(self): + return SrqAttr(max_wr=self.num_msgs*self.qp_count) + + def create_srq_init_attr(self): + return SrqInitAttr(self.create_srq_attr()) + + def create_srq(self): + srq_init_attr = self.create_srq_init_attr() + try: + self.srq = SRQ(self.pd, srq_init_attr) + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Create SRQ is not supported') + raise ex - def pre_run(self, rpsn, rqpn): + def pre_run(self, rpsns, rqps_num): """ - Modify the QP's state to RTS and fill receive queue with work + Modify the QP's states to RTS and fill receive queue with work requests. This method is not implemented in this class. - :param rpsn: Remote PSN - :param rqpn: Remote QPN + :param rpsns: Remote PSNs + :param rqps_num: Remote QPs Number :return: None """ raise NotImplementedError() @@ -372,33 +380,31 @@ class RCResources(TrafficResources): ibv_rc_pingpong). :return: None """ - attr = QPAttr(port_num=self.ib_port) - attr.dest_qp_num = self.rqpn + attr = self.create_qp_attr() attr.path_mtu = PATH_MTU attr.max_dest_rd_atomic = MAX_DEST_RD_ATOMIC - attr.min_rnr_timer = MIN_RNR_TIMER - attr.rq_psn = self.psn - attr.sq_psn = self.rpsn - attr.timeout = TIMEOUT - attr.retry_cnt = RETRY_CNT - attr.rnr_retry = RNR_RETRY + set_rnr_attributes(attr) attr.max_rd_atomic = MAX_RD_ATOMIC gr = GlobalRoute(dgid=self.ctx.query_gid(self.ib_port, self.gid_index), sgid_index=self.gid_index) ah_attr = AHAttr(port_num=self.ib_port, is_global=1, gr=gr, dlid=self.port_attr.lid) attr.ah_attr = ah_attr - self.qp.to_rts(attr) + for i in range(self.qp_count): + attr.dest_qp_num = self.rqps_num[i] + attr.rq_psn = self.psns[i] + attr.sq_psn = self.rpsns[i] + self.qps[i].to_rts(attr) - def pre_run(self, rpsn, rqpn): + def pre_run(self, rpsns, rqps_num): """ Configure Resources before running traffic - :param rpsn: Remote PSN (packet serial number) - :param rqpn: Remote QP number + :param rpsns: Remote PSNs (packet serial number) + :param rqps_num: Remote QPs number :return: None """ - self.rqpn = rqpn - self.rpsn = rpsn + self.rpsns = rpsns + self.rqps_num = rqps_num self.to_rts() @@ -411,18 +417,24 @@ class UDResources(TrafficResources): self.mr = MR(self.pd, self.msg_size + self.GRH_SIZE, e.IBV_ACCESS_LOCAL_WRITE) - def create_qp(self): - qp_caps = QPCap(max_recv_wr=self.num_msgs) - qp_init_attr = QPInitAttr(qp_type=e.IBV_QPT_UD, cap=qp_caps, - scq=self.cq, rcq=self.cq) - qp_attr = QPAttr(port_num=self.ib_port) + def create_qp_init_attr(self): + return QPInitAttr(qp_type=e.IBV_QPT_UD, scq=self.cq, + rcq=self.cq, srq=self.srq, cap=self.create_qp_cap()) + + def create_qps(self): + qp_init_attr = self.create_qp_init_attr() + qp_attr = self.create_qp_attr() qp_attr.qkey = self.UD_QKEY qp_attr.pkey_index = self.UD_PKEY_INDEX - self.qp = QP(self.pd, qp_init_attr, qp_attr) + for _ in range(self.qp_count): + qp = QP(self.pd, qp_init_attr, qp_attr) + self.qps.append(qp) + self.qps_num.append(qp.qp_num) + self.psns.append(random.getrandbits(24)) - def pre_run(self, rpsn, rqpn): - self.rqpn = rqpn - self.rpsn = rpsn + def pre_run(self, rpsns, rqps_num): + self.rpsns = rpsns + self.rqps_num = rqps_num class XRCResources(TrafficResources): @@ -430,23 +442,18 @@ class XRCResources(TrafficResources): self.temp_file = None self.xrcd_fd = -1 self.xrcd = None - self.srq = None - self.qp_count = qp_count self.sqp_lst = [] self.rqp_lst = [] - self.qps_num = [] - self.psns = [] - self.rqps_num = None - self.rpsns = None - super(XRCResources, self).__init__(dev_name, ib_port, gid_index) + super(XRCResources, self).__init__(dev_name, ib_port, gid_index, + qp_count=qp_count) def close(self): os.close(self.xrcd_fd) self.temp_file.close() - def create_qp(self): + def create_qps(self): """ - Initializes self.qp with an XRC SEND/RECV QP. + Initializes self.qps with an XRC SEND/RECV QPs. :return: None """ qp_attr = QPAttr(port_num=self.ib_port) @@ -511,10 +518,7 @@ class XRCResources(TrafficResources): gr=gr, dlid=self.port_attr.lid) qp_attr = QPAttr() qp_attr.path_mtu = PATH_MTU - qp_attr.timeout = TIMEOUT - qp_attr.retry_cnt = RETRY_CNT - qp_attr.rnr_retry = RNR_RETRY - qp_attr.min_rnr_timer = MIN_RNR_TIMER + set_rnr_attributes(qp_attr) qp_attr.ah_attr = ah_attr for i in range(self.qp_count): qp_attr.dest_qp_num = self.rqps_num[i][1] diff --git a/tests/base_rdmacm.py b/tests/base_rdmacm.py new file mode 100755 index 0000000..67d00fb --- /dev/null +++ b/tests/base_rdmacm.py @@ -0,0 +1,154 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019 Mellanox Technologies, Inc . All rights reserved. See COPYING file + +import abc + +from pyverbs.cmid import CMID, AddrInfo, CMEventChannel, ConnParam, UDParam +from pyverbs.qp import QPCap, QPInitAttr, QPAttr, QP +from pyverbs.pyverbs_error import PyverbsUserError +import pyverbs.cm_enums as ce +import pyverbs.enums as e +from pyverbs.cq import CQ + + +GRH_SIZE = 40 +qp_type_per_ps = {ce.RDMA_PS_TCP: e.IBV_QPT_RC, ce.RDMA_PS_UDP: e.IBV_QPT_UD} + + +class CMResources(abc.ABC): + """ + CMResources class is an abstract base class which contains basic resources + for RDMA CM communication. + """ + def __init__(self, addr=None, passive=None, **kwargs): + """ + :param addr: Local address to bind to. + :param passive: Indicate if this CM is the passive CM. + :param kwargs: Arguments: + * *port* (str) + Port number of the address + * *with_ext_qp* (bool) + If set, an external RC QP will be created and used by RDMACM + * *port_space* (str) + If set, indicates the CMIDs port space + """ + self.qp_init_attr = None + self.passive = passive + self.with_ext_qp = kwargs.get('with_ext_qp', False) + self.port = kwargs.get('port') if kwargs.get('port') else '7471' + self.port_space = kwargs.get('port_space', ce.RDMA_PS_TCP) + self.remote_operation = kwargs.get('remote_op') + self.qp_type = qp_type_per_ps[self.port_space] + self.qp_init_attr = QPInitAttr(qp_type=self.qp_type, cap=QPCap()) + self.connected = False + # When passive side (server) listens to incoming connection requests, + # for each new request it creates a new cmid which is used to establish + # the connection with the remote side + self.child_id = None + self.msg_size = 1024 + self.num_msgs = 10 + self.channel = None + self.cq = None + self.qp = None + self.mr = None + self.remote_qpn = None + self.ud_params = None + if self.passive: + self.ai = AddrInfo(src=addr, src_service=self.port, + port_space=self.port_space, flags=ce.RAI_PASSIVE) + else: + self.ai = AddrInfo(src=addr, dst=addr, dst_service=self.port, + port_space=self.port_space) + + def create_mr(self): + cmid = self.child_id if self.passive else self.cmid + mr_remote_function = {None: cmid.reg_msgs, 'read': cmid.reg_read, + 'write': cmid.reg_write} + self.mr = mr_remote_function[self.remote_operation](self.msg_size + GRH_SIZE) + + def create_event_channel(self): + self.channel = CMEventChannel() + + def create_qp_init_attr(self, rcq=None, scq=None): + return QPInitAttr(qp_type=self.qp_type, rcq=rcq, scq=scq, + cap=QPCap(max_recv_wr=1)) + + def create_conn_param(self, qp_num=0): + if self.with_ext_qp: + qp_num = self.qp.qp_num + return ConnParam(qp_num=qp_num) + + def set_ud_params(self, cm_event): + if self.port_space == ce.RDMA_PS_UDP: + self.ud_params = UDParam(cm_event) + + def my_qp_number(self): + if self.with_ext_qp: + return self.qp.qp_num + else: + cm = self.child_id if self.passive else self.cmid + return cm.qpn + + def create_qp(self): + """ + Create a rdmacm QP. If self.with_ext_qp is set, then an external CQ and + RC QP will be created and set in self.cq and self.qp + respectively. + """ + cmid = self.child_id if self.passive else self.cmid + if not self.with_ext_qp: + cmid.create_qp(self.create_qp_init_attr()) + else: + self.cq = CQ(cmid.context, self.num_msgs, None, None, 0) + init_attr = self.create_qp_init_attr(rcq=self.cq, scq=self.cq) + self.qp = QP(cmid.pd, init_attr, QPAttr()) + + def modify_ext_qp_to_rts(self): + cmid = self.child_id if self.passive else self.cmid + attr, mask = cmid.init_qp_attr(e.IBV_QPS_INIT) + self.qp.modify(attr, mask) + attr, mask = cmid.init_qp_attr(e.IBV_QPS_RTR) + self.qp.modify(attr, mask) + attr, mask = cmid.init_qp_attr(e.IBV_QPS_RTS) + self.qp.modify(attr, mask) + + @abc.abstractmethod + def create_child_id(self, cm_event=None): + pass + + +class AsyncCMResources(CMResources): + """ + AsyncCMResources class contains resources for RDMA CM asynchronous + communication. + :param addr: Local address to bind to. + :param passive: Indicate if this CM is the passive CM. + """ + def __init__(self, addr=None, passive=None, **kwargs): + super(AsyncCMResources, self).__init__(addr=addr, passive=passive, + **kwargs) + self.create_event_channel() + self.cmid = CMID(creator=self.channel, port_space=self.port_space) + + def create_child_id(self, cm_event=None): + if not self.passive: + raise PyverbsUserError('create_child_id can be used only in passive side') + self.child_id = CMID(creator=cm_event, listen_id=self.cmid) + + +class SyncCMResources(CMResources): + """ + SyncCMResources class contains resources for RDMA CM synchronous + communication. + :param addr: Local address to bind to. + :param passive: Indicate if this CM is the passive CM. + """ + def __init__(self, addr=None, passive=None, **kwargs): + super(SyncCMResources, self).__init__(addr=addr, passive=passive, + **kwargs) + self.cmid = CMID(creator=self.ai, qp_init_attr=self.qp_init_attr) + + def create_child_id(self, cm_event=None): + if not self.passive: + raise PyverbsUserError('create_child_id can be used only in passive side') + self.child_id = self.cmid.get_request() diff --git a/tests/mlx5_base.py b/tests/mlx5_base.py new file mode 100644 index 0000000..099906f --- /dev/null +++ b/tests/mlx5_base.py @@ -0,0 +1,97 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2020 NVIDIA Corporation . All rights reserved. See COPYING file + +import unittest +import random +import errno + +from pyverbs.providers.mlx5.mlx5dv import Mlx5Context, Mlx5DVContextAttr, \ + Mlx5DVQPInitAttr, Mlx5QP, Mlx5DVDCInitAttr +from tests.base import TrafficResources, set_rnr_attributes, DCT_KEY +from pyverbs.pyverbs_error import PyverbsRDMAError, PyverbsUserError +from pyverbs.qp import QPCap, QPInitAttrEx, QPAttr +import pyverbs.providers.mlx5.mlx5_enums as dve +from pyverbs.addr import AHAttr, GlobalRoute +import pyverbs.enums as e +from pyverbs.mr import MR + + +class Mlx5DcResources(TrafficResources): + def __init__(self, dev_name, ib_port, gid_index, send_ops_flags, + qp_count=1): + self.send_ops_flags = send_ops_flags + super().__init__(dev_name, ib_port, gid_index, with_srq=True, + qp_count=qp_count) + + def to_rts(self): + attr = self.create_qp_attr() + for i in range(self.qp_count): + self.qps[i].to_rts(attr) + self.dct_qp.to_rtr(attr) + + def pre_run(self, rpsns, rqps_num): + self.rpsns = rpsns + self.rqps_num = rqps_num + self.to_rts() + + def create_context(self): + mlx5dv_attr = Mlx5DVContextAttr() + try: + self.ctx = Mlx5Context(mlx5dv_attr, name=self.dev_name) + except PyverbsUserError as ex: + raise unittest.SkipTest(f'Could not open mlx5 context ({ex})') + except PyverbsRDMAError: + raise unittest.SkipTest('Opening mlx5 context is not supported') + + def create_mr(self): + access = e.IBV_ACCESS_REMOTE_WRITE | e.IBV_ACCESS_LOCAL_WRITE + self.mr = MR(self.pd, self.msg_size, access) + + def create_qp_cap(self): + return QPCap(100, 0, 1, 0) + + def create_qp_attr(self): + qp_attr = QPAttr(port_num=self.ib_port) + set_rnr_attributes(qp_attr) + qp_access = e.IBV_ACCESS_LOCAL_WRITE | e.IBV_ACCESS_REMOTE_WRITE + qp_attr.qp_access_flags = qp_access + gr = GlobalRoute(dgid=self.ctx.query_gid(self.ib_port, self.gid_index), + sgid_index=self.gid_index) + ah_attr = AHAttr(port_num=self.ib_port, is_global=1, gr=gr, + dlid=self.port_attr.lid) + qp_attr.ah_attr = ah_attr + return qp_attr + + def create_qp_init_attr(self, send_ops_flags=0): + comp_mask = e.IBV_QP_INIT_ATTR_PD + if send_ops_flags: + comp_mask |= e.IBV_QP_INIT_ATTR_SEND_OPS_FLAGS + return QPInitAttrEx(cap=self.create_qp_cap(), pd=self.pd, scq=self.cq, + rcq=self.cq, srq=self.srq, qp_type=e.IBV_QPT_DRIVER, + send_ops_flags=send_ops_flags, comp_mask=comp_mask, + sq_sig_all=1) + + def create_qps(self): + # Create the DCI QPs. + qp_init_attr = self.create_qp_init_attr(self.send_ops_flags) + try: + for _ in range(self.qp_count): + comp_mask = dve.MLX5DV_QP_INIT_ATTR_MASK_DC + attr = Mlx5DVQPInitAttr(comp_mask=comp_mask, + dc_init_attr=Mlx5DVDCInitAttr()) + qp = Mlx5QP(self.ctx, qp_init_attr, attr) + self.qps.append(qp) + self.qps_num.append(qp.qp_num) + self.psns.append(random.getrandbits(24)) + + # Create the DCT QP. + qp_init_attr = self.create_qp_init_attr() + dc_attr = Mlx5DVDCInitAttr(dc_type=dve.MLX5DV_DCTYPE_DCT, + dct_access_key=DCT_KEY) + attr = Mlx5DVQPInitAttr(comp_mask=dve.MLX5DV_QP_INIT_ATTR_MASK_DC, + dc_init_attr=dc_attr) + self.dct_qp = Mlx5QP(self.ctx, qp_init_attr, attr) + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest(f'Create DC QP is not supported') + raise ex diff --git a/tests/rdmacm_utils.py b/tests/rdmacm_utils.py index c71bab1..834762c 100755 --- a/tests/rdmacm_utils.py +++ b/tests/rdmacm_utils.py @@ -3,235 +3,400 @@ """ Provide some useful helper function for pyverbs rdmacm' tests. """ -from tests.utils import validate, poll_cq, get_send_element, get_recv_wr -from pyverbs.pyverbs_error import PyverbsError -from tests.base import CMResources -from pyverbs.cmid import CMEvent +from tests.utils import validate, poll_cq, get_send_elements, get_recv_wr +from tests.base_rdmacm import AsyncCMResources, SyncCMResources +from pyverbs.cmid import CMEvent, AddrInfo, JoinMCAttrEx +from pyverbs.pyverbs_error import PyverbsError, PyverbsRDMAError import pyverbs.cm_enums as ce -import os - -events_dict = {ce.RDMA_CM_EVENT_ADDR_ERROR: 'Resolve Address Error', - ce.RDMA_CM_EVENT_ROUTE_ERROR: 'Resolve Route Error', - ce.RDMA_CM_EVENT_CONNECT_ERROR: 'Connection Error', - ce.RDMA_CM_EVENT_UNREACHABLE: 'Node is Unreachable', - ce.RDMA_CM_EVENT_REJECTED: 'Connection Rejected', - ce.RDMA_CM_EVENT_DEVICE_REMOVAL: 'Device Removal', - ce.RDMA_CM_EVENT_MULTICAST_JOIN: 'Multicast Join', - ce.RDMA_CM_EVENT_MULTICAST_ERROR: 'Multicast Error', - ce.RDMA_CM_EVENT_ADDR_CHANGE: 'Address Change', - ce.RDMA_CM_EVENT_TIMEWAIT_EXIT: 'Time wait Exit'} - - -def _server_traffic_with_ext_qp(agr_obj, syncer): - recv_wr = get_recv_wr(agr_obj) - agr_obj.qp.post_recv(recv_wr) - syncer.wait() - for _ in range(agr_obj.num_msgs): - poll_cq(agr_obj.cq) - agr_obj.qp.post_recv(recv_wr) - msg_received = agr_obj.mr.read(agr_obj.msg_size, 0) - validate(msg_received, agr_obj.is_server, agr_obj.msg_size) - send_wr = get_send_element(agr_obj, agr_obj.is_server)[0] - agr_obj.qp.post_send(send_wr) - poll_cq(agr_obj.cq) - - -def server_traffic(agr_obj, syncer): - """ - RDMACM passive side traffic function which sends and receives a message, and - then validates the received message. This operation is executed - times. If agr_obj.with_ext_qp is set, the traffic will - use the external QP (agr_obj.qp). - :param agr_obj: Aggregation object which contains all necessary resources - :param syncer: multiprocessing.Barrier object for processes synchronization - :return: None - """ - if agr_obj.with_ext_qp: - return _server_traffic_with_ext_qp(agr_obj, syncer) - send_msg = agr_obj.msg_size * 's' - cmid = agr_obj.child_id - for _ in range(agr_obj.num_msgs): - cmid.post_recv(agr_obj.mr) - syncer.wait() - syncer.wait() - cmid.get_recv_comp() - msg_received = agr_obj.mr.read(agr_obj.msg_size, 0) - validate(msg_received, agr_obj.is_server, agr_obj.msg_size) - agr_obj.mr.write(send_msg, agr_obj.msg_size) - cmid.post_send(agr_obj.mr) - cmid.get_send_comp() - syncer.wait() - - -def _client_traffic_with_ext_qp(agr_obj, syncer): - recv_wr = get_recv_wr(agr_obj) - syncer.wait() - for _ in range(agr_obj.num_msgs): - send_wr = get_send_element(agr_obj, agr_obj.is_server)[0] - agr_obj.qp.post_send(send_wr) - poll_cq(agr_obj.cq) - agr_obj.qp.post_recv(recv_wr) - poll_cq(agr_obj.cq) - msg_received = agr_obj.mr.read(agr_obj.msg_size, 0) - validate(msg_received, agr_obj.is_server, agr_obj.msg_size) - - -def client_traffic(agr_obj, syncer): - """ - RDMACM active side traffic function which sends and receives a message, and - then validates the received message. This operation is executed - times. If agr_obj.with_ext_qp is set, the traffic will - use the external QP (agr_obj.qp). - :param agr_obj: Aggregation object which contains all necessary resources - :param syncer: multiprocessing.Barrier object for processes synchronization - :return: None - """ - if agr_obj.with_ext_qp: - return _client_traffic_with_ext_qp(agr_obj, syncer) - send_msg = agr_obj.msg_size * 'c' - cmid = agr_obj.cmid - for _ in range(agr_obj.num_msgs): - agr_obj.mr.write(send_msg, agr_obj.msg_size) - syncer.wait() - cmid.post_send(agr_obj.mr) - cmid.get_send_comp() - syncer.wait() - cmid.post_recv(agr_obj.mr) - syncer.wait() - cmid.get_recv_comp() - msg_received = agr_obj.mr.read(agr_obj.msg_size, 0) - validate(msg_received, agr_obj.is_server, agr_obj.msg_size) - - -def event_handler(agr_obj): +from pyverbs.addr import AH +import pyverbs.enums as e +import abc +import errno + +GRH_SIZE = 40 +MULTICAST_QPN = 0xffffff + +class CMConnection(abc.ABC): """ - Handle and execute corresponding API for RDMACM events of asynchronous - communication - :param agr_obj: Aggregation object which contains all necessary resources - :return: None + RDMA CM base abstract connection class. The class contains the rdmacm + resources and other methods to easily establish a connection and run + traffic using the rdmacm resources. Each type of connection or traffic + should inherit from this class and implement the necessary methods such as + connection establishment and traffic. """ - cm_event = CMEvent(agr_obj.cmid.event_channel) - if cm_event.event_type == ce.RDMA_CM_EVENT_ADDR_RESOLVED: - agr_obj.cmid.resolve_route() - elif cm_event.event_type == ce.RDMA_CM_EVENT_ROUTE_RESOLVED: - agr_obj.create_qp() - param = agr_obj.create_conn_param() - if agr_obj.with_ext_qp: - param.qpn = agr_obj.qp.qp_num - agr_obj.cmid.connect(param) - elif cm_event.event_type == ce.RDMA_CM_EVENT_CONNECT_REQUEST: - agr_obj.create_child_id(cm_event) - param = agr_obj.create_conn_param() - agr_obj.create_qp() - if agr_obj.with_ext_qp: - agr_obj.modify_ext_qp_to_rts() - param.qpn = agr_obj.qp.qp_num - agr_obj.child_id.accept(param) - elif cm_event.event_type == ce.RDMA_CM_EVENT_ESTABLISHED: - agr_obj.connected = True - elif cm_event.event_type == ce.RDMA_CM_EVENT_CONNECT_RESPONSE: - agr_obj.connected = True - if agr_obj.with_ext_qp: - agr_obj.modify_ext_qp_to_rts() - agr_obj.cmid.establish() - elif cm_event.event_type == ce.RDMA_CM_EVENT_DISCONNECTED: - if agr_obj.is_server: - agr_obj.child_id.disconnect() - agr_obj.connected = False + def __init__(self, syncer=None, notifier=None): + """ + Initializes a connection object. + :param syncer: Barrier object to sync between all the test processes. + :param notifier: Queue object to pass objects between the connection + sides. + """ + self.syncer = syncer + self.notifier = notifier + self.cm_res = None + + def rdmacm_traffic(self, server=None, multicast=False): + """ + Run rdmacm traffic. This method runs the compatible traffic flow + depending on the CMResources. If self.with_ext_qp is set the traffic + will go through the external QP. + :param server: Run as server. + :param multicast: Run multicast traffic. + """ + server = server if server is not None else self.cm_res.passive + if self.cm_res.with_ext_qp: + if server: + self._ext_qp_server_traffic() + else: + self._ext_qp_client_traffic() else: - agr_obj.cmid.disconnect() - agr_obj.connected = False - else: - if cm_event.event_type in events_dict: - raise PyverbsError('Unexpected event - {}'.format( - events_dict[cm_event.event_type])) + if server: + self._cmid_server_traffic(multicast) + else: + self._cmid_client_traffic(multicast) + + def remote_traffic(self, passive, remote_op='write'): + """ + Run rdmacm remote traffic. This method runs RDMA remote traffic from + the active to the passive. + :param passive: If True, run as server. + :param remote_op: 'write'/'read', The type of the RDMA remote operation. + """ + msg_size = self.cm_res.msg_size + if passive: + self.cm_res.mr.write((msg_size) * 's', msg_size) + mr_details = (self.cm_res.mr.rkey, self.cm_res.mr.buf) + self.notifier.put(mr_details) + self.syncer.wait() + self.syncer.wait() + if remote_op == 'write': + msg_received = self.cm_res.mr.read(msg_size, 0) + validate(msg_received, True, msg_size) else: - raise PyverbsError('The event {} is not supported'.format( - cm_event.event_type)) - cm_event.ack_cm_event() + self.cm_res.mr.write((msg_size) * 'c', msg_size) + self.syncer.wait() + rkey, remote_addr = self.notifier.get() + cmid = self.cm_res.cmid + post_func = cmid.post_write if remote_op == 'write' else \ + cmid.post_read + for _ in range(self.cm_res.num_msgs): + post_func(self.cm_res.mr, msg_size, remote_addr, rkey, + flags=e.IBV_SEND_SIGNALED) + cmid.get_send_comp() + self.syncer.wait() + if remote_op == 'read': + msg_received = self.cm_res.mr.read(msg_size, 0) + validate(msg_received, False, msg_size) + + def _ext_qp_server_traffic(self): + """ + RDMACM server side traffic function which sends and receives a message, + and then validates the received message. This traffic method uses the CM + external QP and CQ for send, recv and get_completion. + :return: None + """ + recv_wr = get_recv_wr(self.cm_res) + self.cm_res.qp.post_recv(recv_wr) + self.syncer.wait() + for _ in range(self.cm_res.num_msgs): + poll_cq(self.cm_res.cq) + self.cm_res.qp.post_recv(recv_wr) + msg_received = self.cm_res.mr.read(self.cm_res.msg_size, 0) + validate(msg_received, self.cm_res.passive, self.cm_res.msg_size) + send_wr = get_send_elements(self.cm_res, self.cm_res.passive)[0] + self.cm_res.qp.post_send(send_wr) + poll_cq(self.cm_res.cq) + + def _ext_qp_client_traffic(self): + """ + RDMACM client side traffic function which sends and receives a message, + and then validates the received message. This traffic method uses the CM + external QP and CQ for send, recv and get_completion. + :return: None + """ + recv_wr = get_recv_wr(self.cm_res) + self.syncer.wait() + for _ in range(self.cm_res.num_msgs): + send_wr = get_send_elements(self.cm_res, self.cm_res.passive)[0] + self.cm_res.qp.post_send(send_wr) + poll_cq(self.cm_res.cq) + self.cm_res.qp.post_recv(recv_wr) + poll_cq(self.cm_res.cq) + msg_received = self.cm_res.mr.read(self.cm_res.msg_size, 0) + validate(msg_received, self.cm_res.passive, self.cm_res.msg_size) + + def _cmid_server_traffic(self, multicast=False): + """ + RDMACM server side traffic function which sends and receives a message, + and then validates the received message. This traffic method uses the + RDMACM API for send, recv and get_completion. + :return: None + """ + grh_offset = GRH_SIZE if self.cm_res.qp_type == e.IBV_QPT_UD else 0 + send_msg = (self.cm_res.msg_size + grh_offset) * 's' + cmid = self.cm_res.child_id if not multicast else self.cm_res.cmid + for _ in range(self.cm_res.num_msgs): + cmid.post_recv(self.cm_res.mr) + self.syncer.wait() + self.syncer.wait() + wc = cmid.get_recv_comp() + msg_received = self.cm_res.mr.read(self.cm_res.msg_size, grh_offset) + validate(msg_received, True, self.cm_res.msg_size) + if self.cm_res.port_space == ce.RDMA_PS_TCP: + self.cm_res.mr.write(send_msg, self.cm_res.msg_size) + cmid.post_send(self.cm_res.mr) + else: + if multicast: + ah = AH(cmid.pd, attr=self.cm_res.ud_params.ah_attr) + rqpn = MULTICAST_QPN + else: + ah = AH(cmid.pd, wc=wc, port_num=1, grh=self.cm_res.mr.buf) + rqpn = self.cm_res.remote_qpn + self.cm_res.mr.write(send_msg, self.cm_res.msg_size + GRH_SIZE) + cmid.post_ud_send(self.cm_res.mr, ah, rqpn=rqpn, + length=self.cm_res.msg_size) + cmid.get_send_comp() + self.syncer.wait() + + def _cmid_client_traffic(self, multicast=False): + """ + RDMACM client side traffic function which sends and receives a message, + and then validates the received message. This traffic method uses the + RDMACM API for send, recv and get_completion. + :return: None + """ + grh_offset = GRH_SIZE if self.cm_res.qp_type == e.IBV_QPT_UD else 0 + send_msg = (self.cm_res.msg_size + grh_offset) * 'c' + cmid = self.cm_res.cmid + for _ in range(self.cm_res.num_msgs): + self.cm_res.mr.write(send_msg, self.cm_res.msg_size + grh_offset) + self.syncer.wait() + if self.cm_res.port_space == ce.RDMA_PS_TCP: + cmid.post_send(self.cm_res.mr) + else: + ah = AH(cmid.pd, attr=self.cm_res.ud_params.ah_attr) + rqpn = MULTICAST_QPN if multicast else self.cm_res.ud_params.qp_num + cmid.post_ud_send(self.cm_res.mr, ah, rqpn=rqpn, + length=self.cm_res.msg_size) + cmid.get_send_comp() + cmid.post_recv(self.cm_res.mr) + self.syncer.wait() + self.syncer.wait() + cmid.get_recv_comp() + msg_received = self.cm_res.mr.read(self.cm_res.msg_size, grh_offset) + validate(msg_received, False, self.cm_res.msg_size) + + def event_handler(self, expected_event=None): + """ + Handle and execute corresponding API for RDMACM events of asynchronous + communication. + :param expected_event: The user expected event. + :return: None + """ + cm_event = CMEvent(self.cm_res.cmid.event_channel) + if cm_event.event_type == ce.RDMA_CM_EVENT_CONNECT_REQUEST: + self.cm_res.create_child_id(cm_event) + elif cm_event.event_type in [ce.RDMA_CM_EVENT_ESTABLISHED, + ce.RDMA_CM_EVENT_MULTICAST_JOIN]: + self.cm_res.set_ud_params(cm_event) + if expected_event and expected_event != cm_event.event_type: + raise PyverbsError('Expected this event: {}, got this event: {}'. + format(expected_event, cm_event.event_str())) + cm_event.ack_cm_event() + + @abc.abstractmethod + def establish_connection(self): + pass + @abc.abstractmethod + def disconnect(self): + pass -def sync_traffic(addr, syncer, notifier, is_server): + +class CMAsyncConnection(CMConnection): """ - RDMACM synchronous data and control path which first establish a connection - using RDMACM's synchronous API and then execute RDMACM synchronous traffic. - :param addr: Address to connect to and to bind to - :param syncer: multiprocessing.Barrier object for processes synchronization - :param notifier: Notify parent process about any exceptions or success - :param is_server: A flag which indicates if this is a server or client - :return: None + Implement RDMACM connection management for asynchronous CMIDs. It includes + connection establishment, disconnection and other methods such as traffic. """ - try: - if is_server: - server = CMResources(src=addr) - server.cmid.listen() - syncer.wait() - server.create_child_id() - server.child_id.accept() - server.create_mr() - server_traffic(server, syncer) - server.child_id.disconnect() + def __init__(self, ip_addr, syncer=None, notifier=None, passive=False, **kwargs): + """ + Init the CMConnection and then init the AsyncCMResources. + :param ip_addr: IP address to use. + :param syncer: Barrier object to sync between all the test processes. + :param notifier: Queue object to pass objects between the connection + sides. + :param passive: Indicate if it's a passive side. + :param kwargs: Arguments used to initialize the CM resources. For more + info please check CMResources. + """ + super(CMAsyncConnection, self).__init__(syncer=syncer, notifier=notifier) + self.cm_res = AsyncCMResources(addr=ip_addr, passive=passive, **kwargs) + + def join_to_multicast(self, mc_addr=None, src_addr=None, extended=False): + """ + Join the CMID to multicast group. + :param mc_addr: The multicast IP address. + :param src_addr: The CMIDs source address. + :param extended: Use the join_multicast_ex API. + """ + self.cm_res.cmid.bind_addr(self.cm_res.ai) + resolve_addr_info = AddrInfo(src=src_addr, dst=mc_addr) + self.cm_res.cmid.resolve_addr(resolve_addr_info) + self.event_handler(expected_event=ce.RDMA_CM_EVENT_ADDR_RESOLVED) + self.cm_res.create_qp() + mc_addr_info = AddrInfo(src=mc_addr) + if not extended: + self.cm_res.cmid.join_multicast(addr=mc_addr_info) + else: + flags = ce.RDMA_MC_JOIN_FLAG_FULLMEMBER + comp_mask = ce.RDMA_CM_JOIN_MC_ATTR_ADDRESS | \ + ce.RDMA_CM_JOIN_MC_ATTR_JOIN_FLAGS + mcattr = JoinMCAttrEx(addr=mc_addr_info, comp_mask=comp_mask, + join_flags=flags) + self.cm_res.cmid.join_multicast(mc_attr=mcattr) + self.event_handler(expected_event=ce.RDMA_CM_EVENT_MULTICAST_JOIN) + self.cm_res.create_mr() + + def leave_multicast(self, mc_addr=None): + """ + Leave multicast group. + :param mc_addr: The multicast IP address. + """ + mc_addr_info = AddrInfo(src=mc_addr) + self.cm_res.cmid.leave_multicast(mc_addr_info) + + def establish_connection(self): + """ + Establish RMDACM connection between two Async CMIDs. + """ + if self.cm_res.passive: + self.cm_res.cmid.bind_addr(self.cm_res.ai) + self.cm_res.cmid.listen() + self.syncer.wait() + self.event_handler(expected_event=ce.RDMA_CM_EVENT_CONNECT_REQUEST) + self.cm_res.create_qp() + if self.cm_res.with_ext_qp: + self.set_cmids_qp_ece(self.cm_res.passive) + self.cm_res.modify_ext_qp_to_rts() + self.set_cmid_ece(self.cm_res.passive) + self.cm_res.child_id.accept(self.cm_res.create_conn_param()) + if self.cm_res.port_space == ce.RDMA_PS_TCP: + self.event_handler(expected_event=ce.RDMA_CM_EVENT_ESTABLISHED) + else: + self.cm_res.cmid.resolve_addr(self.cm_res.ai) + self.event_handler(expected_event=ce.RDMA_CM_EVENT_ADDR_RESOLVED) + self.syncer.wait() + self.cm_res.cmid.resolve_route() + self.event_handler(expected_event=ce.RDMA_CM_EVENT_ROUTE_RESOLVED) + self.cm_res.create_qp() + if self.cm_res.with_ext_qp: + self.set_cmid_ece(self.cm_res.passive) + self.cm_res.cmid.connect(self.cm_res.create_conn_param()) + if self.cm_res.with_ext_qp: + self.event_handler(expected_event=\ + ce.RDMA_CM_EVENT_CONNECT_RESPONSE) + self.set_cmids_qp_ece(self.cm_res.passive) + self.cm_res.modify_ext_qp_to_rts() + self.cm_res.cmid.establish() + else: + self.event_handler(expected_event=ce.RDMA_CM_EVENT_ESTABLISHED) + self.cm_res.create_mr() + self.sync_qp_numbers() + + def sync_qp_numbers(self): + """ + Sync the QP numbers of the connections sides. + """ + if self.cm_res.passive: + self.syncer.wait() + self.notifier.put(self.cm_res.my_qp_number()) + self.syncer.wait() + self.cm_res.remote_qpn = self.notifier.get() else: - client = CMResources(dst=addr) - syncer.wait() - client.cmid.connect() - client.create_mr() - client_traffic(client, syncer) - client.cmid.disconnect() - except Exception as ex: - side = 'passive' if is_server else 'active' - notifier.put('Caught exception in {side} side process: pid {pid}\n' - .format(side=side, pid=os.getpid()) + - 'Exception message: {ex}'.format(ex=str(ex))) - else: - notifier.put(None) - - -def async_traffic_with_ext_qp(addr, syncer, notifier, is_server): - return async_traffic(addr, syncer, notifier, is_server, True) - - -def async_traffic(addr, syncer, notifier, is_server, with_ext_qp=False): + self.syncer.wait() + self.cm_res.remote_qpn = self.notifier.get() + self.notifier.put(self.cm_res.my_qp_number()) + self.syncer.wait() + + def disconnect(self): + """ + Disconnect the connection. + """ + if self.cm_res.port_space == ce.RDMA_PS_TCP: + if self.cm_res.passive: + self.cm_res.child_id.disconnect() + else: + self.event_handler(expected_event=ce.RDMA_CM_EVENT_DISCONNECTED) + self.cm_res.cmid.disconnect() + + def set_cmid_ece(self, passive): + """ + Set the local CMIDs ECE. The ECE is taken from the CMIDs QP ECE. + :param passive: Indicates if this CMID is participate as passive in + this connection. + """ + cmid = self.cm_res.child_id if passive else self.cm_res.cmid + try: + ece = self.cm_res.qp.query_ece() + cmid.set_local_ece(ece) + except PyverbsRDMAError as ex: + if ex.error_code != errno.EOPNOTSUPP: + raise ex + + def set_cmids_qp_ece(self, passive): + """ + Set the CMIDs QP ECE. + :param passive: Indicates if this CMID is participate as passive in + this connection. + """ + cmid = self.cm_res.child_id if passive else self.cm_res.cmid + try: + ece = cmid.get_remote_ece() + self.cm_res.qp.set_ece(ece) + except PyverbsRDMAError as ex: + if ex.error_code != errno.EOPNOTSUPP: + raise ex + +class CMSyncConnection(CMConnection): """ - RDMACM asynchronous data and control path function that first establishes a - connection using RDMACM events API and then executes RDMACM asynchronous - traffic. - :param addr: Address to connect to and to bind to - :param syncer: multiprocessing.Barrier object for processes synchronization - :param notifier: Notify parent process about any exceptions or success - :param is_server: A flag which indicates if this is a server or not - :param with_ext_qp: If set, an external RC QP will be created and used by - RDMACM (default: False) - :return: None + Implement RDMACM connection management for synchronous CMIDs. It includes + connection establishment, disconnection and other methods such as traffic. """ - try: - if is_server: - server = CMResources(src=addr, is_async=True, - with_ext_qp=with_ext_qp) - listen_id = server.cmid - listen_id.bind_addr(server.ai) - listen_id.listen() - syncer.wait() - while not server.connected: - event_handler(server) - server.create_mr() - server_traffic(server, syncer) - server.child_id.disconnect() + def __init__(self, ip_addr, syncer=None, notifier=None, passive=False, **kwargs): + """ + Init the CMConnection and then init the SyncCMResources. + :param ip_addr: IP address to use. + :param syncer: Barrier object to sync between all the test processes. + :param notifier: Queue object to pass objects between the connection + sides. + :param passive: Indicate if it's a passive side. + :param kwargs: Arguments used to initialize the CM resources. For more + info please check CMResources. + """ + super(CMSyncConnection, self).__init__(syncer=syncer, notifier=notifier) + self.cm_res = SyncCMResources(addr=ip_addr, passive=passive, **kwargs) + + def establish_connection(self): + """ + Establish RMDACM connection between two Sync CMIDs. + """ + if self.cm_res.passive: + self.cm_res.cmid.listen() + self.syncer.wait() + self.cm_res.create_child_id() + self.cm_res.child_id.accept() + self.cm_res.create_mr() else: - client = CMResources(src=addr, dst=addr, is_async=True, - with_ext_qp=with_ext_qp) - id = client.cmid - id.resolve_addr(client.ai) - syncer.wait() - while not client.connected: - event_handler(client) - client.create_mr() - client_traffic(client, syncer) - event_handler(client) - except Exception as ex: - side = 'passive' if is_server else 'active' - notifier.put('Caught exception in {side} side process: pid {pid}\n' - .format(side=side, pid=os.getpid()) + - 'Exception message: {ex}'.format(ex=str(ex))) - else: - notifier.put(None) + self.syncer.wait() + self.cm_res.cmid.connect() + self.cm_res.create_mr() + + def disconnect(self): + """ + Disconnect the connection. + """ + if self.cm_res.port_space == ce.RDMA_PS_TCP: + if self.cm_res.passive: + self.cm_res.child_id.disconnect() + else: + self.cm_res.cmid.disconnect() diff --git a/tests/run_tests.py b/tests/run_tests.py index 9e2e5d2..9d7edf2 100755 --- a/tests/run_tests.py +++ b/tests/run_tests.py @@ -2,6 +2,7 @@ # SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) # Copyright (c) 2018, Mellanox Technologies. All rights reserved. See COPYING file +from args_parser import parser import unittest import os from importlib.machinery import SourceFileLoader @@ -9,4 +10,5 @@ from importlib.machinery import SourceFileLoader module_path = os.path.join(os.path.dirname(__file__), '__init__.py') tests = SourceFileLoader('tests', module_path).load_module() +parser.parse_args() unittest.main(module=tests) diff --git a/tests/test_addr.py b/tests/test_addr.py index 3789606..1ef800b 100644 --- a/tests/test_addr.py +++ b/tests/test_addr.py @@ -9,6 +9,7 @@ from pyverbs.addr import GlobalRoute, AHAttr, AH from tests.base import PyverbsAPITestCase import pyverbs.enums as e from pyverbs.pd import PD +import tests.utils as u class AHTest(PyverbsAPITestCase): @@ -26,7 +27,7 @@ class AHTest(PyverbsAPITestCase): state = ctx.query_port(port_num).state if state != e.IBV_PORT_ACTIVE and state != e.IBV_PORT_INIT: continue - gr = get_global_route(ctx, port_num=port_num) + gr = u.get_global_route(ctx, port_num=port_num) ah_attr = AHAttr(gr=gr, is_global=1, port_num=port_num) try: with AH(pd, attr=ah_attr): @@ -51,8 +52,8 @@ class AHTest(PyverbsAPITestCase): if port_attr.state != e.IBV_PORT_ACTIVE and \ port_attr.state != e.IBV_PORT_INIT: continue - if port_attr.link_layer == e.IBV_LINK_LAYER_INFINIBAND: - raise unittest.SkipTest('Can\'t run RoCE tests on IB link layer') + if port_attr.link_layer != e.IBV_LINK_LAYER_ETHERNET: + raise unittest.SkipTest('RoCE tests are only supported on Ethernet link layer') ah_attr = AHAttr(is_global=0, port_num=port_num) try: ah = AH(pd, attr=ah_attr) @@ -77,7 +78,7 @@ class AHTest(PyverbsAPITestCase): state = ctx.query_port(port_num).state if state != e.IBV_PORT_ACTIVE and state != e.IBV_PORT_INIT: continue - gr = get_global_route(ctx) + gr = u.get_global_route(ctx) ah_attr = AHAttr(gr=gr, is_global=1, port_num=port_num) try: with AH(pd, attr=ah_attr) as ah: @@ -89,18 +90,3 @@ class AHTest(PyverbsAPITestCase): raise ex if done == 0: raise unittest.SkipTest('No port is up, can\'t create AH') - - -def get_global_route(ctx, gid_index=0, port_num=1): - """ - Queries the provided Context's gid and creates a GlobalRoute - object with sgid_index and the queried GID as dgid. - :param ctx: Context object to query - :param gid_index: GID index to query and use. Default: 0, as it's always - valid - :param port_num: Number of the port to query. Default: 1 - :return: GlobalRoute object - """ - gid = ctx.query_gid(port_num, gid_index) - gr = GlobalRoute(dgid=gid, sgid_index=gid_index) - return gr diff --git a/tests/test_cq_events.py b/tests/test_cq_events.py index bcb3f7d..c75bf4e 100644 --- a/tests/test_cq_events.py +++ b/tests/test_cq_events.py @@ -1,3 +1,7 @@ +import errno +import unittest + +from pyverbs.pyverbs_error import PyverbsRDMAError from tests.base import RCResources, UDResources from tests.base import RDMATestCase from tests.utils import traffic @@ -28,12 +32,17 @@ class CqEventsTestCase(RDMATestCase): self.qp_dict = {'ud': CqEventsUD, 'rc': CqEventsRC} def create_players(self, qp_type): - client = self.qp_dict[qp_type](self.dev_name, self.ib_port, - self.gid_index) - server = self.qp_dict[qp_type](self.dev_name, self.ib_port, - self.gid_index) - client.pre_run(server.psn, server.qpn) - server.pre_run(client.psn, client.qpn) + try: + client = self.qp_dict[qp_type](self.dev_name, self.ib_port, + self.gid_index) + server = self.qp_dict[qp_type](self.dev_name, self.ib_port, + self.gid_index) + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Create qp with attrs {} is not supported'.format(qp_type)) + raise ex + client.pre_run(server.psns, server.qps_num) + server.pre_run(client.psns, client.qps_num) return client, server def test_cq_events_ud(self): diff --git a/tests/test_cqex.py b/tests/test_cqex.py index 63c6c00..e0e5b40 100644 --- a/tests/test_cqex.py +++ b/tests/test_cqex.py @@ -59,12 +59,8 @@ class CqExTestCase(RDMATestCase): self.gid_index) server = self.qp_dict[qp_type](self.dev_name, self.ib_port, self.gid_index) - if qp_type == 'xrc': - client.pre_run(server.psns, server.qps_num) - server.pre_run(client.psns, client.qps_num) - else: - client.pre_run(server.psn, server.qpn) - server.pre_run(client.psn, client.qpn) + client.pre_run(server.psns, server.qps_num) + server.pre_run(client.psns, client.qps_num) return client, server def test_ud_traffic_cq_ex(self): diff --git a/tests/test_device.py b/tests/test_device.py index eb1e94f..dbe681d 100644 --- a/tests/test_device.py +++ b/tests/test_device.py @@ -6,33 +6,48 @@ Test module for pyverbs' device module. import unittest import resource import random +import errno from pyverbs.pyverbs_error import PyverbsError, PyverbsRDMAError from tests.base import PyverbsAPITestCase import tests.utils as u import pyverbs.device as d +import pyverbs.enums as e PAGE_SIZE = resource.getpagesize() -class DeviceTest(unittest.TestCase): +class DeviceTest(PyverbsAPITestCase): """ Test various functionalities of the Device class. """ - def test_dev_list(self): - """ - Verify that it's possible to get IB devices list. - """ - d.get_device_list() + def setUp(self): + pass - @staticmethod - def get_device_list(): + def tearDown(self): + pass + + def get_device_list(self): lst = d.get_device_list() if len(lst) == 0: raise unittest.SkipTest('No IB device found') + dev_name = self.config['dev'] + if dev_name: + for dev in lst: + if dev.name.decode() == dev_name: + lst = [dev] + break + if len(lst) == 0: + raise PyverbsRDMAError(f'No IB device with name {dev_name} found') return lst + def test_dev_list(self): + """ + Verify that it's possible to get IB devices list. + """ + self.get_device_list() + def test_open_dev(self): """ Test ibv_open_device() @@ -47,7 +62,16 @@ class DeviceTest(unittest.TestCase): for dev in self.get_device_list(): with d.Context(name=dev.name.decode()) as ctx: attr = ctx.query_device() - self.verify_device_attr(attr) + self.verify_device_attr(attr, dev) + + def test_query_pkey(self): + """ + Test ibv_query_pkey() + """ + for dev in self.get_device_list(): + with d.Context(name=dev.name.decode()) as ctx: + if dev.node_type == e.IBV_NODE_CA: + ctx.query_pkey(port_num=1, index=0) def test_query_gid(self): """ @@ -57,29 +81,64 @@ class DeviceTest(unittest.TestCase): with d.Context(name=dev.name.decode()) as ctx: ctx.query_gid(port_num=1, index=0) + def test_query_gid_table(self): + """ + Test ibv_query_gid_table() + """ + devs = self.get_device_list() + with d.Context(name=devs[0].name.decode()) as ctx: + device_attr = ctx.query_device() + max_entries = 0 + for port_num in range(1, device_attr.phys_port_cnt + 1): + port_attr = ctx.query_port(port_num) + max_entries += port_attr.gid_tbl_len + try: + ctx.query_gid_table(max_entries) + except PyverbsRDMAError as ex: + if ex.error_code in [-errno.EOPNOTSUPP, -errno.EPROTONOSUPPORT]: + raise unittest.SkipTest('ibv_query_gid_table is not'\ + ' supported on this device') + raise ex + + def test_query_gid_ex(self): + """ + Test ibv_query_gid_ex() + """ + devs = self.get_device_list() + with d.Context(name=devs[0].name.decode()) as ctx: + try: + ctx.query_gid_ex(port_num=1, gid_index=0) + except PyverbsRDMAError as ex: + if ex.error_code in [errno.EOPNOTSUPP, errno.EPROTONOSUPPORT]: + raise unittest.SkipTest('ibv_query_gid_ex is not'\ + ' supported on this device') + raise ex + @staticmethod - def verify_device_attr(attr): + def verify_device_attr(attr, device): """ Helper method that verifies correctness of some members of DeviceAttr object. :param attr: A DeviceAttr object + :param device: A Device object :return: None """ - assert attr.node_guid != 0 - assert attr.sys_image_guid != 0 + if device.node_type != e.IBV_NODE_UNSPECIFIED and device.node_type != e.IBV_NODE_UNKNOWN: + assert attr.node_guid != 0 + assert attr.sys_image_guid != 0 assert attr.max_mr_size > PAGE_SIZE assert attr.page_size_cap >= PAGE_SIZE assert attr.vendor_id != 0 - assert attr.vendor_part_id != 0 assert attr.max_qp > 0 assert attr.max_qp_wr > 0 assert attr.max_sge > 0 - assert attr.max_sge_rd > 0 + assert attr.max_sge_rd >= 0 assert attr.max_cq > 0 assert attr.max_cqe > 0 assert attr.max_mr > 0 assert attr.max_pd > 0 - assert attr.max_pkeys > 0 + if device.node_type == e.IBV_NODE_CA: + assert attr.max_pkeys > 0 def test_query_device_ex(self): """ @@ -88,7 +147,7 @@ class DeviceTest(unittest.TestCase): for dev in self.get_device_list(): with d.Context(name=dev.name.decode()) as ctx: attr_ex = ctx.query_device_ex() - self.verify_device_attr(attr_ex.orig_attr) + self.verify_device_attr(attr_ex.orig_attr, dev) @staticmethod def verify_port_attr(attr): @@ -147,7 +206,7 @@ class DMTest(PyverbsAPITestCase): """ for ctx, attr, attr_ex in self.devices: if attr_ex.max_dm_size == 0: - return + raise unittest.SkipTest('Device memory is not supported') dm_len = random.randrange(u.MIN_DM_SIZE, attr_ex.max_dm_size/2, u.DM_ALIGNMENT) dm_attrs = u.get_dm_attrs(dm_len) @@ -160,7 +219,7 @@ class DMTest(PyverbsAPITestCase): """ for ctx, attr, attr_ex in self.devices: if attr_ex.max_dm_size == 0: - return + raise unittest.SkipTest('Device memory is not supported') dm_len = random.randrange(u.MIN_DM_SIZE, attr_ex.max_dm_size/2, u.DM_ALIGNMENT) dm_attrs = u.get_dm_attrs(dm_len) @@ -173,7 +232,7 @@ class DMTest(PyverbsAPITestCase): """ for ctx, attr, attr_ex in self.devices: if attr_ex.max_dm_size == 0: - return + raise unittest.SkipTest('Device memory is not supported') dm_len = attr_ex.max_dm_size + 1 dm_attrs = u.get_dm_attrs(dm_len) try: @@ -202,7 +261,7 @@ class DMTest(PyverbsAPITestCase): """ for ctx, attr, attr_ex in self.devices: if attr_ex.max_dm_size == 0: - return + raise unittest.SkipTest('Device memory is not supported') dm_len = random.randrange(u.MIN_DM_SIZE, attr_ex.max_dm_size/2, u.DM_ALIGNMENT) dm_attrs = u.get_dm_attrs(dm_len) @@ -216,7 +275,7 @@ class DMTest(PyverbsAPITestCase): """ for ctx, attr, attr_ex in self.devices: if attr_ex.max_dm_size == 0: - return + raise unittest.SkipTest('Device memory is not supported') dm_len = random.randrange(u.MIN_DM_SIZE, attr_ex.max_dm_size/2, u.DM_ALIGNMENT) dm_attrs = u.get_dm_attrs(dm_len) @@ -233,7 +292,7 @@ class DMTest(PyverbsAPITestCase): """ for ctx, attr, attr_ex in self.devices: if attr_ex.max_dm_size == 0: - return + raise unittest.SkipTest('Device memory is not supported') dm_len = random.randrange(u.MIN_DM_SIZE, attr_ex.max_dm_size/2, u.DM_ALIGNMENT) dm_attrs = u.get_dm_attrs(dm_len) @@ -257,7 +316,7 @@ class DMTest(PyverbsAPITestCase): """ for ctx, attr, attr_ex in self.devices: if attr_ex.max_dm_size == 0: - return + raise unittest.SkipTest('Device memory is not supported') dm_len = random.randrange(u.MIN_DM_SIZE, attr_ex.max_dm_size/2, u.DM_ALIGNMENT) dm_attrs = u.get_dm_attrs(dm_len) diff --git a/tests/test_efadv.py b/tests/test_efadv.py new file mode 100644 index 0000000..0a847f3 --- /dev/null +++ b/tests/test_efadv.py @@ -0,0 +1,80 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright 2020 Amazon.com, Inc. or its affiliates. All rights reserved. +""" +Test module for efa direct-verbs. +""" + +import errno +from pyverbs.addr import AHAttr +from pyverbs.base import PyverbsRDMAError +from pyverbs.cq import CQ +import pyverbs.enums as e +from pyverbs.pd import PD +import pyverbs.providers.efa.efadv as efa +from tests.base import PyverbsAPITestCase +import tests.utils as u +import unittest + + +class EfaQueryDeviceTest(PyverbsAPITestCase): + """ + Test various functionalities of the direct verbs class. + """ + def test_efadv_query(self): + """ + Verify that it's possible to read EFA direct-verbs. + """ + for ctx, attr, attr_ex in self.devices: + with efa.EfaContext(name=ctx.name) as efa_ctx: + try: + efa_attrs = efa_ctx.query_efa_device() + if self.config['verbosity']: + print(f'\n{efa_attrs}') + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Not supported on non EFA devices') + raise ex + + +class EfaAHTest(PyverbsAPITestCase): + """ + Test functionality of the EfaAH class + """ + def test_efadv_query_ah(self): + """ + Test efadv_query_ah() + """ + for ctx, attr, attr_ex in self.devices: + pd = PD(ctx) + try: + gr = u.get_global_route(ctx, port_num=1) + ah_attr = AHAttr(gr=gr, is_global=1, port_num=1) + ah = efa.EfaAH(pd, attr=ah_attr) + query_ah_attr = ah.query_efa_ah() + if self.config['verbosity']: + print(f'\n{query_ah_attr}') + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Not supported on non EFA devices') + raise ex + + +class EfaQPTest(PyverbsAPITestCase): + """ + Test SRD QP class + """ + def test_efadv_create_driver_qp(self): + """ + Test efadv_create_driver_qp() + """ + for ctx, attr, attr_ex in self.devices: + with PD(ctx) as pd: + with CQ(ctx, 100) as cq: + qia = u.get_qp_init_attr(cq, attr) + qia.qp_type = e.IBV_QPT_DRIVER + try: + qp = efa.SRDQP(pd, qia) + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest("Create SRD QP is not supported") + raise ex diff --git a/tests/test_mlx5_dc.py b/tests/test_mlx5_dc.py new file mode 100755 index 0000000..36fd683 --- /dev/null +++ b/tests/test_mlx5_dc.py @@ -0,0 +1,56 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2020 NVIDIA Corporation . All rights reserved. See COPYING file + +from tests.mlx5_base import Mlx5DcResources +from tests.base import RDMATestCase +import pyverbs.enums as e +import tests.utils as u + + +class DCTest(RDMATestCase): + def setUp(self): + super().setUp() + self.iters = 10 + self.server = None + self.client = None + self.traffic_args = None + + def sync_remote_attr(self): + """ + Exchange the remote attributes between the server and the client. + """ + self.server.rkey = self.client.mr.rkey + self.server.raddr = self.client.mr.buf + self.client.rkey = self.server.mr.rkey + self.client.raddr = self.server.mr.buf + self.client.remote_dct_num = self.server.dct_qp.qp_num + self.server.remote_dct_num = self.client.dct_qp.qp_num + + def create_players(self, resource, **resource_arg): + """ + Init DC tests resources. + :param resource: The RDMA resources to use. + :param resource_arg: Dict of args that specify the resource specific + attributes. + :return: None + """ + self.client = resource(**self.dev_info, **resource_arg) + self.server = resource(**self.dev_info, **resource_arg) + self.client.pre_run(self.server.psns, self.server.qps_num) + self.server.pre_run(self.client.psns, self.client.qps_num) + self.sync_remote_attr() + self.traffic_args = {'client': self.client, 'server': self.server, + 'iters': self.iters, 'gid_idx': self.gid_index, + 'port': self.ib_port} + + def test_dc_rdma_write(self): + self.create_players(Mlx5DcResources, qp_count=2, + send_ops_flags=e.IBV_QP_EX_WITH_RDMA_WRITE) + u.rdma_traffic(**self.traffic_args, new_send=True, + send_op=e.IBV_QP_EX_WITH_RDMA_WRITE) + + def test_dc_send(self): + self.create_players(Mlx5DcResources, qp_count=2, + send_ops_flags=e.IBV_QP_EX_WITH_SEND) + u.traffic(**self.traffic_args, new_send=True, + send_op=e.IBV_QP_EX_WITH_SEND) diff --git a/tests/test_mlx5_lag_affinity.py b/tests/test_mlx5_lag_affinity.py new file mode 100644 index 0000000..f93755f --- /dev/null +++ b/tests/test_mlx5_lag_affinity.py @@ -0,0 +1,53 @@ +import unittest +import errno + + +from pyverbs.qp import QP, QPAttr, QPInitAttr, QPCap +from pyverbs.pyverbs_error import PyverbsRDMAError +from tests.base import BaseResources, RDMATestCase +from pyverbs.providers.mlx5.mlx5dv import Mlx5QP +from tests.utils import requires_root_on_eth +import pyverbs.enums as e +from pyverbs.cq import CQ + + +class LagRawQP(BaseResources): + def __init__(self, dev_name): + super().__init__(dev_name, None, None) + self.cq = self.create_cq() + self.qp = self.create_qp() + + def create_cq(self): + return CQ(self.ctx, 100) + + @requires_root_on_eth() + def create_qp(self): + qia = QPInitAttr(e.IBV_QPT_RAW_PACKET, rcq=self.cq, scq=self.cq, + cap=QPCap()) + try: + qp = QP(self.pd, qia) + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest("Create Raw Packet QP is not supported") + raise ex + qp.to_init(QPAttr()) + return qp + + +class LagPortTestCase(RDMATestCase): + def modify_lag(self, resources): + try: + port_num, active_port_num = Mlx5QP.query_lag_port(resources.qp) + # if port_num is 1 - modify to 2, else modify to 1 + new_port_num = (2 - port_num) + 1 + Mlx5QP.modify_lag_port(resources.qp, new_port_num) + port_num, active_port_num = Mlx5QP.query_lag_port(resources.qp) + self.assertEqual(port_num, new_port_num, 'Port num is not as expected') + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Set LAG affinity is not supported on this device') + raise ex + + def test_raw_modify_lag_port(self): + qp = LagRawQP(self.dev_name) + self.modify_lag(qp) diff --git a/tests/test_mlx5_uar.py b/tests/test_mlx5_uar.py new file mode 100644 index 0000000..9fd0085 --- /dev/null +++ b/tests/test_mlx5_uar.py @@ -0,0 +1,38 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. See COPYING file + +""" +Test module for Mlx5 UAR allocation. +""" +import unittest +import errno + +from pyverbs.pyverbs_error import PyverbsRDMAError +from pyverbs.providers.mlx5.mlx5dv import Mlx5UAR +import pyverbs.providers.mlx5.mlx5_enums as e +from tests.base import BaseResources +from tests.base import RDMATestCase + + +class Mlx5UarRes(BaseResources): + def __init__(self, dev_name, ib_port=None, gid_index=None): + super().__init__(dev_name, ib_port, gid_index) + self.uars = [] + + +class Mlx5UarTestCase(RDMATestCase): + def setUp(self): + super().setUp() + self.uar_res = Mlx5UarRes(self.dev_name) + + def test_alloc_uar(self): + try: + for f in [e._MLX5DV_UAR_ALLOC_TYPE_BF, e._MLX5DV_UAR_ALLOC_TYPE_NC]: + self.uar_res.uars.append(Mlx5UAR(self.uar_res.ctx, f)) + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP or ex.error_code == errno.EPROTONOSUPPORT: + raise unittest.SkipTest(f'UAR allocation (with flag={f}) is not supported') + raise ex + finally: + for uar in self.uar_res.uars: + uar.close() diff --git a/tests/test_mr.py b/tests/test_mr.py index b54e99c..adc649c 100644 --- a/tests/test_mr.py +++ b/tests/test_mr.py @@ -7,9 +7,11 @@ import unittest import random import errno +from tests.base import PyverbsAPITestCase, RCResources, RDMATestCase from pyverbs.pyverbs_error import PyverbsRDMAError, PyverbsError -from tests.base import PyverbsAPITestCase -from pyverbs.mr import MR, MW, DMMR +from pyverbs.mr import MR, MW, DMMR, MWBindInfo, MWBind +from pyverbs.qp import QPCap, QPInitAttr, QPAttr, QP +from pyverbs.wr import SendWR import pyverbs.device as d from pyverbs.pd import PD import pyverbs.enums as e @@ -150,77 +152,175 @@ class MRTest(PyverbsAPITestCase): mr.buf -class MWTest(PyverbsAPITestCase): +class MWRC(RCResources): + def __init__(self, dev_name, ib_port, gid_index, mw_type): + """ + Initialize Memory Window resources based on RC resources that include RC + QP. + :param dev_name: Device name to be used + :param ib_port: IB port of the device to use + :param gid_index: Which GID index to use + :param mw_type: The MW type to use + """ + super().__init__(dev_name=dev_name, ib_port=ib_port, + gid_index=gid_index) + self.mw_type = mw_type + access = e.IBV_ACCESS_REMOTE_WRITE | e.IBV_ACCESS_LOCAL_WRITE + self.mw_bind_info = MWBindInfo(self.mr, self.mr.buf, self.msg_size, + access) + self.mw_bind = MWBind(self.mw_bind_info, e.IBV_SEND_SIGNALED) + try: + self.mw = MW(self.pd, self.mw_type) + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Create MW is not supported') + raise ex + + def create_mr(self): + access = e.IBV_ACCESS_LOCAL_WRITE | e.IBV_ACCESS_MW_BIND + try: + self.mr = MR(self.pd, self.msg_size, access) + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Reg MR with MW access is not supported') + raise ex + + def create_qp_attr(self): + qp_attr = QPAttr(port_num=self.ib_port) + qp_access = e.IBV_ACCESS_LOCAL_WRITE | e.IBV_ACCESS_REMOTE_WRITE + qp_attr.qp_access_flags = qp_access + return qp_attr + + +class MWTest(RDMATestCase): """ Test various functionalities of the MW class. """ - def test_reg_mw_type1(self): - """ - Test ibv_alloc_mw() for type 1 MW - """ - for ctx, attr, attr_ex in self.devices: - with PD(ctx) as pd: - try: - with MW(pd, e.IBV_MW_TYPE_1): - pass - except PyverbsRDMAError as ex: - if ex.error_code == errno.EOPNOTSUPP: - raise unittest.SkipTest('Create memory window of type 1 is not supported') - raise ex + def setUp(self): + super().setUp() + self.iters = 10 + self.server = None + self.client = None - def test_reg_mw_type2(self): - """ - Test ibv_alloc_mw() for type 2 MW - """ - for ctx, attr, attr_ex in self.devices: - with PD(ctx) as pd: - try: - with MW(pd, e.IBV_MW_TYPE_2): - pass - except PyverbsRDMAError as ex: - if ex.error_code == errno.EOPNOTSUPP: - raise unittest.SkipTest('Create memory window of type 2 is not supported') - raise ex + def create_players(self, resource, **resource_arg): + """ + Init memory window tests resources. + :param resource: The RDMA resources to use. + :param resource_arg: Dict of args that specify the resource specific + attributes. + :return: None + """ + self.client = resource(**self.dev_info, **resource_arg) + self.server = resource(**self.dev_info, **resource_arg) + self.client.pre_run(self.server.psns, self.server.qps_num) + self.server.pre_run(self.client.psns, self.client.qps_num) - def test_dereg_mw_type1(self): - """ - Test ibv_dealloc_mw() for type 1 MW - """ - for ctx, attr, attr_ex in self.devices: - with PD(ctx) as pd: - try: - with MW(pd, e.IBV_MW_TYPE_1) as mw: - mw.close() - except PyverbsRDMAError as ex: - if ex.error_code == errno.EOPNOTSUPP: - raise unittest.SkipTest('Create memory window of type 1 is not supported') - raise ex + def tearDown(self): + if self.server: + self.server.mw.close() + if self.client: + self.client.mw.close() + return super().tearDown() - def test_dereg_mw_type2(self): + def bind_mw_type_1(self): + self.server.qp.bind_mw(self.server.mw, self.server.mw_bind) + self.client.qp.bind_mw(self.client.mw, self.client.mw_bind) + # Poll the bind MW action completion. + u.poll_cq(self.server.cq) + u.poll_cq(self.client.cq) + self.server.rkey = self.client.mw.rkey + self.server.remote_addr = self.client.mr.buf + self.client.rkey = self.server.mw.rkey + self.client.remote_addr = self.server.mr.buf + + def bind_mw_type_2(self): + client_send_wr = SendWR(opcode=e.IBV_WR_BIND_MW) + client_send_wr.set_bind_wr(self.client.mw, self.client.mw_bind_info) + server_send_wr = SendWR(opcode=e.IBV_WR_BIND_MW) + server_send_wr.set_bind_wr(self.server.mw, self.server.mw_bind_info) + self.server.qp.post_send(server_send_wr) + self.client.qp.post_send(client_send_wr) + # Poll the bind MW WR. + u.poll_cq(self.server.cq) + u.poll_cq(self.client.cq) + self.server.rkey = client_send_wr.rkey + self.server.remote_addr = self.client.mr.buf + self.client.rkey = server_send_wr.rkey + self.client.remote_addr = self.server.mr.buf + + def invalidate_mw_type1(self): """ - Test ibv_dealloc_mw() for type 2 MW + Invalidate the MWs by rebind this MW with zero length. + :return: None """ - for ctx, attr, attr_ex in self.devices: - with PD(ctx) as pd: - try: - with MW(pd, e.IBV_MW_TYPE_2) as mw: - mw.close() - except PyverbsRDMAError as ex: - if ex.error_code == errno.EOPNOTSUPP: - raise unittest.SkipTest('Create memory window of type 2 is not supported') - raise ex + for player in [self.server, self.client]: + mw_bind_info = MWBindInfo(player.mr, player.mr.buf, 0, 0) + mw_bind = MWBind(mw_bind_info, e.IBV_SEND_SIGNALED) + player.qp.bind_mw(player.mw, mw_bind) + # Poll the bound MW action request completion. + u.poll_cq(player.cq) + + def invalidate_mw_type2_local(self): + """ + Invalidate the MWs by post invalidation send WR from the local QP. + :return: None + """ + inv_send_wr = SendWR(opcode=e.IBV_WR_LOCAL_INV) + inv_send_wr.imm_data = self.server.rkey + self.client.qp.post_send(inv_send_wr) + inv_send_wr = SendWR(opcode=e.IBV_WR_LOCAL_INV) + inv_send_wr.imm_data = self.client.rkey + self.server.qp.post_send(inv_send_wr) + # Poll the invalidate MW WR. + u.poll_cq(self.server.cq) + u.poll_cq(self.client.cq) + + def invalidate_mw_type2_remote(self): + """ + Invalidate the MWs by sending invalidation send WR from the remote QP. + :return: None + """ + server_recv_wr = u.get_recv_wr(self.server) + client_recv_wr = u.get_recv_wr(self.client) + self.server.qp.post_recv(server_recv_wr) + self.client.qp.post_recv(client_recv_wr) + inv_send_wr = SendWR(opcode=e.IBV_WR_SEND_WITH_INV) + inv_send_wr.imm_data = self.client.rkey + self.client.qp.post_send(inv_send_wr) + inv_send_wr = SendWR(opcode=e.IBV_WR_SEND_WITH_INV) + inv_send_wr.imm_data = self.server.rkey + self.server.qp.post_send(inv_send_wr) + # Poll the invalidate MW send WR. + u.poll_cq(self.server.cq) + u.poll_cq(self.client.cq) + # Poll the invalidate MW recv WR. + u.poll_cq(self.server.cq) + u.poll_cq(self.client.cq) + + def test_mw_type1(self): + self.create_players(MWRC, mw_type=e.IBV_MW_TYPE_1) + self.bind_mw_type_1() + u.rdma_traffic(self.client, self.server, self.iters, self.gid_index, + self.ib_port, send_op=e.IBV_WR_RDMA_WRITE) + + def test_mw_type2(self): + self.create_players(MWRC, mw_type=e.IBV_MW_TYPE_2) + self.bind_mw_type_2() + u.rdma_traffic(self.client, self.server, self.iters, self.gid_index, + self.ib_port, send_op=e.IBV_WR_RDMA_WRITE) def test_reg_mw_wrong_type(self): """ Verify that trying to create a MW of a wrong type fails """ - for ctx, attr, attr_ex in self.devices: + with d.Context(name=self.dev_name) as ctx: with PD(ctx) as pd: try: - mw_type = random.randint(3, 100) + mw_type = 3 MW(pd, mw_type) - except PyverbsRDMAError: - pass + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Create memory window of type {} is not supported'.format(mw_type)) else: raise PyverbsError('Created a MW with type {t}'.\ format(t=mw_type)) @@ -254,7 +354,7 @@ class DMMRTest(PyverbsAPITestCase): """ for ctx, attr, attr_ex in self.devices: if attr_ex.max_dm_size == 0: - return + raise unittest.SkipTest('Device memory is not supported') with PD(ctx) as pd: for i in range(10): dm_len = random.randrange(u.MIN_DM_SIZE, attr_ex.max_dm_size/2, diff --git a/tests/test_odp.py b/tests/test_odp.py index 0fa8d94..bef66a0 100755 --- a/tests/test_odp.py +++ b/tests/test_odp.py @@ -1,8 +1,9 @@ from pyverbs.mem_alloc import mmap, munmap, MAP_ANONYMOUS_, MAP_PRIVATE_, \ MAP_HUGETLB_ from tests.utils import requires_odp, requires_huge_pages, traffic, \ - xrc_traffic, create_custom_mr + xrc_traffic, create_custom_mr, poll_cq, post_send, GRH_SIZE from tests.base import RCResources, UDResources, XRCResources +from pyverbs.wr import SGE, SendWR, RecvWR from tests.base import RDMATestCase from pyverbs.mr import MR import pyverbs.enums as e @@ -12,15 +13,18 @@ HUGE_PAGE_SIZE = 0x200000 class OdpUD(UDResources): - @requires_odp('ud') + @requires_odp('ud', e.IBV_ODP_SUPPORT_SEND) def create_mr(self): - self.mr = create_custom_mr(self, e.IBV_ACCESS_ON_DEMAND, - self.msg_size + self.GRH_SIZE) + self.send_mr = MR(self.pd, self.msg_size + self.GRH_SIZE, + e.IBV_ACCESS_LOCAL_WRITE | e.IBV_ACCESS_ON_DEMAND) + self.recv_mr = MR(self.pd, self.msg_size + self.GRH_SIZE, + e.IBV_ACCESS_LOCAL_WRITE) class OdpRC(RCResources): def __init__(self, dev_name, ib_port, gid_index, is_huge=False, - user_addr=None): + user_addr=None, use_mr_prefetch=None, is_implicit=False, + prefetch_advice=e._IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE): """ Initialize an OdpRC object. :param dev_name: Device name to be used @@ -29,22 +33,43 @@ class OdpRC(RCResources): :param is_huge: If True, use huge pages for MR registration :param user_addr: The MR's buffer address. If None, the buffer will be allocated by pyverbs. + :param use_mr_prefetch: Describes the properties of the prefetch + operation. The options are 'sync', 'async' + and None to skip the prefetch operation. + :param is_implicit: If True, register implicit MR. + :param prefetch_advice: The advice of the prefetch request (ignored + if use_mr_prefetch is None). """ self.is_huge = is_huge self.user_addr = user_addr + self.is_implicit = is_implicit super(OdpRC, self).__init__(dev_name=dev_name, ib_port=ib_port, gid_index=gid_index) + self.use_mr_prefetch = use_mr_prefetch + self.prefetch_advice = prefetch_advice - @requires_odp('rc') + @requires_odp('rc', e.IBV_ODP_SUPPORT_SEND | e.IBV_ODP_SUPPORT_RECV) def create_mr(self): access = e.IBV_ACCESS_LOCAL_WRITE | e.IBV_ACCESS_ON_DEMAND if self.is_huge: access |= e.IBV_ACCESS_HUGETLB - self.mr = MR(self.pd, self.msg_size, access, address=self.user_addr) + self.mr = MR(self.pd, self.msg_size, access, address=self.user_addr, + implicit=self.is_implicit) + + +class OdpSrqRc(RCResources): + def __init__(self, dev_name, ib_port, gid_index, qp_count=1): + super(OdpSrqRc, self).__init__(dev_name=dev_name, ib_port=ib_port, + gid_index=gid_index, with_srq=True, + qp_count=qp_count) + + @requires_odp('rc', e.IBV_ODP_SUPPORT_SEND | e.IBV_ODP_SUPPORT_SRQ_RECV) + def create_mr(self): + self.mr = create_custom_mr(self, e.IBV_ACCESS_ON_DEMAND) class OdpXRC(XRCResources): - @requires_odp('xrc') + @requires_odp('xrc', e.IBV_ODP_SUPPORT_SEND | e.IBV_ODP_SUPPORT_SRQ_RECV) def create_mr(self): self.mr = create_custom_mr(self, e.IBV_ACCESS_ON_DEMAND) @@ -54,27 +79,20 @@ class OdpTestCase(RDMATestCase): super(OdpTestCase, self).setUp() self.iters = 100 self.user_addr = None - self.qp_dict = {'rc': OdpRC, 'ud': OdpUD, 'xrc': OdpXRC} - - def create_players(self, qp_type, is_huge=False): - if qp_type == 'rc': - client = self.qp_dict[qp_type](self.dev_name, self.ib_port, - self.gid_index, is_huge=is_huge, - user_addr=self.user_addr) - server = self.qp_dict[qp_type](self.dev_name, self.ib_port, - self.gid_index, is_huge=is_huge, - user_addr=self.user_addr) - else: - client = self.qp_dict[qp_type](self.dev_name, self.ib_port, - self.gid_index) - server = self.qp_dict[qp_type](self.dev_name, self.ib_port, - self.gid_index) - if qp_type == 'xrc': - client.pre_run(server.psns, server.qps_num) - server.pre_run(client.psns, client.qps_num) - else: - client.pre_run(server.psn, server.qpn) - server.pre_run(client.psn, client.qpn) + + def create_players(self, resource, **resource_arg): + """ + Init odp tests resources. + :param resource: The RDMA resources to use. A class of type + BaseResources. + :param resource_arg: Dict of args that specify the resource specific + attributes. + :return: The (client, server) resources. + """ + client = resource(**self.dev_info, **resource_arg) + server = resource(**self.dev_info, **resource_arg) + client.pre_run(server.psns, server.qps_num) + server.pre_run(client.psns, client.qps_num) return client, server def tearDown(self): @@ -83,26 +101,82 @@ class OdpTestCase(RDMATestCase): super(OdpTestCase, self).tearDown() def test_odp_rc_traffic(self): - client, server = self.create_players('rc') + client, server = self.create_players(OdpRC) traffic(client, server, self.iters, self.gid_index, self.ib_port) - def test_odp_ud_traffic(self): - client, server = self.create_players('ud') + def test_odp_implicit_rc_traffic(self): + client, server = self.create_players(OdpRC, is_implicit=True) traffic(client, server, self.iters, self.gid_index, self.ib_port) + def test_odp_ud_traffic(self): + client, server = self.create_players(OdpUD) + # Implement the traffic here because OdpUD uses two different MRs for + # send and recv. + recv_sge = SGE(server.recv_mr.buf, server.msg_size + GRH_SIZE, + server.recv_mr.lkey) + server_recv_wr = RecvWR(sg=[recv_sge], num_sge=1) + send_sge = SGE(client.send_mr.buf + GRH_SIZE, client.msg_size, + client.send_mr.lkey) + client_send_wr = SendWR(num_sge=1, sg=[send_sge]) + for i in range(self.iters): + server.qp.post_recv(server_recv_wr) + post_send(client, client_send_wr, self.gid_index, self.ib_port) + poll_cq(client.cq) + poll_cq(server.cq) + def test_odp_xrc_traffic(self): - client, server = self.create_players('xrc') + client, server = self.create_players(OdpXRC) xrc_traffic(client, server) + def test_odp_rc_srq_traffic(self): + client, server = self.create_players(OdpSrqRc, qp_count=2) + traffic(client, server, self.iters, self.gid_index, self.ib_port) + @requires_huge_pages() def test_odp_rc_huge_traffic(self): - client, server = self.create_players('rc', is_huge=True) + client, server = self.create_players(OdpRC, is_huge=True) traffic(client, server, self.iters, self.gid_index, self.ib_port) @requires_huge_pages() def test_odp_rc_huge_user_addr_traffic(self): self.user_addr = mmap(length=HUGE_PAGE_SIZE, flags=MAP_ANONYMOUS_| MAP_PRIVATE_| MAP_HUGETLB_) - client, server = self.create_players('rc', is_huge=True) + client, server = self.create_players(OdpRC, is_huge=True, + user_addr=self.user_addr) + traffic(client, server, self.iters, self.gid_index, self.ib_port) + + def test_odp_sync_prefetch_rc_traffic(self): + for advice in [e._IBV_ADVISE_MR_ADVICE_PREFETCH, + e._IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE]: + client, server = self.create_players(OdpRC, use_mr_prefetch='sync', + prefetch_advice=advice) + traffic(client, server, self.iters, self.gid_index, self.ib_port) + + def test_odp_async_prefetch_rc_traffic(self): + for advice in [e._IBV_ADVISE_MR_ADVICE_PREFETCH, + e._IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE]: + client, server = self.create_players(OdpRC, use_mr_prefetch='async', + prefetch_advice=advice) + traffic(client, server, self.iters, self.gid_index, self.ib_port) + + def test_odp_implicit_sync_prefetch_rc_traffic(self): + client, server = self.create_players(OdpRC, use_mr_prefetch='sync', is_implicit=True) traffic(client, server, self.iters, self.gid_index, self.ib_port) + def test_odp_implicit_async_prefetch_rc_traffic(self): + client, server = self.create_players(OdpRC, use_mr_prefetch='async', is_implicit=True) + traffic(client, server, self.iters, self.gid_index, self.ib_port) + + def test_odp_prefetch_sync_no_page_fault_rc_traffic(self): + prefetch_advice = e._IBV_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT + client, server = self.create_players(OdpRC, + use_mr_prefetch='sync', + prefetch_advice=prefetch_advice) + traffic(client, server, self.iters, self.gid_index, self.ib_port) + + def test_odp_prefetch_async_no_page_fault_rc_traffic(self): + prefetch_advice = e._IBV_ADVISE_MR_ADVICE_PREFETCH_NO_FAULT + client, server = self.create_players(OdpRC, + use_mr_prefetch='async', + prefetch_advice=prefetch_advice) + traffic(client, server, self.iters, self.gid_index, self.ib_port) diff --git a/tests/test_qp.py b/tests/test_qp.py index 612fca3..f18f885 100644 --- a/tests/test_qp.py +++ b/tests/test_qp.py @@ -1,5 +1,7 @@ # SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) # Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. See COPYING file +# Copyright (c) 2020 Kamal Heib , All rights reserved. See COPYING file + """ Test module for pyverbs' qp module. """ @@ -11,6 +13,7 @@ import os from pyverbs.pyverbs_error import PyverbsRDMAError from pyverbs.qp import QPInitAttr, QPAttr, QP from tests.base import PyverbsAPITestCase +import pyverbs.utils as pu import pyverbs.enums as e from pyverbs.pd import PD from pyverbs.cq import CQ @@ -22,307 +25,266 @@ class QPTest(PyverbsAPITestCase): Test various functionalities of the QP class. """ - def test_create_qp_no_attr_connected(self): + def create_qp(self, creator, qp_init_attr, is_ex, with_attr, port_num=1): + """ + Auxiliary function to create QP object. + """ + try: + qp_attr = (None, QPAttr(port_num=port_num))[with_attr] + return QP(creator, qp_init_attr, qp_attr) + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + with_str = ('without', 'with')[with_attr] + ('', ' extended')[is_ex] + qp_type_str = pu.qp_type_to_str(qp_init_attr.qp_type) + raise unittest.SkipTest(f'Create {qp_type_str} QP {with_str} attrs is not supported') + raise ex + + def create_qp_common_test(self, qp_type, qp_state, is_ex, with_attr): """ - Test QP creation via ibv_create_qp without a QPAttr object proivded. - Checked QP types are RC and UC. + Common function used by create QP tests. """ for ctx, attr, attr_ex in self.devices: with PD(ctx) as pd: with CQ(ctx, 100, None, None, 0) as cq: - qia = get_qp_init_attr(cq, attr) - qia.qp_type = e.IBV_QPT_RC - with QP(pd, qia) as qp: - assert qp.qp_state == e.IBV_QPS_RESET, 'RC QP should have been in RESET' - qia.qp_type = e.IBV_QPT_UC - with QP(pd, qia) as qp: - assert qp.qp_state == e.IBV_QPS_RESET, 'UC QP should have been in RESET' + port_num = 1 + if qp_type == e.IBV_QPT_RAW_PACKET: + eth_port = 0 + for i in range(1, attr.phys_port_cnt + 1): + if u.is_eth(ctx, i) and u.is_root(): + eth_port = i + port_num = eth_port + break + if eth_port == 0: + raise unittest.SkipTest('To Create RAW QP must be done by root on Ethernet link layer') + if is_ex: + qia = get_qp_init_attr_ex(cq, pd, attr, attr_ex, qp_type) + creator = ctx + else: + qia = u.get_qp_init_attr(cq, attr) + qia.qp_type = qp_type + creator = pd - def test_create_qp_no_attr(self): + qp = self.create_qp(creator, qia, is_ex, with_attr, port_num) + qp_type_str = pu.qp_type_to_str(qp_type) + qp_state_str = pu.qp_state_to_str(qp_state) + assert qp.qp_state == qp_state , f'{qp_type_str} QP should have been in {qp_state_str}' + + def test_create_rc_qp_no_attr(self): """ - Test QP creation via ibv_create_qp without a QPAttr object proivded. - Checked QP types are Raw Packet and UD. Raw Packet is skipped for - non-root users / Infiniband link layer. + Test RC QP creation via ibv_create_qp without a QPAttr object provided. """ - for ctx, attr, attr_ex in self.devices: - with PD(ctx) as pd: - with CQ(ctx, 100, None, None, 0) as cq: - for i in range(1, attr.phys_port_cnt + 1): - qia = get_qp_init_attr(cq, attr) - qia.qp_type = e.IBV_QPT_UD - with QP(pd, qia) as qp: - assert qp.qp_state == e.IBV_QPS_RESET, 'UD QP should have been in RESET' - if is_eth(ctx, i) and is_root(): - qia.qp_type = e.IBV_QPT_RAW_PACKET - with QP(pd, qia) as qp: - assert qp.qp_state == e.IBV_QPS_RESET, 'Raw Packet QP should have been in RESET' + self.create_qp_common_test(e.IBV_QPT_RC, e.IBV_QPS_RESET, False, False) - def test_create_qp_with_attr_connected(self): + def test_create_uc_qp_no_attr(self): """ - Test QP creation via ibv_create_qp without a QPAttr object proivded. - Checked QP types are RC and UC. + Test UC QP creation via ibv_create_qp without a QPAttr object provided. """ - for ctx, attr, attr_ex in self.devices: - with PD(ctx) as pd: - with CQ(ctx, 100, None, None, 0) as cq: - qia = get_qp_init_attr(cq, attr) - qia.qp_type = e.IBV_QPT_RC - with QP(pd, qia, QPAttr()) as qp: - assert qp.qp_state == e.IBV_QPS_INIT, 'RC QP should have been in INIT' - qia.qp_type = e.IBV_QPT_UC - with QP(pd, qia, QPAttr()) as qp: - assert qp.qp_state == e.IBV_QPS_INIT, 'UC QP should have been in INIT' + self.create_qp_common_test(e.IBV_QPT_UC, e.IBV_QPS_RESET, False, False) - def test_create_qp_with_attr(self): + def test_create_ud_qp_no_attr(self): """ - Test QP creation via ibv_create_qp with a QPAttr object proivded. - Checked QP types are Raw Packet and UD. Raw Packet is skipped for - non-root users / Infiniband link layer. + Test UD QP creation via ibv_create_qp without a QPAttr object provided. """ - for ctx, attr, attr_ex in self.devices: - with PD(ctx) as pd: - with CQ(ctx, 100, None, None, 0) as cq: - for i in range(1, attr.phys_port_cnt + 1): - qpts = [e.IBV_QPT_UD, e.IBV_QPT_RAW_PACKET] \ - if is_eth(ctx, i) else [e.IBV_QPT_UD] - qia = get_qp_init_attr(cq, attr) - qia.qp_type = e.IBV_QPT_UD - with QP(pd, qia, QPAttr()) as qp: - assert qp.qp_state == e.IBV_QPS_RTS, 'UD QP should have been in RTS' - if is_eth(ctx, i) and is_root(): - qia.qp_type = e.IBV_QPT_RAW_PACKET - with QP(pd, qia, QPAttr()) as qp: - assert qp.qp_state == e.IBV_QPS_RTS, 'Raw Packet QP should have been in RTS' - - def test_create_qp_ex_no_attr_connected(self): - """ - Test QP creation via ibv_create_qp_ex without a QPAttr object proivded. - Checked QP types are RC and UC. + self.create_qp_common_test(e.IBV_QPT_UD, e.IBV_QPS_RESET, False, False) + + def test_create_raw_qp_no_attr(self): """ - for ctx, attr, attr_ex in self.devices: - with PD(ctx) as pd: - with CQ(ctx, 100, None, None, 0) as cq: - qia = get_qp_init_attr_ex(cq, pd, attr, attr_ex, e.IBV_QPT_RC) - try: - with QP(ctx, qia) as qp: - assert qp.qp_state == e.IBV_QPS_RESET, 'RC QP should have been in RESET' - except PyverbsRDMAError as ex: - if ex.error_code == errno.EOPNOTSUPP: - raise unittest.SkipTest('Create QP with extended attrs is not supported') - raise ex - qia = get_qp_init_attr_ex(cq, pd, attr, attr_ex, e.IBV_QPT_UC) - try: - with QP(ctx, qia) as qp: - assert qp.qp_state == e.IBV_QPS_RESET, 'UC QP should have been in RESET' - except PyverbsRDMAError as ex: - if ex.error_code == errno.EOPNOTSUPP: - raise unittest.SkipTest('Create QP with extended attrs is not supported') - raise ex - - def test_create_qp_ex_no_attr(self): - """ - Test QP creation via ibv_create_qp_ex without a QPAttr object proivded. - Checked QP types are Raw Packet and UD. Raw Packet is skipped for - non-root users / Infiniband link layer. + Test RAW Packet QP creation via ibv_create_qp without a QPAttr object + provided. + Raw Packet is skipped for non-root users / Infiniband link layer. """ - for ctx, attr, attr_ex in self.devices: - with PD(ctx) as pd: - with CQ(ctx, 100, None, None, 0) as cq: - for i in range(1, attr.phys_port_cnt + 1): - qia = get_qp_init_attr_ex(cq, pd, attr, attr_ex, - e.IBV_QPT_UD) - try: - with QP(ctx, qia) as qp: - assert qp.qp_state == e.IBV_QPS_RESET, 'UD QP should have been in RESET' - except PyverbsRDMAError as ex: - if ex.error_code == errno.EOPNOTSUPP: - raise unittest.SkipTest('Create QP with extended attrs is not supported') - raise ex - if is_eth(ctx, i) and is_root(): - qia = get_qp_init_attr_ex(cq, pd, attr, attr_ex, - e.IBV_QPT_RAW_PACKET) - try: - with QP(ctx, qia) as qp: - assert qp.qp_state == e.IBV_QPS_RESET, 'Raw Packet QP should have been in RESET' - except PyverbsRDMAError as ex: - if ex.error_code == errno.EOPNOTSUPP: - raise unittest.SkipTest('Create QP with extended attrs is not supported') - raise ex - - def test_create_qp_ex_with_attr_connected(self): - """ - Test QP creation via ibv_create_qp_ex with a QPAttr object proivded. - Checked QP type are RC and UC. + self.create_qp_common_test(e.IBV_QPT_RAW_PACKET, e.IBV_QPS_RESET, False, False) + + def test_create_rc_qp_with_attr(self): """ - for ctx, attr, attr_ex in self.devices: - with PD(ctx) as pd: - with CQ(ctx, 100, None, None, 0) as cq: - qia = get_qp_init_attr_ex(cq, pd, attr, attr_ex, - e.IBV_QPT_RC) - try: - with QP(ctx, qia, QPAttr()) as qp: - assert qp.qp_state == e.IBV_QPS_INIT, 'RC QP should have been in INIT' - except PyverbsRDMAError as ex: - if ex.error_code == errno.EOPNOTSUPP: - raise unittest.SkipTest('Create QP with extended attrs is not supported') - raise ex - qia = get_qp_init_attr_ex(cq, pd, attr, attr_ex, - e.IBV_QPT_UC) - try: - with QP(ctx, qia, QPAttr()) as qp: - assert qp.qp_state == e.IBV_QPS_INIT, 'UC QP should have been in INIT' - except PyverbsRDMAError as ex: - if ex.error_code == errno.EOPNOTSUPP: - raise unittest.SkipTest('Create QP with extended attrs is not supported') - raise ex - - def test_create_qp_ex_with_attr(self): - """ - Test QP creation via ibv_create_qp_ex with a QPAttr object proivded. - Checked QP types are Raw Packet and UD. Raw Packet is skipped for - non-root users / Infiniband link layer. + Test RC QP creation via ibv_create_qp with a QPAttr object provided. + """ + self.create_qp_common_test(e.IBV_QPT_RC, e.IBV_QPS_INIT, False, True) + + def test_create_uc_qp_with_attr(self): + """ + Test UC QP creation via ibv_create_qp with a QPAttr object provided. + """ + self.create_qp_common_test(e.IBV_QPT_UC, e.IBV_QPS_INIT, False, True) + + def test_create_ud_qp_with_attr(self): """ + Test UD QP creation via ibv_create_qp with a QPAttr object provided. + """ + self.create_qp_common_test(e.IBV_QPT_UD, e.IBV_QPS_RTS, False, True) + + def test_create_raw_qp_with_attr(self): + """ + Test RAW Packet QP creation via ibv_create_qp with a QPAttr object + provided. + Raw Packet is skipped for non-root users / Infiniband link layer. + """ + self.create_qp_common_test(e.IBV_QPT_RAW_PACKET, e.IBV_QPS_RTS, False, True) + + def test_create_rc_qp_ex_no_attr(self): + """ + Test RC QP creation via ibv_create_qp_ex without a QPAttr object + provided. + """ + self.create_qp_common_test(e.IBV_QPT_RC, e.IBV_QPS_RESET, True, False) + + def test_create_uc_qp_ex_no_attr(self): + """ + Test UC QP creation via ibv_create_qp_ex without a QPAttr object + provided. + """ + self.create_qp_common_test(e.IBV_QPT_UC, e.IBV_QPS_RESET, True, False) + + def test_create_ud_qp_ex_no_attr(self): + """ + Test UD QP creation via ibv_create_qp_ex without a QPAttr object + provided. + """ + self.create_qp_common_test(e.IBV_QPT_UD, e.IBV_QPS_RESET, True, False) + + def test_create_raw_qp_ex_no_attr(self): + """ + Test Raw Packet QP creation via ibv_create_qp_ex without a QPAttr object + provided. + Raw Packet is skipped for non-root users / Infiniband link layer. + """ + self.create_qp_common_test(e.IBV_QPT_RAW_PACKET, e.IBV_QPS_RESET, True, False) + + def test_create_rc_qp_ex_with_attr(self): + """ + Test RC QP creation via ibv_create_qp_ex with a QPAttr object provided. + """ + self.create_qp_common_test(e.IBV_QPT_RC, e.IBV_QPS_INIT, True, True) + + def test_create_uc_qp_ex_with_attr(self): + """ + Test UC QP creation via ibv_create_qp_ex with a QPAttr object provided. + """ + self.create_qp_common_test(e.IBV_QPT_UC, e.IBV_QPS_INIT, True, True) + + def test_create_ud_qp_ex_with_attr(self): + """ + Test UD QP creation via ibv_create_qp_ex with a QPAttr object provided. + """ + self.create_qp_common_test(e.IBV_QPT_UD, e.IBV_QPS_RTS, True, True) + + def test_create_raw_qp_ex_with_attr(self): + """ + Test Raw Packet QP creation via ibv_create_qp_ex with a QPAttr object + provided. + Raw Packet is skipped for non-root users / Infiniband link layer. + """ + self.create_qp_common_test(e.IBV_QPT_RAW_PACKET, e.IBV_QPS_RTS, True, True) + + def verify_qp_attrs(self, orig_cap, state, init_attr, attr): + self.assertEqual(state, attr.cur_qp_state) + self.assertLessEqual(orig_cap.max_send_wr, init_attr.cap.max_send_wr) + self.assertLessEqual(orig_cap.max_recv_wr, init_attr.cap.max_recv_wr) + self.assertLessEqual(orig_cap.max_send_sge, init_attr.cap.max_send_sge) + self.assertLessEqual(orig_cap.max_recv_sge, init_attr.cap.max_recv_sge) + self.assertLessEqual(orig_cap.max_inline_data, init_attr.cap.max_inline_data) + + def query_qp_common_test(self, qp_type): for ctx, attr, attr_ex in self.devices: with PD(ctx) as pd: with CQ(ctx, 100, None, None, 0) as cq: - for i in range(1, attr.phys_port_cnt + 1): - qia = get_qp_init_attr_ex(cq, pd, attr, attr_ex, - e.IBV_QPT_UD) - try: - with QP(ctx, qia, QPAttr()) as qp: - assert qp.qp_state == e.IBV_QPS_RTS, 'UD QP should have been in RTS' - except PyverbsRDMAError as ex: - if ex.error_code == errno.EOPNOTSUPP: - raise unittest.SkipTest('Create QP with extended attrs is not supported') - raise ex - if is_eth(ctx, i) and is_root(): - qia = get_qp_init_attr_ex(cq, pd, attr, attr_ex, - e.IBV_QPT_RAW_PACKET) - try: - with QP(ctx, qia, QPAttr()) as qp: - assert qp.qp_state == e.IBV_QPS_RTS, 'Raw Packet QP should have been in RTS' - except PyverbsRDMAError as ex: - if ex.error_code == errno.EOPNOTSUPP: - raise unittest.SkipTest('Create QP with extended attrs is not supported') - raise ex - - def test_query_qp(self): - """ - Queries a QP after creation. Verifies that its properties are as + port_num = 1 + if qp_type == e.IBV_QPT_RAW_PACKET: + eth_port = 0 + for i in range(1, attr.phys_port_cnt + 1): + if u.is_eth(ctx, i) and u.is_root(): + eth_port = i + port_num = eth_port + break + if eth_port == 0: + raise unittest.SkipTest('To Create RAW QP must be done by root on Ethernet link layer') + + # Legacy QP + qia = u.get_qp_init_attr(cq, attr) + qia.qp_type = qp_type + caps = qia.cap + qp = self.create_qp(pd, qia, False, False, port_num) + qp_attr, qp_init_attr = qp.query(e.IBV_QP_STATE | e.IBV_QP_CAP) + self.verify_qp_attrs(caps, e.IBV_QPS_RESET, qp_init_attr, qp_attr) + + # Extended QP + qia = get_qp_init_attr_ex(cq, pd, attr, attr_ex, qp_type) + caps = qia.cap # Save them to verify values later + qp = self.create_qp(ctx, qia, True, False, port_num) + qp_attr, qp_init_attr = qp.query(e.IBV_QP_STATE | e.IBV_QP_CAP) + self.verify_qp_attrs(caps, e.IBV_QPS_RESET, qp_init_attr, qp_attr) + + def test_query_rc_qp(self): + """ + Queries an RC QP after creation. Verifies that its properties are as expected. """ - for ctx, attr, attr_ex in self.devices: - with PD(ctx) as pd: - with CQ(ctx, 100, None, None, 0) as cq: - for i in range(1, attr.phys_port_cnt + 1): - qpts = get_qp_types(ctx, i) - for qpt in qpts: - # Extended QP - qia = get_qp_init_attr_ex(cq, pd, attr, attr_ex, - qpt) - caps = qia.cap # Save them to verify values later - try: - qp = QP(ctx, qia) - except PyverbsRDMAError as ex: - if ex.error_code == errno.EOPNOTSUPP: - raise unittest.SkipTest('Create QP with extended attrs is not supported') - raise ex - qp_attr, qp_init_attr = qp.query(e.IBV_QP_CUR_STATE | - e.IBV_QP_CAP) - verify_qp_attrs(caps, e.IBV_QPS_RESET, qp_init_attr, - qp_attr) - # Legacy QP - qia = get_qp_init_attr(cq, attr) - qia.qp_type = qpt - caps = qia.cap # Save them to verify values later - qp = QP(pd, qia) - qp_attr, qp_init_attr = qp.query(e.IBV_QP_CUR_STATE | - e.IBV_QP_CAP) - verify_qp_attrs(caps, e.IBV_QPS_RESET, qp_init_attr, - qp_attr) - - def test_modify_qp(self): - """ - Queries a QP after calling modify(). Verifies that its properties are + self.query_qp_common_test(e.IBV_QPT_RC) + + def test_query_uc_qp(self): + """ + Queries an UC QP after creation. Verifies that its properties are as + expected. + """ + self.query_qp_common_test(e.IBV_QPT_UC) + + def test_query_ud_qp(self): + """ + Queries an UD QP after creation. Verifies that its properties are as + expected. + """ + self.query_qp_common_test(e.IBV_QPT_UD) + + def test_query_raw_qp(self): + """ + Queries an RAW Packet QP after creation. Verifies that its properties + are as expected. + Raw Packet is skipped for non-root users / Infiniband link layer. + """ + self.query_qp_common_test(e.IBV_QPT_RAW_PACKET) + + def test_modify_ud_qp(self): + """ + Queries a UD QP after calling modify(). Verifies that its properties are as expected. """ for ctx, attr, attr_ex in self.devices: with PD(ctx) as pd: with CQ(ctx, 100, None, None, 0) as cq: - # Extended QP - qia = get_qp_init_attr_ex(cq, pd, attr, attr_ex, e.IBV_QPT_UD) - try: - qp = QP(ctx, qia) - except PyverbsRDMAError as ex: - if ex.error_code == errno.EOPNOTSUPP: - raise unittest.SkipTest('Create QP with extended attrs is not supported') - raise ex + # Legacy QP + qia = u.get_qp_init_attr(cq, attr) + qia.qp_type = e.IBV_QPT_UD + qp = self.create_qp(pd, qia, False, False) qa = QPAttr() qa.qkey = 0x123 qp.to_init(qa) - qp_attr, qp_iattr = qp.query(e.IBV_QP_QKEY) - assert qp_attr.qkey == qa.qkey, 'Extended QP, QKey is not as expected' + qp_attr, _ = qp.query(e.IBV_QP_QKEY) + assert qp_attr.qkey == qa.qkey, 'Legacy QP, QKey is not as expected' qp.to_rtr(qa) qa.sq_psn = 0x45 qp.to_rts(qa) - qp_attr, qp_iattr = qp.query(e.IBV_QP_SQ_PSN) - assert qp_attr.sq_psn == qa.sq_psn, 'Extended QP, SQ PSN is not as expected' + qp_attr, _ = qp.query(e.IBV_QP_SQ_PSN) + assert qp_attr.sq_psn == qa.sq_psn, 'Legacy QP, SQ PSN is not as expected' qa.qp_state = e.IBV_QPS_RESET qp.modify(qa, e.IBV_QP_STATE) - assert qp.qp_state == e.IBV_QPS_RESET, 'Extended QP, QP state is not as expected' - # Legacy QP - qia = get_qp_init_attr(cq, attr) - qp = QP(pd, qia) + assert qp.qp_state == e.IBV_QPS_RESET, 'Legacy QP, QP state is not as expected' + # Extended QP + qia = get_qp_init_attr_ex(cq, pd, attr, attr_ex, e.IBV_QPT_UD) + qp = self.create_qp(ctx, qia, True, False) qa = QPAttr() qa.qkey = 0x123 qp.to_init(qa) - qp_attr, qp_iattr = qp.query(e.IBV_QP_QKEY) - assert qp_attr.qkey == qa.qkey, 'Legacy QP, QKey is not as expected' + qp_attr, _ = qp.query(e.IBV_QP_QKEY) + assert qp_attr.qkey == qa.qkey, 'Extended QP, QKey is not as expected' qp.to_rtr(qa) qa.sq_psn = 0x45 qp.to_rts(qa) - qp_attr, qp_iattr = qp.query(e.IBV_QP_SQ_PSN) - assert qp_attr.sq_psn == qa.sq_psn, 'Legacy QP, SQ PSN is not as expected' + qp_attr, _ = qp.query(e.IBV_QP_SQ_PSN) + assert qp_attr.sq_psn == qa.sq_psn, 'Extended QP, SQ PSN is not as expected' qa.qp_state = e.IBV_QPS_RESET qp.modify(qa, e.IBV_QP_STATE) - assert qp.qp_state == e.IBV_QPS_RESET, 'Legacy QP, QP state is not as expected' - - -def get_qp_types(ctx, port_num): - """ - Returns a list of the commonly used QP types. Raw Packet QP will not be - included if link layer is not Ethernet or it current user is not root. - :param ctx: The device's Context, to query the port's link layer - :param port_num: Port number to query - :return: An array of QP types that can be created on this port - """ - qpts = [e.IBV_QPT_RC, e.IBV_QPT_UC, e.IBV_QPT_UD] - if is_eth(ctx, port_num) and is_root(): - qpts.append(e.IBV_QPT_RAW_PACKET) - return qpts - - -def verify_qp_attrs(orig_cap, state, init_attr, attr): - assert state == attr.cur_qp_state - assert orig_cap.max_send_wr <= init_attr.cap.max_send_wr - assert orig_cap.max_recv_wr <= init_attr.cap.max_recv_wr - assert orig_cap.max_send_sge <= init_attr.cap.max_send_sge - assert orig_cap.max_recv_sge <= init_attr.cap.max_recv_sge - assert orig_cap.max_inline_data <= init_attr.cap.max_inline_data - - -def get_qp_init_attr(cq, attr): - """ - Creates a QPInitAttr object with a QP type of the provided array and - other random values. - :param cq: CQ to be used as send and receive CQ - :param attr: Device attributes for capability checks - :return: An initialized QPInitAttr object - """ - qp_cap = u.random_qp_cap(attr) - sig = random.randint(0, 1) - return QPInitAttr(scq=cq, rcq=cq, cap=qp_cap, sq_sig_all=sig) + assert qp.qp_state == e.IBV_QPS_RESET, 'Extended QP, QP state is not as expected' def get_qp_init_attr_ex(cq, pd, attr, attr_ex, qpt): @@ -341,17 +303,3 @@ def get_qp_init_attr_ex(cq, pd, attr, attr_ex, qpt): qia.recv_cq = cq qia.pd = pd # Only XRCD can be created without a PD return qia - - -def is_eth(ctx, port_num): - """ - Querires the device's context's port for its link layer. - :param ctx: The Context to query - :param port_num: Which Context's port to query - :return: True if the port's link layer is Ethernet, else False - """ - return ctx.query_port(port_num).link_layer == e.IBV_LINK_LAYER_ETHERNET - - -def is_root(): - return os.geteuid() == 0 diff --git a/tests/test_qpex.py b/tests/test_qpex.py index c6786c7..77763da 100644 --- a/tests/test_qpex.py +++ b/tests/test_qpex.py @@ -38,21 +38,26 @@ def create_qp_ex(agr_obj, qp_type, send_flags): if ex.error_code == errno.EOPNOTSUPP: raise unittest.SkipTest('Extended QP is not supported on this device') raise ex - return qp + if qp_type != e.IBV_QPT_XRC_SEND: + agr_obj.qps.append(qp) + agr_obj.qps_num.append(qp.qp_num) + agr_obj.psns.append(random.getrandbits(24)) + else: + return qp class QpExUDSend(UDResources): - def create_qp(self): - self.qp = create_qp_ex(self, e.IBV_QPT_UD, e.IBV_QP_EX_WITH_SEND) + def create_qps(self): + create_qp_ex(self, e.IBV_QPT_UD, e.IBV_QP_EX_WITH_SEND) class QpExRCSend(RCResources): - def create_qp(self): - self.qp = create_qp_ex(self, e.IBV_QPT_RC, e.IBV_QP_EX_WITH_SEND) + def create_qps(self): + create_qp_ex(self, e.IBV_QPT_RC, e.IBV_QP_EX_WITH_SEND) class QpExXRCSend(XRCResources): - def create_qp(self): + def create_qps(self): qp_attr = QPAttr(port_num=self.ib_port) qp_attr.pkey_index = 0 for _ in range(self.qp_count): @@ -71,17 +76,17 @@ class QpExXRCSend(XRCResources): class QpExUDSendImm(UDResources): - def create_qp(self): - self.qp = create_qp_ex(self, e.IBV_QPT_UD, e.IBV_QP_EX_WITH_SEND_WITH_IMM) + def create_qps(self): + create_qp_ex(self, e.IBV_QPT_UD, e.IBV_QP_EX_WITH_SEND_WITH_IMM) class QpExRCSendImm(RCResources): - def create_qp(self): - self.qp = create_qp_ex(self, e.IBV_QPT_RC, e.IBV_QP_EX_WITH_SEND_WITH_IMM) + def create_qps(self): + create_qp_ex(self, e.IBV_QPT_RC, e.IBV_QP_EX_WITH_SEND_WITH_IMM) class QpExXRCSendImm(XRCResources): - def create_qp(self): + def create_qps(self): qp_attr = QPAttr(port_num=self.ib_port) qp_attr.pkey_index = 0 for _ in range(self.qp_count): @@ -101,16 +106,16 @@ class QpExXRCSendImm(XRCResources): class QpExRCRDMAWrite(RCResources): - def create_qp(self): - self.qp = create_qp_ex(self, e.IBV_QPT_RC, e.IBV_QP_EX_WITH_RDMA_WRITE) + def create_qps(self): + create_qp_ex(self, e.IBV_QPT_RC, e.IBV_QP_EX_WITH_RDMA_WRITE) def create_mr(self): self.mr = u.create_custom_mr(self, e.IBV_ACCESS_REMOTE_WRITE) class QpExRCRDMAWriteImm(RCResources): - def create_qp(self): - self.qp = create_qp_ex(self, e.IBV_QPT_RC, + def create_qps(self): + create_qp_ex(self, e.IBV_QPT_RC, e.IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM) def create_mr(self): @@ -118,30 +123,30 @@ class QpExRCRDMAWriteImm(RCResources): class QpExRCRDMARead(RCResources): - def create_qp(self): - self.qp = create_qp_ex(self, e.IBV_QPT_RC, e.IBV_QP_EX_WITH_RDMA_READ) + def create_qps(self): + create_qp_ex(self, e.IBV_QPT_RC, e.IBV_QP_EX_WITH_RDMA_READ) def create_mr(self): self.mr = u.create_custom_mr(self, e.IBV_ACCESS_REMOTE_READ) class QpExRCAtomicCmpSwp(RCResources): - def create_qp(self): - self.qp = create_qp_ex(self, e.IBV_QPT_RC, + def create_qps(self): + create_qp_ex(self, e.IBV_QPT_RC, e.IBV_QP_EX_WITH_ATOMIC_CMP_AND_SWP) self.mr = u.create_custom_mr(self, e.IBV_ACCESS_REMOTE_ATOMIC) class QpExRCAtomicFetchAdd(RCResources): - def create_qp(self): - self.qp = create_qp_ex(self, e.IBV_QPT_RC, + def create_qps(self): + create_qp_ex(self, e.IBV_QPT_RC, e.IBV_QP_EX_WITH_ATOMIC_FETCH_AND_ADD) self.mr = u.create_custom_mr(self, e.IBV_ACCESS_REMOTE_ATOMIC) class QpExRCBindMw(RCResources): - def create_qp(self): - self.qp = create_qp_ex(self, e.IBV_QPT_RC, e.IBV_QP_EX_WITH_BIND_MW) + def create_qps(self): + create_qp_ex(self, e.IBV_QPT_RC, e.IBV_QP_EX_WITH_BIND_MW) def create_mr(self): self.mr = u.create_custom_mr(self, e.IBV_ACCESS_REMOTE_WRITE) @@ -164,27 +169,27 @@ class QpExTestCase(RDMATestCase): 'rc_bind_mw': QpExRCBindMw} def create_players(self, qp_type): - client = self.qp_dict[qp_type](self.dev_name, self.ib_port, - self.gid_index) - server = self.qp_dict[qp_type](self.dev_name, self.ib_port, - self.gid_index) - if 'xrc' in qp_type: - client.pre_run(server.psns, server.qps_num) - server.pre_run(client.psns, client.qps_num) - else: - client.pre_run(server.psn, server.qpn) - server.pre_run(client.psn, client.qpn) + try: + client = self.qp_dict[qp_type](self.dev_name, self.ib_port, + self.gid_index) + server = self.qp_dict[qp_type](self.dev_name, self.ib_port, + self.gid_index) + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Create player with {} is not supported'.format(qp_type)) + client.pre_run(server.psns, server.qps_num) + server.pre_run(client.psns, client.qps_num) return client, server def test_qp_ex_ud_send(self): client, server = self.create_players('ud_send') u.traffic(client, server, self.iters, self.gid_index, self.ib_port, - is_cq_ex=False, send_op=e.IBV_QP_EX_WITH_SEND) + new_send=True, send_op=e.IBV_QP_EX_WITH_SEND) def test_qp_ex_rc_send(self): client, server = self.create_players('rc_send') u.traffic(client, server, self.iters, self.gid_index, self.ib_port, - is_cq_ex=False, send_op=e.IBV_QP_EX_WITH_SEND) + new_send=True, send_op=e.IBV_QP_EX_WITH_SEND) def test_qp_ex_xrc_send(self): client, server = self.create_players('xrc_send') @@ -193,12 +198,12 @@ class QpExTestCase(RDMATestCase): def test_qp_ex_ud_send_imm(self): client, server = self.create_players('ud_send_imm') u.traffic(client, server, self.iters, self.gid_index, self.ib_port, - is_cq_ex=False, send_op=e.IBV_QP_EX_WITH_SEND_WITH_IMM) + new_send=True, send_op=e.IBV_QP_EX_WITH_SEND_WITH_IMM) def test_qp_ex_rc_send_imm(self): client, server = self.create_players('rc_send_imm') u.traffic(client, server, self.iters, self.gid_index, self.ib_port, - is_cq_ex=False, send_op=e.IBV_QP_EX_WITH_SEND_WITH_IMM) + new_send=True, send_op=e.IBV_QP_EX_WITH_SEND_WITH_IMM) def test_qp_ex_xrc_send_imm(self): client, server = self.create_players('xrc_send_imm') @@ -211,7 +216,7 @@ class QpExTestCase(RDMATestCase): client.raddr = server.mr.buf server.raddr = client.mr.buf u.rdma_traffic(client, server, self.iters, self.gid_index, self.ib_port, - is_cq_ex=False, send_op=e.IBV_QP_EX_WITH_RDMA_WRITE) + new_send=True, send_op=e.IBV_QP_EX_WITH_RDMA_WRITE) def test_qp_ex_rc_rdma_write_imm(self): client, server = self.create_players('rc_write_imm') @@ -220,7 +225,7 @@ class QpExTestCase(RDMATestCase): client.raddr = server.mr.buf server.raddr = client.mr.buf u.traffic(client, server, self.iters, self.gid_index, self.ib_port, - is_cq_ex=False, send_op=e.IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM) + new_send=True, send_op=e.IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM) def test_qp_ex_rc_rdma_read(self): client, server = self.create_players('rc_read') @@ -230,7 +235,7 @@ class QpExTestCase(RDMATestCase): server.raddr = client.mr.buf server.mr.write('s' * server.msg_size, server.msg_size) u.rdma_traffic(client, server, self.iters, self.gid_index, self.ib_port, - is_cq_ex=False, send_op=e.IBV_QP_EX_WITH_RDMA_READ) + new_send=True, send_op=e.IBV_QP_EX_WITH_RDMA_READ) def test_qp_ex_rc_atomic_cmp_swp(self): client, server = self.create_players('rc_cmp_swp') @@ -242,7 +247,7 @@ class QpExTestCase(RDMATestCase): server.raddr = client.mr.buf server.mr.write('s' * 8, 8) u.rdma_traffic(client, server, self.iters, self.gid_index, self.ib_port, - is_cq_ex=False, send_op=e.IBV_QP_EX_WITH_ATOMIC_CMP_AND_SWP) + new_send=True, send_op=e.IBV_QP_EX_WITH_ATOMIC_CMP_AND_SWP) def test_qp_ex_rc_atomic_fetch_add(self): client, server = self.create_players('rc_fetch_add') @@ -254,47 +259,4 @@ class QpExTestCase(RDMATestCase): server.raddr = client.mr.buf server.mr.write('s' * 8, 8) u.rdma_traffic(client, server, self.iters, self.gid_index, self.ib_port, - is_cq_ex=False, send_op=e.IBV_QP_EX_WITH_ATOMIC_FETCH_AND_ADD) - - def test_qp_ex_rc_bind_mw(self): - """ - Verify bind memory window operation using the new post_send API. - Instead of checking through regular pingpong style traffic, we'll - do as follows: - - Register an MR with remote write access - - Bind a MW without remote write permission to the MR - - Verify that remote write fails - Since it's a unique flow, it's an integral part of that test rather - than a utility method. - """ - client, server = self.create_players('rc_bind_mw') - client_sge = u.get_send_element(client, False)[1] - # Create a MW and bind it - server.qp.wr_start() - server.qp.wr_id = 0x123 - server.qp.wr_flags = e.IBV_SEND_SIGNALED - bind_info = MWBindInfo(server.mr, server.mr.buf, server.mr.length, - e.IBV_ACCESS_LOCAL_WRITE) - try: - mw = MW(server.pd, mw_type=e.IBV_MW_TYPE_2) - except PyverbsRDMAError as ex: - if ex.error_code == errno.EOPNOTSUPP: - raise unittest.SkipTest('Memory Window allocation is not supported') - raise ex - new_key = inc_rkey(server.mr.rkey) - server.qp.wr_bind_mw(mw, new_key, bind_info) - server.qp.wr_complete() - u.poll_cq(server.cq) - # Verify that remote write fails - client.qp.wr_start() - client.qp.wr_id = 0x124 - client.qp.wr_flags = e.IBV_SEND_SIGNALED - client.qp.wr_rdma_write(new_key, server.mr.buf) - client.qp.wr_set_sge(client_sge) - client.qp.wr_complete() - try: - u.poll_cq(client.cq) - except PyverbsRDMAError as ex: - if ex.error_code != e.IBV_WC_REM_ACCESS_ERR: - raise ex - + new_send=True, send_op=e.IBV_QP_EX_WITH_ATOMIC_FETCH_AND_ADD) diff --git a/tests/test_rdmacm.py b/tests/test_rdmacm.py index 880f9a9..b8fd1d9 100755 --- a/tests/test_rdmacm.py +++ b/tests/test_rdmacm.py @@ -1,87 +1,180 @@ # SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) # Copyright (c) 2019 Mellanox Technologies, Inc. All rights reserved. See COPYING file -from tests.rdmacm_utils import sync_traffic, async_traffic, \ - async_traffic_with_ext_qp -from pyverbs.pyverbs_error import PyverbsError -from tests.base import RDMATestCase import multiprocessing as mp -import pyverbs.device as d -import subprocess import unittest -import json +import os + +from tests.rdmacm_utils import CMSyncConnection, CMAsyncConnection +from pyverbs.pyverbs_error import PyverbsError +from tests.utils import requires_mcast_support +from tests.base import RDMATestCase +import pyverbs.cm_enums as ce + NUM_OF_PROCESSES = 2 +MC_IP_PREFIX = '230' class CMTestCase(RDMATestCase): def setUp(self): - if self.dev_name is not None: - net_name = self.get_net_name(self.dev_name) - try: - self.ip_addr = self.get_ip_address(net_name) - except KeyError: - raise unittest.SkipTest('Device {} doesn\'t have net interface' - .format(self.dev_name)) - else: - dev_list = d.get_device_list() - for dev in dev_list: - net_name = self.get_net_name(dev.name.decode()) - try: - self.ip_addr = self.get_ip_address(net_name) - except IndexError: - continue - else: - self.dev_name = dev.name.decode() - break - if self.dev_name is None: - raise unittest.SkipTest('No devices with net interface') super().setUp() + if not self.ip_addr: + raise unittest.SkipTest('Device {} doesn\'t have net interface' + .format(self.dev_name)) - @staticmethod - def get_net_name(dev): - out = subprocess.check_output(['ls', '/sys/class/infiniband/{}/device/net/' - .format(dev)]) - return out.decode().split('\n')[0] - - @staticmethod - def get_ip_address(ifname): - out = subprocess.check_output(['ip', '-j', 'addr', 'show', ifname]) - loaded_json = json.loads(out.decode()) - interface = loaded_json[0]['addr_info'][0]['local'] - if 'fe80::' in interface: - interface = interface + '%' + ifname - return interface - - @staticmethod - def two_nodes_rdmacm_traffic(ip_addr, traffic_func): + def two_nodes_rdmacm_traffic(self, connection_resources, test_flow, + **resource_kwargs): + """ + Init and manage the rdmacm test processes. If needed, terminate those + processes and raise an exception. + :param connection_resources: The CMConnection resources to use. + :param test_flow: The target RDMACM flow method to run. + :param resource_kwargs: Dict of args that specify the CMResources + specific attributes. Each test case can pass + here as key words the specific CMResources + attributes that are requested. + :return: None + """ + if resource_kwargs.get('port_space', None) == ce.RDMA_PS_UDP and \ + self.is_eth_and_has_roce_hw_bug(): + raise unittest.SkipTest('Device {} doesn\'t support UDP with RoCEv2' + .format(self.dev_name)) ctx = mp.get_context('fork') - syncer = ctx.Barrier(NUM_OF_PROCESSES, timeout=5) - notifier = ctx.Queue() - passive = ctx.Process(target=traffic_func, - args=[ip_addr, syncer, notifier, True]) - active = ctx.Process(target=traffic_func, - args=[ip_addr, syncer, notifier, False]) + self.syncer = ctx.Barrier(NUM_OF_PROCESSES, timeout=15) + self.notifier = ctx.Queue() + passive = ctx.Process(target=test_flow, + kwargs={'connection_resources': connection_resources, + 'passive':True, **resource_kwargs}) + active = ctx.Process(target=test_flow, + kwargs={'connection_resources': connection_resources, + 'passive':False, **resource_kwargs}) passive.start() active.start() - while notifier.empty(): - pass - - for _ in range(NUM_OF_PROCESSES): - res = notifier.get() + passive.join(15) + active.join(15) + # If the processes is still alive kill them and fail the test. + proc_killed = False + for proc in [passive, active]: + if proc.is_alive(): + proc.terminate() + proc_killed = True + # Check if the test processes raise exceptions. + if not self.notifier.empty(): + res = self.notifier.get() if res is not None: - passive.terminate() - active.terminate() raise PyverbsError(res) + # Raise exeption if the test proceses was terminate. + if proc_killed: + raise Exception('RDMA CM test procces is stuck, kill the test') + + def rdmacm_traffic(self, connection_resources=None, passive=None, **kwargs): + """ + Run RDMACM traffic between two CMIDs. + :param connection_resources: The connection resources to use. + :param passive: Indicate if this CMID is this the passive side. + :return: None + """ + try: + player = connection_resources(ip_addr=self.ip_addr, + syncer=self.syncer, + notifier=self.notifier, + passive=passive, **kwargs) + player.establish_connection() + player.rdmacm_traffic() + player.disconnect() + except Exception as ex: + side = 'passive' if passive else 'active' + self.notifier.put('Caught exception in {side} side process: pid ' + '{pid}\n'.format(side=side, pid=os.getpid()) + + 'Exception message: {ex}'.format(ex=str(ex))) + + def rdmacm_multicast_traffic(self, connection_resources=None, passive=None, + extended=False, **kwargs): + """ + Run RDMACM multicast traffic between two CMIDs. + :param connection_resources: The connection resources to use. + :param passive: Indicate if this CMID is the passive side. + :param extended: Use exteneded multicast join request. This request + allows CMID to join with specific join flags. + :param kwargs: Arguments to be passed to the connection_resources. + :return: None + """ + try: + player = connection_resources(ip_addr=self.ip_addr, syncer=self.syncer, + notifier=self.notifier, passive=False, + **kwargs) + mc_addr = MC_IP_PREFIX + self.ip_addr[self.ip_addr.find('.'):] + player.join_to_multicast(src_addr=self.ip_addr, mc_addr=mc_addr, + extended=extended) + player.rdmacm_traffic(server=passive, multicast=True) + player.leave_multicast(mc_addr=mc_addr) + except Exception as ex: + side = 'passive' if passive else 'active' + self.notifier.put('Caught exception in {side} side process: pid {pid}\n' + .format(side=side, pid=os.getpid()) + + 'Exception message: {ex}'.format(ex=str(ex))) + + def rdmacm_remote_traffic(self, connection_resources=None, passive=None, + remote_op='write', **kwargs): + """ + Run RDMACM remote traffic between two CMIDs. + :param connection_resources: The connection resources to use. + :param passive: Indicate if this CMID is the passive side. + :param remote_op: The remote operation in the traffic. + :param kwargs: Arguments to be passed to the connection_resources. + :return: None + """ + try: + player = connection_resources(ip_addr=self.ip_addr, + syncer=self.syncer, + notifier=self.notifier, + passive=passive, + remote_op=remote_op, **kwargs) + player.establish_connection() + player.remote_traffic(passive=passive, remote_op=remote_op) + player.disconnect() + except Exception as ex: + while not self.notifier.empty(): + self.notifier.get() + side = 'passive' if passive else 'active' + msg = f'Caught exception in {side} side process: pid {os.getpid()}\n' \ + f'Exception message: {str(ex)}' + self.notifier.put(msg) - passive.join() - active.join() def test_rdmacm_sync_traffic(self): - self.two_nodes_rdmacm_traffic(self.ip_addr, sync_traffic) + self.two_nodes_rdmacm_traffic(CMSyncConnection, self.rdmacm_traffic) def test_rdmacm_async_traffic(self): - self.two_nodes_rdmacm_traffic(self.ip_addr, async_traffic) + self.two_nodes_rdmacm_traffic(CMAsyncConnection, self.rdmacm_traffic) + + @requires_mcast_support() + def test_rdmacm_async_multicast_traffic(self): + self.two_nodes_rdmacm_traffic(CMAsyncConnection, + self.rdmacm_multicast_traffic, + port_space=ce.RDMA_PS_UDP) + + @requires_mcast_support() + def test_rdmacm_async_ex_multicast_traffic(self): + self.two_nodes_rdmacm_traffic(CMAsyncConnection, + self.rdmacm_multicast_traffic, + port_space=ce.RDMA_PS_UDP, extended=True) def test_rdmacm_async_traffic_external_qp(self): - self.two_nodes_rdmacm_traffic(self.ip_addr, async_traffic_with_ext_qp) + self.two_nodes_rdmacm_traffic(CMAsyncConnection, self.rdmacm_traffic, + with_ext_qp=True) + + def test_rdmacm_async_udp_traffic(self): + self.two_nodes_rdmacm_traffic(CMAsyncConnection, self.rdmacm_traffic, + port_space=ce.RDMA_PS_UDP) + + def test_rdmacm_async_read(self): + self.two_nodes_rdmacm_traffic(CMAsyncConnection, + self.rdmacm_remote_traffic, + remote_op='read') + + def test_rdmacm_async_write(self): + self.two_nodes_rdmacm_traffic(CMAsyncConnection, + self.rdmacm_remote_traffic, + remote_op='write') diff --git a/tests/test_relaxed_ordering.py b/tests/test_relaxed_ordering.py index 27af992..3a6b18a 100644 --- a/tests/test_relaxed_ordering.py +++ b/tests/test_relaxed_ordering.py @@ -1,8 +1,12 @@ +import unittest +import errno + from tests.base import RCResources, UDResources, XRCResources from tests.utils import traffic, xrc_traffic from tests.base import RDMATestCase from pyverbs.mr import MR import pyverbs.enums as e +from pyverbs.pyverbs_error import PyverbsRDMAError class RoUD(UDResources): @@ -30,16 +34,17 @@ class RoTestCase(RDMATestCase): self.qp_dict = {'rc': RoRC, 'ud': RoUD, 'xrc': RoXRC} def create_players(self, qp_type): - client = self.qp_dict[qp_type](self.dev_name, self.ib_port, - self.gid_index) - server = self.qp_dict[qp_type](self.dev_name, self.ib_port, - self.gid_index) - if qp_type == 'xrc': - client.pre_run(server.psns, server.qps_num) - server.pre_run(client.psns, client.qps_num) - else: - client.pre_run(server.psn, server.qpn) - server.pre_run(client.psn, client.qpn) + try: + client = self.qp_dict[qp_type](self.dev_name, self.ib_port, + self.gid_index) + server = self.qp_dict[qp_type](self.dev_name, self.ib_port, + self.gid_index) + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest('Create player with attrs {} is not supported'.format(qp_type)) + raise ex + client.pre_run(server.psns, server.qps_num) + server.pre_run(client.psns, client.qps_num) return client, server def test_ro_rc_traffic(self): diff --git a/tests/test_shared_pd.py b/tests/test_shared_pd.py new file mode 100644 index 0000000..2a27886 --- /dev/null +++ b/tests/test_shared_pd.py @@ -0,0 +1,95 @@ +# SPDX-License-Identifier: (GPL-2.0 OR Linux-OpenIB) +# Copyright (c) 2020 Mellanox Technologies, Inc. All rights reserved. See COPYING file +""" +Test module for Shared PD. +""" +import unittest +import errno +import os + +from tests.test_qpex import QpExRCRDMAWrite +from tests.base import RDMATestCase +from pyverbs.device import Context +from pyverbs.pd import PD +from pyverbs.mr import MR +import pyverbs.enums as e +import tests.utils as u + + +def get_import_res_class(base_class): + """ + This function creates a class that inherits base_class of any BaseResources + type. Its purpose is to behave exactly as base_class does, except for the + objects creation, which instead of creating context, PD and MR, it imports + them. Hence the returned class must be initialized with (cmd_fd, pd_handle, + mr_handle, mr_addr, **kwargs), while kwargs are the arguments needed + (if any) for base_class. In addition it has unimport_resources() method + which unimprot all the resources and closes the imported PD object. + :param base_class: The base resources class to inherit from + :return: ImportResources(cmd_fd, pd_handle, mr_handle, mr_addr, **kwargs) + class + """ + class ImportResources(base_class): + def __init__(self, cmd_fd, pd_handle, mr_handle, mr_addr=None, **kwargs): + self.cmd_fd = cmd_fd + self.pd_handle = pd_handle + self.mr_handle = mr_handle + self.mr_addr = mr_addr + super(ImportResources, self).__init__(**kwargs) + + def create_context(self): + try: + self.ctx = Context(cmd_fd=self.cmd_fd) + except u.PyverbsRDMAError as ex: + if ex.error_code in [errno.EOPNOTSUPP, errno.EPROTONOSUPPORT]: + raise unittest.SkipTest('Importing a device is not supported') + raise ex + + def create_pd(self): + self.pd = PD(self.ctx, handle=self.pd_handle) + + def create_mr(self): + self.mr = MR(self.pd, handle=self.mr_handle, address=self.mr_addr) + + def unimport_resources(self): + self.mr.unimport() + self.pd.unimport() + self.pd.close() + + return ImportResources + + +class SharedPDTestCase(RDMATestCase): + def setUp(self): + super().setUp() + self.iters = 10 + self.server_res = None + self.imported_res = [] + + def tearDown(self): + for res in self.imported_res: + res.unimport_resources() + super().tearDown() + + def test_imported_rc_ex_rdma_write(self): + setup_params = {'dev_name': self.dev_name, 'ib_port': self.ib_port, + 'gid_index': self.gid_index} + self.server_res = QpExRCRDMAWrite(**setup_params) + cmd_fd_dup = os.dup(self.server_res.ctx.cmd_fd) + import_cls = get_import_res_class(QpExRCRDMAWrite) + server_import = import_cls( + cmd_fd_dup, self.server_res.pd.handle, self.server_res.mr.handle, + # The imported MR's address is NULL, so using the address of the + # "main" MR object to be able to validate the message + self.server_res.mr.buf, + **setup_params) + self.imported_res.append(server_import) + client = QpExRCRDMAWrite(**setup_params) + client.pre_run(server_import.psns, server_import.qps_num) + server_import.pre_run(client.psns, client.qps_num) + client.rkey = server_import.mr.rkey + server_import.rkey = client.mr.rkey + client.raddr = server_import.mr.buf + server_import.raddr = client.mr.buf + u.rdma_traffic(client, server_import, self.iters, self.gid_index, + self.ib_port, send_op=e.IBV_QP_EX_WITH_RDMA_WRITE, new_send=True) diff --git a/tests/utils.py b/tests/utils.py index 45bb735..29f643f 100755 --- a/tests/utils.py +++ b/tests/utils.py @@ -4,6 +4,7 @@ Provide some useful helper function for pyverbs' tests. """ from itertools import combinations as com +import errno import unittest import random import socket @@ -11,11 +12,12 @@ import os from pyverbs.pyverbs_error import PyverbsError, PyverbsRDMAError from pyverbs.addr import AHAttr, AH, GlobalRoute +from tests.base import XRCResources, DCT_KEY from pyverbs.wr import SGE, SendWR, RecvWR -from pyverbs.qp import QPCap, QPInitAttrEx +from pyverbs.qp import QPCap, QPInitAttr, QPInitAttrEx +from tests.mlx5_base import Mlx5DcResources from pyverbs.base import PyverbsRDMAErrno from pyverbs.mr import MW, MWBindInfo -from tests.base import XRCResources from pyverbs.cq import PollCqAttr import pyverbs.device as d import pyverbs.enums as e @@ -238,6 +240,19 @@ def random_qp_init_attr_ex(attr_ex, attr, qpt=None): return qia +def get_qp_init_attr(cq, attr): + """ + Creates a QPInitAttr object with a QP type of the provided array and + other random values. + :param cq: CQ to be used as send and receive CQ + :param attr: Device attributes for capability checks + :return: An initialized QPInitAttr object + """ + qp_cap = random_qp_cap(attr) + sig = random.randint(0, 1) + return QPInitAttr(scq=cq, rcq=cq, cap=qp_cap, sq_sig_all=sig) + + def wc_status_to_str(status): try: return \ @@ -267,12 +282,17 @@ def create_custom_mr(agr_obj, additional_access_flags=0, size=None): :param size: MR's length. If None, agr_obj.msg_size is used. """ mr_length = size if size else agr_obj.msg_size - return MR(agr_obj.pd, mr_length, - e.IBV_ACCESS_LOCAL_WRITE | additional_access_flags) + try: + return MR(agr_obj.pd, mr_length, + e.IBV_ACCESS_LOCAL_WRITE | additional_access_flags) + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest(f'Create custom mr with additional access flags {additional_access_flags} is not supported') + raise ex # Traffic helpers -def get_send_element(agr_obj, is_server): +def get_send_elements(agr_obj, is_server, opcode=e.IBV_WR_SEND): """ Creates a single SGE and a single Send WR for agr_obj's QP type. The content of the message is either 's' for server side or 'c' for client side. @@ -287,8 +307,10 @@ def get_send_element(agr_obj, is_server): msg = (agr_obj.msg_size + offset) * ('s' if is_server else 'c') mr.write(msg, agr_obj.msg_size + offset) sge = SGE(mr.buf + offset, agr_obj.msg_size, mr.lkey) - return SendWR(num_sge=1, sg=[sge]), sge - + send_wr = SendWR(opcode=opcode, num_sge=1, sg=[sge]) + if opcode in [e.IBV_WR_RDMA_WRITE, e.IBV_WR_RDMA_READ]: + send_wr.set_wr_rdma(int(agr_obj.rkey), int(agr_obj.remote_addr)) + return send_wr, sge def get_recv_wr(agr_obj): """ @@ -313,52 +335,71 @@ def get_global_ah(agr_obj, gid_index, port): return AH(agr_obj.pd, attr=ah_attr) +def get_global_route(ctx, gid_index=0, port_num=1): + """ + Queries the provided Context's gid and creates a GlobalRoute + object with sgid_index and the queried GID as dgid. + :param ctx: Context object to query + :param gid_index: GID index to query and use. Default: 0, as it's always + valid + :param port_num: Number of the port to query. Default: 1 + :return: GlobalRoute object + """ + gid = ctx.query_gid(port_num, gid_index) + gr = GlobalRoute(dgid=gid, sgid_index=gid_index) + return gr + + def xrc_post_send(agr_obj, qp_num, send_object, gid_index, port, send_op=None): - agr_obj.qp = agr_obj.sqp_lst[qp_num] + agr_obj.qps = agr_obj.sqp_lst if send_op: post_send_ex(agr_obj, send_object, gid_index, port, send_op) else: post_send(agr_obj, send_object, gid_index, port) -def post_send_ex(agr_obj, send_object, gid_index, port, send_op=None): - qp_type = agr_obj.qp.qp_type - agr_obj.qp.wr_start() - agr_obj.qp.wr_id = 0x123 - agr_obj.qp.wr_flags = e.IBV_SEND_SIGNALED +def post_send_ex(agr_obj, send_object, gid_index, port, send_op=None, qp_idx=0): + qp = agr_obj.qps[qp_idx] + qp_type = qp.qp_type + qp.wr_start() + qp.wr_id = 0x123 + qp.wr_flags = e.IBV_SEND_SIGNALED if send_op == e.IBV_QP_EX_WITH_SEND: - agr_obj.qp.wr_send() + qp.wr_send() elif send_op == e.IBV_QP_EX_WITH_RDMA_WRITE: - agr_obj.qp.wr_rdma_write(agr_obj.rkey, agr_obj.raddr) + qp.wr_rdma_write(agr_obj.rkey, agr_obj.raddr) elif send_op == e.IBV_QP_EX_WITH_SEND_WITH_IMM: - agr_obj.qp.wr_send_imm(IMM_DATA) + qp.wr_send_imm(IMM_DATA) elif send_op == e.IBV_QP_EX_WITH_RDMA_WRITE_WITH_IMM: - agr_obj.qp.wr_rdma_write_imm(agr_obj.rkey, agr_obj.raddr, IMM_DATA) + qp.wr_rdma_write_imm(agr_obj.rkey, agr_obj.raddr, IMM_DATA) elif send_op == e.IBV_QP_EX_WITH_RDMA_READ: - agr_obj.qp.wr_rdma_read(agr_obj.rkey, agr_obj.raddr) + qp.wr_rdma_read(agr_obj.rkey, agr_obj.raddr) elif send_op == e.IBV_QP_EX_WITH_ATOMIC_CMP_AND_SWP: # We're checking the returned value (remote's content), so cmp/swp # values are of no importance. - agr_obj.qp.wr_atomic_cmp_swp(agr_obj.rkey, agr_obj.raddr, 42, 43) + qp.wr_atomic_cmp_swp(agr_obj.rkey, agr_obj.raddr, 42, 43) elif send_op == e.IBV_QP_EX_WITH_ATOMIC_FETCH_AND_ADD: - agr_obj.qp.wr_atomic_fetch_add(agr_obj.rkey, agr_obj.raddr, 1) + qp.wr_atomic_fetch_add(agr_obj.rkey, agr_obj.raddr, 1) elif send_op == e.IBV_QP_EX_WITH_BIND_MW: bind_info = MWBindInfo(agr_obj.mr, agr_obj.mr.buf, agr_obj.mr.rkey, e.IBV_ACCESS_REMOTE_WRITE) mw = MW(agr_obj.pd, mw_type=e.IBV_MW_TYPE_2) # A new rkey is needed to be set into bind_info, modify rkey - agr_obj.qp.wr_bind_mw(mw, agr_obj.mr.rkey + 12, bind_info) - agr_obj.qp.wr_send() + qp.wr_bind_mw(mw, agr_obj.mr.rkey + 12, bind_info) + qp.wr_send() if qp_type == e.IBV_QPT_UD: ah = get_global_ah(agr_obj, gid_index, port) - agr_obj.qp.wr_set_ud_addr(ah, agr_obj.rqpn, agr_obj.UD_QKEY) + qp.wr_set_ud_addr(ah, agr_obj.rqps_num[qp_idx], agr_obj.UD_QKEY) if qp_type == e.IBV_QPT_XRC_SEND: - agr_obj.qp.wr_set_xrc_srqn(agr_obj.remote_srqn) - agr_obj.qp.wr_set_sge(send_object) - agr_obj.qp.wr_complete() + qp.wr_set_xrc_srqn(agr_obj.remote_srqn) + if isinstance(agr_obj, Mlx5DcResources): + ah = get_global_ah(agr_obj, gid_index, port) + qp.wr_set_dc_addr(ah, agr_obj.remote_dct_num, DCT_KEY) + qp.wr_set_sge(send_object) + qp.wr_complete() -def post_send(agr_obj, send_wr, gid_index, port): +def post_send(agr_obj, send_wr, gid_index, port, qp_idx=0): """ Post a single send WR to the QP. Post_send's second parameter (send bad wr) is ignored for simplicity. For UD traffic an address vector is added as @@ -367,26 +408,28 @@ def post_send(agr_obj, send_wr, gid_index, port): :param send_wr: Send work request to post send :param gid_index: Local gid index :param port: IB port number + :param qp_idx: QP index to use :return: None """ qp_type = agr_obj.qp.qp_type if qp_type == e.IBV_QPT_UD: ah = get_global_ah(agr_obj, gid_index, port) - send_wr.set_wr_ud(ah, agr_obj.rqpn, agr_obj.UD_QKEY) - agr_obj.qp.post_send(send_wr, None) + send_wr.set_wr_ud(ah, agr_obj.rqps_num[qp_idx], agr_obj.UD_QKEY) + agr_obj.qps[qp_idx].post_send(send_wr, None) -def post_recv(qp, recv_wr, num_wqes=1): +def post_recv(agr_obj, recv_wr, qp_idx=0 ,num_wqes=1): """ Call the QP's post_recv() method times. Post_recv's second parameter (recv bad wr) is ignored for simplicity. - :param qp: QP which posts receive work request :param recv_wr: Receive work request to post + :param qp_idx: QP index which posts receive work request :param num_wqes: Number of WQEs to post :return: None """ + receive_queue = agr_obj.srq if agr_obj.srq else agr_obj.qps[qp_idx] for _ in range(num_wqes): - qp.post_recv(recv_wr, None) + receive_queue.post_recv(recv_wr, None) def poll_cq(cq, count=1, data=None): @@ -484,13 +527,14 @@ def validate(received_str, is_server, msg_size): format(exp=expected_str, rcv=received_str)) -def send(agr_obj, send_wr, gid_index, port, send_op=None): - if send_op: - return post_send_ex(agr_obj, send_wr, gid_index, port, send_op) - return post_send(agr_obj, send_wr, gid_index, port) +def send(agr_obj, send_object, gid_index, port, send_op=None, new_send=False, qp_idx=0): + if new_send: + return post_send_ex(agr_obj, send_object, gid_index, port, send_op, qp_idx) + return post_send(agr_obj, send_object, gid_index, port, qp_idx) -def traffic(client, server, iters, gid_idx, port, is_cq_ex=False, send_op=None): +def traffic(client, server, iters, gid_idx, port, is_cq_ex=False, send_op=None, + new_send=False): """ Runs basic traffic between two sides :param client: client side, clients base class is BaseTraffic @@ -499,7 +543,8 @@ def traffic(client, server, iters, gid_idx, port, is_cq_ex=False, send_op=None): :param gid_idx: local gid index :param port: IB port :param is_cq_ex: If True, use poll_cq_ex() rather than poll_cq() - :param send_op: If not None, new post send API is assumed. + :param send_op: The send_wr opcode. + :param new_send: If True use new post send API. :return: """ poll = poll_cq_ex if is_cq_ex else poll_cq @@ -508,31 +553,49 @@ def traffic(client, server, iters, gid_idx, port, is_cq_ex=False, send_op=None): imm_data = IMM_DATA else: imm_data = None - # Using the new post send API, we need the SGE, not the SendWR - send_element_idx = 1 if send_op else 0 s_recv_wr = get_recv_wr(server) c_recv_wr = get_recv_wr(client) - post_recv(client.qp, c_recv_wr, client.num_msgs) - post_recv(server.qp, s_recv_wr, server.num_msgs) + for qp_idx in range(server.qp_count): + # prepare the receive queue with RecvWR + post_recv(client, c_recv_wr, qp_idx=qp_idx) + post_recv(server, s_recv_wr, qp_idx=qp_idx) read_offset = GRH_SIZE if client.qp.qp_type == e.IBV_QPT_UD else 0 for _ in range(iters): - c_send_wr = get_send_element(client, False)[send_element_idx] - send(client, c_send_wr, gid_idx, port, send_op) - poll(client.cq) - poll(server.cq, data=imm_data) - post_recv(server.qp, s_recv_wr) - msg_received = server.mr.read(server.msg_size, read_offset) - validate(msg_received, True, server.msg_size) - s_send_wr = get_send_element(server, True)[send_element_idx] - send(server, s_send_wr, gid_idx, port, send_op) - poll(server.cq) - poll(client.cq, data=imm_data) - post_recv(client.qp, c_recv_wr) - msg_received = client.mr.read(client.msg_size, read_offset) - validate(msg_received, False, client.msg_size) - - -def rdma_traffic(client, server, iters, gid_idx, port, is_cq_ex=False, send_op=None): + for qp_idx in range(server.qp_count): + c_send_wr, c_sg = get_send_elements(client, False) + if client.use_mr_prefetch: + flags = e._IBV_ADVISE_MR_FLAG_FLUSH + if client.use_mr_prefetch == 'async': + flags = 0 + prefetch_mrs(client, [c_sg], advice=client.prefetch_advice, + flags=flags) + c_send_object = c_sg if send_op else c_send_wr + send(client, c_send_object, gid_idx, port, send_op, new_send, + qp_idx) + poll(client.cq) + poll(server.cq, data=imm_data) + post_recv(server, s_recv_wr, qp_idx=qp_idx) + msg_received = server.mr.read(server.msg_size, read_offset) + validate(msg_received, True, server.msg_size) + s_send_wr, s_sg = get_send_elements(server, True) + if server.use_mr_prefetch: + flags = e._IBV_ADVISE_MR_FLAG_FLUSH + if server.use_mr_prefetch == 'async': + flags = 0 + prefetch_mrs(server, [s_sg], advice=server.prefetch_advice, + flags=flags) + s_send_object = s_sg if send_op else s_send_wr + send(server, s_send_object, gid_idx, port, send_op, new_send, + qp_idx) + poll(server.cq) + poll(client.cq, data=imm_data) + post_recv(client, c_recv_wr, qp_idx=qp_idx) + msg_received = client.mr.read(client.msg_size, read_offset) + validate(msg_received, False, client.msg_size) + + +def rdma_traffic(client, server, iters, gid_idx, port, new_send=False, + send_op=None): """ Runs basic RDMA traffic between two sides. No receive WQEs are posted. For RDMA send with immediate, use traffic(). @@ -541,18 +604,19 @@ def rdma_traffic(client, server, iters, gid_idx, port, is_cq_ex=False, send_op=N :param iters: number of traffic iterations :param gid_idx: local gid index :param port: IB port - :param is_cq_ex: If True, use poll_cq_ex() rather than poll_cq() - :param send_op: If not None, new post send API is assumed. + :param new_send: If True use new post send API. + :param send_op: The send_wr opcode. :return: """ # Using the new post send API, we need the SGE, not the SendWR - send_element_idx = 1 if send_op else 0 - same_side_check = (send_op == e.IBV_QP_EX_WITH_RDMA_READ or - send_op == e.IBV_QP_EX_WITH_ATOMIC_CMP_AND_SWP or - send_op == e.IBV_QP_EX_WITH_ATOMIC_FETCH_AND_ADD) + send_element_idx = 1 if new_send else 0 + same_side_check = send_op in [e.IBV_QP_EX_WITH_RDMA_READ, + e.IBV_QP_EX_WITH_ATOMIC_CMP_AND_SWP, + e.IBV_QP_EX_WITH_ATOMIC_FETCH_AND_ADD, + e.IBV_WR_RDMA_READ] for _ in range(iters): - c_send_wr = get_send_element(client, False)[send_element_idx] - send(client, c_send_wr, gid_idx, port, send_op) + c_send_wr = get_send_elements(client, False, send_op)[send_element_idx] + send(client, c_send_wr, gid_idx, port, send_op, new_send) poll_cq(client.cq) if same_side_check: msg_received = client.mr.read(client.msg_size, 0) @@ -560,10 +624,10 @@ def rdma_traffic(client, server, iters, gid_idx, port, is_cq_ex=False, send_op=N msg_received = server.mr.read(server.msg_size, 0) validate(msg_received, False if same_side_check else True, server.msg_size) - s_send_wr = get_send_element(server, True)[send_element_idx] + s_send_wr = get_send_elements(server, True, send_op)[send_element_idx] if same_side_check: client.mr.write('c' * client.msg_size, client.msg_size) - send(server, s_send_wr, gid_idx, port, send_op) + send(server, s_send_wr, gid_idx, port, send_op, new_send) poll_cq(server.cq) if same_side_check: msg_received = server.mr.read(client.msg_size, 0) @@ -595,13 +659,13 @@ def xrc_traffic(client, server, is_cq_ex=False, send_op=None): client.remote_srqn = server.srq.get_srq_num() s_recv_wr = get_recv_wr(server) c_recv_wr = get_recv_wr(client) - post_recv(client.srq, c_recv_wr, client.qp_count*client.num_msgs) - post_recv(server.srq, s_recv_wr, server.qp_count*server.num_msgs) + post_recv(client, c_recv_wr, num_wqes=client.qp_count*client.num_msgs) + post_recv(server, s_recv_wr, num_wqes=server.qp_count*server.num_msgs) # Using the new post send API, we need the SGE, not the SendWR send_element_idx = 1 if send_op else 0 for _ in range(client.num_msgs): for i in range(server.qp_count): - c_send_wr = get_send_element(client, False)[send_element_idx] + c_send_wr = get_send_elements(client, False)[send_element_idx] if send_op is None: c_send_wr.set_qp_type_xrc(client.remote_srqn) xrc_post_send(client, i, c_send_wr, 0, 0, send_op) @@ -609,7 +673,7 @@ def xrc_traffic(client, server, is_cq_ex=False, send_op=None): poll(server.cq) msg_received = server.mr.read(server.msg_size, 0) validate(msg_received, True, server.msg_size) - s_send_wr = get_send_element(server, True)[send_element_idx] + s_send_wr = get_send_elements(server, True)[send_element_idx] if send_op is None: s_send_wr.set_qp_type_xrc(server.remote_srqn) xrc_post_send(server, i, s_send_wr, 0, 0, send_op) @@ -620,33 +684,68 @@ def xrc_traffic(client, server, is_cq_ex=False, send_op=None): # Decorators -def requires_odp(qp_type): +def requires_odp(qp_type, required_odp_caps): + def outer(func): + def inner(instance): + odp_supported(instance.ctx, qp_type, required_odp_caps) + if getattr(instance, 'is_implicit', False): + odp_implicit_supported(instance.ctx) + return func(instance) + return inner + return outer + + +def requires_root_on_eth(port_num=1): + def outer(func): + def inner(instance): + if not (is_eth(instance.ctx, port_num) and is_root()): + raise unittest.SkipTest('Must be run by root on Ethernet link layer') + return func(instance) + return inner + return outer + + +def requires_mcast_support(): + """ + Check if the device support multicast + return: True if multicast is supported + """ def outer(func): def inner(instance): - odp_supported(instance.ctx, qp_type) + ctx = d.Context(name=instance.dev_name) + if ctx.query_device().max_mcast_grp == 0: + raise unittest.SkipTest('Multicast is not supported on this device') return func(instance) return inner return outer -def odp_supported(ctx, qp_type): +def odp_supported(ctx, qp_type, required_odp_caps): """ Check device ODP capabilities, support only send/recv so far. :param ctx: Device Context :param qp_type: QP type ('rc', 'ud' or 'uc') + :param required_odp_caps: ODP Capability mask of specified device :return: None """ odp_caps = ctx.query_device_ex().odp_caps if odp_caps.general_caps == 0: raise unittest.SkipTest('ODP is not supported - No ODP caps') qp_odp_caps = getattr(odp_caps, '{}_odp_caps'.format(qp_type)) - has_odp_send = qp_odp_caps & e.IBV_ODP_SUPPORT_SEND - has_odp_recv = qp_odp_caps & e.IBV_ODP_SUPPORT_SRQ_RECV if qp_type == 'xrc'\ - else qp_odp_caps & e.IBV_ODP_SUPPORT_RECV - if has_odp_send == 0: - raise unittest.SkipTest('ODP is not supported - ODP send not supported') - if has_odp_recv == 0: - raise unittest.SkipTest('ODP is not supported - ODP recv not supported') + if required_odp_caps & qp_odp_caps != required_odp_caps: + raise unittest.SkipTest('ODP is not supported - ODP recv/send is not supported') + + +def odp_implicit_supported(ctx): + """ + Check device ODP implicit capability. + :param ctx: Device Context + :return: None + """ + odp_caps = ctx.query_device_ex().odp_caps + has_odp_implicit = odp_caps.general_caps & e.IBV_ODP_SUPPORT_IMPLICIT + if has_odp_implicit == 0: + raise unittest.SkipTest('ODP implicit is not supported') def requires_huge_pages(): @@ -669,3 +768,35 @@ def huge_pages_supported(): with open(huge_path, 'r') as f: if not int(f.read()): raise unittest.SkipTest('There are no huge pages of size 2M allocated') + + +def prefetch_mrs(agr_obj, sg_list, advice=e._IBV_ADVISE_MR_ADVICE_PREFETCH_WRITE, + flags=e._IBV_ADVISE_MR_FLAG_FLUSH): + """ + Pre-fetch a range of an on-demand paging MR. + :param agr_obj: Aggregation object which contains all resources necessary + :param sg_list: SGE list + :param advice: The requested advice value + :param flags: Describes the properties of the advice operation + :return: None + """ + try: + agr_obj.pd.advise_mr(advice, flags, sg_list) + except PyverbsRDMAError as ex: + if ex.error_code == errno.EOPNOTSUPP: + raise unittest.SkipTest(f'Advise MR with flags ({flags}) and advice ({advice}) is not supported') + raise ex + + +def is_eth(ctx, port_num): + """ + Querires the device's context's port for its link layer. + :param ctx: The Context to query + :param port_num: Which Context's port to query + :return: True if the port's link layer is Ethernet, else False + """ + return ctx.query_port(port_num).link_layer == e.IBV_LINK_LAYER_ETHERNET + + +def is_root(): + return os.geteuid() == 0 diff --git a/util/util.c b/util/util.c index 8c5f8f1..6af0b3a 100644 --- a/util/util.c +++ b/util/util.c @@ -1,4 +1,8 @@ /* GPLv2 or OpenIB.org BSD (MIT) See COPYING file */ +#include +#include +#include +#include #include #include #include @@ -20,3 +24,24 @@ int set_fd_nonblock(int fd, bool nonblock) return -1; return 0; } + +#ifndef GRND_INSECURE +#define GRND_INSECURE 0x0004 +#endif +unsigned int get_random(void) +{ + static unsigned int seed; + ssize_t sz; + + if (!seed) { + sz = getrandom(&seed, sizeof(seed), + GRND_NONBLOCK | GRND_INSECURE); + if (sz < 0) + sz = getrandom(&seed, sizeof(seed), GRND_NONBLOCK); + + if (sz != sizeof(seed)) + seed = time(NULL); + } + + return rand_r(&seed); +} diff --git a/util/util.h b/util/util.h index 514302b..0f2c35c 100644 --- a/util/util.h +++ b/util/util.h @@ -42,4 +42,5 @@ int set_fd_nonblock(int fd, bool nonblock); int open_cdev(const char *devname_hint, dev_t cdev); +unsigned int get_random(void); #endif