From 6d2c1b06b13c16a073b6a56760a28d8aa754ee22 Mon Sep 17 00:00:00 2001 From: Packit Date: Sep 11 2020 16:17:44 +0000 Subject: libvma-9.0.2 base --- diff --git a/.gitignore b/.gitignore new file mode 100644 index 0000000..f99e630 --- /dev/null +++ b/.gitignore @@ -0,0 +1,64 @@ +# Object files +*.o +*.lo +*.la + +# Libraries +.libs +*.lib +*.a + +# Shared objects +*.so +*.so.* + +# Dependencies +.dirstamp +.deps + +# Automake +Makefile.in +Makefile +aclocal.m4 +autom4te.cache/ +config.h +config.h.in +config.h.in~ +config.log +config.status +config/aux/config.guess +config/aux/config.sub +config/aux/install-sh +config/aux/missing +config/aux/decomp +config/aux/ltmain.sh +config/aux/compile +config/aux/depcomp +config/m4/libtool.m4 +config/m4/ltoptions.m4 +config/m4/ltsugar.m4 +config/m4/ltversion.m4 +config/m4/lt~obsolete.m4 +configure +libtool + +# Tool +cov-int/ +jenkins/ + +# Project/Eclipse +.cproject +.project +tags + +# VMA specific +src/utils/timetest +src/stats/vma_stats +src/vlogger/vlogger_test +src/state_machine/state_machine_test +stamp-h1 + +# build products +VMA_VERSION +build/libvma.spec +debian/changelog diff --git a/.gittemplate b/.gittemplate new file mode 100644 index 0000000..44874f7 --- /dev/null +++ b/.gittemplate @@ -0,0 +1,5 @@ +issue: + +log:
+ +reviewed by: diff --git a/COPYING b/COPYING new file mode 100644 index 0000000..86822e3 --- /dev/null +++ b/COPYING @@ -0,0 +1,342 @@ + +1. ------------------------------------------------------------------------ +2. GNU GENERAL PUBLIC LICENSE +3. Version 2, June 1991 +4. +5. Copyright (C) 1989, 1991 Free Software Foundation, Inc. +6. 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +7. Everyone is permitted to copy and distribute verbatim copies +8. of this license document, but changing it is not allowed. +9. +10. Preamble +11. +12. The licenses for most software are designed to take away your +13. freedom to share and change it. By contrast, the GNU General Public +14. License is intended to guarantee your freedom to share and change free +15. software--to make sure the software is free for all its users. This +16. General Public License applies to most of the Free Software +17. Foundation's software and to any other program whose authors commit to +18. using it. (Some other Free Software Foundation software is covered by +19. the GNU Library General Public License instead.) You can apply it to +20. your programs, too. +21. +22. When we speak of free software, we are referring to freedom, not +23. price. Our General Public Licenses are designed to make sure that you +24. have the freedom to distribute copies of free software (and charge for +25. this service if you wish), that you receive source code or can get it +26. if you want it, that you can change the software or use pieces of it +27. in new free programs; and that you know you can do these things. +28. +29. To protect your rights, we need to make restrictions that forbid +30. anyone to deny you these rights or to ask you to surrender the rights. +31. These restrictions translate to certain responsibilities for you if you +32. distribute copies of the software, or if you modify it. +33. +34. For example, if you distribute copies of such a program, whether +35. gratis or for a fee, you must give the recipients all the rights that +36. you have. You must make sure that they, too, receive or can get the +37. source code. And you must show them these terms so they know their +38. rights. +39. +40. We protect your rights with two steps: (1) copyright the software, and +41. (2) offer you this license which gives you legal permission to copy, +42. distribute and/or modify the software. +43. +44. Also, for each author's protection and ours, we want to make certain +45. that everyone understands that there is no warranty for this free +46. software. If the software is modified by someone else and passed on, we +47. want its recipients to know that what they have is not the original, so +48. that any problems introduced by others will not reflect on the original +49. authors' reputations. +50. +51. Finally, any free program is threatened constantly by software +52. patents. We wish to avoid the danger that redistributors of a free +53. program will individually obtain patent licenses, in effect making the +54. program proprietary. To prevent this, we have made it clear that any +55. patent must be licensed for everyone's free use or not licensed at all. +56. +57. The precise terms and conditions for copying, distribution and +58. modification follow. +59. +60. GNU GENERAL PUBLIC LICENSE +61. TERMS AND CONDITIONS FOR COPYING, DISTRIBUTION AND MODIFICATION +62. +63. 0. This License applies to any program or other work which contains +64. a notice placed by the copyright holder saying it may be distributed +65. under the terms of this General Public License. The "Program", below, +66. refers to any such program or work, and a "work based on the Program" +67. means either the Program or any derivative work under copyright law: +68. that is to say, a work containing the Program or a portion of it, +69. either verbatim or with modifications and/or translated into another +70. language. (Hereinafter, translation is included without limitation in +71. the term "modification".) Each licensee is addressed as "you". +72. +73. Activities other than copying, distribution and modification are not +74. covered by this License; they are outside its scope. The act of +75. running the Program is not restricted, and the output from the Program +76. is covered only if its contents constitute a work based on the +77. Program (independent of having been made by running the Program). +78. Whether that is true depends on what the Program does. +79. +80. 1. You may copy and distribute verbatim copies of the Program's +81. source code as you receive it, in any medium, provided that you +82. conspicuously and appropriately publish on each copy an appropriate +83. copyright notice and disclaimer of warranty; keep intact all the +84. notices that refer to this License and to the absence of any warranty; +85. and give any other recipients of the Program a copy of this License +86. along with the Program. +87. +88. You may charge a fee for the physical act of transferring a copy, and +89. you may at your option offer warranty protection in exchange for a fee. +90. +91. 2. You may modify your copy or copies of the Program or any portion +92. of it, thus forming a work based on the Program, and copy and +93. distribute such modifications or work under the terms of Section 1 +94. above, provided that you also meet all of these conditions: +95. +96. a) You must cause the modified files to carry prominent notices +97. stating that you changed the files and the date of any change. +98. +99. b) You must cause any work that you distribute or publish, that in +100. whole or in part contains or is derived from the Program or any +101. part thereof, to be licensed as a whole at no charge to all third +102. parties under the terms of this License. +103. +104. c) If the modified program normally reads commands interactively +105. when run, you must cause it, when started running for such +106. interactive use in the most ordinary way, to print or display an +107. announcement including an appropriate copyright notice and a +108. notice that there is no warranty (or else, saying that you provide +109. a warranty) and that users may redistribute the program under +110. these conditions, and telling the user how to view a copy of this +111. License. (Exception: if the Program itself is interactive but +112. does not normally print such an announcement, your work based on +113. the Program is not required to print an announcement.) +114. +115. These requirements apply to the modified work as a whole. If +116. identifiable sections of that work are not derived from the Program, +117. and can be reasonably considered independent and separate works in +118. themselves, then this License, and its terms, do not apply to those +119. sections when you distribute them as separate works. But when you +120. distribute the same sections as part of a whole which is a work based +121. on the Program, the distribution of the whole must be on the terms of +122. this License, whose permissions for other licensees extend to the +123. entire whole, and thus to each and every part regardless of who wrote it. +124. +125. Thus, it is not the intent of this section to claim rights or contest +126. your rights to work written entirely by you; rather, the intent is to +127. exercise the right to control the distribution of derivative or +128. collective works based on the Program. +129. +130. In addition, mere aggregation of another work not based on the Program +131. with the Program (or with a work based on the Program) on a volume of +132. a storage or distribution medium does not bring the other work under +133. the scope of this License. +134. +135. 3. You may copy and distribute the Program (or a work based on it, +136. under Section 2) in object code or executable form under the terms of +137. Sections 1 and 2 above provided that you also do one of the following: +138. +139. a) Accompany it with the complete corresponding machine-readable +140. source code, which must be distributed under the terms of Sections +141. 1 and 2 above on a medium customarily used for software interchange; or, +142. +143. b) Accompany it with a written offer, valid for at least three +144. years, to give any third party, for a charge no more than your +145. cost of physically performing source distribution, a complete +146. machine-readable copy of the corresponding source code, to be +147. distributed under the terms of Sections 1 and 2 above on a medium +148. customarily used for software interchange; or, +149. +150. c) Accompany it with the information you received as to the offer +151. to distribute corresponding source code. (This alternative is +152. allowed only for noncommercial distribution and only if you +153. received the program in object code or executable form with such +154. an offer, in accord with Subsection b above.) +155. +156. The source code for a work means the preferred form of the work for +157. making modifications to it. For an executable work, complete source +158. code means all the source code for all modules it contains, plus any +159. associated interface definition files, plus the scripts used to +160. control compilation and installation of the executable. However, as a +161. special exception, the source code distributed need not include +162. anything that is normally distributed (in either source or binary +163. form) with the major components (compiler, kernel, and so on) of the +164. operating system on which the executable runs, unless that component +165. itself accompanies the executable. +166. +167. If distribution of executable or object code is made by offering +168. access to copy from a designated place, then offering equivalent +169. access to copy the source code from the same place counts as +170. distribution of the source code, even though third parties are not +171. compelled to copy the source along with the object code. +172. +173. 4. You may not copy, modify, sublicense, or distribute the Program +174. except as expressly provided under this License. Any attempt +175. otherwise to copy, modify, sublicense or distribute the Program is +176. void, and will automatically terminate your rights under this License. +177. However, parties who have received copies, or rights, from you under +178. this License will not have their licenses terminated so long as such +179. parties remain in full compliance. +180. +181. 5. You are not required to accept this License, since you have not +182. signed it. However, nothing else grants you permission to modify or +183. distribute the Program or its derivative works. These actions are +184. prohibited by law if you do not accept this License. Therefore, by +185. modifying or distributing the Program (or any work based on the +186. Program), you indicate your acceptance of this License to do so, and +187. all its terms and conditions for copying, distributing or modifying +188. the Program or works based on it. +189. +190. 6. Each time you redistribute the Program (or any work based on the +191. Program), the recipient automatically receives a license from the +192. original licensor to copy, distribute or modify the Program subject to +193. these terms and conditions. You may not impose any further +194. restrictions on the recipients' exercise of the rights granted herein. +195. You are not responsible for enforcing compliance by third parties to +196. this License. +197. +198. 7. If, as a consequence of a court judgment or allegation of patent +199. infringement or for any other reason (not limited to patent issues), +200. conditions are imposed on you (whether by court order, agreement or +201. otherwise) that contradict the conditions of this License, they do not +202. excuse you from the conditions of this License. If you cannot +203. distribute so as to satisfy simultaneously your obligations under this +204. License and any other pertinent obligations, then as a consequence you +205. may not distribute the Program at all. For example, if a patent +206. license would not permit royalty-free redistribution of the Program by +207. all those who receive copies directly or indirectly through you, then +208. the only way you could satisfy both it and this License would be to +209. refrain entirely from distribution of the Program. +210. +211. If any portion of this section is held invalid or unenforceable under +212. any particular circumstance, the balance of the section is intended to +213. apply and the section as a whole is intended to apply in other +214. circumstances. +215. +216. It is not the purpose of this section to induce you to infringe any +217. patents or other property right claims or to contest validity of any +218. such claims; this section has the sole purpose of protecting the +219. integrity of the free software distribution system, which is +220. implemented by public license practices. Many people have made +221. generous contributions to the wide range of software distributed +222. through that system in reliance on consistent application of that +223. system; it is up to the author/donor to decide if he or she is willing +224. to distribute software through any other system and a licensee cannot +225. impose that choice. +226. +227. This section is intended to make thoroughly clear what is believed to +228. be a consequence of the rest of this License. +229. +230. 8. If the distribution and/or use of the Program is restricted in +231. certain countries either by patents or by copyrighted interfaces, the +232. original copyright holder who places the Program under this License +233. may add an explicit geographical distribution limitation excluding +234. those countries, so that distribution is permitted only in or among +235. countries not thus excluded. In such case, this License incorporates +236. the limitation as if written in the body of this License. +237. +238. 9. The Free Software Foundation may publish revised and/or new versions +239. of the General Public License from time to time. Such new versions will +240. be similar in spirit to the present version, but may differ in detail to +241. address new problems or concerns. +242. +243. Each version is given a distinguishing version number. If the Program +244. specifies a version number of this License which applies to it and "any +245. later version", you have the option of following the terms and conditions +246. either of that version or of any later version published by the Free +247. Software Foundation. If the Program does not specify a version number of +248. this License, you may choose any version ever published by the Free Software +249. Foundation. +250. +251. 10. If you wish to incorporate parts of the Program into other free +252. programs whose distribution conditions are different, write to the author +253. to ask for permission. For software which is copyrighted by the Free +254. Software Foundation, write to the Free Software Foundation; we sometimes +255. make exceptions for this. Our decision will be guided by the two goals +256. of preserving the free status of all derivatives of our free software and +257. of promoting the sharing and reuse of software generally. +258. +259. NO WARRANTY +260. +261. 11. BECAUSE THE PROGRAM IS LICENSED FREE OF CHARGE, THERE IS NO WARRANTY +262. FOR THE PROGRAM, TO THE EXTENT PERMITTED BY APPLICABLE LAW. EXCEPT WHEN +263. OTHERWISE STATED IN WRITING THE COPYRIGHT HOLDERS AND/OR OTHER PARTIES +264. PROVIDE THE PROGRAM "AS IS" WITHOUT WARRANTY OF ANY KIND, EITHER EXPRESSED +265. OR IMPLIED, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF +266. MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE. THE ENTIRE RISK AS +267. TO THE QUALITY AND PERFORMANCE OF THE PROGRAM IS WITH YOU. SHOULD THE +268. PROGRAM PROVE DEFECTIVE, YOU ASSUME THE COST OF ALL NECESSARY SERVICING, +269. REPAIR OR CORRECTION. +270. +271. 12. IN NO EVENT UNLESS REQUIRED BY APPLICABLE LAW OR AGREED TO IN WRITING +272. WILL ANY COPYRIGHT HOLDER, OR ANY OTHER PARTY WHO MAY MODIFY AND/OR +273. REDISTRIBUTE THE PROGRAM AS PERMITTED ABOVE, BE LIABLE TO YOU FOR DAMAGES, +274. INCLUDING ANY GENERAL, SPECIAL, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING +275. OUT OF THE USE OR INABILITY TO USE THE PROGRAM (INCLUDING BUT NOT LIMITED +276. TO LOSS OF DATA OR DATA BEING RENDERED INACCURATE OR LOSSES SUSTAINED BY +277. YOU OR THIRD PARTIES OR A FAILURE OF THE PROGRAM TO OPERATE WITH ANY OTHER +278. PROGRAMS), EVEN IF SUCH HOLDER OR OTHER PARTY HAS BEEN ADVISED OF THE +279. POSSIBILITY OF SUCH DAMAGES. +280. +281. END OF TERMS AND CONDITIONS +282. +283. How to Apply These Terms to Your New Programs +284. +285. If you develop a new program, and you want it to be of the greatest +286. possible use to the public, the best way to achieve this is to make it +287. free software which everyone can redistribute and change under these terms. +288. +289. To do so, attach the following notices to the program. It is safest +290. to attach them to the start of each source file to most effectively +291. convey the exclusion of warranty; and each file should have at least +292. the "copyright" line and a pointer to where the full notice is found. +293. +294. +295. Copyright (C) +296. +297. This program is free software; you can redistribute it and/or modify +298. it under the terms of the GNU General Public License as published by +299. the Free Software Foundation; either version 2 of the License, or +300. (at your option) any later version. +301. +302. This program is distributed in the hope that it will be useful, +303. but WITHOUT ANY WARRANTY; without even the implied warranty of +304. MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +305. GNU General Public License for more details. +306. +307. You should have received a copy of the GNU General Public License +308. along with this program; if not, write to the Free Software +309. Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA +310. +311. +312. Also add information on how to contact you by electronic and paper mail. +313. +314. If the program is interactive, make it output a short notice like this +315. when it starts in an interactive mode: +316. +317. Gnomovision version 69, Copyright (C) year name of author +318. Gnomovision comes with ABSOLUTELY NO WARRANTY; for details type `show w'. +319. This is free software, and you are welcome to redistribute it +320. under certain conditions; type `show c' for details. +321. +322. The hypothetical commands `show w' and `show c' should show the appropriate +323. parts of the General Public License. Of course, the commands you use may +324. be called something other than `show w' and `show c'; they could even be +325. mouse-clicks or menu items--whatever suits your program. +326. +327. You should also get your employer (if you work as a programmer) or your +328. school, if any, to sign a "copyright disclaimer" for the program, if +329. necessary. Here is a sample; alter the names: +330. +331. Yoyodyne, Inc., hereby disclaims all copyright interest in the program +332. `Gnomovision' (which makes passes at compilers) written by James Hacker. +333. +334. , 1 April 1989 +335. Ty Coon, President of Vice +336. +337. This General Public License does not permit incorporating your program into +338. proprietary programs. If your program is a subroutine library, you may +339. consider it more useful to permit linking proprietary applications with the +340. library. If this is what you want to do, use the GNU Library General +341. Public License instead of this License. diff --git a/INSTALL b/INSTALL new file mode 100644 index 0000000..23e5f25 --- /dev/null +++ b/INSTALL @@ -0,0 +1,236 @@ +Installation Instructions +************************* + +Copyright (C) 1994, 1995, 1996, 1999, 2000, 2001, 2002, 2004, 2005 Free +Software Foundation, Inc. + +This file is free documentation; the Free Software Foundation gives +unlimited permission to copy, distribute and modify it. + +Basic Installation +================== + +These are generic installation instructions. + + The `configure' shell script attempts to guess correct values for +various system-dependent variables used during compilation. It uses +those values to create a `Makefile' in each directory of the package. +It may also create one or more `.h' files containing system-dependent +definitions. Finally, it creates a shell script `config.status' that +you can run in the future to recreate the current configuration, and a +file `config.log' containing compiler output (useful mainly for +debugging `configure'). + + It can also use an optional file (typically called `config.cache' +and enabled with `--cache-file=config.cache' or simply `-C') that saves +the results of its tests to speed up reconfiguring. (Caching is +disabled by default to prevent problems with accidental use of stale +cache files.) + + If you need to do unusual things to compile the package, please try +to figure out how `configure' could check whether to do them, and mail +diffs or instructions to the address given in the `README' so they can +be considered for the next release. If you are using the cache, and at +some point `config.cache' contains results you don't want to keep, you +may remove or edit it. + + The file `configure.ac' (or `configure.in') is used to create +`configure' by a program called `autoconf'. You only need +`configure.ac' if you want to change it or regenerate `configure' using +a newer version of `autoconf'. + +The simplest way to compile this package is: + + 1. `cd' to the directory containing the package's source code and type + `./configure' to configure the package for your system. If you're + using `csh' on an old version of System V, you might need to type + `sh ./configure' instead to prevent `csh' from trying to execute + `configure' itself. + + Running `configure' takes awhile. While running, it prints some + messages telling which features it is checking for. + + 2. Type `make' to compile the package. + + 3. Optionally, type `make check' to run any self-tests that come with + the package. + + 4. Type `make install' to install the programs and any data files and + documentation. + + 5. You can remove the program binaries and object files from the + source code directory by typing `make clean'. To also remove the + files that `configure' created (so you can compile the package for + a different kind of computer), type `make distclean'. There is + also a `make maintainer-clean' target, but that is intended mainly + for the package's developers. If you use it, you may have to get + all sorts of other programs in order to regenerate files that came + with the distribution. + +Compilers and Options +===================== + +Some systems require unusual options for compilation or linking that the +`configure' script does not know about. Run `./configure --help' for +details on some of the pertinent environment variables. + + You can give `configure' initial values for configuration parameters +by setting variables in the command line or in the environment. Here +is an example: + + ./configure CC=c89 CFLAGS=-O2 LIBS=-lposix + + *Note Defining Variables::, for more details. + +Compiling For Multiple Architectures +==================================== + +You can compile the package for more than one kind of computer at the +same time, by placing the object files for each architecture in their +own directory. To do this, you must use a version of `make' that +supports the `VPATH' variable, such as GNU `make'. `cd' to the +directory where you want the object files and executables to go and run +the `configure' script. `configure' automatically checks for the +source code in the directory that `configure' is in and in `..'. + + If you have to use a `make' that does not support the `VPATH' +variable, you have to compile the package for one architecture at a +time in the source code directory. After you have installed the +package for one architecture, use `make distclean' before reconfiguring +for another architecture. + +Installation Names +================== + +By default, `make install' installs the package's commands under +`/usr/local/bin', include files under `/usr/local/include', etc. You +can specify an installation prefix other than `/usr/local' by giving +`configure' the option `--prefix=PREFIX'. + + You can specify separate installation prefixes for +architecture-specific files and architecture-independent files. If you +pass the option `--exec-prefix=PREFIX' to `configure', the package uses +PREFIX as the prefix for installing programs and libraries. +Documentation and other data files still use the regular prefix. + + In addition, if you use an unusual directory layout you can give +options like `--bindir=DIR' to specify different values for particular +kinds of files. Run `configure --help' for a list of the directories +you can set and what kinds of files go in them. + + If the package supports it, you can cause programs to be installed +with an extra prefix or suffix on their names by giving `configure' the +option `--program-prefix=PREFIX' or `--program-suffix=SUFFIX'. + +Optional Features +================= + +Some packages pay attention to `--enable-FEATURE' options to +`configure', where FEATURE indicates an optional part of the package. +They may also pay attention to `--with-PACKAGE' options, where PACKAGE +is something like `gnu-as' or `x' (for the X Window System). The +`README' should mention any `--enable-' and `--with-' options that the +package recognizes. + + For packages that use the X Window System, `configure' can usually +find the X include and library files automatically, but if it doesn't, +you can use the `configure' options `--x-includes=DIR' and +`--x-libraries=DIR' to specify their locations. + +Specifying the System Type +========================== + +There may be some features `configure' cannot figure out automatically, +but needs to determine by the type of machine the package will run on. +Usually, assuming the package is built to be run on the _same_ +architectures, `configure' can figure that out, but if it prints a +message saying it cannot guess the machine type, give it the +`--build=TYPE' option. TYPE can either be a short name for the system +type, such as `sun4', or a canonical name which has the form: + + CPU-COMPANY-SYSTEM + +where SYSTEM can have one of these forms: + + OS KERNEL-OS + + See the file `config.sub' for the possible values of each field. If +`config.sub' isn't included in this package, then this package doesn't +need to know the machine type. + + If you are _building_ compiler tools for cross-compiling, you should +use the option `--target=TYPE' to select the type of system they will +produce code for. + + If you want to _use_ a cross compiler, that generates code for a +platform different from the build platform, you should specify the +"host" platform (i.e., that on which the generated programs will +eventually be run) with `--host=TYPE'. + +Sharing Defaults +================ + +If you want to set default values for `configure' scripts to share, you +can create a site shell script called `config.site' that gives default +values for variables like `CC', `cache_file', and `prefix'. +`configure' looks for `PREFIX/share/config.site' if it exists, then +`PREFIX/etc/config.site' if it exists. Or, you can set the +`CONFIG_SITE' environment variable to the location of the site script. +A warning: not all `configure' scripts look for a site script. + +Defining Variables +================== + +Variables not defined in a site shell script can be set in the +environment passed to `configure'. However, some packages may run +configure again during the build, and the customized values of these +variables may be lost. In order to avoid this problem, you should set +them in the `configure' command line, using `VAR=value'. For example: + + ./configure CC=/usr/local2/bin/gcc + +causes the specified `gcc' to be used as the C compiler (unless it is +overridden in the site shell script). Here is a another example: + + /bin/bash ./configure CONFIG_SHELL=/bin/bash + +Here the `CONFIG_SHELL=/bin/bash' operand causes subsequent +configuration-related scripts to be executed by `/bin/bash'. + +`configure' Invocation +====================== + +`configure' recognizes the following options to control how it operates. + +`--help' +`-h' + Print a summary of the options to `configure', and exit. + +`--version' +`-V' + Print the version of Autoconf used to generate the `configure' + script, and exit. + +`--cache-file=FILE' + Enable the cache: use and save the results of the tests in FILE, + traditionally `config.cache'. FILE defaults to `/dev/null' to + disable caching. + +`--config-cache' +`-C' + Alias for `--cache-file=config.cache'. + +`--quiet' +`--silent' +`-q' + Do not print messages saying which checks are being made. To + suppress all normal output, redirect it to `/dev/null' (any error + messages will still be shown). + +`--srcdir=DIR' + Look for the package's source code in directory DIR. Usually + `configure' can determine that directory automatically. + +`configure' also accepts some other, not widely useful, options. Run +`configure --help' for more details. + diff --git a/LICENSE b/LICENSE new file mode 100644 index 0000000..4501421 --- /dev/null +++ b/LICENSE @@ -0,0 +1,31 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ diff --git a/Makefile.am b/Makefile.am new file mode 100644 index 0000000..3872264 --- /dev/null +++ b/Makefile.am @@ -0,0 +1,62 @@ +SUBDIRS := src tools + + +DIST_SUBDIRS := src tests tools + +noinst_SCRIPTS = \ + $(wildcard contrib/scripts/*) + +EXTRA_DIST = \ + build \ + contrib \ + debian \ + install.sh \ + journal.txt \ + README.txt \ + VMA_VERSION + + +.PHONY: tests + +mydocdir = $(if $(docdir),$(docdir),${datadir}/doc/$(distdir)) +mydoc_DATA = README.txt journal.txt VMA_VERSION + +install-exec-hook: + if type systemctl >/dev/null 2>&1; then \ + cp $(top_builddir)/contrib/scripts/vma.init $(DESTDIR)$(sbindir)/vma; \ + chmod 755 $(DESTDIR)$(sbindir)/vma; \ + mkdir -p $(DESTDIR)$(prefix)/lib/systemd/system/; \ + cp $(top_builddir)/contrib/scripts/vma.service $(DESTDIR)$(prefix)/lib/systemd/system/vma.service; \ + chmod 644 $(DESTDIR)$(prefix)/lib/systemd/system/vma.service; \ + else \ + mkdir -p $(DESTDIR)/$(sysconfdir)/init.d; \ + cp $(top_builddir)/contrib/scripts/vma.init $(DESTDIR)$(sysconfdir)/init.d/vma; \ + chmod 755 $(DESTDIR)$(sysconfdir)/init.d/vma; \ + fi + +uninstall-hook: + if type systemctl >/dev/null 2>&1; then \ + rm -rf $(DESTDIR)$(sbindir)/vma; \ + rm -rf $(DESTDIR)$(prefix)/lib/systemd/system/vma.service; \ + else \ + rm -rf $(DESTDIR)$(sysconfdir)/init.d/vma; \ + fi + +install-all: install + +uninstall-all: uninstall + +tests: + $(MAKE) + $(MAKE) -C tests/gtest + $(MAKE) -C tests/latency_test + $(MAKE) -C tests/throughput_test + $(MAKE) -C tests/pps_test + +demo: + $(MAKE) + $(MAKE) -C src/vma/infra + +rpmspec: build/libvma.spec + +debian: debian/changelog diff --git a/README.md b/README.md new file mode 100644 index 0000000..6b4e1a6 --- /dev/null +++ b/README.md @@ -0,0 +1,20 @@ +[![GitHub version](https://badge.fury.io/gh/mellanox%2Flibvma.svg)](https://badge.fury.io/gh/mellanox%2Flibvma) +[![Coverity Scan Build Status](https://scan.coverity.com/projects/8025/badge.svg)](https://scan.coverity.com/projects/libvma) + +### Introduction +Mellanox's Messaging Accelerator (VMA) boosts performance for message-based and streaming applications such as those found in financial services market data environments and Web2.0 clusters. It allows application written over standard socket API +to run over Ethernet and/or Infiniband from user-space with full network stack bypass. + +The [VMA architecture](https://github.com/Mellanox/libvma/wiki/Architecture) page includes additional information. + +### Download +Get all download and installation information [here](https://github.com/Mellanox/libvma/wiki/Downloads). +or some quick instruction in order to [build VMA from source](https://github.com/Mellanox/libvma/wiki/Build-Instruction). + +### Technical Support +Have a question? please open a [github issue](https://github.com/Mellanox/libvma/issues) or contact support@mellanox.com. + +### Additional Information +* Refer to the libvma [README.txt](https://github.com/Mellanox/libvma/blob/master/README.txt) +* Main VMA page on Mellanox.com: http://www.mellanox.com/vma/ +* Check out the rest of the Wiki pages in this project diff --git a/README.txt b/README.txt new file mode 100644 index 0000000..c3d5a4d --- /dev/null +++ b/README.txt @@ -0,0 +1,1178 @@ +Update: 12 Aug 2018 + +Introduction +============ + +Mellanox's Messaging Accelerator (VMA) is dynamically linked user space Linux +library for transparently enhancing the performance of networking-heavy +applications. It boosts performance for message-based and streaming applications +such as those found in financial services market data environments and Web2.0 +clusters. +It allows application written over standard socket API to run over Infiniband +and/or Ethernet from user-space with full network stack bypass. +The result is a reduction in latency by as much as 300%, +an increase in application throughput by as much as 200%, +higher packets rates and better CPU utilization as compared to applications +running on standard Ethernet or InfiniBand interconnect networks. + +Build libvma from source +======================== + +Prerequisites: +1. MLNX_OFED as described in the "Pre Installation" step of next section. +2. Or, upstream kernel and userspace verbs libraries (libibverbs, libmlx4, libmlx5, librdmacm) +3. Autoconf, Automake, libtool, unzip, patch, libnl-devel (netlink 1 or 3) + +Build: +1. ./autogen.sh +2. ./configure --with-ofed=/usr --prefix=/usr --libdir=/usr/lib64 --includedir=/usr/include --docdir=/usr/share/doc/libvma --sysconfdir=/etc +3. make + +You will find libvma.so in path_to_vma_dir/src/vma/.libs/libvma.so. + +Install: +1. sudo make install + +Tip: +./install.sh can do the build and install steps for you. + + +Install libvma from rpm or debian +================================= + +Pre Installation: +1. If possible, install latest MLNX_OFED with the --vma option. + This will also install libvma, and you can skip to "Running" step. +2. If installing over existing MLNX_OFED, add the following to + /etc/modprobe.d/mlnx.conf: + options ib_uverbs disable_raw_qp_enforcement=1 + options mlx4_core fast_drop=1 + options mlx4_core log_num_mgm_entry_size=-1 + And restart openibd or rdma service depending on your system configuration + +Installing: +Install the package as any other rpm or debian package [rpm -i libvma.X.Y.Z-R.rpm]. +The installation copies the VMA library to: /usr/lib[64]/libvma.so +The VMA monitoring utility is installed at: /usr/bin/vma_stat +The VMA extra socket API is located at: /usr/include/mellanox/vma_extra.h +The installation location of the README.txt and version information file +(VMA_VERSION), are as follows: +- Redhat: /usr/share/doc/libvma-X.Y.Z-R/ +- SuSE: /usr/share/doc/packages/libvma-X.Y.Z-R/ + +Post Installation: +When working over Infiniband, we recommend to manually add persistence +for the following system parameters: +1. Force IPoIB to work in 'datagram' mode (disabling IPoIB 'connected' mode) + Modify "SET_IPOIB_CM=no" in file "/etc/infiniband/openib.conf" +2. Force IGMP to work in V2 on IB interfaces + Add "net.ipv4.conf.all.force_igmp_version = 2" in file "/etc/sysctl.conf" + +Upgrading: +Use rpm update procedure: # rpm -U libvma.X.Y.Z-R.rpm +You can upgrade by uninstalling (rpm -e) the previously installed package +before starting to install the new VMA rpm. + +Uninstalling: +When uninstalling remember to uninstall (rpm -e) the package before you +uninstall ofed. + +Running: +Set the environment variable LD_PRELOAD to libvma.so and run your application. +Example: # LD_PRELOAD=libvma.so iperf -uc 224.22.22.22 -t 5 + + + +Configuration Values +==================== + +On default startup the VMA library logs to stderr the VMA version, the modified +configuration parameters being used and their values. +Please notice that except VMA_TRACELEVEL, VMA logs just those parameters whose value != default. + +Example: + VMA INFO: --------------------------------------------------------------------------- + VMA INFO: VMA_VERSION: 8.6.9-0 Development Snapshot built on Jun 26 2018 14:36:59 + VMA INFO: Git: 41d72e46ba99badb1d7be73ac42af8b7ee6879e8 + VMA INFO: Cmd Line: sockperf sr + VMA INFO: Current Time: Tue Jun 26 14:49:19 2018 + VMA INFO: Pid: 20439 + VMA INFO: OFED Version: MLNX_OFED_LINUX-4.4-0.1.8.0: + VMA INFO: Architecture: x86_64 + VMA INFO: Node: r-aa-apollo03.mtr.labs.mlnx + VMA INFO: --------------------------------------------------------------------------- + VMA INFO: Log Level DETAILS [VMA_TRACELEVEL] + VMA DETAILS: Log Details 0 [VMA_LOG_DETAILS] + VMA DETAILS: Log Colors Enabled [VMA_LOG_COLORS] + VMA DETAILS: Log File [VMA_LOG_FILE] + VMA DETAILS: Stats File [VMA_STATS_FILE] + VMA DETAILS: Stats shared memory directory /tmp/ [VMA_STATS_SHMEM_DIR] + VMA DETAILS: VMAD output directory /tmp/vma/ [VMA_VMAD_NOTIFY_DIR] + VMA DETAILS: Stats FD Num (max) 100 [VMA_STATS_FD_NUM] + VMA DETAILS: Conf File /etc/libvma.conf [VMA_CONFIG_FILE] + VMA DETAILS: Application ID VMA_DEFAULT_APPLICATION_ID [VMA_APPLICATION_ID] + VMA DETAILS: Polling CPU idle usage Disabled [VMA_CPU_USAGE_STATS] + VMA DETAILS: SigIntr Ctrl-C Handle Disabled [VMA_HANDLE_SIGINTR] + VMA DETAILS: SegFault Backtrace Disabled [VMA_HANDLE_SIGSEGV] + VMA DETAILS: Ring allocation logic TX 0 (Ring per interface) [VMA_RING_ALLOCATION_LOGIC_TX] + VMA DETAILS: Ring allocation logic RX 0 (Ring per interface) [VMA_RING_ALLOCATION_LOGIC_RX] + VMA DETAILS: Ring migration ratio TX 100 [VMA_RING_MIGRATION_RATIO_TX] + VMA DETAILS: Ring migration ratio RX 100 [VMA_RING_MIGRATION_RATIO_RX] + VMA DETAILS: Ring limit per interface 0 (no limit) [VMA_RING_LIMIT_PER_INTERFACE] + VMA DETAILS: Ring On Device Memory TX 0 [VMA_RING_DEV_MEM_TX] + VMA DETAILS: TCP max syn rate 0 (no limit) [VMA_TCP_MAX_SYN_RATE] + VMA DETAILS: Tx Mem Segs TCP 1000000 [VMA_TX_SEGS_TCP] + VMA DETAILS: Tx Mem Bufs 200000 [VMA_TX_BUFS] + VMA DETAILS: Tx Mem Buf size 0 [VMA_TX_BUF_SIZE] + VMA DETAILS: Tx QP WRE 2048 [VMA_TX_WRE] + VMA DETAILS: Tx QP WRE Batching 64 [VMA_TX_WRE_BATCHING] + VMA DETAILS: Tx Max QP INLINE 204 [VMA_TX_MAX_INLINE] + VMA DETAILS: Tx MC Loopback Enabled [VMA_TX_MC_LOOPBACK] + VMA DETAILS: Tx non-blocked eagains Disabled [VMA_TX_NONBLOCKED_EAGAINS] + VMA DETAILS: Tx Prefetch Bytes 256 [VMA_TX_PREFETCH_BYTES] + VMA DETAILS: Rx Mem Bufs 200000 [VMA_RX_BUFS] + VMA DETAILS: Rx QP WRE 16000 [VMA_RX_WRE] + VMA DETAILS: Rx QP WRE Batching 64 [VMA_RX_WRE_BATCHING] + VMA DETAILS: Rx Byte Min Limit 65536 [VMA_RX_BYTES_MIN] + VMA DETAILS: Rx Poll Loops 100000 [VMA_RX_POLL] + VMA DETAILS: Rx Poll Init Loops 0 [VMA_RX_POLL_INIT] + VMA DETAILS: Rx UDP Poll OS Ratio 100 [VMA_RX_UDP_POLL_OS_RATIO] + VMA DETAILS: HW TS Conversion 3 [VMA_HW_TS_CONVERSION] + VMA DETAILS: Rx Poll Yield Disabled [VMA_RX_POLL_YIELD] + VMA DETAILS: Rx Prefetch Bytes 256 [VMA_RX_PREFETCH_BYTES] + VMA DETAILS: Rx Prefetch Bytes Before Poll 0 [VMA_RX_PREFETCH_BYTES_BEFORE_POLL] + VMA DETAILS: Rx CQ Drain Rate Disabled [VMA_RX_CQ_DRAIN_RATE_NSEC] + VMA DETAILS: GRO max streams 32 [VMA_GRO_STREAMS_MAX] + VMA DETAILS: TCP 3T rules Disabled [VMA_TCP_3T_RULES] + VMA DETAILS: ETH MC L2 only rules Disabled [VMA_ETH_MC_L2_ONLY_RULES] + VMA DETAILS: Force Flowtag for MC Disabled [VMA_MC_FORCE_FLOWTAG] + VMA DETAILS: Select Poll (usec) 100000 [VMA_SELECT_POLL] + VMA DETAILS: Select Poll OS Force Disabled [VMA_SELECT_POLL_OS_FORCE] + VMA DETAILS: Select Poll OS Ratio 10 [VMA_SELECT_POLL_OS_RATIO] + VMA DETAILS: Select Skip OS 4 [VMA_SELECT_SKIP_OS] + VMA DETAILS: CQ Drain Interval (msec) 10 [VMA_PROGRESS_ENGINE_INTERVAL] + VMA DETAILS: CQ Drain WCE (max) 10000 [VMA_PROGRESS_ENGINE_WCE_MAX] + VMA DETAILS: CQ Interrupts Moderation Enabled [VMA_CQ_MODERATION_ENABLE] + VMA DETAILS: CQ Moderation Count 48 [VMA_CQ_MODERATION_COUNT] + VMA DETAILS: CQ Moderation Period (usec) 50 [VMA_CQ_MODERATION_PERIOD_USEC] + VMA DETAILS: CQ AIM Max Count 560 [VMA_CQ_AIM_MAX_COUNT] + VMA DETAILS: CQ AIM Max Period (usec) 250 [VMA_CQ_AIM_MAX_PERIOD_USEC] + VMA DETAILS: CQ AIM Interval (msec) 250 [VMA_CQ_AIM_INTERVAL_MSEC] + VMA DETAILS: CQ AIM Interrupts Rate (per sec) 5000 [VMA_CQ_AIM_INTERRUPTS_RATE_PER_SEC] + VMA DETAILS: CQ Poll Batch (max) 16 [VMA_CQ_POLL_BATCH_MAX] + VMA DETAILS: CQ Keeps QP Full Enabled [VMA_CQ_KEEP_QP_FULL] + VMA DETAILS: QP Compensation Level 256 [VMA_QP_COMPENSATION_LEVEL] + VMA DETAILS: Offloaded Sockets Enabled [VMA_OFFLOADED_SOCKETS] + VMA DETAILS: Timer Resolution (msec) 10 [VMA_TIMER_RESOLUTION_MSEC] + VMA DETAILS: TCP Timer Resolution (msec) 100 [VMA_TCP_TIMER_RESOLUTION_MSEC] + VMA DETAILS: TCP control thread 0 (Disabled) [VMA_TCP_CTL_THREAD] + VMA DETAILS: TCP timestamp option 0 [VMA_TCP_TIMESTAMP_OPTION] + VMA DETAILS: TCP nodelay 0 [VMA_TCP_NODELAY] + VMA DETAILS: TCP quickack 0 [VMA_TCP_QUICKACK] + VMA DETAILS: Exception handling mode -1(just log debug message) [VMA_EXCEPTION_HANDLING] + VMA DETAILS: Avoid sys-calls on tcp fd Disabled [VMA_AVOID_SYS_CALLS_ON_TCP_FD] + VMA DETAILS: Allow privileged sock opt Enabled [VMA_ALLOW_PRIVILEGED_SOCK_OPT] + VMA DETAILS: Delay after join (msec) 0 [VMA_WAIT_AFTER_JOIN_MSEC] + VMA DETAILS: Internal Thread Affinity -1 [VMA_INTERNAL_THREAD_AFFINITY] + VMA DETAILS: Internal Thread Cpuset [VMA_INTERNAL_THREAD_CPUSET] + VMA DETAILS: Internal Thread Arm CQ Disabled [VMA_INTERNAL_THREAD_ARM_CQ] + VMA DETAILS: Internal Thread TCP Handling 0 (deferred) [VMA_INTERNAL_THREAD_TCP_TIMER_HANDLING] + VMA DETAILS: Thread mode Multi spin lock [VMA_THREAD_MODE] + VMA DETAILS: Buffer batching mode 1 (Batch and reclaim buffers) [VMA_BUFFER_BATCHING_MODE] + VMA DETAILS: Mem Allocate type 1 (Contig Pages) [VMA_MEM_ALLOC_TYPE] + VMA DETAILS: Num of UC ARPs 3 [VMA_NEIGH_UC_ARP_QUATA] + VMA DETAILS: UC ARP delay (msec) 10000 [VMA_NEIGH_UC_ARP_DELAY_MSEC] + VMA DETAILS: Num of neigh restart retries 1 [VMA_NEIGH_NUM_ERR_RETRIES] + VMA DETAILS: IPOIB support Enabled [VMA_IPOIB] + VMA DETAILS: SocketXtreme Disabled [VMA_SOCKETXTREME] + VMA DETAILS: BF (Blue Flame) Enabled [VMA_BF] + VMA DETAILS: fork() support Enabled [VMA_FORK] + VMA DETAILS: close on dup2() Enabled [VMA_CLOSE_ON_DUP2] + VMA DETAILS: MTU 0 (follow actual MTU) [VMA_MTU] + VMA DETAILS: MSS 0 (follow VMA_MTU) [VMA_MSS] + VMA DETAILS: TCP CC Algorithm 0 (LWIP) [VMA_TCP_CC_ALGO] + VMA DETAILS: Polling Rx on Tx TCP Disabled [VMA_RX_POLL_ON_TX_TCP] + VMA DETAILS: Trig dummy send getsockname() Disabled [VMA_TRIGGER_DUMMY_SEND_GETSOCKNAME] + VMA INFO: --------------------------------------------------------------------------- + +VMA_TRACELEVEL +Logging level the VMA library will be using. Default is info +Example: # VMA_TRACELEVEL=debug + +none + Print no log at all +panic + Panic level logging, this would generally cause fatal behavior and an exception + will be thrown by the VMA library. Typically, this is caused by memory + allocation problems. This level is rarely used. +error + Runtime ERRORs in the VMA. + Typically, these can provide insight for the developer of wrong internal + logic like: Errors from underlying OS or Infiniband verbs calls. internal + double mapping/unmapping of objects. +warn + Runtime warning that do not disrupt the workflow of the application but + might warn of a problem in the setup or the overall setup configuration. + Typically, these can be address resolution failure (due to wrong routing + setup configuration), corrupted ip packets in the receive path or + unsupported functions requested by the user application +info + General information passed to the user of the application. Bring up + configuration logging or some general info to help the user better + use the VMA library +details + Complete VMA's configuration information. + Very high level insight of some of the critical decisions done in VMA. +debug + High level insight to the operations done in the VMA. All socket API calls + are logged and internal high level control channels log there activity. +fine + Low level run time logging of activity. This logging level includes basic + Tx and Rx logging in the fast path and it will lower application + performance. It is recommended to use this level with VMA_LOG_FILE parameter. +finer + Very low level run time logging of activity! + This logging level will DRASTICALLY lower application performance. + It is recommended to use this level with VMA_LOG_FILE parameter. +all + today this level is identical to finer + +VMA_LOG_DETAILS +Add details on each log line. +0 = Basic log line +1 = ThreadId +2 = ProcessId + ThreadId +3 = Time + ProcessId + ThreadId [Time is in milli-seconds from start of process] +Default value is 0 + +VMA_LOG_COLORS +Use color scheme when logging. Red for errors, purple for warnings and dim for +low level debugs. VMA_LOG_COLORS is automatically disabled when logging is direct +to a non terminal device (e.g. VMA_LOG_FILE is configured). +Default value is 1 (Enabled) + +VMA_LOG_FILE +Redirect all VMA logging to a specific user defined file. +This is very useful when raising the VMA_TRACELEVEL +VMA will replace a single '%d' appearing in the log file name with the pid of +the process loaded with VMA. This can help in running multiple instances of VMA +each with it's own log file name. +Example: VMA_LOG_FILE=/tmp/vma_log.txt + +VMA_SPEC +VMA predefined specification profiles. + +latency + Optimized for use cases that are keen on latency. i.e. Ping-Pong tests. + + Latency SPEC changes the following default configuration + VMA_RING_DEV_MEM_TX = 16384 (default: 0) + VMA_TX_WRE = 256 (default: 2048) + VMA_TX_WRE_BATCHING = 4 (default: 64) + VMA_RX_WRE = 256 (default: 16000) + VMA_RX_WRE_BATCHING = 4 (default: 64) + VMA_RX_POLL = -1 (default: 100000) + VMA_TSO = Disable (default: Enable) + VMA_RX_PREFETCH_BYTES_BEFORE_POLL = 256 (default: 0) + VMA_GRO_STREAMS_MAX = 0 (default: 32) + VMA_SELECT_POLL = -1 (default: 100000) + VMA_SELECT_POLL_OS_FORCE = Enable (default: Disabled) + VMA_SELECT_POLL_OS_RATIO = 1 (default: 10) + VMA_SELECT_SKIP_OS = 1 (default: 4) + VMA_PROGRESS_ENGINE_INTERVAL = 100 (default: 10) + VMA_CQ_MODERATION_ENABLE = Disable (default: Enabled) + VMA_CQ_AIM_MAX_COUNT = 128 (default: 560) + VMA_CQ_AIM_INTERVAL_MSEC = Disable (default: 250) + VMA_CQ_KEEP_QP_FULL = Disable (default: Enable) + VMA_TCP_NODELAY = Enable (default: Disable) + VMA_AVOID_SYS_CALLS_ON_TCP_FD = Enable (default: Disable) + VMA_INTERNAL_THREAD_AFFINITY = 0 (default: -1) + VMA_THREAD_MODE = Single (default: Multi spin lock) + VMA_MEM_ALLOC_TYPE = 2 (default: 1 (Contig Pages)) + + Example: VMA_SPEC=latency + +multi_ring_latency + Optimized for use cases that are keen on latency where two applications communicate using send-only and receive-only TCP sockets + + Multi ring latency SPEC changes the following default configuration + VMA_MEM_ALLOC_TYPE = 2 (default: 1 (Contig Pages)) + VMA_SELECT_POLL = -1 (default: 100000) + VMA_RX_POLL = -1 (default: 100000) + VMA_RING_ALLOCATION_LOGIC_TX = 20 (default: Ring per interface) + VMA_RING_ALLOCATION_LOGIC_RX = 20 (default: Ring per interface) + VMA_SELECT_POLL_OS_RATIO = 0 (default: 10) + VMA_SELECT_SKIP_OS = 0 (default: 4) + VMA_RX_POLL_ON_TX_TCP = true (default: false) + VMA_TRIGGER_DUMMY_SEND_GETSOCKNAME = true (default: false) + + Example: VMA_SPEC=multi_ring_latency + +VMA_STATS_FILE +Redirect socket statistics to a specific user defined file. +VMA will dump each socket's statistics into a file when closing the socket. +Example: VMA_STATS_FILE=/tmp/stats + +VMA_STATS_SHMEM_DIR +Set the directory path for VMA to create the shared memory files for vma_stats. +No files will be created when setting this value to empty string "". +Default value is /tmp/ + +VMA_VMAD_NOTIFY_DIR +Set the directory path for VMA to write files used by vmad. +Default value is /tmp/vma/ +Note: when used vmad must be run with --notify-dir directing the same folder. + +VMA_STATS_FD_NUM +Max number of sockets monitored by VMA statistic mechanism. +Value range is 0 to 1024. +Default value is 100 + +VMA_CONFIG_FILE +Sets the full path to the VMA configuration file. +Default values is: /etc/libvma.conf +Example: VMA_CONFIG_FILE=/tmp/libvma.conf + +VMA_APPLICATION_ID +Specify a group of rules from libvma.conf for VMA to apply. +Example: 'VMA_APPLICATION_ID=iperf_server'. +Default is "VMA_DEFAULT_APPLICATION_ID" (match only the '*' group rule) + +VMA_CPU_USAGE_STATS +Calculate VMA CPU usage during polling HW loops. +This information is available through VMA stats utility. +Default value is 0 (Disabled) + +VMA_HANDLE_SIGINTR +When Enabled, VMA handler will be called when interrupt signal is sent to the process. +VMA will also call to application's handler if exist. +Value range is 0 to 1 +Default value is 0 (Disabled) + +VMA_HANDLE_SIGSEGV +When Enabled, print backtrace if segmentation fault happens. +Value range is 0 to 1 +Default value is 0 (Disabled) + +VMA_TX_SEGS_TCP +Number of TCP LWIP segments allocation for each VMA process. +Default value is 1000000 + +VMA_TX_BUFS +Number of global Tx data buffer elements allocation. +Default value is 200000 + +VMA_TX_BUF_SIZE +Size of Tx data buffer elements allocation. +Can not be less then MTU (Maximum Transfer Unit) and greater than 0xFF00. +Default value is calculated basing on VMA_MTU and VMA_MSS. + +VMA_TX_WRE +Number of Work Request Elements allocated in all transmit QP's. +The number of QP's can change according to the number of network offloaded +interfaces. +Default value is 2048 + +VMA_TX_WRE_BATCHING +The number of Tx Work Request Elements used until a completion signal is requested. +Tuning this parameter allows a better control of the jitter encountered from the +Tx CQE handling. Setting a high batching value results in high PPS and lower +average latency. Setting a low batching value results in lower latency std-dev. +Value range is 1-64 +Default value is 64 + +VMA_TX_MAX_INLINE +Max send inline data set for QP. +Data copied into the INLINE space is at least 32 bytes of headers and +the rest can be user datagram payload. +VMA_TX_MAX_INLINE=0 disables INLINEing on the Tx transmit path. +In older releases this parameter was called: VMA_MAX_INLINE +Default VMA_TX_MAX_INLINE is 204 + +VMA_TX_MC_LOOPBACK +This parameter sets the initial value used by VMA internally to controls the +multicast loopback packets behavior during transmission. +An application that calls setsockopt() with IP_MULTICAST_LOOP will run over +the initial value set by this parameter. +Read more in 'Multicast loopback behavior' in notes section below +Default value is 1 (Enabled) + +VMA_TX_NONBLOCKED_EAGAINS +Return value 'OK' on all send operation done on a non-blocked UDP sockets. This +is the OS default behavior. The datagram sent is silently dropped inside VMA +or the network stack. +When set Enabled (set to 1), VMA will return with error EAGAIN if it was unable +accomplish the send operation and the datagram was dropped. +In both cases a dropped Tx statistical counter is incremented. +Default value is 0 (Disabled) + +VMA_TX_PREFETCH_BYTES +Accelerate offloaded send operation by optimizing cache. Different values +give optimized send rate on different machines. We recommend you tune this +for your specific hardware. +Value range is 0 to MTU size +Disable with a value of 0 +Default value is 256 bytes + +VMA_RING_ALLOCATION_LOGIC_TX +VMA_RING_ALLOCATION_LOGIC_RX +Ring allocation logic is used to separate the traffic to different rings. +By default all sockets use the same ring for both RX and TX over the same interface. +Even when specifying the logic to be per socket or thread, for different interfaces +we use different rings. This is useful when tuning for a multi-threaded application +and aiming for HW resource separation. +Warning: This feature might hurt performance for applications which their main +processing loop is based in select() and/or poll(). +The logic options are: +0 - Ring per interface +1 - Ring per ip address (using ip address) +10 - Ring per socket (using socket fd as separator) +20 - Ring per thread (using the id of the thread in which the socket was created) +30 - Ring per core (using cpu id) +31 - Ring per core - attach threads : attach each thread to a cpu core +Default value is 0 + +VMA_RING_MIGRATION_RATIO_TX +VMA_RING_MIGRATION_RATIO_RX +Ring migration ratio is used with the "ring per thread" logic in order to decide when +it is beneficial to replace the socket's ring with the ring allocated for the current thread. +Each VMA_RING_MIGRATION_RATIO iterations (of accessing the ring) we check the current +thread ID and see if our ring is matching the current thread. +If not, we consider ring migration. If we keep accessing the ring from the same thread for some +iterations, we migrate the socket to this thread ring. +Use a value of -1 in order to disable migration. +Default value is 100 + +VMA_RING_LIMIT_PER_INTERFACE +Limit the number of rings that can be allocated per interface. +For example, in ring allocation per socket logic, if the number of sockets using +the same interface is larger than the limit, then several sockets will be sharing the +same ring. +[Note:VMA_RX_BUFS might need to be adjusted in order to have enough buffers for all +rings in the system. Each ring consume VMA_RX_WRE buffers.] +Use a value of 0 for unlimited number of rings. +Default value is 0 (no limit) + +VMA_RING_DEV_MEM_TX +VMA can use the On Device Memory to store the egress packet if it does not fit into +the BF inline buffer. This improves application egress latency by reducing PCI transactions. +Using VMA_RING_DEV_MEM_TX, the user can set the amount of On Device Memory buffer allocated +for each TX ring. +The total size of the On Device Memory is limited to 256k for a single port HCA and to +128k for dual port HCA. +Default value is 0 + +VMA_RX_BUFS +Number Rx data buffer elements allocation for the processes. These data buffers +may be used by all QPs on all HCAs +Default value is 200000 + +VMA_RX_WRE +Number of Work Request Elements allocated in all receive QP's. +Default value is 16000 + +VMA_RX_WRE_BATCHING +Number of Work Request Elements and RX buffers to batch before recycling. +Batching decrease latency mean, but might increase latency STD. +Value range is 1-1024. +Default value is 64 + +VMA_RX_BYTES_MIN +Minimum value in bytes that will be used per socket by VMA when applications +call to setsockopt(SO_RCVBUF). If application tries to set a smaller value then +configured in VMA_RX_BYTES_MIN, VMA will force this minimum limit value on the +socket.VMA offloaded socket's receive max limit of ready bytes count. If the +application does not drain a sockets and the byte limit is reached, new +received datagrams will be dropped. +Monitor of the applications socket's usage of current, max and dropped bytes +and packet counters can be done with vma_stats. +Default value is 65536 + +VMA_RX_POLL +The number of times to poll on Rx path for ready packets before going to sleep +(wait for interrupt in blocked mode) or return -1 (in non-blocked mode). +This Rx polling is done when the application is working with direct blocked +calls to read(), recv(), recvfrom() & recvmsg(). +When Rx path has successful poll hits (see performance monitoring) the latency +is improved dramatically. This comes on account of CPU utilization. +Value range is -1, 0 to 100,000,000 +Where value of -1 is used for infinite polling +Default value is 100000 + +VMA_TSO +With Segmentation Offload, or TCP Large Send, TCP can pass a buffer to be +transmitted that is bigger than the maximum transmission unit (MTU) supported +by the medium. Intelligent adapters implement large sends by using the +prototype TCP and IP headers of the incoming send buffer to carve out segments +of required size. Copying the prototype header and options, then calculating +the sequence number and checksum fields creates TCP segment headers. +Expected benefits: Throughput increase and CPU unload. +Default value is 1 (Enable) + +VMA_RX_POLL_INIT +VMA maps all UDP sockets as potential offloaded capable. Only after the +ADD_MEMBERSHIP does the offload start to work and the CQ polling kicks in VMA. +This parameter control the polling count during this transition phase where the +socket is a UDP unicast socket and no multicast addresses where added to it. +Once the first ADD_MEMBERSHIP is called the above VMA_RX_POLL takes effect. +Value range is similar to the above VMA_RX_POLL +Default value is 0 + +VMA_RX_UDP_POLL_OS_RATIO +The above parameter will define the ratio between VMA CQ poll and OS FD poll. +This will result in a single poll of the not-offloaded sockets every +VMA_RX_UDP_POLL_OS_RATIO offloaded socket (CQ) polls. No matter if the CQ poll +was a hit or miss. No matter if the socket is blocking or non-blocking. +When disabled, only offloaded sockets are polled. +This parameter replaces the two old parameters: VMA_RX_POLL_OS_RATIO and +VMA_RX_SKIP_OS +Disable with 0 +Default value is 100 + +VMA_HW_TS_CONVERSION +The above parameter defines the time stamp conversion method. +The value of VMA_HW_TS_CONVERSION is determined by all devices - i.e if the hardware of +one device does not support the conversion, then it will be disabled for the other devices. +Options = [0,1,2,3,4]: +0 = Disabled +1 = Raw-HW time - only convert the time stamp to seconds.nano_seconds time + units (or disable if hardware does not supports). +2 = Best possible - Raw-HW or system time - Sync to system time, then Raw hardware time - + disable if none of them are supported by hardware. +3 = Sync to system time - convert the time stamp to seconds.nano_seconds time units. + comparable to receive software timestamp. + disable if hardware does not supports. +4 = PTP Sync - convert the time stamp to seconds.nano_seconds time units. + in case it is not supported - will apply option 3 (or disable + if hardware does not supports). +Default value: 3 + +VMA_RX_POLL_YIELD +When an application is running with multiple threads, on a limited number of +cores, there is a need for each thread polling inside the VMA (read, readv, +recv & recvfrom) to yield the CPU to other polling thread so not to starve +them from processing incoming packets. +Default value is 0 (Disable) + +VMA_RX_PREFETCH_BYTES +Size of receive buffer to prefetch into cache while processing ingress packets. +The default is a single cache line of 64 bytes which should be at least 32 +bytes to cover the IPoIB+IP+UDP headers and a small part of the users payload. +Increasing this can help improve performance for larger user payload sizes. +Value range is 32 bytes to MTU size +Default value is 256 bytes + +VMA_RX_PREFETCH_BYTES_BEFORE_POLL +Same as the above VMA_RX_PREFETCH_BYTES, only that prefetch is done before +actually getting the packets. +This benefit low pps traffic latency. +Disable with 0. +Default value is 0 + +VMA_RX_CQ_DRAIN_RATE_NSEC +Socket's receive path CQ drain logic rate control. +When disabled (Default) the socket's receive path will first try to return a +ready packet from the socket's receive ready packet queue. Only if that queue +is empty will the socket check the CQ for ready completions for processing. +When enabled, even if the socket's receive ready packet queue is not empty it +will still check the CQ for ready completions for processing. This CQ polling +rate is controls in nano-second resolution to prevent CPU consumption because +of over CQ polling. This will enable a more 'real time' monitoring of the +sockets ready packet queue. +Recommended value is 100-5000 (nsec) +Default value is 0 (Disable) + +VMA_RX_POLL_ON_TX_TCP +This parameter enables/disables TCP RX polling during TCP TX operation for faster +TCP ACK reception. +Default: 0 (Disable) + +VMA_TRIGGER_DUMMY_SEND_GETSOCKNAME +This parameter triggers dummy packet send from getsockname(), this +will warm up the caches. +For more information regarding dummy send, see VMA user manual document. +Default: 0 (Disable) + +VMA_GRO_STREAMS_MAX +Control the number of TCP streams to perform GRO (generic receive offload) simultaneously. +Disable GRO with a value of 0. +Default value is 32 + +VMA_TCP_3T_RULES +Use only 3 tuple rules for TCP, instead of using 5 tuple rules. +This can improve performance for a server with listen socket which accept many +connections. + +VMA_ETH_MC_L2_ONLY_RULES +Use only L2 rules for Ethernet Multicast. +All loopback traffic will be handled by VMA instead of OS. + +VMA_MC_FORCE_FLOWTAG +Forces the use of flow tag acceleration for multicast flows where setsockopt(SO_REUSEADDR) is +set. +Applicable if there are no other sockets opened for the same flow in system. + +VMA_SELECT_POLL +The duration in micro-seconds (usec) in which to poll the hardware on Rx path before +going to sleep (pending an interrupt blocking on OS select(), poll() or epoll_wait(). +The max polling duration will be limited by the timeout the user is using when +calling select(), poll() or epoll_wait(). +When select(), poll() or epoll_wait() path has successful receive poll hits +(see performance monitoring) the latency is improved dramatically. This comes +on account of CPU utilization. +Value range is -1, 0 to 100,000,000 +Where value of -1 is used for infinite polling +Where value of 0 is used for no polling (interrupt driven) +Default value is 100000 + +VMA_SELECT_POLL_OS_FORCE +This flag forces to poll the OS file descriptors while user thread calls +select(), poll() or epoll_wait() even when no offloaded sockets are mapped. +Enabling this flag causes VMA to set VMA_SELECT_POLL_OS_RATIO and +VMA_SELECT_SKIP_OS to 1. This will result in VMA_SELECT_POLL number of +times VMA will poll the OS file descriptors, along side with offloaded +sockets, if such sockets exists. +Note that setting VMA_SELECT_SKIP_OS and VMA_SELECT_POLL_OS_RATIO +directly will override the values these parameters gets while +VMA_SELECT_POLL_OS_FORCE is enabled. +Enable with 1 +Disable with 0 +Default value is 0 + +VMA_SELECT_POLL_OS_RATIO +This will enable polling of the OS file descriptors while user thread calls +select() or poll() and the VMA is busy in the offloaded sockets polling loop. +This will result in a single poll of the not-offloaded sockets every +VMA_SELECT_POLL_RATIO offloaded sockets (CQ) polls. +When disabled, only offloaded sockets are polled. +(See VMA_SELECT_POLL for more info) +Disable with 0 +Default value is 10 + +VMA_SELECT_SKIP_OS +Similar to VMA_RX_SKIP_OS, but in select() or poll() this will force the VMA +to check the non offloaded fd even though an offloaded socket has ready +packets found while polling. +Default value is 4 + +VMA_PROGRESS_ENGINE_INTERVAL +VMA Internal thread safe check that the CQ is drained at least once +every N milliseconds. +This mechanism allows VMA to progress the TCP stack even when the application +doesn't access its socket (so it doesn't provide a context to VMA). +If CQ was already drained by the application receive +socket API calls then this thread goes back to sleep without any processing. +Disable with 0 +Default value is 10 msec + +VMA_PROGRESS_ENGINE_WCE_MAX +Each time VMA's internal thread starts it's CQ draining, it will stop when +reach this max value. +The application is not limited by this value in the number of CQ elements +it can ProcessId form calling any of the receive path socket APIs. +Default value is 10000 + +VMA_CQ_MODERATION_ENABLE +Enable CQ interrupt moderation. +Default value is 1 (Enabled) + +VMA_CQ_MODERATION_COUNT +Number of packets to hold before generating interrupt. +Default value is 48 + +VMA_CQ_MODERATION_PERIOD_USEC +Period in micro-seconds for holding the packet before generating interrupt. +Default value is 50 + +VMA_CQ_AIM_MAX_COUNT +Maximum count value to use in the adaptive interrupt moderation algorithm. +Default value is 560 + +VMA_CQ_AIM_MAX_PERIOD_USEC +Maximum period value to use in the adaptive interrupt moderation algorithm. +Default value is 250 + +VMA_CQ_AIM_INTERVAL_MSEC +Frequency of interrupt moderation adaptation. +Interval in milliseconds between adaptation attempts. +Use value of 0 to disable adaptive interrupt moderation. +Default value is 250 + +VMA_CQ_AIM_INTERRUPTS_RATE_PER_SEC +Desired interrupts rate per second for each ring (CQ). +The count and period parameters for CQ moderation will change automatically +to achieve the desired interrupt rate for the current traffic rate. +Default value is 5000 + +VMA_CQ_POLL_BATCH_MAX +Max size of the array while polling the CQs in the VMA +Default value is 16 + +VMA_CQ_KEEP_QP_FULL +If disabled (default), CQ will not try to compensate for each poll on the +receive path. It will use a "debt" to remember how many WRE miss from each QP +to fill it when buffers become available. +If enabled, CQ will try to compensate QP for each polled receive completion. If +buffers are short it will re-post a recently completed buffer. This causes a packet +drop and will be monitored in the vma_stats. +Default value is 1 (Enabled) + +VMA_QP_COMPENSATION_LEVEL +Number of spare receive buffer CQ holds to allow for filling up QP while full +receive buffers are being processes inside VMA. +Default value is 256 buffers + +VMA_OFFLOADED_SOCKETS +Create all sockets as offloaded/not-offloaded by default. +Value of 1 is for offloaded, 0 for not-offloaded. +Default value is 1 (Enabled) + +VMA_TIMER_RESOLUTION_MSEC +Control VMA internal thread wakeup timer resolution (in milliseconds) +Default value is 10 (milliseconds) + +VMA_TCP_TIMER_RESOLUTION_MSEC +Control VMA internal TCP timer resolution (fast timer) (in milliseconds). +Minimum value is the internal thread wakeup timer resolution (VMA_TIMER_RESOLUTION_MSEC). +Default value is 100 (milliseconds) + +VMA_TCP_CTL_THREAD +Do all TCP control flows in the internal thread. +This feature should be kept disabled if using blocking poll/select (epoll is OK). +Use value of 0 to disable. +Use value of 1 for waking up the thread when there is work to do. +Use value of 2 for waiting for thread timer to expire. +Default value is disabled + +VMA_TCP_TIMESTAMP_OPTION +If set, enable TCP timestamp option. +Currently, LWIP is not supporting RTTM and PAWS mechanisms. +See RFC1323 for info. +Use value of 0 to disable. +Use value of 1 for enable. +Use value of 2 for OS follow up. +Disabled by default (enabling causing a slight performance degradation). + +VMA_TCP_NODELAY +If set, disable the Nagle algorithm option for each TCP socket during initialization. +This means that TCP segments are always sent as soon as possible, even if there is +only a small amount of data. +For more information on TCP_NODELAY flag refer to TCP manual page. +Valid Values are: +Use value of 0 to disable. +Use value of 1 for enable. +Default value is Disabled. + +VMA_TCP_QUICKACK +If set, disable delayed acknowledge ability. +This means that TCP responds after every packet. +For more information on TCP_QUICKACK flag refer to TCP manual page. +Valid Values are: +Use value of 0 to disable. +Use value of 1 for enable. +Default value is Disabled. + +VMA_EXCEPTION_HANDLING +Mode for handling missing support or error cases in Socket API or functionality by VMA. +Useful for quickly identifying VMA unsupported Socket API or features +Use value of -2 to exit() on VMA startup failure. +Use value of -1 for just handling at DEBUG severity. +Use value of 0 to log DEBUG message and try recovering via Kernel network stack (un-offloading the socket). +Use value of 1 to log ERROR message and try recovering via Kernel network stack (un-offloading the socket). +Use value of 2 to log ERROR message and return API respectful error code. +Use value of 3 to log ERROR message and abort application (throw vma_error exception). +Default value is -1 (notice, that in the future the default value will be changed to 0) + +VMA_AVOID_SYS_CALLS_ON_TCP_FD +For TCP fd, avoid system calls for the supported options of: +ioctl, fcntl, getsockopt, setsockopt. +Non-supported options will go to OS. +To activate, use VMA_AVOID_SYS_CALLS_ON_TCP_FD=1. +Default value is disabled + +VMA_INTERNAL_THREAD_AFFINITY +Control which CPU core(s) the VMA internal thread is serviced on. The cpu set +should be provided as *EITHER* a hexadecimal value that represents a bitmask. *OR* as a +comma delimited of values (ranges are ok). Both the bitmask and comma delimited list +methods are identical to what is supported by the taskset command. See the man page +on taskset for additional information. +Where value of -1 disables internal thread affinity setting by VMA +Bitmask Examples: +0x00000001 - Run on processor 0. +0x00000007 - Run on processors 1,2, and 3. +Comma Delimited Examples: +0,4,8 - Run on processors 0,4, and 8. +0,1,7-10 - Run on processors 0,1,7,8,9 and 10. +Default value is -1 (Disabled). + +VMA_INTERNAL_THREAD_CPUSET +Select a cpuset for VMA internal thread (see man page of cpuset). +The value is the path to the cpuset (for example: /dev/cpuset/my_set), or an empty +string to run it on the same cpuset the process runs on. +Default value is an empty string. + +VMA_INTERNAL_THREAD_TCP_TIMER_HANDLING +Select the internal thread policy when handling TCP timers +Use value of 0 for deferred handling. The internal thread will not handle TCP timers upon timer +expiration (once every 100ms) in order to let application threads handling it first +Use value of 1 for immediate handling. The internal thread will try locking and handling TCP timers upon +timer expiration (once every 100ms). Application threads may be blocked till internal thread finishes handling TCP timers +Default value is 0 (deferred handling) + +VMA_INTERNAL_THREAD_ARM_CQ +Wakeup the internal thread for each packet that the CQ receive. +Poll and process the packet and bring it to the socket layer. +This can minimize latency in case of a busy application which is not available to +receive the packet when it arrived. +However, this might decrease performance in case of high pps rate application. +Default value is 0 (Disabled) + +VMA_WAIT_AFTER_JOIN_MSEC +This parameter indicates the time of delay the first packet send after +receiving the multicast JOINED event from the SM +This is helpful to over come loss of first few packets of an outgoing stream +due to SM lengthy handling of MFT configuration on the switch chips +Default value is 0 (milliseconds) + +VMA_THREAD_MODE +By default VMA is ready for multi-threaded applications, meaning it is thread safe. +If the users application is a single threaded one, then using this configuration +parameter you can help eliminate VMA locks and get even better performance. +Single threaded application value is 0 +Multi threaded application using spin lock value is 1 +Multi threaded application using mutex lock value is 2 +Multi threaded application with more threads than cores using spin lock value is 3 +Default value is 1 (Multi with spin lock) + +VMA_BUFFER_BATCHING_MODE +Batching of returning Rx buffers and pulling Tx buffers per socket. +In case the value is 0 then VMA will not use buffer batching. +In case the value is 1 then VMA will use buffer batching and will try to periodically reclaim unused buffers. +In case the value is 2 then VMA will use buffer batching with no reclaim. +[future: other values are reserved] +Default value is 1 + +VMA_MEM_ALLOC_TYPE +This replaces the VMA_HUGETBL parameter logic. +VMA will try to allocate data buffers as configured: + 0 - "ANON" - using malloc + 1 - "CONTIG" - using contiguous pages + 2 - "HUGEPAGES" - using huge pages. +OFED will also try to allocate QP & CQ memory accordingly: + 0 - "ANON" - default - use current pages ANON small ones. + "HUGE" - force huge pages + "CONTIG" - force contig pages + 1 - "PREFER_CONTIG" - try contig fallback to ANON small pages. + "PREFER_HUGE" - try huge fallback to ANON small pages. + 2 - "ALL" - try huge fallback to contig if failed fallback to ANON small pages. +To override OFED use: (MLX_QP_ALLOC_TYPE, MLX_CQ_ALLOC_TYPE) +Default value is 1 (Contiguous pages) + +The following VMA neigh parameters are for advanced users or Mellanox support only: + +VMA_NEIGH_UC_ARP_QUATA +VMA will send UC ARP in case neigh state is NUD_STALE. +In case that neigh state is still NUD_STALE VMA will try +VMA_NEIGH_UC_ARP_QUATA retries to send UC ARP again and then will send BC ARP. + +VMA_NEIGH_UC_ARP_DELAY_MSEC +This parameter indicates number of msec to wait between every UC ARP. + +VMA_NEIGH_NUM_ERR_RETRIES +This number indicates number of retries to restart neigh state machine in case neigh got ERROR event. +Default value is 1 + +VMA_BF +This flag enables / disables BF (Blue Flame) usage of the ConnectX +Default value is 1 (Enabled) + +VMA_FORK +Control whether VMA should support fork. Setting this flag on will cause VMA to +call ibv_fork_init() function. ibv_fork_init() initializes libibverbs's data +structures to handle fork() function calls correctly and avoid data corruption. +If ibv_fork_init() is not called or returns a non-zero status, then libibverbs +data structures are not fork()-safe and the effect of an application calling +fork() is undefined. +ibv_fork_init() works on Linux kernels 2.6.17 and higher which support the +MADV_DONTFORK flag for madvise(). +Note that VMA's default with huge pages enabled (VMA_HUGETBL) you should use an +OFED stack version that support fork()ing of with huge pages (OFED 1.5 and higher). +Default value is 0 (Disabled) + +VMA_CLOSE_ON_DUP2 +When this parameter is enabled, VMA will handle the duplicate fd (oldfd), +as if it was closed (clear internal data structures) and only then, +will forward the call to the OS. +This is, in practice, a very rudimentary dup2 support. +It only supports the case, where dup2 is used to close file descriptors, +Default value is 1 (Enabled) + +VMA_SOCKETXTREME +When this parameter is enabled, VMA operates in SocketXtreme mode. +SocketXtreme mode brings latency down, eliminating copy operations and +increasing throughput allowing applications to further utilize true kernel +bypass architecture. An application should use a socket extension API named SocketXtreme. +Default value is 0 (Disabled) + +VMA_MTU +Size of each Rx and Tx data buffer (Maximum Transfer Unit). +This value sets the fragmentation size of the packets sent by the VMA library. +If VMA_MTU is 0 then for each interface VMA will follow the actual MTU. +If VMA_MTU is greater than 0 then this MTU value is applicable to all interfaces regardless of their actual MTU +Default value is 0 (following interface actual MTU) + +VMA_MSS +VMA_MSS define the max TCP payload size that can sent without IP fragmentation. +Value of 0 will set VMA's TCP MSS to be aligned with VMA_MTU configuration +(leaving 40 bytes room for IP + TCP headers; "TCP MSS = VMA_MTU - 40"). +Other VMA_MSS values will force VMA's TCP MSS to that specific value. +Default value is 0 (following VMA_MTU) + +VMA_TCP_CC_ALGO +TCP congestion control algorithm. +The default algorithm coming with LWIP is a variation of Reno/New-Reno. +The new Cubic algorithm was adapted from FreeBsd implementation. +Use value of 0 for LWIP algorithm. +Use value of 1 for Cubic algorithm. +Use value of 2 in order to disable the congestion algorithm. +Default value is 0 (LWIP). + +VMA_TCP_MAX_SYN_RATE +Limit the number of TCP SYN packets that VMA will handle +per second per listen socket. +For example, in case you use 10 for this value than VMA will accept at most 10 +(could be less) new connections per second per listen socket. +Use a value of 0 for un-limiting the number of TCP SYN packets that can be handled. +Value range is 0 to 100000. +Default value is 0 (no limit) + + +VMA Monitoring & Performance Counters +===================================== +The VMA internal performance counters include information per user +sockets and a global view on select() and epoll_wait() usage by the application. + +Use the 'vma_stats' included utility to view the per socket information and +performance counters during run time. +Usage: + vma_stats [-p pid] [-k directory] [-v view] [-d details] [-i interval] + +Defaults: + find_pid=enabled, directory="/tmp/", view=1, details=1, interval=1, + +Options: + -p, --pid= Show VMA statistics for process with pid: + -k, --directory= Set shared memory directory path to + -n, --name= Show VMA statistics for application: + -f, --find_pid Find and show statistics for VMA instance running (default) + -F, --forbid_clean By setting this flag inactive shared objects would not be removed + -i, --interval= Print report every seconds + -c, --cycles= Do report print cycles and exit, use 0 value for infinite (default) + -v, --view=<1|2|3|4|5> Set view type:1- basic info,2- extra info,3- full info,4- mc groups,5- similar to 'netstat -tunaep' + -d, --details=<1|2> Set details mode:1- to see totals,2- to see deltas + -z, --zero Zero counters + -l, --log_level= Set VMA log level to (1 <= level <= 7) + -S, --fd_dump= [] Dump statistics for fd number using log level . use 0 value for all open fds + -D, --details_level= Set VMA log details level to (0 <= level <= 3) + -s, --sockets= Log only sockets that match or , format: 4-16 or 1,9 (or combination) + -V, --version Print version + -h, --help Print this help message + + +Use VMA_STATS_FILE to get internal VMA statistics like vma_stats provide. +If this parameter is set and the user application performed transmit or receive +activity on a socket, then these values will be logs once the sockets are closed. + +Below is a logout example of a socket performance counters. +Below the logout example there is some explanations about the numbers. + +VMA: [fd=10] Tx Offload: 455 KB / 233020 / 0 / 3 [bytes/packets/drops/errors] +VMA: [fd=10] Tx OS info: 0 KB / 0 / 0 [bytes/packets/errors] +VMA: [fd=10] Rx Offload: 455 KB / 233020 / 0 / 0 [bytes/packets/eagains/errors] +VMA: [fd=10] Rx byte: max 200 / dropped 0 (0.00%) / limit 2000000 +VMA: [fd=10] Rx pkt : max 1 / dropped 0 (0.00%) +VMA: [fd=10] Rx OS info: 0 KB / 0 / 0 [bytes/packets/errors] +VMA: [fd=10] Rx poll: 0 / 233020 (100.00%) [miss/hit] + +Looking good :) +- No errors on transmit or receive on this socket (user fd=10) +- All the traffic was offloaded. No packets transmitted or receive via the OS. +- Just about no missed Rx polls (see VMA_RX_POLL & VMA_SELECT_POLL), meaning + the receiving thread did not get to a blocked state to cause a contexts + switch and hurt latency. +- No dropped packets caused by socket receive buffer limit (see VMA_RX_BYTES_MIN) + +Interrupt Moderation +==================== +The basic idea behind interrupt moderation is that the HW will not generate +interrupt for each packet, but instead only after some amount of packets received +or after the packet was held for some time. + +The adaptive interrupt moderation change this packet count and time period +automatically to reach a desired rate of interrupts. + + +1. Use VMA_RX_POLL=0 and VMA_SELECT_POLL=0 to work in interrupt driven mode. + +2. Control the period and frame count parameters with: + VMA_CQ_MODERATION_COUNT - hold #count frames before interrupt + VMA_CQ_MODERATION_PERIOD_USEC - hold #usec before interrupt + +3. Control the adaptive algorithm with the following: + VMA_CQ_AIM_MAX_COUNT - max possible #count frames to hold + VMA_CQ_AIM_MAX_PERIOD_USEC - max possible #usec to hold + VMA_CQ_AIM_INTERRUPTS_RATE_PER_SEC - desired interrupt rate + VMA_CQ_AIM_INTERVAL_MSEC - frequency of adaptation + +4. Disable CQ moderation with VMA_CQ_MODERATION_ENABLE=0 +5. Disable Adaptive CQ moderation with VMA_CQ_AIM_INTERVAL_MSEC=0 + + + + +Notes +===== +* Multicast loopback behavior: + There is a different behavior between IPoIB and VMA when dealing with + multicast packets at the same machine: + - When sending from VMA application to IPoIB application on the same + machine the packet will never be accepted by the IPoIB side (even when + the loopback is enabled) + - When sending from IPoIB application to VMA application on the same + machine, the packet will always be accepted by the VMA side (even when + the loop is disabled + + +Troubleshooting +=============== + +* High log level: + + VMA WARNING: ************************************************************* + VMA WARNING: * VMA is currently configured with high log level * + VMA WARNING: * Application performance will decrease in this log level! * + VMA WARNING: * This log level is recommended for debugging purposes only * + VMA WARNING: ************************************************************* +This warning message means that you are using VMA with high log level: +VMA_TRACELEVEL variable value is set to 4 or more. +In order to fix it - set VMA_TRACELEVEL to it's default value: 3 + + +* Ethernet RAW_PACKET_QP limited to privilege users + + VMA WARNING: ****************************************************************************** + VMA WARNING: * Verbs RAW_PACKET QP type creation is limited for root user access * + VMA WARNING: * Working in this mode might causes VMA malfunction over Ethernet interfaces * + VMA WARNING: * WARNING: the following steps will restart your network interface! * + VMA WARNING: * 1. "echo options ib_uverbs disable_raw_qp_enforcement=1 > /etc/modprobe.d/ib_uverbs.conf" * + VMA WARNING: * 2. "/etc/init.d/openibd restart" * + VMA WARNING: * Read the RAW_PACKET QP root access enforcement section in the VMA's User Manual for more information * + VMA WARNING: ****************************************************************************** +This warning message means that VMA tried to create a HW QP resource over Eth +interface while the kernel requires this operation to be done only by privileged +users. root can enable this for regular users as well by: + 1. "echo options ib_uverbs disable_raw_qp_enforcement=1 > /etc/modprobe.d/ib_uverbs.conf" + 2. Restart openibd or rdma service depending on your system configuration + + +* CAP_NET_RAW and root access + +VMA_WARNING: ****************************************************************************** +VMA_WARNING: * Interface will not be offloaded. +VMA_WARNING: * Offloaded resources are restricted to root or user with CAP_NET_RAW privileges +VMA_WARNING: * Read the CAP_NET_RAW and root access section in the VMA's User Manual for more information +VMA_WARNING: ****************************************************************************** +This warning message means that VMA tried to create a hardware QP resource +while the kernel requires this operation to be performed only by privileged +users. Run as user root or grant CAP_NET_RAW privileges to your user +1. "setcap cap_net_raw=ep /usr/bin/sockperf" +2. "chmod u+s " + +* Huge pages out of resource: + + VMA WARNING: *************************************************************** + VMA WARNING: * NO IMMEDIATE ACTION NEEDED! * + VMA WARNING: * Not enough hugepage resources for VMA memory allocation. * + VMA WARNING: * VMA will continue working with regular memory allocation. * + VMA INFO : * Optional: 1. Switch to a different memory allocation type * + VMA_INFO : * (VMA_MEM_ALLOC_TYPE= 0 or 1) * + VMA INFO : * 2. Restart process after increasing the number of * + VMA INFO : * hugepages resources in the system: * + VMA INFO : * "cat /proc/meminfo | grep -i HugePage" * + VMA INFO : * "echo 1000000000 > /proc/sys/kernel/shmmax" * + VMA INFO : * "echo 800 > /proc/sys/vm/nr_hugepages" * + VMA WARNING: * Please refer to the memory allocation section in the VMA's * + VMA WARNING: * User Manual for more information * + VMA WARNING: *************************************************************** +This warning message means that you are using VMA with hugepages memory allocation, +but not enough huge pages resources are available in the system. +If you want VMA to take full advantage of the performance benefits of huge pages then +you should restart the application after adding more hugepages resources in your +system similar to the details in the warning message above or trying to free unused hupepages +shared memory segments with the below script. + +NOTE: Use 'ipcs -m' and 'ipcrm -m shmid' to check and clean unused shared memory segments. +Below is a short script to help you release VMAs unused huge pages resources: + for shmid in `ipcs -m | grep 0x00000000 | awk '{print $2}'`; + do echo 'Clearing' $shmid; ipcrm -m $shmid; + done; + + +* Not supported Bonding Configuration: + + VMA WARNING: ****************************************************************************** + VMA WARNING: VMA doesn't support current bonding configuration of bond0. + VMA WARNING: The only supported bonding mode is "802.3ad(#4)" or "active-backup(#1)" + VMA WARNING: with "fail_over_mac=1" or "fail_over_mac=0". + VMA WARNING: The effect of working in unsupported bonding mode is undefined. + VMA WARNING: Read more about Bonding in the VMA's User Manual + VMA WARNING: ****************************************************************************** + +This warning message means that VMA has detected bonding device which is configured +to work in mode which is not supported by VMA, this means that VMA will not support +high availability events for that interface. +VMA currently supports just active-backup(#1) or 802.3ad(#4) and fail_over_mac = 1 or 0 mode. +In order to fix this issue please change the bonding configuration. + +Example: + +Lets assume that the bonding device is bond0, which has two slaves: ib0 and +ib1. + +Shut down the bond0 interface: +#ifconfig bond0 down + +Find all the slaves of bond0: +#cat sys/class/net/bond0/bonding/slaves +ib0 ib1 + +Free all the slaves: +#echo -ib0 > /sys/class/net/bond0/bonding/slaves +#echo -ib1 > /sys/class/net/bond0/bonding/slaves + +Change the bond mode: +#echo active-backup > /sys/class/net/bond0/bonding/mode + +Change the fail_over_mac mode: +#echo 1 > /sys/class/net/bond0/bonding/fail_over_mac + +Enslave the interfaces back: +#echo +ib0 > /sys/class/net/bond0/bonding/slaves +#echo +ib1 > /sys/class/net/bond0/bonding/slaves + +Bring up the bonding interface: +#ifconfig bond0 up +OR +#ifconfig bond0 netmask up + +* Not supported Bonding & VLAN Configuration: + + VMA WARNING: ****************************************************************** + VMA WARNING: bond0.10: vlan over bond while fail_over_mac=1 is not offloaded + VMA WARNING: ****************************************************************** + +This warning message means that VMA has detected bonding device which is configured with +VLAN over it while fail_over_mac=1. +This means that the bond will not be offloaded. +In order to fix this issue please change the bonding configuration. + diff --git a/VMA_VERSION.in b/VMA_VERSION.in new file mode 100644 index 0000000..ee279bb --- /dev/null +++ b/VMA_VERSION.in @@ -0,0 +1,4 @@ +VMA VERSION: @VERSION@-@VMA_LIBRARY_RELEASE@ +VMA SVN-REVISION: @VMA_LIBRARY_REVISION@ +BUILD DATE: @BUILD_DATE@ @BUILD_TIME@ + diff --git a/autogen.sh b/autogen.sh new file mode 100755 index 0000000..de3e135 --- /dev/null +++ b/autogen.sh @@ -0,0 +1,10 @@ +#!/bin/sh + +set -e +rm -rf autom4te.cache +mkdir -p config +autoreconf -v --install || exit 1 +rm -rf autom4te.cache + +exit 0 + diff --git a/build/README b/build/README new file mode 100644 index 0000000..13cb5e2 --- /dev/null +++ b/build/README @@ -0,0 +1,28 @@ +OPEN-SOURCE USER: DO NOT USE THE SCRIPT NAMED "MELLANOX-INTERNAL". +THIS SCRIPT IS PART OF MELLANOX INTERNAL AUTOMATION SYSTEM AND WILL NOT WORK FOR YOU. + +Building +======== +Build packages: + ./build/build_rpm.sh + ./build/build_deb.sh + +In order to install the rpm: +rpm -ivh + + +Building - Mellanox internal +============================ + +In order to create the rpm you should: +1) Update the version number in the configure.ac file: + Update the variables: VMA_LIBRARY_MAJOR, VMA_LIBRARY_MINOR, VMA_LIBRARY_REVISION, VMA_LIBRARY_RELEASE + The format of the version will be: VMA_LIBRARY_MAJOR.VMA_LIBRARY_MINOR.VMA_LIBRARY_REVISION-VMA_LIBRARY_RELEASE +2) After editing configure.ac run autogen +3) Update README.txt and journal.txt with all the major version changes +4) Commit configure.ac, configure and journal.txt +5) Create git TAG named MA_LIBRARY_MAJOR.VMA_LIBRARY_MINOR.VMA_LIBRARY_REVISION-VMA_LIBRARY_RELEASE +6) Run the internal script from the build machine and you are done. + +In order to install the rpm: +rpm -ivh diff --git a/build/build_deb.sh b/build/build_deb.sh new file mode 100755 index 0000000..857f209 --- /dev/null +++ b/build/build_deb.sh @@ -0,0 +1,53 @@ +#!/bin/bash + +BASE_DIR=`pwd` +script_dir=`dirname $(readlink -f $0)` +cd $script_dir/.. + +BUILD_DIR=`pwd`/build_debian +mkdir -p $BUILD_DIR + +LOG_FILE=$BUILD_DIR/build_debian.log + +echo "Running ./autogen.sh ..." +./autogen.sh > $LOG_FILE 2>&1 + +echo "Running ./configure ..." +./configure >> $LOG_FILE 2>&1 +if [ $? -ne 0 ]; then + echo "configure failed! see $LOG_FILE" + cd $BASE_DIR + exit 1 +fi + +echo "Running make dist ..." +make dist >> $LOG_FILE 2>&1 +if [ $? -ne 0 ]; then + echo "make dist failed! see $LOG_FILE" + cd $BASE_DIR + exit 2 +fi + +cp libvma*.tar.gz $BUILD_DIR/ +cd $BUILD_DIR +tar xzvf libvma*.tar.gz >> $LOG_FILE 2>&1 +cd $(find . -maxdepth 1 -type d -name "libvma*") +VMA_DIR=`pwd` + +echo "Running dpkg-buildpackage ... this might take a while ..." +dpkg-buildpackage -us -uc >> $LOG_FILE 2>&1 +if [ $? -ne 0 ]; then + echo "dpkg-buildpackage failed! see $LOG_FILE" + cd $BASE_DIR + exit 3 +fi + +cd .. + +rm -rf $VMA_DIR + +echo "Debian file are under $BUILD_DIR" + +rm -rf $LOG_FILE + +cd $BASE_DIR diff --git a/build/build_git_rpm_deb_mellanox_internal.sh b/build/build_git_rpm_deb_mellanox_internal.sh new file mode 100755 index 0000000..a9009d5 --- /dev/null +++ b/build/build_git_rpm_deb_mellanox_internal.sh @@ -0,0 +1,1178 @@ +#!/bin/bash + +# Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +function usage { + +cat << EOF + +The building environment works on 4 possible modes: + * Release + * Daily + * Local + * Wrapper + +usage: $0 options + +GENERAL OPTIONS: +---------------- + -h Show this message + -r -R Release mode + -w -W Wrapper mode + -d -D Daily mode + -l -L Local mode + +RELEASE MODE OPTIONS: +--------------------- + -r + + Optional: + -o -O Override existing release + -c -C Make coverity + -b -B Make Bullseye + +WRAPPER MODE OPTIONS: +--------------------- + -w -f ... + + can run this format "-w -f " several times in order to + build different branches on several machines in the same run + Example: ./build_vma_rpm.sh -w vma_6.1 -f alf1 alf2 -w vma_6.2 vma_6.3 -f alf3 + ==> will build vma_6.1 on alf1, alf2 and vma_6.2,vma_6.3 on alf3 + +DAILY MODE OPTIONS: +------------------- + -d + + Optional: + -s -S Secure copy the rpms created to bgate + -c -C Make coverity + -b -B Make Bullseye + +LOCAL MODE OPTIONS: +------------------- + -l + + run the script from the HEAD of the workspace folder + + Optional: + -n -N Add a name to rpm + +EOF + +cleanFilesAndExit + +} + +function echoStep { + +echo -e "\e[00;32m $1 \e[0m" + +} + +function echoMsgToUser { + +echo -e "\e[00;34m $1 \e[0m" + +} + +function echoErr { + +echo -e "\e[00;31m $1 \e[00m" + +} + +function echoDebug { + +echo -e "\e[00;35m $1 \e[00m" + +} + +function isValidGitBranch { + +git branch -r > /tmp/git_list.tmp +folder=$1 +ret=0 +grep "origin/$folder$" /tmp/git_list.tmp > /dev/null +if [[ $? != 0 ]]; then + ret=1; +fi +rm -f /tmp/git_list.tmp +return $ret + +} + +function checkLegalWrapperModeParams { + +if [ $# -eq 0 ]; then + usage; +fi +machines=0 +firstFlag=1 +numOfBranches=0 +numOfMachines=0 +i=0 +j=0 + +for PARAM in $@; do + if [ $PARAM == "-c" ] || [ $PARAM == "-C" ]; then + continue + fi + if [ $PARAM == "-b" ] || [ $PARAM == "-B" ]; then + continue + fi + if [ $PARAM == "-w" ]; then + if [ $firstFlag == 1 ]; then + firstFlag=0; + continue + fi + if [ "$numOfBranches" == "0" ]; then + usage; + fi + if [ "$numOfMachines" == "0" ]; then + usage; + fi + + numOfBranches=0 + numOfMachines=0 + i=0 + j=0 + machines=0 + continue + fi + if [ $PARAM == "-f" ]; then + machines=1; + continue + fi + if [ $machines == 1 ]; then + eval machine_$i=$PARAM + curr_machine=$(eval echo \$machine_$i) + ping -c 1 $curr_machine + if [ $? != 0 ]; then + echoErr "\"$curr_machine\" is illegal machine" + # mail report + tmp_file="/tmp/log_$RANDOM.txt" + echo "Script failed: ssh: connect to host \"$curr_machine\": No route to host" > $tmp_file + /bin/mail -s "$SUBJECT" "$EMAIL" < $tmp_file + usage; + fi + i=`expr $i + 1` + numOfMachines=`expr $i` + else + eval build_branch_$j=$PARAM + curr_build_branch=$(eval echo \$build_branch_$j) + isValidGitBranch $curr_build_branch + if [[ $? != 0 ]]; then + echoErr "\"$curr_build_branch\" is illegal branch folder" + echoErr "run \"git branch -r\" for the complete list" + # mail report + tmp_file="/tmp/log_$RANDOM.txt" + echo "Script failed: \"$curr_build_branch\" is illegal branch folder" > $tmp_file + /bin/mail -s "$SUBJECT" "$EMAIL" < $tmp_file + usage; + fi + j=`expr $j + 1` + numOfBranches=`expr $j` + fi +done + +if [[ "$numOfBranches" == "0" || "$numOfMachines" == "0" ]]; then + usage; +fi + +} + +#if there is no new commit for the input branch argument, no need to build new rpm +function needToBuildDaily { + +branch_folder=$1 +last_daily=`ls -Atr /.autodirect/mswg/release/vma/daily/"$branch_folder"/ | tail -1` #last daily rpm created +if [[ $? != 0 ]]; then # folder doesn't exist + return 0; +fi +commitDate=`git log -1 --format="%ci" | grep -Po '\d{4}[\-]\d{2}[\-]\d{2}'` #date of last commit +commitTime=`git log -1 --format="%ci" | grep -Po '\d{2}[\:]\d{2}[\:]\d{2}'` #time of last commit +DATE=`date +""%Y"-"%m"-"%d""` #today's date + +#if last commit is not from today, and rpm was created since last commit- no need to build new rpm +if [ "$commitDate" != "$DATE" ]; then + if [ "$last_daily" \> "$commitDate" ]; then + echoMsgToUser "Branch $branch_folder: No changes from last build- no need to build again" + echo "No changes from last build- no new rpm" >> ~/script_status + return 1; #nothing changed + fi + if [ "$last_daily" == "$commitDate" ] && [ "$commitTime" \< "$buildTime" ]; then + echoMsgToUser "Branch $branch_folder: No changes from last build- no need to build again" + echo "No changes from last build- no new rpm" >> ~/script_status + return 1; #nothing changed + fi +fi + +return 0 + +} + +function runWrapper { + +SCRIPT_STATUS_FILE="~/script_status" +rm -f ~/script_status +touch ~/script_status +echo "BUILD VMA RPM- DAILY STATUS" > ~/script_status + +if [ $# -eq 0 ]; then + usage; +fi +machines=0 +make_cov=0 +make_bullseye=0 +firstFlag=1 +numOfBranches=0 +numOfMachines=0 +i=0 +j=0 + +for PARAM in $@; do + if [ $PARAM == "-c" ] || [ $PARAM == "-C" ]; then + make_cov=1; + continue + fi + if [ $PARAM == "-b" ] || [ $PARAM == "-B" ]; then + make_bullseye=1; + continue + fi + if [ $PARAM == "-w" ]; then + if [ $firstFlag == 1 ]; then + firstFlag=0; + continue + fi + + startBuild $numOfBranches $numOfMachines + + numOfBranches=0 + numOfMachines=0 + i=0 + j=0 + machines=0 + continue + fi + if [ $PARAM == "-f" ]; then + machines=1; + continue + fi + if [ $machines == 1 ]; then + eval machine_$i=$PARAM + i=`expr $i + 1` + numOfMachines=`expr $i` + else + eval build_branch_$j=$PARAM + j=`expr $j + 1` + numOfBranches=`expr $j` + fi +done + +startBuild $numOfBranches $numOfMachines + +mailScriptStatus + +exit + +} + +function startBuild { + +numOfBranches=$1 +numOfMachines=$2 + +script_flags=" " +if [ $make_cov == 1 ]; then + script_flags=" -c $script_flags" +fi +if [ $make_bullseye == 1 ]; then + script_flags=" -b $script_flags" +fi + +for (( i=0; i<$numOfBranches; i++ )); do + curr_branch=$(eval echo \$build_branch_$i) + echo "============================================" >> ~/script_status + curr_branch_name=$curr_branch + if [ "$curr_branch" == "master" ]; then + curr_branch_name=$master_name + fi + echo "Branch: $curr_branch" >> ~/script_status + echo "===============" >> ~/script_status + cd $vma_repos_base + PARENT=`ps --no-heading -o %c -p $PPID` + if [ $PARENT == bash ]; then + rm -rf $vma_repos_dir + git clone /.autodirect/mswg/git/accl/vma.git/ vma_build_repos #TODO: add rm -rf... at the end + #git clone /.autodirect/mtrswgwork/ork/workspace/git/vma/ vma_build_repos #debug + fi + cd $vma_repos_dir + git checkout $curr_branch + needToBuildDaily $curr_branch + if [[ $? == 0 ]]; then + if [ -z "$last_daily" ]; then + since="yesterday" + else + since="$last_daily $buildTime" + fi + for (( j=0; j<$numOfMachines; j++ )); do + working_machine=$(eval echo \$machine_$j) + ssh $working_machine rm -f /tmp/$script + scp $full_path_script $working_machine:/tmp/$script + ssh $working_machine chmod 777 /tmp/$script + ssh $working_machine bash /tmp/$script $script_flags -d $curr_branch -f $SCRIPT_STATUS_FILE + ssh $working_machine rm -f /tmp/$script + echoStep "$working_machine is finished" + done + #exit #debug + cd $vma_repos_base + if [ $PARENT == bash ]; then + rm -rf $vma_repos_dir + #git clone /.autodirect/mtrswgwork/ork/workspace/git/vma/ vma_build_repos #debug + git clone /.autodirect/mswg/git/accl/vma.git/ vma_build_repos #TODO: add rm -rf... at the end + fi + cd $vma_repos_dir + git checkout $curr_branch + + DATE=`date +""%Y"-"%m"-"%d""` #today's date + if [ $WRAPPER_MODE == 1 ] && [ "$curr_branch" != "vma_6.4_deliberate_failures" ]; then + echo " " >> ~/script_status + cd $vma_repos_dir + git checkout $curr_branch + git log --since="$since" >> ~/script_status + echo " " >> ~/script_status + fi + + cd /.autodirect/mswg/release/vma/ + folderName=$(readlink -f "latest_daily-"$curr_branch""/ | grep -o "20.*") + if [ "$folderName" == "$DATE" ]; then + cd latest_daily-"$curr_branch" + bgate_curr_branch=$curr_branch + if [ "$curr_branch" == "master" ]; then + bgate_curr_branch=$master_name + fi + #ssh ork@bgate.mellanox.com mkdir -p /hpc/home/vma/daily/"$bgate_curr_branch"/"$folderName" + #scp -r . ork@bgate.mellanox.com:/hpc/home/vma/daily/"$bgate_curr_branch"/"$folderName" + ssh ophirmu@r-sw-hail25 mkdir -p /net/10.224.1.11/vol/asal_home/vma/daily/"$bgate_curr_branch"/"$folderName" + scp -r . ophirmu@r-sw-hail25:/net/10.224.1.11/vol/asal_home/vma/daily/"$bgate_curr_branch"/"$folderName" + fi + cd - + fi +done + +} + +function parseArguments { + +EMAIL="sw-dev-vma@mellanox.com mellanox-CSA-team@asaltech.com" +#EMAIL="ork@mellanox.com" # for debug +SUBJECT="VMA Daily Build" + +RELEASE_MODE=0 +DAILY_MODE=0 +LOCAL_MODE=0 +WRAPPER_MODE=0 + +BRANCH_INITIALIZED=0 +branch_folder= + +mswg_vma_folder="/.autodirect/mswg/release/vma" +mswg_daily_folder="/.autodirect/mswg/release/vma/daily" + +#MAIL_ALERT=0 +rpm_name= +OVERRIDE=0 +make_cov=0 +make_bullseye=0 +copy_to_bgate=0 + +master_name="vma_7.0" +buildTime="18:00:00" + +LOG_SCRIPT_STATUS=0 +log_file= + +if [ "$1" == "-h" ]; then + usage; +fi + +pwd_dir=`pwd` +cd $vma_repos_base +PARENT=`ps --no-heading -o %c -p $PPID` +if [ $PARENT == bash ]; then + rm -rf $vma_repos_dir + #git clone /.autodirect/mtrswgwork/ork/workspace/git/vma vma_build_repos #debug + git clone /.autodirect/mswg/git/accl/vma.git/ vma_build_repos #TODO: add rm -rf... at the end +fi +cd $vma_repos_dir + +while getopts wWd:D:r:R:l:L:n:N:f:F:oOsScCbBh OPTION +do + case $OPTION in + h) # help + usage + ;; + d|D|r|R) # daily \ release mode + branch_folder=$OPTARG + + #### branch folder arg- checks that the requested branch folder exists under https://sirius.voltaire.com/repos/enterprise/mce/branches/ + isValidGitBranch $branch_folder + if [[ $? != 0 ]]; then + echoErr "\"$branch_folder\" is illegal branch folder" + echoErr "run \"git branch -r\" for the complete list" + usage; + fi + BRANCH_INITIALIZED=1 + + #### sign which mode is selected + if [[ $OPTION == "d" ]] || [[ $OPTION == "D" ]]; then + DAILY_MODE=1; + fi + if [[ $OPTION == "r" ]] || [[ $OPTION == "R" ]]; then + RELEASE_MODE=1; + fi + ;; + c|C) # make coverity + make_cov=1 + ;; + b|B) # make coverity + make_bullseye=1 + ;; + w|W) + WRAPPER_MODE=1; + checkLegalWrapperModeParams $@ + runWrapper $@; + ;; + l|L) # local mode + LOCAL_MODE=1 + rm -rf $vma_repos_dir + cd $pwd_dir + target_dir=$OPTARG + if [ ! -d $target_dir ]; then + echoErr "folder $target_dir does not exist" + usage; + fi + ;; + n|N) # rpm's name + rpm_name="-$OPTARG" + ;; + f|F) # log the status of the script (success/failiure) to a log file + LOG_SCRIPT_STATUS=1 + log_file=$OPTARG + ;; +# m|M) # mail errors to user +# MAIL_ALERT=1 +# ;; + o|O) # override existing release + OVERRIDE=1 + ;; + s|S) # copy rpms to bgate, if on daily mode + copy_to_bgate=1 + ;; + ?) + usage + ;; + esac +done + +#### check if arguments are valid #### +total_modes=$(( $DAILY_MODE + $RELEASE_MODE + $LOCAL_MODE + $WRAPPER_MODE )) # exactly one mode can be chosen +if [ $total_modes != 1 ]; then + echoErr "Please choose exactly one mode- daily/release/local/wrapper" + usage +fi + +if [ ! -x $rpm_name ] && [ $LOCAL_MODE != 1 ]; then # costumer's name argument was added but not on local mode + echoErr "-n flag is available only when using -l (local mode)" + usage +fi + +if [ $OVERRIDE == 1 ] && [ $RELEASE_MODE != 1 ]; then # override argument was added but not on release mode + echoErr "-o flag is available only when using -r (release mode)" + usage +fi + +if [ $copy_to_bgate == 1 ] && [ $DAILY_MODE != 1 ]; then # copy to bgate available only on daily mode + echoErr "-s flag is available only when using -d (daily mode)" + usage +fi + +if [ $LOG_SCRIPT_STATUS == 1 ] && [ $LOCAL_MODE == 1 ]; then + echoErr "-f flag is available only when using -r (release mode) or -d (daily mode)" + usage +fi + +if [ $make_cov == 1 ] && [ $LOCAL_MODE == 1 ]; then + echoErr "-c flag is available only when using -r (release mode) or -d (daily mode)" + usage +fi + +if [ "$branch_folder" != "vma_6.3" ] && [ "$branch_folder" != "master" ]; then #only vma_6.3 can be build with coverity + make_cov=0 +fi + +} + +function build_vma_src_rpm { + +APP_NAME=libvma +VMA_DIR=vma + +cd .. +echoStep `pwd` +DATE=`git log -1 --format="%ci" | grep -Po '\d{4}[\-]\d{2}[\-]\d{2}'` +TIME=`git log -1 --format="%ci" | grep -Po '\d{2}[\:]\d{2}[\:]\d{2}'` +cd build + +grep -e "VMA_LIBRARY_MAJOR=" -e "VMA_LIBRARY_MINOR=" -e "VMA_LIBRARY_REVISION=" -e "VMA_LIBRARY_RELEASE=" $VMA_DIR/configure.ac |head -4 > temp +. ./temp +VERSION=$VMA_LIBRARY_MAJOR.$VMA_LIBRARY_MINOR.$VMA_LIBRARY_REVISION +#VERSION=`grep "VMA_VERSION" $VMA_DIR/version.h |awk -F "\"" '{print $2'}` +#RELEASE=`grep "VMA_RELEASE" $VMA_DIR/version.h |awk -F "\"" '{print $2'}` + +VMA_DIR_NAME=libvma-$VERSION + +if [ $# -lt 1 ]; then + RPM_DIR=$(rpm --eval '%{_topdir}'); +else + RPM_DIR=$1; +fi + +#sed -e 's/__VERSION/'$VERSION'/g' -e 's/__RELEASE/'$VMA_LIBRARY_RELEASE'/g' -e 's/__DATE/'$DATE'/g' -e 's/__TIME/'$TIME'/g' -e 's/__MAJOR/'$VMA_LIBRARY_MAJOR'/g' $APP_NAME.spec > $APP_NAME-$VERSION.spec +#sed -e 's/__VERSION/'$VERSION'/g' -e 's/__RELEASE/'$VMA_LIBRARY_RELEASE'/g'-e 's/__DATE/'$DATE'/g' -e 's/__TIME/'$TIME'/g' -e 's/__MAJOR/'$VMA_LIBRARY_MAJOR'/g' $APP_NAME.spec > $APP_NAME-$VERSION.spec +#sed -e 's/__VERSION/'$VERSION'/g' -e 's/__RELEASE/'$VMA_LIBRARY_RELEASE'/g' -e 's/__DATE/'$DATE'/g' -e 's/__TIME/'$TIME'/g' $VMA_DIR/vma_version_template > $VMA_DIR/VMA_VERSION + +rm -f libvma*.tar.gz > /dev/null > /dev/null 2>&1 +rm -f $RPM_DIR/SRPMS/libvma* > /dev/null > /dev/null 2>&1 +rm -rf $VMA_DIR_NAME > /dev/null > /dev/null 2>&1 + +mkdir $VMA_DIR_NAME +mkdir $VMA_DIR_NAME/build +#cp -r $APP_NAME.spec $VMA_DIR_NAME/build +cp -r $VMA_DIR $VMA_DIR_NAME/build/$VMA_DIR_NAME/ # copy vma & udp_test +cd $VMA_DIR_NAME +cd build +cd $VMA_DIR_NAME +#./autogen.sh +autogenWrap +prepare_debian_files "debian" +./configure > /dev/null > /dev/null 2>&1 +cp -r build/$APP_NAME.spec ../ +cp -r build/$APP_NAME.spec ../../../$APP_NAME-$VERSION.spec +make dist > /dev/null > /dev/null 2>&1 +cp $VMA_DIR_NAME.tar.gz ../../../ +cd .. +#tar zcvf ../../$VMA_DIR_NAME.tar.gz --exclude .git $VMA_DIR_NAME > /dev/null > /dev/null 2>&1 +cd .. +cd .. + +sudo cp *.gz $APP_NAME-$VERSION.spec $RPM_DIR/SOURCES/ > /dev/null > /dev/null 2>&1 +sudo rpmbuild --define "_topdir $RPM_DIR" -bs $APP_NAME-$VERSION.spec + +rm -f $VMA_DIR_NAME.tar.gz temp > /dev/null > /dev/null 2>&1 +rm -rf $VMA_DIR_NAME > /dev/null > /dev/null 2>&1 +rm -rf $APP_NAME-$VERSION.spec > /dev/null > /dev/null 2>&1 + +#if [ ! -f $RPM_DIR/SRPMS/libvma* ]; then +# exit 1 +#fi +echo $RPM_DIR/SRPMS/libvma* + +} + +function cleanFilesAndExit { + +#### clear the workspace folder if the script is on daily/release mode +PARENT=`ps --no-heading -o %c -p $PPID` +if [ ! $LOCAL_MODE == 1 ] && [ $PARENT == bash ]; then + rm -rf $vma_repos_dir; +fi + +exit 1 +} + +function errorOccured { + +#### clear the workspace folder if the script is on daily/release mode +if [ ! $LOCAL_MODE == 1 ] && [ $PARENT == bash ]; then + rm -rf $vma_repos_dir; +fi + +echoErr "failed on step: $1" +#if [ $MAIL_ALERT == 1 ]; then +# sendErrorMailToUser $1; +#fi + +if [ "$LOG_SCRIPT_STATUS" == 1 ]; then + name=`uname -n` + echo "Machine: $name - failed on step: $1" >> $log_file +fi + +} + +function mailScriptStatus { + +# Left in case we want to add the logs as attachments +#ls ~/*_$DATE.log +#if [ $? == 0 ]; then +# mutt -s "$SUBJECT" `for file in ~/*_$DATE.log; do echo -n "-a ${file} "; done` "$EMAIL" < ~/script_status +# rm -f ~/*_$DATE.log +#else +# /bin/mail -s "$SUBJECT" "$EMAIL" < ~/script_status +#fi + +/bin/mail -s "$SUBJECT" "$EMAIL" < ~/script_status + +rm -f ~/script_status + +} + +function git_co { + +#### check out the requested branch +echoStep "git checkout $branch_folder" + +git checkout $branch_folder +if [[ $? != 0 ]]; then + errorOccured "git checkout $branch_folder" + finishScript; +fi + +} + +function getVmaParamsFromConfigure.ac { + +grep_line=$(egrep "VMA_LIBRARY_MAJOR=[0-9]{1,}" configure.ac) +vma_ver_major=$(echo $grep_line | awk -F '=' '{print $2}') + +grep_line=$(egrep "VMA_LIBRARY_MINOR=[0-9]{1,}" configure.ac) +vma_ver_minor=$(echo $grep_line | awk -F '=' '{print $2}') + +grep_line=$(egrep "VMA_LIBRARY_REVISION=[0-9]{1,}" configure.ac) +vma_ver_revision=$(echo $grep_line | awk -F '=' '{print $2}') + +grep_line=$(egrep "VMA_LIBRARY_RELEASE=[0-9]{1,}" configure.ac) +vma_ver_release=$(echo $grep_line | awk -F '=' '{print $2}') + +} + +function areFilesUpdated { + +jurnal_version=`cat journal.txt | head -1` +configure_version=`echo "Version $vma_ver_major.$vma_ver_minor.$vma_ver_revision-$vma_ver_release":` # configure.ac version in same format as journal.txt (i.e. Version 6.3.22-0:) +if [[ $jurnal_version != $configure_version ]]; then + echoMsgToUser "Configure.ac or journal.ac are not updated" + echoMsgToUser "version defined in configure.ac = $configure_version" + echoMsgToUser "version defined in journal.txt = $jurnal_version" + echoMsgToUser "Do you want to continue anyway?" + while true; do + read yn + case $yn in + y|Y ) break;; + n|N ) cleanFilesAndExit;; + * ) echo "Please answer y or n";; + esac + done +fi + +} + +function isReleaseExists { + +mswg_vma_version_folder="vma_v_"$vma_ver_major"."$vma_ver_minor"."$vma_ver_revision"-"$vma_ver_release"" +if [[ $OVERRIDE == 0 ]]; then + ls $mswg_vma_folder | grep $mswg_vma_version_folder + if [[ $? == 0 ]]; then + echoMsgToUser "This version already exist: $mswg_vma_folder/$mswg_vma_version_folder" + echoMsgToUser "Do you want to continue (a new rpm will be created instead)? y/n" + while true; do + read yn + case $yn in + y|Y ) + rm -rf $mswg_vma_folder/$mswg_vma_version_folder + break;; + n|N ) cleanFilesAndExit;; + * ) echo "Please answer y or n";; + esac + done + fi +fi +cd $mswg_vma_folder +rm -rf "$mswg_vma_version_folder" +cd - + +} + +function set_topdir { + +#check _topdir folder +redhatFlag=0 +suseFlag=0 +distribution=$(cat /etc/issue | grep "Red Hat") +if [ "$distribution" != "" ]; then + topdir="/usr/src/redhat" + redhatFlag=1 +else + topdir="/usr/src/packages" + suseFlag=1 +fi + +} + +function finishScript { + +PARENT=`ps --no-heading -o %c -p $PPID` +if [ ! $LOCAL_MODE == 1 ] && [ $PARENT == bash ]; then + rm -rf $workspace_folder; +fi + +cd $vma_repos_base +rm -rf $vma_repos_dir + +if [ "$1" == "s" ]; then + #create TAG- if release mode + if [ "$RELEASE_MODE" == 1 ]; then + finalRpm="$mswg_vma_folder/vma_v_"$fullVersion"" #TODO:replace svn_revision with??????? + echoStep "rpm was created successfuly under $finalRpm" + else + echoStep "rpm was created successfuly under $target_dir" + fi +fi + +exit + +} + +function handleError { + +retVal=$1 +command=$2 +if [[ $retVal != 0 ]]; then + errorOccured "$command" + PARENT=`ps --no-heading -o %c -p $PPID` + if [ $PARENT == bash ]; then + finishScript; + else + exit + fi +fi + +} + +function prepare_deb_tarball { + pathToWorkspace=$1 + + debBuildDir="/tmp" + debBuildContainer="deb_build" + debBuildDirFinal="$debBuildDir/$debBuildContainer/" + + cd $debBuildDir + sudo rm -rf $debBuildContainer + mkdir $debBuildContainer + cd $debBuildContainer + + DEB_VMA_VERSION="$vma_ver_major.$vma_ver_minor.$vma_ver_revision" + DEB_VMA_RELEASE="$vma_ver_release" + + cp -rf $pathToWorkspace libvma-$DEB_VMA_VERSION.$DEB_VMA_RELEASE + rm -rf libvma-$DEB_VMA_VERSION.$DEB_VMA_RELEASE/.git + + cd libvma-$DEB_VMA_VERSION.$DEB_VMA_RELEASE + #./autogen.sh + autogenWrap + cd .. + + prepare_debian_files "libvma-$DEB_VMA_VERSION.$DEB_VMA_RELEASE/debian" + + srcDebTarName=libvma_$DEB_VMA_VERSION.$DEB_VMA_RELEASE.orig.tar.gz + currpwd=`pwd` + srcDebTarPath="$currpwd/$srcDebTarName" + + tar czvf $srcDebTarName libvma-$DEB_VMA_VERSION.$DEB_VMA_RELEASE + +} + + +function prepare_debian_files { + + pathToDebianDir=$1 + debUserName="Or Kehati" + debUserEmail="ork@mellanox.com" + localArch="`eval arch`" + debArch="any" + #debArch="$localArch" + #if [[ "$localArch" == "x86_64" ]]; then + # debArch="amd64"; + #elif [[ "$localArch" == "x86" ]]; then + # debArch="i386"; + #fi + debDate=`date -R` + + DEB_VMA_VERSION="$vma_ver_major.$vma_ver_minor.$vma_ver_revision" + DEB_VMA_RELEASE="$vma_ver_release" + DEB_VMA_DATE="$debDate" + DEB_VMA_ARCH="$debArch" + DEB_VMA_USERNAME="$debUserName" + DEB_VMA_USER_EMAIL="$debUserEmail" + + #mv $pathToDebianDir/postinst $pathToDebianDir/postinst.template + #mv $pathToDebianDir/postrm $pathToDebianDir/postrm.template + #mv $pathToDebianDir/changelog $pathToDebianDir/changelog.template + #mv $pathToDebianDir/control $pathToDebianDir/control.template + #mv $pathToDebianDir/copyright $pathToDebianDir/copyright.template + #mv $pathToDebianDir/rules $pathToDebianDir/rules.template + #sed -e "s/__DEB_VMA_VERSION/$DEB_VMA_VERSION/g" -e "s/__DEB_VMA_RELEASE/$DEB_VMA_RELEASE/g" $pathToDebianDir/postinst.template > $pathToDebianDir/postinst + #sed -e "s/__DEB_VMA_VERSION/$DEB_VMA_VERSION/g" -e "s/__DEB_VMA_RELEASE/$DEB_VMA_RELEASE/g" $pathToDebianDir/postrm.template > $pathToDebianDir/postrm + + #sed -e "s/__DEB_VMA_VERSION/$DEB_VMA_VERSION/g" -e "s/__DEB_VMA_RELEASE/$DEB_VMA_RELEASE/g" -e "s/__DEB_VMA_DATE/$DEB_VMA_DATE/g" -e "s/__DEB_VMA_USERNAME/$DEB_VMA_USERNAME/g" -e "s/__DEB_VMA_USER_EMAIL/$DEB_VMA_USER_EMAIL/g" $pathToDebianDir/changelog.template > $pathToDebianDir/changelog + + #sed -e "s/__DEB_VMA_ARCH/$DEB_VMA_ARCH/g" -e "s/__DEB_VMA_USERNAME/$DEB_VMA_USERNAME/g" -e "s/__DEB_VMA_USER_EMAIL/$DEB_VMA_USER_EMAIL/g" -e "s/__DEB_VMA_VERSION/$DEB_VMA_VERSION/g" -e "s/__DEB_VMA_RELEASE/$DEB_VMA_RELEASE/g" $pathToDebianDir/control.template > $pathToDebianDir/control + + #sed -e "s/__DEB_VMA_VERSION/$DEB_VMA_VERSION/g" -e "s/__DEB_VMA_RELEASE/$DEB_VMA_RELEASE/g" -e "s/__DEB_VMA_DATE/$DEB_VMA_DATE/g" -e "s/__DEB_VMA_ARCH/$DEB_VMA_ARCH/g" $pathToDebianDir/copyright.template > $pathToDebianDir/copyright + + #sed -e "s/__VMA_DEB_DATE/$DATE/g" -e "s/__VMA_DEB_TIME/$TIME/g" $pathToDebianDir/rules.template > $pathToDebianDir/rules + + #rm -f $pathToDebianDir/postinst.template + #rm -f $pathToDebianDir/postrm.template + #rm -f $pathToDebianDir/changelog.template + #rm -f $pathToDebianDir/control.template + #rm -f $pathToDebianDir/copyright.template + #rm -f $pathToDebianDir/rules.template + +} + +function build_deb { + srcRpmFile=$1 + pathToFinalDir=$2 + debFinalFile=$pathToFinalDir + ubuntuMachine="hail14-vm03-ub12-x64-ofed20" + debBuildDir="/tmp" + debBuildContainer="deb_build" + debBuildDirFinal="$debBuildDir/$debBuildContainer/" + libvmaDir="libvma"-"$vma_ver_major"."$vma_ver_minor"."$vma_ver_revision" + ssh $ubuntuMachine "cd $debBuildDir; sudo rm -rf $debBuildContainer" + ssh $ubuntuMachine "cd $debBuildDir; mkdir $debBuildContainer" + ssh $ubuntuMachine "cp $srcRpmFile $debBuildDirFinal" + ssh $ubuntuMachine "cd $debBuildDirFinal; rpm2cpio *.rpm | cpio -idmv > /dev/null 2>&1" + ssh $ubuntuMachine "cd $debBuildDirFinal; tar xzvf *.tar.gz > /dev/null 2>&1" + ssh $ubuntuMachine "cd $debBuildDirFinal$libvmaDir; sudo dpkg-buildpackage -us -uc 2>&1" + ssh $ubuntuMachine "cd $debBuildDirFinal; cp *.deb "$pathToFinalDir"" + ssh $ubuntuMachine "cd $debBuildDir; sudo rm -rf $debBuildContainer" + + #sudo rm -rf $debBuildDirFinal +} + +function runStep { + +currStep=$1 +echoStep "$currStep" +$currStep +handleError $? "$currStep" + +} + +# from http://superuser.com/questions/39751/add-directory-to-path-if-its-not-already-there +function pathadd { + export PATH="$1:$PATH" + #if [ -d "$1" ] && [[ ":$PATH:" != *":$1:"* ]]; then + # PATH="${PATH:+"$PATH:"}$1" + #fi +} + +function autogenWrap { + mv ./config/config.guess ./config/config.guess.override + mv ./config/config.sub ./config/config.sub.override + ./autogen.sh + rm ./config/config.guess + rm ./config/config.sub + mv ./config/config.guess.override ./config/config.guess + mv ./config/config.sub.override ./config/config.sub +} +################################## main ################################## +script=`basename $0` +script_dir=`dirname $(readlink -f $0)` +full_path_script=$script_dir/$script + +#vma_repos_base=/.autodirect/mtrswgwork/ork/ # for debug +vma_repos_base=/.autodirect/mswg/projects/vma/vma_git +vma_repos_dir=$vma_repos_base/vma_build_repos + +path_to_bullseye=/.autodirect/mswg/release/vma/bullseye/bin + +parseArguments $@ +all_flags=$@ +#mswg_vma_folder="/.autodirect/mtrswgwork/ork/tmp/vma" # for debug + +if [ $LOCAL_MODE == 1 ]; then + #### local mode- no need to checkout, current folder contains the code + workspace_folder=`pwd`; +else + #### daily/release mode- check out the requested revision from the branch + cd $vma_repos_dir + workspace_folder=$vma_repos_dir + target_dir=$mswg_vma_folder + git_co; +fi + +if [ ! -f ./autogen.sh ] && [ $LOCAL_MODE == 1 ]; then + echoErr "please run the script from the HEAD of the workspace folder" + script=`basename $0` + echoErr "i.e. ./build/$script $all_flags" + usage +fi + +getVmaParamsFromConfigure.ac +if [ $RELEASE_MODE == 1 ]; then + #### release mode- check if configure.ac and jurnal.txt are updated + areFilesUpdated + + #### release mode- check if a release already exists for this version + isReleaseExists +fi + +#currPwd=`pwd` +#prepare_deb_tarball "$currPwd" +#cd $currPwd + +echoStep "clean project" +make clean +make distclean +chmod -R 777 build/ + +#### create build/vma dir instead of running script ./get_src.sh +mkdir -p /tmp/vma +cp -r ./* /tmp/vma +mkdir -p build/vma +cp -r /tmp/vma/* build/vma/ +rm -rf /tmp/vma + +#runStep "./autogen.sh" +autogenWrap + +runStep "./configure --enable-debug" + +if [ "$make_cov" == 1 ]; then + runStep "make cov" +#else + #runStep "make" +fi + +curr_pwd=$PWD + +chmod -R 777 build/ +cd build/ + +set_topdir + +#if [ "$make_bullseye" == 1 ]; then +# pathadd "$path_to_bullseye" +# export COVFILE="$curr_pwd/test.cov" +# #export COVFILE="/tmp/test.cov" +# cov01 -1 +# cov01 -s +#fi + +echoStep "build_vma_src_rpm" +if [ "$(rpm --eval '%{_topdir}')" == "$topdir" ]; then + build_vma_src_rpm > /tmp/tmp_file +# build_vma_src_rpm +else + for d in SRPMS SPECS SOURCES RPMS BUILD; do sudo mkdir -p "$topdir"/"$d"; done + build_vma_src_rpm $topdir > /tmp/tmp_file +# build_vma_src_rpm $topdir +fi +handleError $? "build_vma_src_rpm $topdir" + +# move to source package folder +srcRpm=$(cat /tmp/tmp_file | grep Wrote: | grep -o '/.*') +srcRpmPath=$(cat /tmp/tmp_file | grep Wrote: | grep -o '/.*/') +rm -rf /tmp/tmp_file +cd $srcRpmPath + +machine=`uname -m` +name=`uname -n` +fullVersion="$vma_ver_major"."$vma_ver_minor"."$vma_ver_revision"-"$vma_ver_release" +finalRpm= +finalCoverity= +finalBullseye= +err=0 +# make rpm and output errors +echoStep "sudo BUILD_32=$i BUILD_BULLSEYE=$make_bullseye rpmbuild --rebuild --define _topdir $topdir $srcRpm" +sudo BUILD_32=$i BUILD_BULLSEYE=$make_bullseye rpmbuild --rebuild --define "_topdir $topdir" $srcRpm > /tmp/build_vma_rpm.log +if [[ $? != 0 ]]; then + errorOccured "sudo BUILD_32=$i BUILD_BULLSEYE=$make_bullseye rpmbuild --rebuild libvma-$fullVersion.src.rpm" $i + i=`expr $i + 1` +fi + +#if [ "$make_bullseye" == 1 ]; then + #cov01 -0 +#fi + +# find path to rpm +path=$(cat /tmp/build_vma_rpm.log | grep Wrote: | head -1 | grep -o '/.*') +pattern=" |'" +if [[ $path =~ $pattern ]]; then # in case $path contains space + path=`echo $path | awk '{print $1}'` +fi +echo "path=$path" + +path_devel=$(cat /tmp/build_vma_rpm.log | grep Wrote: | grep devel | grep -o '/.*') +pattern=" |'" +if [[ $path_devel =~ $pattern ]]; then # in case $path contains space + path_devel=`echo $path_devel | awk '{print $1}'` +fi +echo "path_devel=$path_devel" + +path_util=$(cat /tmp/build_vma_rpm.log | grep Wrote: | grep util | grep -o '/.*') +pattern=" |'" +if [[ $path_util =~ $pattern ]]; then # in case $path contains space + path_util=`echo $path_util | awk '{print $1}'` +fi +echo "path_util=$path_util" + +rm -f /tmp/build_vma_rpm.log + +# copy to correct location +if [ "$DAILY_MODE" == 1 ]; then #daily + cd $mswg_vma_folder + date=$(date +%Y-%m-%d) + mkdir -p daily/"$branch_folder"/"$date" + #create symbolic link + rm -rf latest_daily-"$branch_folder" + ln -s daily/"$branch_folder"/"$date" latest_daily-"$branch_folder" + if [ "$make_bullseye" == 1 ]; then + cp $path* daily/"$branch_folder"/"$date"/libvma-"$fullVersion"-"$machine"-"$name"."$date".bullseye.rpm + else + cp $path* daily/"$branch_folder"/"$date"/ + cp $path_devel* daily/"$branch_folder"/"$date"/ + cp $path_util* daily/"$branch_folder"/"$date"/ + #cp $path* daily/"$branch_folder"/"$date"/libvma-"$fullVersion"-"$machine"-"$name"."$date".rpm + cp $srcRpm daily/"$branch_folder"/"$date"/libvma-"$fullVersion"-"$machine"-"$name"."$date".src.rpm + fi + #cp $path* daily/"$branch_folder"/"$date"/ + if [[ $? != 0 ]]; then + err=1; + echo "Machine: $name - failed on step: cp $path daily/$branch_folder/$date/" >> $log_file + fi + finalRpm=$mswg_vma_folder/daily/"$branch_folder"/"$date"/libvma-"$fullVersion"-"$machine"-"$name"."$date".rpm + if [ "$make_cov" == 1 ]; then + cp -r $workspace_folder/cov-build daily/"$branch_folder"/"$date" + finalCoverity=$mswg_vma_folder/daily/"$branch_folder"/"$date"/cov-build + fi + if [ "$make_bullseye" == 1 ]; then + finalRpm=$mswg_vma_folder/daily/"$branch_folder"/"$date"/libvma-"$fullVersion"-"$machine"-"$name"."$date".bullseye.rpm + cp "/tmp/test.cov" daily/"$branch_folder"/"$date"/libvma-"$fullVersion"-"$machine"-"$name"."$date".cov + finalBullseye=$mswg_vma_folder/daily/"$branch_folder"/"$date"/libvma-"$fullVersion"-"$machine"-"$name"."$date".bullseye.cov + fi + build_deb "$mswg_vma_folder/daily/$branch_folder/$date/libvma-$fullVersion-$machine-$name.$date.src.rpm" "$mswg_vma_folder/daily/$branch_folder/$date/" + + if [ "$copy_to_bgate" == 1 ]; then + bgate_branch_folder=$branch_folder + if [ "$branch_folder" == "master" ]; then + bgate_branch_folder=$master_name + fi + #ssh ork@bgate.mellanox.com mkdir -p /hpc/home/vma/daily/"$bgate_branch_folder"/"$date" + ssh ophirmu@r-sw-hail25 mkdir -p /net/10.224.1.11/vol/asal_home/vma/daily/"$bgate_branch_folder"/"$date" + #scp -r daily/"$branch_folder"/"$date"/* ork@bgate.mellanox.com:/hpc/home/vma/daily/"$bgate_branch_folder"/"$date" + scp -r daily/"$branch_folder"/"$date"/* ophirmu@r-sw-hail25:/net/10.224.1.11/vol/asal_home/vma/daily/"$bgate_branch_folder"/"$date" + fi +fi +if [ "$RELEASE_MODE" == 1 ]; then #release + cd $mswg_vma_folder + mkdir -p "vma_v_"$fullVersion""/src/ + #create symbolic link + rm -rf latest_release + ln -s "vma_v_"$fullVersion"" latest_release + cd "vma_v_"$fullVersion"" + #copy src rpm + cp $srcRpm src/ + if [[ $? != 0 ]]; then + err=1; + echo "Machine: $name - failed on step: cp $srcRpm src/" >> $log_file + fi + cp $path* . + cp $path_devel* . + cp $path_util* . + #copy the rpm (short name) to vma dir + #ln -s src/*x86_64.rpm libvma-""$fullVersion"-"$machine"".rpm + + cp "$workspace_folder"/README.txt . + cp "$workspace_folder"/journal.txt . + if [ "$make_cov" == 1 ]; then + cp -r $workspace_folder/cov-build . + fi + if [ "$make_bullseye" == 1 ]; then + cp "/tmp/test.cov" libvma-""$fullVersion"-"$machine"".cov + fi + localPath=`pwd` + finalRpm="$localPath/libvma-""$fullVersion"-"$machine"".rpm" + + build_deb $localPath/src/libvma-*.src.rpm "$localPath/" + + echo "libvma-$fullVersion.src.rpm" > $localPath/src/latest.txt + + echo "libvma-$fullVersion.src.rpm" > $mswg_vma_folder/source_rpms/latest.txt + ln -s $localPath/src/libvma-*.src.rpm $mswg_vma_folder/source_rpms/libvma-$fullVersion.src.rpm + #ln -s $localPath/src/libvma-$fullVersion.src.rpm $mswg_vma_folder/source_rpms/libvma-$fullVersion.src.rpm + +fi +if [ "$LOCAL_MODE" == 1 ]; then #local + #cp $path* "$target_dir"/libvma-""$fullVersion"-"$machine""$rpm_name"".rpm + cp $path* "$target_dir"/ + cp $path_util* "$target_dir"/ + cp $path_devel* "$target_dir"/ + if [[ $? != 0 ]]; then + err=1; + echo "Machine: $name - failed on step: cp $path $target_dir/" >> $log_file + fi + finalRpm="$target_dir"/libvma-""$fullVersion"-"$machine""$rpm_name"".rpm; +fi + +if [ "$LOG_SCRIPT_STATUS" == 1 ]; then + if [ $err == 0 ]; then + echo "Machine: $name - rpm was created successfuly, location: $finalRpm" >> $log_file + echo "Machine: $name - deb was created successfuly, location: $debFinalFile" >> $log_file + if [ "$make_cov" == 1 ]; then + echo "Coverity was created successfuly, location: $finalCoverity" >> $log_file + fi + if [ "$make_bullseye" == 1 ]; then + echo "Bullseye .cov file was created successfuly, location: $finalBullseye" >> $log_file + fi + fi +fi + +if [ $LOCAL_MODE != 1 ]; then + cd $vma_repos_dir + rm -rf vma_repos +fi + +finishScript "s" #finish script successfuly diff --git a/build/build_rpm.sh b/build/build_rpm.sh new file mode 100755 index 0000000..fa3aac0 --- /dev/null +++ b/build/build_rpm.sh @@ -0,0 +1,39 @@ +#!/bin/bash + +BASE_DIR=`pwd` +script_dir=`dirname $(readlink -f $0)` +cd $script_dir/.. + +LOG_FILE=`pwd`/build_rpm.log + +echo "Running ./autogen.sh ..." +./autogen.sh > $LOG_FILE 2>&1 + +echo "Running ./configure ..." +./configure >> $LOG_FILE 2>&1 +if [ $? -ne 0 ]; then + echo "configure failed! see $LOG_FILE" + cd $BASE_DIR + exit +fi + +echo "Running make dist ..." +make dist >> $LOG_FILE 2>&1 +if [ $? -ne 0 ]; then + echo "make dist failed! see $LOG_FILE" + cd $BASE_DIR + exit +fi + +echo "Running rpmbuild ... this might take a while ..." +rpmbuild -ta libvma*.tar.gz >> $LOG_FILE 2>&1 +if [ $? -ne 0 ]; then + echo "rpmbuild failed! see $LOG_FILE" + cd $BASE_DIR + exit +fi + +grep Wrote build_rpm.log +rm -rf $LOG_FILE + +cd $BASE_DIR diff --git a/build/libvma.spec.in b/build/libvma.spec.in new file mode 100644 index 0000000..f11e48d --- /dev/null +++ b/build/libvma.spec.in @@ -0,0 +1,187 @@ +%global version @VERSION@ +%global release @VMA_LIBRARY_RELEASE@ +%global pkgname @PACKAGE@ +%global major_release @VMA_LIBRARY_MAJOR@ +%global revision @VMA_LIBRARY_REVISION@ +%define date __DATE +%define time __TIME + +%global pmake %{__make} %{?_smp_mflags} %{?mflags} V=1 + +%global include_dir %{_includedir}/mellanox +%global vma_ver_rel %{name}-%{version}-%{release} +%global vma_datadir %{_datadir}/%{vma_ver_rel} + +%global ofed_ver %(eval 'ofed_info -s 2> /dev/null || echo "unknown"') +%global ofed_dir %{?_ofed_dir}%{?!_ofed_dir:%{_prefix}} + +%global _use_internal_dependency_generator 0 +%global use_systemd %(if ( test -d "%{_unitdir}" > /dev/null); then echo -n '1'; else echo -n '0'; fi) + +Summary: A library for boosting TCP and UDP traffic (over RDMA hardware) +Name: libvma +Version: %{version} +Release: 1%{?dist} +License: GPLv2 +Group: System Environment/Libraries +Url: https://github.com/Mellanox/libvma +BuildRoot: %(mktemp -ud %{_tmppath}/%{name}-%{version}-%{release}-XXXXXX) +Source: %{pkgname}-%{version}.tar.gz +Requires(post): /sbin/ldconfig +Requires(postun): /sbin/ldconfig +Prefix: %{_prefix} + +%description + +VMA library is a performance booster of TCP and UDP traffic +Part of Mellanox's enhanced services +Allows application written over standard socket API +To run over Infiniband/Ethernet from userspace with full network stack bypass +and get better throughput, latency and packets/sec rate + +%package devel +Summary: Header files and link required to develop with Libvma +Group: System Environment/Libraries +Requires: %{name}%{?_isa} = %{version}-%{release} + +%description devel +Headers and symbolink link required to compile and link with the Libvma library. + +%package utils +Summary: Libvma utilities +Group: System Environment/Libraries +Requires: %{name}%{?_isa} = %{version}-%{release} + +%description utils +Example tests and tools for collecting and analyzing Libvma statistic + +%prep +%setup -q + +%build + +export revision=1 +#export CFLAGS CXXFLAGS LDFLAGS +export CXXFLAGS='%{optflags}' + +%configure --with-ofed=%{ofed_dir} --enable-opt-log=none +%{pmake} +cp -f src/vma/.libs/%{name}.so %{name}-debug.so +%{pmake} clean + +%configure --with-ofed=%{ofed_dir} --docdir=%{_docdir}/%{name}-%{version} +%{pmake} + +%install +[ "${RPM_BUILD_ROOT}" != "/" -a -d ${RPM_BUILD_ROOT} ] && rm -rf ${RPM_BUILD_ROOT} + +mkdir -p $RPM_BUILD_ROOT%{include_dir} +mkdir -p $RPM_BUILD_ROOT%{vma_datadir}/scripts +mkdir -p $RPM_BUILD_ROOT%{_sysconfdir} +mkdir -p $RPM_BUILD_ROOT%{_libdir} + +%{pmake} DESTDIR=${RPM_BUILD_ROOT} install + +rm -f $RPM_BUILD_ROOT%{_libdir}/*.la + +install -m 755 tests/vma_perf_envelope/vma_perf_envelope.sh $RPM_BUILD_ROOT/%{vma_datadir}/scripts/vma_perf_envelope.sh +install -m 644 src/vma/vma_extra.h $RPM_BUILD_ROOT/%{include_dir}/vma_extra.h +install -m 644 src/vma/util/libvma.conf $RPM_BUILD_ROOT/%{_sysconfdir}/ +install -s -m 755 src/stats/vma_stats $RPM_BUILD_ROOT/%{_bindir}/vma_stats +install -s -m 755 tools/daemon/vmad $RPM_BUILD_ROOT/%{_sbindir}/vmad +install -m 755 ./%{name}-debug.so $RPM_BUILD_ROOT/%{_libdir}/%{name}-debug.so +%if "%{use_systemd}" == "1" +install -D -m 644 contrib/scripts/vma.service $RPM_BUILD_ROOT/%{_unitdir}/vma.service +install -m 755 contrib/scripts/vma.init $RPM_BUILD_ROOT/%{_sbindir}/vma +%else +install -m 755 contrib/scripts/vma.init $RPM_BUILD_ROOT/%{_sysconfdir}/init.d/vma +%endif + +%post +if [ `grep memlock /etc/security/limits.conf |grep unlimited |wc -l` -le 0 ]; then + echo "* - memlock unlimited" >> /etc/security/limits.conf + echo "* soft memlock unlimited" >> /etc/security/limits.conf + echo "* hard memlock unlimited" >> /etc/security/limits.conf + echo "- Changing max locked memory to unlimited (in /etc/security/limits.conf)" + echo " Please log out from the shell and login again in order to update this change " + echo " Read more about this topic in the VMA's User Manual" +fi +/sbin/ldconfig + +# Package setup, not upgrade +if [ $1 = 1 ]; then + if type systemctl >/dev/null 2>&1; then + systemctl --no-reload enable vma.service >/dev/null 2>&1 || true + elif [ -e /sbin/chkconfig ]; then + /sbin/chkconfig --add vma + elif [ -e /usr/sbin/update-rc.d ]; then + /usr/sbin/update-rc.d vma defaults + else + /usr/lib/lsb/install_initd /etc/init.d/vma + fi +fi + +%preun +# Package removal, not upgrade +if [ $1 = 0 ]; then + if type systemctl >/dev/null 2>&1; then + systemctl --no-reload disable vma.service >/dev/null 2>&1 || true + systemctl stop vma.service || true + elif [ -e /sbin/chkconfig ]; then + /etc/init.d/vma stop + /sbin/chkconfig --del vma + elif [ -e /usr/sbin/update-rc.d ]; then + /etc/init.d/vma stop + /usr/sbin/update-rc.d -f vma remove + else + /etc/init.d/vma stop + /usr/lib/lsb/remove_initd /etc/init.d/vma + fi + rm -f /var/cache/vma/* +fi + + +%clean +[ "${RPM_BUILD_ROOT}" != "/" -a -d ${RPM_BUILD_ROOT} ] && rm -rf ${RPM_BUILD_ROOT} + +%postun +# Package upgrade +/sbin/ldconfig +if type systemctl >/dev/null 2>&1; then + systemctl --system daemon-reload >/dev/null 2>&1 || true +fi + + +%files +%defattr(-,root,root,-) +%{_libdir}/%{name}*.so.* +%{_libdir}/%{name}.so +%{_libdir}/%{name}-debug.so +%{_docdir}/%{name}-%{version}/README.txt +%{_docdir}/%{name}-%{version}/journal.txt +%{_docdir}/%{name}-%{version}/VMA_VERSION +%config(noreplace) %{_sysconfdir}/libvma.conf +%{_sysconfdir}/security/limits.d/30-libvma-limits.conf +%{_sbindir}/vmad +%if "%{use_systemd}" == "1" +%{_prefix}/lib/systemd/system/vma.service +%{_sbindir}/vma +%else +%{_sysconfdir}/init.d/vma +%endif + +%files devel +%defattr(-,root,root,-) +%{include_dir}/vma_extra.h + +%files utils +%defattr(-,root,root) +%{_bindir}/vma_stats +%{vma_datadir}/scripts/vma_perf_envelope.sh + +%changelog +* Mon Nov 28 2016 Igor Ivanov +- Add daemon + +* Mon Jan 4 2016 Avner BenHanoch 7.0.12-1 +- Initial Packaging diff --git a/build/libvma_fedora.spec b/build/libvma_fedora.spec new file mode 100644 index 0000000..0c31a17 --- /dev/null +++ b/build/libvma_fedora.spec @@ -0,0 +1,113 @@ +Name: libvma +Version: 8.0.1 +Release: 1%{?dist} +Summary: A library for boosting TCP and UDP traffic (over RDMA hardware) + +License: GPLv2 or BSD +Url: https://github.com/Mellanox/libvma +Source: http://www.mellanox.com/downloads/Accelerator/%{name}-%{version}.tar.gz +#arm is excluded since libvma fails to compile on arm. +#Reason: libvma uses assembly commands that are not supported by arm. +ExcludeArch: %{arm} +Requires: pam +Requires(post): /sbin/ldconfig +Requires(postun): /sbin/ldconfig + +BuildRequires: librdmacm-devel libibverbs-devel libnl3-devel +BuildRequires: automake autoconf libtool + +%global use_systemd %(if ( test -d "%{_unitdir}" > /dev/null); then echo -n '1'; else echo -n '0'; fi) + +%description +libvma is a LD_PRELOAD-able library that boosts performance +of TCP and UDP traffic. +It allows application written over standard socket API to handle +fast path data traffic from user space over Ethernet and/or +Infiniband with full network stack bypass and get better throughput, +latency and packets/sec rate. +No application binary change is required for that. +libvma is supported by RDMA capable devices that support +"verbs" IBV_QPT_RAW_PACKET QP for Ethernet and/or IBV_QPT_UD QP for IPoIB. + +%package devel +Summary: Header files required to develop with libvma +Requires: %{name}%{?_isa} = %{version}-%{release} + +%description devel +Headers files required to develop with the libvma library. + +%package utils +Summary: Libvma utilities +Requires: %{name}%{?_isa} = %{version}-%{release} + +%description utils +Tools for collecting and analyzing libvma statistic. + +%prep +%setup -q + +%build +./autogen.sh +%configure +make %{?_smp_mflags} V=1 + +%install +%make_install +rm -f $RPM_BUILD_ROOT%{_libdir}/*.la + +%post -p /sbin/ldconfig +%postun -p /sbin/ldconfig + +%files +%{_libdir}/%{name}*.so.* +#libvma.so in needed in the main package so that +#'LD_PRELOAD=libvma.so ' works. +%{_libdir}/%{name}.so +%license COPYING LICENSE +%doc README.txt journal.txt VMA_VERSION +%config(noreplace) %{_sysconfdir}/libvma.conf +%config(noreplace) %{_sysconfdir}/security/limits.d/30-libvma-limits.conf +%{_sbindir}/vmad +%config(noreplace) %{_sysconfdir}/init.d/vma +%if "%{use_systemd}" == "1" +%config(noreplace) %{_sysconfdir}/systemd/system/vma.service +%endif + +%files devel +%{_includedir}/* + +%files utils +%{_bindir}/vma_stats + +%changelog +* Thu Dec 03 2016 Alex Vainman +- Add daemon + +* Thu Mar 13 2016 Alex Vainman - 8.0.1-1 +- New upstream release +- Move to dual license: GPLv2 or BSD +- ExcludeArch update +- Removal of extra space in: + config(noreplace) {_sysconfdir}/security/limits.d/30-libvma-limits.conf +- Add V=1 to make + +* Wed Mar 2 2016 Alex Vainman - 7.0.14-2 +- Added reasoning for archs exclusion +- Package description improvement +- Removal of the pre scriplet +- Added COPYING and LICENSE files to the package + +* Sun Feb 21 2016 Alex Vainman - 7.0.14-1 +- New upstream release +- Removal of redundant macros and obsolete/unneeded tags +- Added ExcludeArch, BuildRequires and Require sections +- Fixes and cleanups in the build and installation sections +- Install 30-libvma-limits.conf file under + /etc/security/limits.d/ +- Fixes related to files/directories ownerships +- Removal of vma_perf_envelope.sh from the utility package +- Update Source tag URL +- Fix most of the rpmlint warnings + +* Mon Jan 4 2016 Avner BenHanoch - 7.0.12-1 +- Initial Packaging diff --git a/config/m4/dpcp.m4 b/config/m4/dpcp.m4 new file mode 100644 index 0000000..117caeb --- /dev/null +++ b/config/m4/dpcp.m4 @@ -0,0 +1,73 @@ +# dpcp.m4 - Library to operate with DevX +# +# Copyright (C) Mellanox Technologies Ltd. 2001-2020. ALL RIGHTS RESERVED. +# See file LICENSE for terms. +# + +########################## +# libdpcp usage support +# +AC_DEFUN([DPCP_CAPABILITY_SETUP], +[ +AC_ARG_WITH([dpcp], + AC_HELP_STRING([--with-dpcp(=DIR)], + [Search for dpcp headers and libraries in DIR (default NO)]), + [], + [with_dpcp=no] +) +if test -z "$with_dpcp" || test "$with_dpcp" = "yes"; then + with_dpcp=/usr +fi + +FUNC_CHECK_WITHDIR([dpcp], [$with_dpcp], [include/mellanox/dpcp.h]) + +vma_cv_dpcp=0 +AS_IF([test "x$with_dpcp" == xno], + [], + [ + vma_cv_dpcp_save_CPPFLAGS="$CPPFLAGS" + vma_cv_dpcp_save_CXXFLAGS="$CXXFLAGS" + vma_cv_dpcp_save_CFLAGS="$CFLAGS" + vma_cv_dpcp_save_LDFLAGS="$LDFLAGS" + vma_cv_dpcp_save_LIBS="$LIBS" + + vma_cv_dpcp_CPPFLAGS="-I$with_dpcp/include" + vma_cv_dpcp_LIBS="-ldpcp -lmlx5" + vma_cv_dpcp_LDFLAGS="-L$with_dpcp/lib -Wl,--rpath,$with_dpcp/lib" + if test -d "$with_dpcp/lib64"; then + vma_cv_dpcp_LDFLAGS="-L$with_dpcp/lib64 -Wl,--rpath,$with_dpcp/lib64" + fi + + CPPFLAGS="$vma_cv_dpcp_CPPFLAGS $CPPFLAGS" + CXXFLAGS="-std=c++11 $CXXFLAGS" + LDFLAGS="$vma_cv_dpcp_LDFLAGS $LDFLAGS" + LIBS="$vma_cv_dpcp_LIBS $LIBS" + + AC_LANG_PUSH([C++]) + AC_CHECK_HEADER( + [mellanox/dpcp.h], + [AC_LINK_IFELSE([AC_LANG_PROGRAM([[#include ]], + [[dpcp::provider *provider; + dpcp::provider::get_instance(provider);]])], + [vma_cv_dpcp=1]) + ]) + AC_LANG_POP() + + CPPFLAGS="$vma_cv_dpcp_save_CPPFLAGS" + CXXFLAGS="$vma_cv_dpcp_save_CXXFLAGS" + CFLAGS="$vma_cv_dpcp_save_CFLAGS" + LDFLAGS="$vma_cv_dpcp_save_LDFLAGS" + LIBS="$vma_cv_dpcp_save_LIBS" + ]) + +AC_MSG_CHECKING([for dpcp support]) +if test "$vma_cv_dpcp" -ne 0; then + CPPFLAGS="$CPPFLAGS $vma_cv_dpcp_CPPFLAGS" + LDFLAGS="$LDFLAGS $vma_cv_dpcp_LDFLAGS" + AC_SUBST([DPCP_LIBS], ["-ldpcp"]) + AC_DEFINE_UNQUOTED([DEFINED_DPCP], [1], [Define to 1 to use DPCP]) + AC_MSG_RESULT([yes]) +else + AC_MSG_RESULT([no]) +fi +]) diff --git a/config/m4/func.m4 b/config/m4/func.m4 new file mode 100644 index 0000000..6bfe57e --- /dev/null +++ b/config/m4/func.m4 @@ -0,0 +1,58 @@ +# func.m4 - Collection of functions +# +# Copyright (C) Mellanox Technologies Ltd. 2001-2020. ALL RIGHTS RESERVED. +# See file LICENSE for terms. +# + +########################## +# Configure functions +# +# Some helper script functions +# +AC_DEFUN([FUNC_CONFIGURE_INIT], +[ +show_section_title() +{ + cat < /dev/null`" = "x"], + [AC_MSG_RESULT([not found]) + AC_MSG_WARN([Expected file $2/$3 not found]) + AC_MSG_ERROR([Cannot continue])], + [AC_MSG_RESULT([($2)])] + ) + ] + ) + ] + ) +]) diff --git a/config/m4/nl.m4 b/config/m4/nl.m4 new file mode 100644 index 0000000..c410324 --- /dev/null +++ b/config/m4/nl.m4 @@ -0,0 +1,58 @@ +# nl.m4 - Detect nl package +# +# Copyright (C) Mellanox Technologies Ltd. 2001-2020. ALL RIGHTS RESERVED. +# See file LICENSE for terms. +# + +########################## +# Checking nl library +# +AC_DEFUN([CHECK_NL_LIB], +[ +# checking for libnl1 or libnl3 in libibverbs +if test -f "$ac_cv_ofed_path/lib64/libibverbs.so" ; then + libibverbs_file="$ac_cv_ofed_path/lib64/libibverbs.so" +elif test -f "$(ls -d $ac_cv_ofed_path/lib/$(uname -m)-linux-*)/libibverbs.so" ; then + libibverbs_file="$(ls -d $ac_cv_ofed_path/lib/$(uname -m)-linux-*)/libibverbs.so" +else + libibverbs_file="$ac_cv_ofed_path/lib/libibverbs.so" +fi + +PKG_CHECK_MODULES([LIBNL3],[libnl-route-3.0],[use_libnl3=yes] ,[use_libnl3=no]) +PKG_CHECK_MODULES([LIBNL1],[libnl-1], [use_libnl1=yes] , [use_libnl1=no]) + +ldd $libibverbs_file | grep libnl >/dev/null 2>&1 +if test $? -eq 0 ; then + # When linking with libibverbs library, we must ensure that we pick the same version + # of libnl that libibverbs picked. Prefer libnl-3 unless libibverbs linked to libnl-1 + ldd $libibverbs_file | grep -e 'libnl3' -e 'libnl-3' >/dev/null 2>&1 + if test $? -eq 0 ; then + # libnl3 case + if test "$use_libnl3" == no; then + AC_MSG_ERROR([libibverbs is linked with libnl3 while libnl3-devel\libnl3-route-devel are not installed. Please install libnl3-devel\libnl3-route-devel and try again]) + fi + use_libnl1=no + else + # libnl1 case + if test "$use_libnl1" == no; then + AC_MSG_ERROR([libibverbs is linked with libnl1 while libnl1-devel is not installed. Please install libnl1-devel and try again]) + fi + use_libnl3=no + fi +fi + +if test "$use_libnl3" == yes; then + AC_SUBST([LIBNL_LIBS], "$LIBNL3_LIBS") + AC_SUBST([LIBNL_CFLAGS], "$LIBNL3_CFLAGS") + AC_SUBST([LIBNLX_DEVEL], "libnl3-devel") + AC_DEFINE([HAVE_LIBNL3], [1], [Use libnl-3]) +elif test "$use_libnl1" == yes; then + AC_SUBST([LIBNL_LIBS], "$LIBNL1_LIBS") + AC_SUBST([LIBNL_CFLAGS], "$LIBNL1_CFLAGS") + AC_SUBST([LIBNLX_DEVEL], "libnl-devel") + AC_DEFINE([HAVE_LIBNL1], [1], [Use libnl-1]) +else + AC_MSG_ERROR([libvma needs libnl3-devel,libnl3-route-devel\libnl1-devel (better libnl3)]) +fi + +]) diff --git a/config/m4/opt.m4 b/config/m4/opt.m4 new file mode 100644 index 0000000..2f7d1bc --- /dev/null +++ b/config/m4/opt.m4 @@ -0,0 +1,47 @@ +# opt.m4 - Macros to control optimization +# +# Copyright (C) Mellanox Technologies Ltd. 2001-2020. ALL RIGHTS RESERVED. +# See file LICENSE for terms. +# + +########################## +# Logging control +# +# VMA defined log levels +# +AC_DEFUN([OPT_VMA_LOGGING], +[ +AC_DEFINE(DEFINED_VLOG_INIT, -2, VMA Log Init Definition) +AC_DEFINE(DEFINED_VLOG_NONE, -1, VMA Log None Definition) +AC_DEFINE(DEFINED_VLOG_PANIC, 0, VMA Log Panic Definition) +AC_DEFINE(DEFINED_VLOG_ERROR, 1, VMA Log Error Definition) +AC_DEFINE(DEFINED_VLOG_WARNING, 2, VMA Log Warning Definition) +AC_DEFINE(DEFINED_VLOG_INFO, 3, VMA Log Info Definition) +AC_DEFINE(DEFINED_VLOG_DETAILS, 4, VMA Log Details Definition) +AC_DEFINE(DEFINED_VLOG_DEBUG, 5, VMA Log Debug Definition) +AC_DEFINE(DEFINED_VLOG_FINE, 6, VMA Log Fine Definition) +AC_DEFINE(DEFINED_VLOG_FINER, 7, VMA Log Finer Definition) +AC_DEFINE(DEFINED_VLOG_ALL, 8, VMA Log All Definition) + +AC_ARG_ENABLE([opt-log], + AC_HELP_STRING([--enable-opt-log], + [Optimize latency (none, medium, high) by limiting max log level (default=medium)]),, + enableval=medium) +AC_MSG_CHECKING([for logging optimization]) +enable_opt_log=DEFINED_VLOG_ALL +case "$enableval" in + no | none) + ;; + yes | medium) + enable_opt_log=DEFINED_VLOG_DEBUG + ;; + high) + enable_opt_log=DEFINED_VLOG_DETAILS + ;; + *) + AC_MSG_ERROR([Unrecognized --enable-opt-log parameter as $enableval]) + ;; +esac +AC_DEFINE_UNQUOTED([VMA_MAX_DEFINED_LOG_LEVEL], [$enable_opt_log], [Log optimization level]) +AC_MSG_RESULT([$enableval]) +]) diff --git a/config/m4/prof.m4 b/config/m4/prof.m4 new file mode 100644 index 0000000..02bbb9a --- /dev/null +++ b/config/m4/prof.m4 @@ -0,0 +1,252 @@ +# prof.m4 - Profiling, instrumentation +# +# Copyright (C) Mellanox Technologies Ltd. 2001-2020. ALL RIGHTS RESERVED. +# See file LICENSE for terms. +# + +########################## +# libibprof profiling support +# +AC_DEFUN([PROF_IBPROF_SETUP], +[ +AC_ARG_WITH([ibprof], + AC_HELP_STRING([--with-ibprof], + [Search ibprof location (default NO)]), + [], + [with_ibprof=no] +) + +AS_IF([test "x$with_ibprof" == xno], + [], + [AC_CHECK_HEADER( + [$with_ibprof/include/ibprof_api.h], + [ + CFLAGS="$CFLAGS -DVMA_TIME_IBPROF" + CXXFLAGS="$CXXFLAGS -DVMA_TIME_IBPROF" + CPPFLAGS="$CPPFLAGS -I$with_ibprof/include" + if test -d "$with_ibprof/lib64"; then + LDFLAGS="$LDFLAGS -L$with_ibprof/lib64 -Wl,--rpath,$with_ibprof/lib64" + else + LDFLAGS="$LDFLAGS -L$with_ibprof/lib -Wl,--rpath,$with_ibprof/lib" + fi + AC_SUBST([LIBIBPROF_LIBS], "-libprof") + ], + [AC_MSG_ERROR([ibprof support requested, but <$with_ibprof/include/ibprof_api.h> not found.])]) +]) +]) + +########################## +# +# RDTSC measurements support +# +# ****** Total VMA RX******** +# RDTSC_MEASURE_RX_CQE_RECEIVEFROM +# +# ******* Verbs Poll *********** +# RDTSC_MEASURE_RX_VERBS_IDLE_POLL +# RDTSC_MEASURE_RX_VERBS_READY_POLL +# +# ******* LWIP *********** +# RDTSC_MEASURE_RX_LWIP +# +# ******* Other RX *********** +# RDTSC_MEASURE_RX_DISPATCH_PACKET +# RDTSC_MEASURE_RX_AFTER_PROCCESS_BUFFER_TO_RECIVEFROM +# RDTSC_MEASURE_RX_VMA_TCP_IDLE_POLL +# RDTSC_MEASURE_RX_READY_POLL_TO_LWIP +# RDTSC_MEASURE_RX_LWIP_TO_RECEVEFROM +# +# ****** Total VMA TX ******** +# RDTSC_MEASURE_TX_SENDTO_TO_AFTER_POST_SEND + +# ******* Verbs Post Send *********** +# RDTSC_MEASURE_TX_VERBS_POST_SEND + +# ******* App *********** +# RDTSC_MEASURE_RECEIVEFROM_TO_SENDTO +# +AC_DEFUN([PROF_RDTSC_SETUP], +[ +AC_MSG_CHECKING([if rdtsc-rx-cqe-recvfrom is enabled]) +AC_ARG_WITH([rdtsc-rx-cqe-recvfrom], + AC_HELP_STRING([--with-rdtsc-rx-cqe-recvfrom], + [Enable rdtsc measurement of rx CQE recvfrom]), + [], + [with_rdtsc_rx_cqe_recvfrom=no] +) + +AS_IF([test "x$with_rdtsc_rx_cqe_recvfrom" == xyes], + [AC_DEFINE([RDTSC_MEASURE], 1, [Define to 1 to enable rdtsc measurements.])] + [AC_DEFINE([RDTSC_MEASURE_RX_CQE_RECEIVEFROM], 1, [Define to 1 to enable rdtsc measurement of rx CQE recvfrom.])] + [AC_MSG_RESULT([yes])], + [AC_MSG_RESULT([no])] +) + + +AC_MSG_CHECKING([if rdtsc-rx-verbs-idle-poll is enabled]) +AC_ARG_WITH([rdtsc-rx-verbs-idle-poll], + AC_HELP_STRING([--with-rdtsc-rx-verbs-idle-poll], + [Enable rdtsc measurement of rx verbs idle poll]), + [], + [with_rdtsc_rx_verbs_idle_poll=no] +) + +AS_IF([test "x$with_rdtsc_rx_verbs_idle_poll" == xyes], + [AC_DEFINE([RDTSC_MEASURE], 1, [Define to 1 to enable rdtsc measurements.])] + [AC_DEFINE([RDTSC_MEASURE_RX_VERBS_IDLE_POLL], 1, [Define to 1 to enable rdtsc measurement of rx verbs idle poll.])] + [AC_MSG_RESULT([yes])], + [AC_MSG_RESULT([no])] +) + +AC_MSG_CHECKING([if rdtsc-rx-verbs-ready-poll is enabled]) +AC_ARG_WITH([rdtsc-rx-verbs-ready-poll], + AC_HELP_STRING([--with-rdtsc-rx-verbs-ready-poll], + [Enable rdtsc measurement of rx verbs ready poll]), + [], + [with_rdtsc_rx_verbs_ready_poll=no] +) + +AS_IF([test "x$with_rdtsc_rx_verbs_ready_poll" == xyes], + [AC_DEFINE([RDTSC_MEASURE], 1, [Define to 1 to enable rdtsc measurements.])] + [AC_DEFINE([RDTSC_MEASURE_RX_VERBS_READY_POLL], 1, [Define to 1 to enable rdtsc measurement of rx verbs ready poll.])] + [AC_MSG_RESULT([yes])], + [AC_MSG_RESULT([no])] +) + +AC_MSG_CHECKING([if rdtsc-rx-lwip is enabled]) +AC_ARG_WITH([rdtsc-rx-lwip], + AC_HELP_STRING([--with-rdtsc-rx-lwip], + [Enable rdtsc measurement of rx lwip]), + [], + [with_rdtsc_rx_lwip=no] +) + +AS_IF([test "x$with_rdtsc_rx_lwip" == xyes], + [AC_DEFINE([RDTSC_MEASURE], 1, [Define to 1 to enable rdtsc measurements.])] + [AC_DEFINE([RDTSC_MEASURE_RX_LWIP], 1, [Define to 1 to enable rdtsc measurement of rx lwip.])] + [AC_MSG_RESULT([yes])], + [AC_MSG_RESULT([no])] +) + +AC_MSG_CHECKING([if rdtsc-rx-dispatch-packet is enabled]) +AC_ARG_WITH([rdtsc-rx-dispatch-packet], + AC_HELP_STRING([--with-rdtsc-rx-dispatch-packet], + [Enable rdtsc measurement of rx dispatch packet]), + [], + [with_rdtsc_rx_dispatch_packet=no] +) + +AS_IF([test "x$with_rdtsc_rx_dispatch_packet" == xyes], + [AC_DEFINE([RDTSC_MEASURE], 1, [Define to 1 to enable rdtsc measurements.])] + [AC_DEFINE([RDTSC_MEASURE_RX_DISPATCH_PACKET], 1, [Define to 1 to enable rdtsc measurement of rx dispatch packet.])] + [AC_MSG_RESULT([yes])], + [AC_MSG_RESULT([no])] +) + +AC_MSG_CHECKING([if rdtsc-rx-after-process-buffer-to-receivefrom is enabled]) +AC_ARG_WITH([rdtsc-rx-after-process-buffer-to-receivefrom], + AC_HELP_STRING([--with-rdtsc-rx-after-process-buffer-to-receivefrom], + [Enable rdtsc measurement of rx after process buffer to receivefrom]), + [], + [with_rdtsc_rx_after_process_buffer_to_receivefrom=no] +) + +AS_IF([test "x$with_rdtsc_rx_after_process_buffer_to_receivefrom" == xyes], + [AC_DEFINE([RDTSC_MEASURE], 1, [Define to 1 to enable rdtsc measurements.])] + [AC_DEFINE([RDTSC_MEASURE_RX_AFTER_PROCCESS_BUFFER_TO_RECIVEFROM], 1, [Define to 1 to enable rdtsc measurement of rx after process buffer to receivefrom.])] + [AC_MSG_RESULT([yes])], + [AC_MSG_RESULT([no])] +) + +AC_MSG_CHECKING([if rdtsc-rx-vma-tcp-idle-poll is enabled]) +AC_ARG_WITH([rdtsc-rx-vma-tcp-idle-poll], + AC_HELP_STRING([--with-rdtsc-rx-vma-tcp-idle-poll], + [Enable rdtsc measurement of rx vma tcp idle poll]), + [], + [with_rdtsc_rx_vma_tcp_idle_poll=no] +) + +AS_IF([test "x$with_rdtsc_rx_vma_tcp_idle_poll" == xyes], + [AC_DEFINE([RDTSC_MEASURE], 1, [Define to 1 to enable rdtsc measurements.])] + [AC_DEFINE([RDTSC_MEASURE_RX_VMA_TCP_IDLE_POLL], 1, [Define to 1 to enable rdtsc measurement of rx vma tcp idle poll.])] + [AC_MSG_RESULT([yes])], + [AC_MSG_RESULT([no])] +) + +AC_MSG_CHECKING([if rdtsc-rx-ready-poll-to-lwip is enabled]) +AC_ARG_WITH([rdtsc-rx-ready-poll-to-lwip], + AC_HELP_STRING([--with-rdtsc-rx-ready-poll-to-lwip], + [Enable rdtsc measurement of rx ready poll to lwip]), + [], + [with_rdtsc_rx_ready_poll_to_lwip=no] +) + +AS_IF([test "x$with_rdtsc_rx_ready_poll_to_lwip" == xyes], + [AC_DEFINE([RDTSC_MEASURE], 1, [Define to 1 to enable rdtsc measurements.])] + [AC_DEFINE([RDTSC_MEASURE_RX_READY_POLL_TO_LWIP], 1, [Define to 1 to enable rdtsc measurement of rx ready poll to lwip.])] + [AC_MSG_RESULT([yes])], + [AC_MSG_RESULT([no])] +) + +AC_MSG_CHECKING([if rdtsc-rx-lwip-to-receivefrom is enabled]) +AC_ARG_WITH([rdtsc-rx-lwip-to-receivefrom], + AC_HELP_STRING([--with-rdtsc-rx-lwip-to-receivefrom], + [Enable rdtsc measurement of rx lwip to receivefrom]), + [], + [with_rdtsc_rx_lwip_to_receivefrom=no] +) + +AS_IF([test "x$with_rdtsc_rx_lwip_to_receivefrom" == xyes], + [AC_DEFINE([RDTSC_MEASURE], 1, [Define to 1 to enable rdtsc measurements.])] + [AC_DEFINE([RDTSC_MEASURE_RX_LWIP_TO_RECEVEFROM], 1, [Define to 1 to enable rdtsc measurement of rx lwip to receivefrom.])] + [AC_MSG_RESULT([yes])], + [AC_MSG_RESULT([no])] +) + + +AC_MSG_CHECKING([if rdtsc-tx-sendto-to-after-post-send is enabled]) +AC_ARG_WITH([rdtsc-tx-sendto-to-after-post-send], + AC_HELP_STRING([--with-rdtsc-tx-sendto-to-after-post-send], + [Enable rdtsc measurement of tx sendto to after post send]), + [], + [with_rdtsc_tx_sendto_to_after_post_send=no] +) + +AS_IF([test "x$with_rdtsc_tx_sendto_to_after_post_send" == xyes], + [AC_DEFINE([RDTSC_MEASURE], 1, [Define to 1 to enable rdtsc measurements.])] + [AC_DEFINE([RDTSC_MEASURE_TX_SENDTO_TO_AFTER_POST_SEND], 1, [Define to 1 to enable rdtsc measurement of tx sendto to after port send.])] + [AC_MSG_RESULT([yes])], + [AC_MSG_RESULT([no])] +) + +AC_MSG_CHECKING([if rdtsc-tx-verbs-post-send is enabled]) +AC_ARG_WITH([rdtsc-tx-verbs-post-send], + AC_HELP_STRING([--with-rdtsc-tx-verbs-post-send], + [Enable rdtsc measurement of tx verbs post send]), + [], + [with_rdtsc_tx_verbs_post_send=no] +) + +AS_IF([test "x$with_rdtsc_tx_verbs_post_send" == xyes], + [AC_DEFINE([RDTSC_MEASURE], 1, [Define to 1 to enable rdtsc measurements.])] + [AC_DEFINE([RDTSC_MEASURE_TX_VERBS_POST_SEND], 1, [Define to 1 to enable rdtsc measurement of tx verbs post send.])] + [AC_MSG_RESULT([yes])], + [AC_MSG_RESULT([no])] +) + + +AC_MSG_CHECKING([if rdtsc-receivefrom-to-sendto is enabled]) +AC_ARG_WITH([rdtsc-receivefrom-to-sendto], + AC_HELP_STRING([--with-rdtsc-receivefrom-to-sendto], + [Enable rdtsc measurement of receivefrom to sendto]), + [], + [with_rdtsc_receivefrom_to_sendto=no] +) + +AS_IF([test "x$with_rdtsc_receivefrom_to_sendto" == xyes], + [AC_DEFINE([RDTSC_MEASURE], 1, [Define to 1 to enable rdtsc measurements.])] + [AC_DEFINE([RDTSC_MEASURE_RECEIVEFROM_TO_SENDTO], 1, [Define to 1 to enable rdtsc measurement of receivefrom to sendto.])] + [AC_MSG_RESULT([yes])], + [AC_MSG_RESULT([no])] +) +]) \ No newline at end of file diff --git a/config/m4/verbs.m4 b/config/m4/verbs.m4 new file mode 100644 index 0000000..f46ac28 --- /dev/null +++ b/config/m4/verbs.m4 @@ -0,0 +1,211 @@ +# verbs.m4 - Parsing verbs capabilities +# +# Copyright (C) Mellanox Technologies Ltd. 2001-2020. ALL RIGHTS RESERVED. +# See file LICENSE for terms. +# + + +# Check attributes +# Usage: CHECK_VERBS_ATTRIBUTE([attribute], [header file], [definition]) +# Note: +# - [definition] can be omitted if it is equal to attribute +# +AC_DEFUN([CHECK_VERBS_ATTRIBUTE], [ + AC_TRY_LINK( + [#include <$2>], + [int attr = (int)$1; attr = attr;], + [vma_cv_attribute_$1=yes], + [vma_cv_attribute_$1=no]) + + AC_MSG_CHECKING([for attribute $1]) + AC_MSG_RESULT([$vma_cv_attribute_$1]) + AS_IF([test "x$3" != "x"], [vma_cv_attribute_ex_$3=$vma_cv_attribute_$1]) + AS_IF([test "x$vma_cv_attribute_$1" = "xyes"], [ + AS_IF([test "x$3" = "x"], + [AC_DEFINE_UNQUOTED([DEFINED_$1], [1], [Define to 1 if attribute $1 is supported])], + [AC_DEFINE_UNQUOTED([DEFINED_$3], [1], [Define to 1 if attribute $1 is supported])] + ) + ]) +]) + +# Check attributes +# Usage: CHECK_VERBS_MEMBER([attribute], [header file], [definition]) +# +AC_DEFUN([CHECK_VERBS_MEMBER], [ + AC_CHECK_MEMBER( $1, [AC_DEFINE_UNQUOTED([DEFINED_$3], [1], [Define to 1 if attribute $1 is supported])], [], [[#include <$2>]]) +]) + +########################## +# Configure ofed capabilities +# +AC_DEFUN([VERBS_CAPABILITY_SETUP], +[ + +AC_CHECK_HEADERS([infiniband/verbs.h], , + [AC_MSG_ERROR([Unable to find the libibverbs-devel header files])]) + +AC_CHECK_HEADERS([rdma/rdma_cma.h], , + [AC_MSG_ERROR([Unable to find the librdmacm-devel header files])]) + +AC_CHECK_LIB(ibverbs, + ibv_get_device_list, [VERBS_LIBS="$VERBS_LIBS -libverbs"], + AC_MSG_ERROR([ibv_get_device_list() not found.])) + +AC_CHECK_LIB(rdmacm, + rdma_create_id, [VERBS_LIBS="$VERBS_LIBS -lrdmacm"], + AC_MSG_ERROR([rdma_create_id() not found.])) + +AC_SUBST([VERBS_LIBS]) + +# Save LIBS +verbs_saved_libs=$LIBS +LIBS="$LIBS $VERBS_LIBS" + + +# Check if VERBS version +# +vma_cv_verbs=0 +vma_cv_verbs_str="None" +AC_TRY_LINK( +#include +, +[ + int access = (int)IBV_EXP_ACCESS_ALLOCATE_MR; + access = access; +], +[ + vma_cv_verbs=2 + vma_cv_verbs_str="Experimental" +], +[ + AC_CHECK_HEADER([infiniband/verbs.h], + [AC_CHECK_MEMBERS([struct ibv_query_device_ex_input.comp_mask], + [vma_cv_verbs=3 vma_cv_verbs_str="Upstream"], + [vma_cv_verbs=1 vma_cv_verbs_str="Legacy"], + [[#include ]] )], + [], + [AC_MSG_ERROR([Can not detect VERBS version])] + ) +]) +AC_MSG_CHECKING([for OFED Verbs version]) +AC_MSG_RESULT([$vma_cv_verbs_str]) +AC_DEFINE_UNQUOTED([DEFINED_VERBS_VERSION], [$vma_cv_verbs], [Define found Verbs version]) + + +# Check if direct hardware operations can be used instead of VERBS API +# +vma_cv_directverbs=0 +case "$vma_cv_verbs" in + 1) + ;; + 2) + AC_CHECK_HEADER([infiniband/mlx5_hw.h], + [AC_CHECK_DECL([MLX5_ETH_INLINE_HEADER_SIZE], + [vma_cv_directverbs=$vma_cv_verbs], [], [[#include ]])]) + ;; + 3) + AC_CHECK_HEADER([infiniband/mlx5dv.h], + [AC_CHECK_LIB(mlx5, + mlx5dv_init_obj, [VERBS_LIBS="$VERBS_LIBS -lmlx5" vma_cv_directverbs=$vma_cv_verbs])]) + ;; + *) + AC_MSG_ERROR([Unrecognized parameter 'vma_cv_verbs' as $vma_cv_verbs]) + ;; +esac +AC_MSG_CHECKING([for direct verbs support]) +if test "$vma_cv_directverbs" -ne 0; then + AC_DEFINE_UNQUOTED([DEFINED_DIRECT_VERBS], [$vma_cv_directverbs], [Direct VERBS support]) + AC_MSG_RESULT([yes]) +else + AC_MSG_RESULT([no]) +fi + + +# Check +# +CHECK_VERBS_ATTRIBUTE([IBV_CQ_ATTR_MODERATE], [infiniband/verbs.h], [IBV_CQ_ATTR_MODERATE]) +CHECK_VERBS_ATTRIBUTE([IBV_QPT_RAW_PACKET], [infiniband/verbs.h]) +CHECK_VERBS_ATTRIBUTE([IBV_WC_WITH_VLAN], [infiniband/verbs.h]) +CHECK_VERBS_ATTRIBUTE([IBV_ACCESS_ALLOCATE_MR], [infiniband/verbs.h]) +CHECK_VERBS_ATTRIBUTE([IBV_QP_CREATE_SOURCE_QPN], [infiniband/verbs.h], [IBV_QP_INIT_SOURCE_QPN]) +CHECK_VERBS_ATTRIBUTE([IBV_FLOW_SPEC_IB], [infiniband/verbs.h], [IBV_FLOW_SPEC_IB]) +CHECK_VERBS_ATTRIBUTE([IBV_DEVICE_RAW_IP_CSUM], [infiniband/verbs.h]) +CHECK_VERBS_ATTRIBUTE([IBV_SEND_IP_CSUM], [infiniband/verbs.h]) +CHECK_VERBS_ATTRIBUTE([IBV_FLOW_SPEC_ACTION_TAG], [infiniband/verbs.h], [IBV_FLOW_TAG]) +CHECK_VERBS_ATTRIBUTE([IBV_WC_EX_WITH_COMPLETION_TIMESTAMP], [infiniband/verbs.h], [IBV_CQ_TIMESTAMP]) +CHECK_VERBS_MEMBER([struct ibv_device_attr_ex.orig_attr], [infiniband/verbs.h], [IBV_DEVICE_ATTR_EX]) +CHECK_VERBS_MEMBER([struct ibv_alloc_dm_attr.length], [infiniband/verbs.h], [IBV_DM]) +CHECK_VERBS_MEMBER([struct ibv_packet_pacing_caps.qp_rate_limit_min], [infiniband/verbs.h], [IBV_PACKET_PACING_CAPS]) +CHECK_VERBS_MEMBER([struct ibv_qp_rate_limit_attr.max_burst_sz], [infiniband/verbs.h], [IBV_QP_SUPPORT_BURST]) + +# Check +# +if test "x$vma_cv_verbs" == x2; then + CHECK_VERBS_ATTRIBUTE([IBV_EXP_CQ_MODERATION], [infiniband/verbs_exp.h], [IBV_CQ_ATTR_MODERATE]) + CHECK_VERBS_ATTRIBUTE([IBV_EXP_WR_NOP], [infiniband/verbs_exp.h], [IBV_WR_NOP]) + CHECK_VERBS_ATTRIBUTE([IBV_EXP_ACCESS_ALLOCATE_MR], [infiniband/verbs_exp.h]) + CHECK_VERBS_ATTRIBUTE([IBV_EXP_QP_INIT_ATTR_ASSOCIATED_QPN], [infiniband/verbs_exp.h], [IBV_QP_INIT_SOURCE_QPN]) + CHECK_VERBS_ATTRIBUTE([IBV_EXP_FLOW_SPEC_IB], [infiniband/verbs_exp.h], [IBV_FLOW_SPEC_IB]) + CHECK_VERBS_ATTRIBUTE([IBV_EXP_SEND_IP_CSUM], [infiniband/verbs_exp.h]) + CHECK_VERBS_ATTRIBUTE([IBV_EXP_DEVICE_ATTR_MAX_DM_SIZE], [infiniband/verbs_exp.h], [IBV_DM]) + CHECK_VERBS_ATTRIBUTE([IBV_EXP_QP_RATE_LIMIT], [infiniband/verbs_exp.h], [IBV_PACKET_PACING_CAPS]) + CHECK_VERBS_ATTRIBUTE([IBV_EXP_QP_SUPPORT_BURST], [infiniband/verbs_exp.h], [IBV_QP_SUPPORT_BURST]) + CHECK_VERBS_ATTRIBUTE([IBV_EXP_WR_TSO], [infiniband/verbs_exp.h], [OPCODE_TSO]) + CHECK_VERBS_ATTRIBUTE([IBV_EXP_DEVICE_CROSS_CHANNEL], [infiniband/verbs_exp.h], [IBV_DEVICE_CROSS_CHANNEL]) + + # + # Experimental Verbs CQ + # + AC_ARG_ENABLE([exp-cq], + AC_HELP_STRING([--disable-exp-cq], + [Disable experimental Verbs CQ (disables UDP RX HW Timestamp, RX CSUM verification offload and Multi Packet RQ)]), + [enable_exp_cq=no], + [enable_exp_cq=yes] + ) + + AS_IF([test "x$enable_exp_cq" == xyes], + [AC_DEFINE([DEFINED_IBV_EXP_CQ], 1, [Define to 1 if Experimental Verbs CQ was enabled at configure time])] + + CHECK_VERBS_ATTRIBUTE([IBV_EXP_CQ_TIMESTAMP], [infiniband/verbs_exp.h], [IBV_CQ_TIMESTAMP]) + CHECK_VERBS_ATTRIBUTE([IBV_EXP_VALUES_CLOCK_INFO], [infiniband/verbs_exp.h], [IBV_CLOCK_INFO]) + CHECK_VERBS_ATTRIBUTE([IBV_EXP_DEVICE_RX_CSUM_L4_PKT], [infiniband/verbs_exp.h]) + CHECK_VERBS_ATTRIBUTE([IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT], [infiniband/verbs_exp.h]) + CHECK_VERBS_ATTRIBUTE([IBV_EXP_FLOW_SPEC_ACTION_TAG], [infiniband/verbs_exp.h], [IBV_FLOW_TAG]) + ) + + have_mp_rq=yes + AC_CHECK_DECLS([IBV_EXP_DEVICE_ATTR_VLAN_OFFLOADS, + IBV_EXP_DEVICE_ATTR_MAX_CTX_RES_DOMAIN, + IBV_EXP_CQ_RX_UDP_PACKET, + MLX5_CQE_L3_HDR_TYPE_MASK, + MLX5_CQE_L4_OK, + MLX5_CQE_L4_HDR_TYPE_UDP], + [], + [have_mp_rq=no], + [[#include ] + [#include ]]) + + AC_MSG_CHECKING([for multi packet RQ support]) + AS_IF([test "x$have_mp_rq" == xyes -a "x$enable_exp_cq" == xyes -a "x$vma_cv_directverbs" == x2], + [AC_DEFINE([HAVE_MP_RQ], 1, [MP_RQ QP supported])] [AC_MSG_RESULT([yes (warning: this feature is deprecated and will be removed in a future release)])], + [AC_MSG_RESULT([no])] + ) + + AC_CHECK_FUNCS([rdma_lib_reset]) + AC_CHECK_FUNCS([ibv_exp_get_device_list]) +fi + +# Check Upstream +# +if test "x$vma_cv_verbs" == x3; then + CHECK_VERBS_ATTRIBUTE([IBV_WR_TSO], [infiniband/verbs.h], [OPCODE_TSO]) + + if test "x$vma_cv_directverbs" == x3; then + CHECK_VERBS_ATTRIBUTE([MLX5_OPCODE_NOP], [infiniband/mlx5dv.h], [IBV_WR_NOP]) + CHECK_VERBS_MEMBER([struct mlx5dv_clock_info.last_cycles], [infiniband/mlx5dv.h], [IBV_CLOCK_INFO]) + fi +fi + +# Restore LIBS +LIBS=$verbs_saved_libs +]) diff --git a/configure.ac b/configure.ac new file mode 100644 index 0000000..1df73e2 --- /dev/null +++ b/configure.ac @@ -0,0 +1,501 @@ +# Indicate that we require autoconf 2.59 or later. Ths is needed because we +# use some autoconf macros only available in 2.59. +# +AC_PREREQ(2.59) + + +dnl===-----------------------------------------------------------------------=== +dnl=== +dnl=== SECTION 1: Setting product version +dnl=== +dnl===-----------------------------------------------------------------------=== + +# Update version number here: +# +define([vma_ver_major], 9) +define([vma_ver_minor], 0) +define([vma_ver_revision], 2) +define([vma_ver_release], 0) + + +# Initialize autoconf and define the package name, version number and +# email address for reporting bugs. +# +AC_INIT(libvma, [vma_ver_major.vma_ver_minor.vma_ver_revision], support@mellanox.com) + +# Definitions will be placed in this file rather than +# in the DEFS variable +# +AC_CONFIG_HEADER([config.h]) + +VMA_LIBRARY_MAJOR=9 +VMA_LIBRARY_MINOR=0 +VMA_LIBRARY_REVISION=2 +VMA_LIBRARY_RELEASE=0 + +AC_DEFINE_UNQUOTED(VMA_LIBRARY_MAJOR, ${VMA_LIBRARY_MAJOR}, [VMA Major Version]) +AC_SUBST(VMA_LIBRARY_MAJOR) +AC_DEFINE_UNQUOTED(VMA_LIBRARY_MINOR, ${VMA_LIBRARY_MINOR}, [VMA Minor Version]) +AC_SUBST(VMA_LIBRARY_MINOR) +AC_DEFINE_UNQUOTED(VMA_LIBRARY_REVISION, ${VMA_LIBRARY_REVISION}, [VMA Revision]) +AC_SUBST(VMA_LIBRARY_REVISION) +AC_DEFINE_UNQUOTED(VMA_LIBRARY_RELEASE, ${VMA_LIBRARY_RELEASE}, [VMA Release]) +AC_SUBST(VMA_LIBRARY_RELEASE) + +AM_CONDITIONAL(IS_RELEASE_ZERO, test ${VMA_LIBRARY_RELEASE} -eq 0) + +GIT_VER=`git describe --long --abbrev=40 --dirty --tags 2> /dev/null || echo ""` +if test -n "$GIT_VER"; then GIT_VER=`echo $GIT_VER | sed -e 's/-dirty/+/' | sed s/.*-g//`; else GIT_VER=""; fi + +AC_DEFINE_UNQUOTED(VMA_GIT_VERSION, "${GIT_VER}", [VMA Git Version]) + +dateopt="" + if test -n "$SOURCE_DATE_EPOCH" ; then + dateopt="-u -d @$SOURCE_DATE_EPOCH" +fi +AC_SUBST([BUILD_DATE], [$(date $dateopt +'%b/%d/%Y')]) +AC_SUBST([BUILD_TIME], [$(date $dateopt +'%H:%M:%S')]) +AC_SUBST([BUILD_DATE_CHANGELOG], [$(date $dateopt +'%a, %d %b %Y %T %z')]) + +dnl===-----------------------------------------------------------------------=== +dnl=== +dnl=== SECTION 2: Initialization & Setup +dnl=== +dnl===-----------------------------------------------------------------------=== + +# Verify that the source directory is valid. +# +AC_CONFIG_SRCDIR(src) + +# Place for the extra autoconf files. +# +AC_CONFIG_AUX_DIR(config/aux) + +# Place all our m4 macro into the config subdirectory. +# +AC_CONFIG_MACRO_DIR(config/m4) + +# Init automake and libtool +# +AM_INIT_AUTOMAKE(foreign [subdir-objects]) + +# Set non-verbose make by default +# +m4_ifdef([AM_SILENT_RULES], [AM_SILENT_RULES([yes])]) + +m4_include([config/m4/func.m4]) +m4_include([config/m4/opt.m4]) +m4_include([config/m4/verbs.m4]) +m4_include([config/m4/dpcp.m4]) +m4_include([config/m4/nl.m4]) +m4_include([config/m4/prof.m4]) + +FUNC_CONFIGURE_INIT() + + +dnl===-----------------------------------------------------------------------=== +dnl=== +dnl=== SECTION 3: Checking for programs we need +dnl=== +dnl===-----------------------------------------------------------------------=== +show_section_title "Configure build tools" + +# Find compiler, libtools, etc +# +AC_PROG_CC +AC_PROG_CXX +AC_PROG_LIBTOOL +AC_PROG_YACC +AM_PROG_LEX + +# Check for pkg-config package +# +AC_CHECK_PROG( + [have_pkg_config], + [pkg-config], + m4_ifdef([PKG_PROG_PKG_CONFIG], yes, no_pkg_m4), + no_pkg_config) +case "$have_pkg_config" in + no_pkg_m4) + AC_MSG_ERROR([ + *** You do not have pkg.m4 properly installed. + *** aclocal can not find one. + *** Set the environment variable ACLOCAL="aclocal -I/path/to/pkg.m4" + ]) + ;; + no_pkg_config) + AC_MSG_ERROR([ + *** The pkg-config is required to build the library. + *** Make sure it is installed or set path to pkg-config. + ]) + ;; + *) + ;; +esac +PKG_PROG_PKG_CONFIG() + + +dnl===-----------------------------------------------------------------------=== +dnl=== +dnl=== SECTION 4: Setting compiler specific options +dnl=== +dnl===-----------------------------------------------------------------------=== +show_section_title "Setting compiler specific options" + +AC_MSG_CHECKING([for compiler]) +case $CC in + gcc*|g++*) + AC_MSG_RESULT([gcc]) + CFLAGS="$CFLAGS -Wall -Wextra -Werror -Wundef -ffunction-sections -fdata-sections -Wsequence-point -pipe -Winit-self -Wmissing-include-dirs" + CXXFLAGS="$CXXFLAGS -Wshadow -Wall -Wextra -Werror -Wundef -ffunction-sections -fdata-sections -Wsequence-point -pipe -Winit-self -Wmissing-include-dirs" + ;; + icc*|icpc*) + AC_MSG_RESULT([icc]) + CFLAGS="$CFLAGS -Wall -Werror" + CXXFLAGS="$CXXFLAGS -Wall -Werror" + ;; + clang*|clang++*) + AC_MSG_RESULT([clang]) + CFLAGS="$CFLAGS -Wall -Werror -Wno-format-security -Wno-self-assign" + CXXFLAGS="$CXXFLAGS -Wall -Werror -Wno-overloaded-virtual" + ;; + *) + AC_MSG_RESULT([unknown]) + ;; +esac + + +dnl===-----------------------------------------------------------------------=== +dnl=== +dnl=== SECTION 5: Checking for project configuration +dnl=== +dnl===-----------------------------------------------------------------------=== +show_section_title "Configure project" + +OS=`cat /etc/issue | awk '{ print $3}'` + +CFLAGS="-D_GNU_SOURCE -fPIC $CFLAGS" +CXXFLAGS="-D_GNU_SOURCE -fPIC $CXXFLAGS" + +# gcov support +# +AC_MSG_CHECKING( + [for gcov support]) +AC_ARG_ENABLE([gcov], + AC_HELP_STRING([--enable-gcov], + [turn on code coverage analysis tools]), + [CFLAGS+=" --coverage -O0"; CXXFLAGS+=" --coverage -O0"; LIBS+=" -lgcov"; VMA_GCOV=1; + AC_MSG_RESULT([yes])], + [VMA_GCOV=0 + AC_MSG_RESULT([no])] +) + +# Performance time points support +# +AC_MSG_CHECKING( + [for time measurement support]) +AC_ARG_ENABLE([time_measure], + AC_HELP_STRING([--enable-time-measure], + [turn on time measuring]), + [CPPFLAGS+=" -DVMA_TIME_MEASURE"; + AC_MSG_RESULT([yes])], + [AC_MSG_RESULT([no])] +) + +# Valgrind support +# +AC_ARG_WITH([valgrind], + AC_HELP_STRING([--with-valgrind], + [Enable Valgrind annotations (small runtime overhead, default NO)]), + [], + [with_valgrind=no] +) + +AC_MSG_CHECKING( + [for valgrind support]) +AS_IF([test "x$with_valgrind" == xno], + [AC_DEFINE([NVALGRIND], 1, [Define to 1 to disable Valgrind annotations.]) + AC_MSG_RESULT([no]) + ], + [AC_CHECK_HEADER([valgrind/memcheck.h], [], + [AC_MSG_ERROR([Valgrind memcheck support requested, but not found, install valgrind-devel rpm.])]) + if test -d $with_valgrind; then + CPPFLAGS="$CPPFLAGS -I$with_valgrind/include" + fi + AC_MSG_RESULT([yes]) + ] +) + +# Debug configuration. +# +with_debug_info=yes +with_debug=no +AC_ARG_ENABLE([debug], + AC_HELP_STRING([--enable-debug], + [Enable debug mode build]), + [with_debug=yes]) + +AC_ARG_ENABLE([debuginfo], + AC_HELP_STRING([--disable-debuginfo], + [Don't include debug information]), + [with_debug_info=no; with_debug=no]) + +if test "x${with_debug}" = "xyes" ; then + CFLAGS="-g -D_DEBUG $CFLAGS" + CXXFLAGS="-g -D_DEBUG $CXXFLAGS" +else + CFLAGS="$CFLAGS -O3" + CXXFLAGS="$CXXFLAGS -O3" + + if test "x${with_debug_info}" = "xyes" ; then + CFLAGS="-g $CFLAGS" + CXXFLAGS="-g $CXXFLAGS" + fi +fi + +if test ${date:-""} != "" ; then + CFLAGS="-DVMA_DATE_TIME='\"$date-$time\"' $CFLAGS" + CXXFLAGS="-DVMA_DATE_TIME='\"$date-$time\"' $CXXFLAGS" +fi + +if test ${revision:-0} -ne 0 ; then + CFLAGS="-DVMA_SVN_REVISION=$revision $CFLAGS" + CXXFLAGS="-DVMA_SVN_REVISION=$revision $CXXFLAGS" +fi + +# OFED configuration. +# +AC_MSG_CHECKING([for OFED path]) +AC_ARG_WITH(ofed, + AC_HELP_STRING([--with-ofed], [Path to OFED install]), + [ac_cv_ofed_path=$withval], + [if test -e "/etc/infiniband/info" ; then + ac_cv_ofed_path=`grep prefix /etc/infiniband/info | awk -F "=" '{print $2}'` + else + ac_cv_ofed_path="/usr" + fi]) +CPPFLAGS="$CPPFLAGS -I$ac_cv_ofed_path/include" +if test -d "$ac_cv_ofed_path/lib64" ; then + LDFLAGS="$LDFLAGS -L$ac_cv_ofed_path/lib64" +elif test -d "$ac_cv_ofed_path/lib/$(uname -m)-linux-*" ; then + LDFLAGS="$LDFLAGS -L$(ls -d $ac_cv_ofed_path/lib/$(uname -m)-linux-*)" +else + LDFLAGS="$LDFLAGS -L$ac_cv_ofed_path/lib" +fi +AC_MSG_RESULT($ac_cv_ofed_path) + +VERBS_CAPABILITY_SETUP() +OPT_VMA_LOGGING() +PROF_IBPROF_SETUP() + +# Enable internal performance counters +# Note: uncomment setup to activate this ability +# +#PROF_RDTSC_SETUP() + +# VMA SocketXtreme configuration +# +AC_ARG_ENABLE([vmapoll],, + AC_MSG_ERROR([--enable-vmapoll option is deprecated. Please use VMA_SOCKETXTREME=1 environment variable])) + +AC_ARG_ENABLE([socketxtreme],, + AC_MSG_ERROR([--enable-socketxtreme option is deprecated. Please use VMA_SOCKETXTREME=1 environment variable])) + +# Thread locking control +# +AC_ARG_ENABLE([thread-lock], + AC_HELP_STRING([--enable-thread-lock], + [Enable thread locking (default=yes)])) +AC_MSG_CHECKING( + [for thread locking support]) +if test "x$enable_thread_lock" = "xno"; then + if test "x$enable_socketxtreme" = xyes; then + AC_DEFINE([DEFINED_NO_THREAD_LOCK], 1, [Define to 1 to disable thread locking]) + AC_MSG_RESULT([no]) + else + AC_MSG_RESULT([yes (socketxtreme is not enabled)]) + fi +else + AC_MSG_RESULT([yes]) +fi + +# Enable tcp tx window availability +# +AC_ARG_ENABLE([tcp-tx-wnd-availability], + AC_HELP_STRING([--enable-tcp-tx-wnd-availability], + [Enable TCP Tx window availability + (TCP packets will only be sent if their size (hdr options + data) is less than or equal to the window size. + Otherwise -1 is returned and errno is set to EAGAIN) (default=no)])) +AC_MSG_CHECKING( + [for tcp tx window availability support]) +if test "x$enable_tcp_tx_wnd_availability" = "xyes"; then + AC_DEFINE(DEFINED_TCP_TX_WND_AVAILABILITY, 1, [Define to 1 to enable TCP Tx window availability]) + AC_MSG_RESULT([yes]) +else + AC_MSG_RESULT([no]) +fi + +# Control TSO usage +# +AC_ARG_ENABLE([tso], + AC_HELP_STRING([--enable-tso], + [Enable TSO availability (default=no)])) +AC_MSG_CHECKING( + [for tso support]) +if test "x$enable_tso" = xyes -a "x$vma_cv_attribute_ex_OPCODE_TSO" = xyes; then + AC_DEFINE_UNQUOTED([DEFINED_TSO], [1], [Define to 1 to use TSO]) + AC_MSG_RESULT([yes]) +else + AC_MSG_RESULT([no]) +fi + +# DPCP Library support +# +if test "x$vma_cv_directverbs" == x3; then + DPCP_CAPABILITY_SETUP() +fi + +AC_MSG_CHECKING([for md5 version of VMA statistics is]) +STATS_PROTOCOL_VER=`md5sum ${srcdir}/src/vma/util/vma_stats.h | awk '{ print $1}'` +AC_DEFINE_UNQUOTED(STATS_PROTOCOL_VER, "${STATS_PROTOCOL_VER}", [Stats Protocol Version]) +AC_SUBST(STATS_PROTOCOL_VER) +AC_MSG_RESULT(${STATS_PROTOCOL_VER}) + + +dnl===-----------------------------------------------------------------------=== +dnl=== +dnl=== SECTION 6: Checking for header files +dnl=== +dnl===-----------------------------------------------------------------------=== +show_section_title "Check for header files" + +# Look for Standard headers +# +AC_HEADER_STDC + +AC_CHECK_HEADERS([sys/prctl.h sys/inotify.h sys/fanotify.h sys/capability.h linux/ethtool.h]) + + +dnl===-----------------------------------------------------------------------=== +dnl=== +dnl=== SECTION 7: Checking for libraries +dnl=== +dnl===-----------------------------------------------------------------------=== +show_section_title "Check for libraries" + +AC_CHECK_LIB([stdc++], [atoi]) +AC_CHECK_LIB([dl], [dlsym]) +AC_CHECK_LIB([rt], [clock_gettime]) +AC_CHECK_LIB([pthread], [pthread_create]) + +CHECK_NL_LIB() + +dnl===-----------------------------------------------------------------------=== +dnl=== +dnl=== SECTION 8: Checking for types and structures +dnl=== +dnl===-----------------------------------------------------------------------=== +show_section_title "Check for functions, types and structures" + +# Does this compiler have built-in functions for atomic memory access? +AC_MSG_CHECKING([for atomic memory access (__sync_bool_compare_and_swap) support]) +AC_TRY_LINK(, +[ + int variable = 1; + return (__sync_bool_compare_and_swap(&variable, 1, 2) + && __sync_add_and_fetch(&variable, 1)) ? 1 : 0; +], +[ + AC_MSG_RESULT([yes]) + AC_DEFINE(HAVE_BUILTIN_ATOMIC, 1, [Define to 1 if gcc supports __sync_bool_compare_and_swap() a.o.]) +], +[ + AC_MSG_RESULT([no]) +]) + +AC_MSG_CHECKING([for SOF_TIMESTAMPING_SOFTWARE support]) +AC_TRY_LINK( +#include +, +[ + int ts = (int)SOF_TIMESTAMPING_SOFTWARE; + ts = ts; +], +[ + AC_MSG_RESULT([yes]) + AC_DEFINE(DEFINED_MISSING_NET_TSTAMP, 0, [Define to 0 if linux/net_tstamp.h exists]) +], +[ + AC_MSG_RESULT([no]) + AC_DEFINE(DEFINED_MISSING_NET_TSTAMP, 1, [Define to 1 if linux/net_tstamp.h is missing]) +]) + +AC_MSG_CHECKING([for FRA_OIFNAME enum value support]) +AC_TRY_LINK( +#include +, +[ + int oif = (int)FRA_OIFNAME; + oif = oif; +], +[ + AC_MSG_RESULT([yes]) + AC_DEFINE(DEFINED_FRA_OIFNAME, 1, [Define to 1 if enum value FRA_OIFNAME exists in linux/fib_rules.h]) +], +[ + AC_MSG_RESULT([no]) + AC_DEFINE(DEFINED_FRA_OIFNAME, 0, [Define to 0 if enum value FRA_OIFNAME does not exist in linux/fib_rules.h]) +]) + +AC_CHECK_TYPES([struct mmsghdr],[],[],[#include ]) + +AC_MSG_CHECKING([for 'struct timespec' for recvmmsg() const]) +AC_TRY_LINK( +#include +, +[ + const struct timespec ts = {0,0}; + recvmmsg(0,0,0,0,&ts); +], +[ + AC_MSG_RESULT([yes]) + AC_DEFINE(RECVMMSG_WITH_CONST_TIMESPEC, 1, [Define to 1 if 'struct timespec' for recvmmsg() is 'const struct timespec']) +], +[ + AC_MSG_RESULT([no]) +]) + +dnl===-----------------------------------------------------------------------=== +dnl=== +dnl=== SECTION 9: Configure makefiles +dnl=== +dnl===-----------------------------------------------------------------------=== +show_section_title "Configure makefiles" +AC_CONFIG_FILES([ + Makefile + src/Makefile + src/vma/Makefile + src/vma/infra/Makefile + src/vma/netlink/Makefile + src/utils/Makefile + src/vlogger/Makefile + src/stats/Makefile + src/state_machine/Makefile + tests/Makefile + tests/timetest/Makefile + tests/gtest/Makefile + tests/pps_test/Makefile + tests/latency_test/Makefile + tests/throughput_test/Makefile + tools/Makefile + tools/daemon/Makefile + contrib/scripts/vma.init + contrib/scripts/vma.service + build/libvma.spec + debian/changelog + VMA_VERSION + ]) + +AC_OUTPUT + +show_summary_title diff --git a/contrib/jenkins_tests/build.sh b/contrib/jenkins_tests/build.sh new file mode 100755 index 0000000..5d0de6f --- /dev/null +++ b/contrib/jenkins_tests/build.sh @@ -0,0 +1,37 @@ +#!/bin/bash -eExl + +source $(dirname $0)/globals.sh + +do_check_filter "Checking for building with gcc ..." "off" + +cd $WORKSPACE + +rm -rf ${build_dir} +mkdir -p ${build_dir} +cd ${build_dir} + +# Set symbolic links to default build and install +ln -s "${build_dir}/0/install" "${install_dir}" + +build_list="\ +default: \ +opt-log:--enable-opt-log=no" + + +build_tap=${WORKSPACE}/${prefix}/build.tap +echo "1..$(echo $build_list | tr " " "\n" | wc -l)" > $build_tap + +test_id=0 +for build in $build_list; do + IFS=':' read build_name build_option <<< "$build" + mkdir -p ${build_dir}/${test_id} + cd ${build_dir}/${test_id} + test_exec='${WORKSPACE}/configure --prefix=${build_dir}/${test_id}/install $build_option $jenkins_test_custom_configure && make $make_opt install' + do_check_result "$test_exec" "$test_id" "$build_name" "$build_tap" "${build_dir}/build-${test_id}" + cd ${build_dir} + test_id=$((test_id+1)) +done + + +echo "[${0##*/}]..................exit code = $rc" +exit $rc diff --git a/contrib/jenkins_tests/compiler.sh b/contrib/jenkins_tests/compiler.sh new file mode 100755 index 0000000..1523988 --- /dev/null +++ b/contrib/jenkins_tests/compiler.sh @@ -0,0 +1,35 @@ +#!/bin/bash -eExl + +source $(dirname $0)/globals.sh + +do_check_filter "Checking for compiler ..." "on" + +do_module "intel/ics-15.0.1" + +cd $WORKSPACE + +rm -rf $compiler_dir +mkdir -p $compiler_dir +cd $compiler_dir + +compiler_list="icc:icpc clang:clang++" + +compiler_tap=${WORKSPACE}/${prefix}/compiler.tap +echo "1..$(echo $compiler_list | tr " " "\n" | wc -l)" > $compiler_tap + +test_id=0 +for compiler in $compiler_list; do + IFS=':' read cc cxx <<< "$compiler" + mkdir -p ${compiler_dir}/${test_id} + cd ${compiler_dir}/${test_id} + test_exec='${WORKSPACE}/configure --prefix=$compiler_dir-$cc CC=$cc CXX=$cxx $jenkins_test_custom_configure && make $make_opt all' + do_check_result "$test_exec" "$test_id" "$compiler" "$compiler_tap" "${compiler_dir}/compiler-${test_id}" + cd ${compiler_dir} + test_id=$((test_id+1)) +done + +module unload intel/ics-15.0.1 + + +echo "[${0##*/}]..................exit code = $rc" +exit $rc diff --git a/contrib/jenkins_tests/cov.sh b/contrib/jenkins_tests/cov.sh new file mode 100755 index 0000000..fca004b --- /dev/null +++ b/contrib/jenkins_tests/cov.sh @@ -0,0 +1,79 @@ +#!/bin/bash -xeEl + +source $(dirname $0)/globals.sh + +do_check_filter "Checking for coverity ..." "on" + +do_module "tools/cov-8.7" + +cd $WORKSPACE + +rm -rf $cov_dir +mkdir -p $cov_dir +cd $cov_dir + +cov_exclude_file_list="tests" + +cov_build_id="cov_build_${BUILD_NUMBER}" +cov_build="$cov_dir/$cov_build_id" + +set +eE + +${WORKSPACE}/configure --prefix=${cov_dir}/install $jenkins_test_custom_configure > "${cov_dir}/cov.log" 2>&1 +make clean >> "${cov_dir}/cov.log" 2>&1 +eval "cov-configure --config $cov_dir/coverity_config.xml --gcc" +eval "cov-build --config $cov_dir/coverity_config.xml --dir $cov_build make >> "${cov_dir}/cov.log" 2>&1" +rc=$(($rc+$?)) + +for excl in $cov_exclude_file_list; do + cov-manage-emit --config $cov_dir/coverity_config.xml --dir $cov_build --tu-pattern "file('$excl')" delete + sleep 1 +done + +eval "cov-analyze --config $cov_dir/coverity_config.xml \ + --all --aggressiveness-level low \ + --enable-fnptr --fnptr-models --paths 20000 \ + --disable-parse-warnings \ + --dir $cov_build" +rc=$(($rc+$?)) + +set -eE + +cov_web_path="$(echo $cov_build | sed -e s,$WORKSPACE,,g)" +nerrors=$(cov-format-errors --dir $cov_build | awk '/Processing [0-9]+ errors?/ { print $2 }') +rc=$(($rc+$nerrors)) + +index_html=$(cd $cov_build && find . -name index.html | cut -c 3-) +cov_url="$WS_URL/$cov_web_path/${index_html}" +cov_file="$cov_build/${index_html}" + +rm -f jenkins_sidelinks.txt + +coverity_tap=${WORKSPACE}/${prefix}/coverity.tap + +echo 1..1 > $coverity_tap +if [ $rc -gt 0 ]; then + echo "not ok 1 Coverity Detected $nerrors failures # $cov_url" >> $coverity_tap + do_err "coverity" "${cov_build}/output/summary.txt" + info="Coverity found $nerrors errors" + status="error" +else + echo ok 1 Coverity found no issues >> $coverity_tap + info="Coverity found no issues" + status="success" +fi + +if [ -n "$ghprbGhRepository" ]; then + context="MellanoxLab/coverity" + do_github_status "repo='$ghprbGhRepository' sha1='$ghprbActualCommit' target_url='$cov_url' state='$status' info='$info' context='$context'" +fi + +echo Coverity report: $cov_url +printf "%s\t%s\n" Coverity $cov_url >> jenkins_sidelinks.txt + +module unload tools/cov-8.7 + +do_archive "$( find ${cov_build}/output -type f -name "*.txt" -or -name "*.html" -or -name "*.xml" )" + +echo "[${0##*/}]..................exit code = $rc" +exit $rc diff --git a/contrib/jenkins_tests/cppcheck.sh b/contrib/jenkins_tests/cppcheck.sh new file mode 100755 index 0000000..9cab45e --- /dev/null +++ b/contrib/jenkins_tests/cppcheck.sh @@ -0,0 +1,48 @@ +#!/bin/bash -xeEl + +source $(dirname $0)/globals.sh + +do_check_filter "Checking for cppcheck ..." "on" + +# This unit requires cppcheck so check for existence +if [ $(command -v cppcheck >/dev/null 2>&1 || echo $?) ]; then + echo "[SKIP] cppcheck tool does not exist" + exit 0 +fi + +cd $WORKSPACE + +rm -rf $cppcheck_dir +mkdir -p $cppcheck_dir +cd $cppcheck_dir + +set +eE +eval "find ${WORKSPACE}/src -name '*.h' -o -name '*.cpp' -o -name '*.c' -o -name '*.hpp' -o -name '*.inl' | \ + cppcheck --std=c99 --std=c++11 --language=c++ --force --enable=information \ + --inline-suppr --suppress=memleak:config_parser.y \ + --template='{severity}: {id}: {file}:{line}: {message}' \ + --file-list=- 2> ${cppcheck_dir}/cppcheck.err 1> ${cppcheck_dir}/cppcheck.log" +rc=$(($rc+$?)) +set -eE + +nerrors=$(cat ${cppcheck_dir}/cppcheck.err | grep error | wc -l) +rc=$(($rc+$nerrors)) + +cppcheck_tap=${WORKSPACE}/${prefix}/cppcheck.tap + +echo 1..1 > $cppcheck_tap +if [ $rc -gt 0 ]; then + echo "not ok 1 cppcheck Detected $nerrors failures # ${cppcheck_dir}/cppcheck.err" >> $cppcheck_tap + do_err "cppcheck" "${cppcheck_dir}/cppcheck.err" + info="cppcheck found $nerrors errors" + status="error" +else + echo ok 1 cppcheck found no issues >> $cppcheck_tap + info="cppcheck found no issues" + status="success" +fi + +do_archive "${cppcheck_dir}/cppcheck.err" "${cppcheck_dir}/cppcheck.log" + +echo "[${0##*/}]..................exit code = $rc" +exit $rc diff --git a/contrib/jenkins_tests/csbuild.sh b/contrib/jenkins_tests/csbuild.sh new file mode 100755 index 0000000..4fc51a0 --- /dev/null +++ b/contrib/jenkins_tests/csbuild.sh @@ -0,0 +1,69 @@ +#!/bin/bash -xeEl + +source $(dirname $0)/globals.sh + +do_check_filter "Checking for csbuild ..." "on" + +# This unit requires csbuild so check for existence +if [ $(command -v csbuild >/dev/null 2>&1 || echo $?) ]; then + echo "[SKIP] csbuild tool does not exist" + exit 0 +fi + +# There is a bug in gcc less than 4.5 +if [ $(echo `gcc -dumpversion | cut -f1-2 -d.` \< 4.5 | bc ) -eq 1 ]; then + echo "[SKIP] csbuild tool can not launch on this gcc" + exit 0 +fi + +cd $WORKSPACE + +rm -rf $csbuild_dir +mkdir -p $csbuild_dir +cd $csbuild_dir + +set +eE + +${WORKSPACE}/configure --prefix=${csbuild_dir}/install $jenkins_test_custom_configure > "${csbuild_dir}/csbuild.log" 2>&1 +make clean + +eval "csbuild --cswrap-timeout=180 --no-clean -c \"make $make_opt \" > \"${csbuild_dir}/csbuild.log\" 2>&1" +rc=$(($rc+$?)) + +eval "csgrep --quiet --event 'error|warning' \ + --path '^${WORKSPACE}' --strip-path-prefix '${WORKSPACE}' \ + --remove-duplicates '${csbuild_dir}/csbuild.log' | \ + csgrep --invert-match --path '^ksh-.*[0-9]+\.c$' | \ + csgrep --invert-match --checker CLANG_WARNING --event error | \ + csgrep --invert-match --checker CLANG_WARNING --msg \"internal warning\" | \ + csgrep --invert-match --checker COMPILER_WARNING --msg \"-Woverloaded-virtual\" | \ + csgrep --invert-match --checker COMPILER_WARNING --msg \"-Wformat-nonliteral\" | \ + csgrep --invert-match --checker CPPCHECK_WARNING --event 'preprocessorErrorDirective|syntaxError' | \ + csgrep --mode=grep --invert-match --event 'internal warning' --prune-events=1 | \ + cssort --key=path > ${csbuild_dir}/csbuild.err 2>&1 \ + " +eval "grep 'timed out' ${csbuild_dir}/csbuild.log >> ${csbuild_dir}/csbuild.err 2>&1" + +set -eE + +nerrors=$(cat ${csbuild_dir}/csbuild.err | grep 'Error:\|error:' | wc -l) +rc=$(($rc+$nerrors)) + +csbuild_tap=${WORKSPACE}/${prefix}/csbuild.tap + +echo 1..1 > $csbuild_tap +if [ $rc -gt 0 ]; then + echo "not ok 1 csbuild Detected $nerrors failures # ${csbuild_dir}/csbuild.err" >> $csbuild_tap + do_err "csbuild" "${csbuild_dir}/csbuild.err" + info="csbuild found $nerrors errors" + status="error" +else + echo ok 1 csbuild found no issues >> $csbuild_tap + info="csbuild found no issues" + status="success" +fi + +do_archive "${csbuild_dir}/csbuild.err" "${csbuild_dir}/csbuild.log" + +echo "[${0##*/}]..................exit code = $rc" +exit $rc diff --git a/contrib/jenkins_tests/globals.sh b/contrib/jenkins_tests/globals.sh new file mode 100755 index 0000000..e7e7a76 --- /dev/null +++ b/contrib/jenkins_tests/globals.sh @@ -0,0 +1,304 @@ +#!/bin/bash + +WORKSPACE=${WORKSPACE:=$PWD} +if [ -z "$BUILD_NUMBER" ]; then + echo Running interactive + BUILD_NUMBER=1 + WS_URL=file://$WORKSPACE + JENKINS_RUN_TESTS=yes +else + echo Running under jenkins + WS_URL=$JOB_URL/ws +fi + +TARGET=${TARGET:=all} +i=0 +if [ "$TARGET" == "all" -o "$TARGET" == "default" ]; then + target_list[$i]="default: " + i=$((i+1)) +fi + +# exit code +rc=0 + +jenkins_test_custom_configure=${jenkins_test_custom_configure:=""} +jenkins_test_custom_prefix=${jenkins_test_custom_prefix:="jenkins"} + +prefix=${jenkins_test_custom_prefix}/${jenkins_target} +build_dir=${WORKSPACE}/${prefix}/build/ +install_dir=${WORKSPACE}/${prefix}/install +compiler_dir=${WORKSPACE}/${prefix}/compiler +test_dir=${WORKSPACE}/${prefix}/test +gtest_dir=${WORKSPACE}/${prefix}/gtest +rpm_dir=${WORKSPACE}/${prefix}/rpm +cov_dir=${WORKSPACE}/${prefix}/cov +cppcheck_dir=${WORKSPACE}/${prefix}/cppcheck +csbuild_dir=${WORKSPACE}/${prefix}/csbuild +vg_dir=${WORKSPACE}/${prefix}/vg +style_dir=${WORKSPACE}/${prefix}/style +tool_dir=${WORKSPACE}/${prefix}/tool + + +nproc=$(grep processor /proc/cpuinfo|wc -l) +make_opt="-j$(($nproc / 2 + 1))" +if [ $(command -v timeout >/dev/null 2>&1 && echo $?) ]; then + timeout_exe="timeout -s SIGKILL 20m" +fi + +trap "on_exit" INT TERM ILL KILL FPE SEGV ALRM + +function on_exit() +{ + rc=$((rc + $?)) + echo "[${0##*/}]..................exit code = $rc" + pkill -9 sockperf + pkill -9 vma +} + +function do_cmd() +{ + cmd="$*" + set +e + eval $cmd >> /dev/null 2>&1 + ret=$? + set -e + if [ $ret -gt 0 ]; then + exit $ret + fi +} + +function do_export() +{ + export PATH="$1/bin:${PATH}" + export LD_LIBRARY_PATH="$1/lib:${LD_LIBRARY_PATH}" + export MANPATH="$1/share/man:${MANPATH}" +} + +function do_archive() +{ + cmd="tar -rvf ${jenkins_test_artifacts}.tar $*" + set +e + eval $cmd >> /dev/null 2>&1 + set -e +} + +function do_github_status() +{ + echo "Calling: github $1" + eval "local $1" + + local token="" + if [ -z "$tokenfile" ]; then + tokenfile="$HOME/.mellanox-github" + fi + + if [ -r "$tokenfile" ]; then + token="$(cat $tokenfile)" + else + echo Error: Unable to read tokenfile: $tokenfile + return + fi + + curl \ + -X POST \ + -H "Content-Type: application/json" \ + -d "{\"state\": \"$state\", \"context\": \"$context\",\"description\": \"$info\", \"target_url\": \"$target_url\"}" \ + "https://api.github.com/repos/$repo/statuses/${sha1}?access_token=$token" +} + +# Test if an environment module exists and load it if yes. +# Otherwise, return error code. +# $1 - module name +# +function do_module() +{ + echo "Checking module $1" + if [[ $(module avail 2>&1 | grep "$1" -q > /dev/null || echo $?) ]]; then + echo "[SKIP] module tool does not exist" + exit 0 + else + module load "$1" + fi +} + +# format text +# +function do_format() +{ + set +x + local is_format=true + if [[ $is_format == true ]] ; then + res="" + for ((i=2; i<=$#; i++)) ; do + case "${!i}" in + "bold" ) res="$res\e[1m" ;; + "underline" ) res="$res\e[4m" ;; + "reverse" ) res="$res\e[7m" ;; + "red" ) res="$res\e[91m" ;; + "green" ) res="$res\e[92m" ;; + "yellow" ) res="$res\e[93m" ;; + esac + done + echo -e "$res$1\e[0m" + else + echo "$1" + fi + set -x +} + +# print error message +# +function do_err() +{ + set +x + echo -e $(do_format "FAILURE: $1" "red" "bold") 2>&1 + if [ -n "$2" ]; then + echo ">>>" + cat $2 + echo ">>>" + fi + set -x +} + +# Verify if current environment is suitable. +# +function do_check_env() +{ + echo "Checking system configuration" + if [ $(command -v pkill >/dev/null 2>&1 || echo $?) ]; then + echo "pkill is not found" + echo "environment [NOT OK]" + exit 1 + fi + if [ $(sudo pwd >/dev/null 2>&1 || echo $?) ]; then + echo "sudo does not work" + echo "environment [NOT OK]" + exit 1 + fi + + if [ $(command -v ofed_info >/dev/null 2>&1 || echo $?) ]; then + echo "Configuration: INBOX : ${ghprbTargetBranch}" + export jenkins_ofed=inbox + else + echo "Configuration: MOFED[$(ofed_info -s)] : ${ghprbTargetBranch}" + export jenkins_ofed=$(ofed_info -s | sed 's/.*[l|X]-\([0-9\.]\+\).*/\1/') + fi + + echo "environment [OK]" +} + +# Check if the unit should be proccesed +# $1 - output message +# $2 - [on|off] if on - skip this case if JENKINS_RUN_TESTS variable is OFF +# +function do_check_filter() +{ + local msg=$1 + local filter=$2 + + if [ -n "$filter" -a "$filter" == "on" ]; then + if [ -z "$JENKINS_RUN_TESTS" -o "$JENKINS_RUN_TESTS" == "no" ]; then + echo "$msg [SKIP]" + exit 0 + fi + fi + + echo "$msg [OK]" +} + +# Launch command and detect result of execution +# $1 - test command +# $2 - test id +# $3 - test name +# $4 - test tap file +# $5 - files for stdout/stderr +# +function do_check_result() +{ + set +e + if [ -z "$5" ]; then + eval $timeout_exe $1 + ret=$? + else + eval $timeout_exe $1 2>> "${5}.err" 1>> "${5}.log" + ret=$? + do_archive "${5}.err" "${5}.log" + fi + set -e + if [ $ret -gt 0 ]; then + echo "not ok $2 $3" >> $4 + if [ -z "$5" ]; then + do_err "$1" + else + do_err "$1" "${5}.err" + fi + else + echo "ok $2 $3" >> $4 + fi + rc=$((rc + $ret)) +} + +# Detect interface ip +# $1 - [ib|eth] to select link type or empty to select the first found +# $2 - [empty|mlx4|mlx5] +# $3 - ip address not to get +# +function do_get_ip() +{ + sv_ifs=${IFS} + netdevs=$(ibdev2netdev | grep Up | grep "$2" | cut -f 5 -d ' ') + IFS=$'\n' read -rd '' -a netdev_ifs <<< "${netdevs}" + lnkifs=$(ip -o link | awk '{print $2,$(NF-2)}') + IFS=$'\n' read -rd '' -a lnk_ifs <<< "${lnkifs}" + IFS=${sv_ifs} + ifs_array=() + + for nd_if in "${netdev_ifs[@]}" ; do + found_if='' + for v_if in "${lnk_ifs[@]}" ; do + if [ ! -z "$(echo ${v_if} | grep ${nd_if})" ] ; then + mac=$(echo "${v_if}"| awk '{ print $NF }') #; echo "mac=$mac" + for p_if in "${lnk_ifs[@]}" ; do + if [ ! -z "$(echo ${p_if} | grep -E ${mac} | grep -Ei eth)" ] ; then + if_name=$(echo "${p_if}"| awk '{ print $1}') + ifs_array+=(${if_name::-1}) + #-#echo "${nd_if} --> ${if_name::-1} " + found_if=1 + break 2 + fi + done + fi + done + # use the netdevice if needed + [ -z "${found_if}" ] && { + ifs_array+=(${nd_if}) + } + done + + if [ "${#ifs_array[@]}" -le 1 ] ; then + if (dmesg | grep -i hypervisor > /dev/null 2>&1) ; then + ifs_array=(eth1 eth2) + fi + fi + + for ip in ${ifs_array[@]}; do + if [ -n "$1" -a "$1" == "ib" -a -n "$(ip link show $ip | grep 'link/inf')" ]; then + found_ip=$(ip -4 address show $ip | grep 'inet' | sed 's/.*inet \([0-9\.]\+\).*/\1/') + if [ -n "$(ibdev2netdev | grep $ip | grep mlx5)" ]; then + local ofed_v=$(ofed_info -s | grep OFED | sed 's/.*[l|X]-\([0-9\.]\+\).*/\1/') + if [ $(echo $ofed_v | grep 4.[1-9] >/dev/null 2>&1 || echo $?) ]; then + echo "$ip is CX4 device that does not support IPoIB in OFED: $ofed_v" + unset found_ip + fi + fi + elif [ -n "$1" -a "$1" == "eth" -a -n "$(ip link show $ip | grep 'link/eth')" ]; then + found_ip=$(ip -4 address show $ip | grep 'inet' | sed 's/.*inet \([0-9\.]\+\).*/\1/') + elif [ -z "$1" ]; then + found_ip=$(ip -4 address show $ip | grep 'inet' | sed 's/.*inet \([0-9\.]\+\).*/\1/') + fi + if [ -n "$found_ip" -a "$found_ip" != "$3" ]; then + echo $found_ip + break + fi + done +} diff --git a/contrib/jenkins_tests/gtest.sh b/contrib/jenkins_tests/gtest.sh new file mode 100755 index 0000000..45875a0 --- /dev/null +++ b/contrib/jenkins_tests/gtest.sh @@ -0,0 +1,54 @@ +#!/bin/bash -eExl + +source $(dirname $0)/globals.sh + +do_check_filter "Checking for gtest ..." "on" + +if [ $(command -v ibdev2netdev >/dev/null 2>&1 || echo $?) ]; then + echo "[SKIP] ibdev2netdev tool does not exist" + exit 0 +fi + +cd $WORKSPACE + +rm -rf $gtest_dir +mkdir -p $gtest_dir +cd $gtest_dir + +gtest_app="$PWD/tests/gtest/gtest" +gtest_lib=$install_dir/lib/libvma.so + +gtest_ip_list="" +if [ ! -z $(do_get_ip 'eth') ]; then + gtest_ip_list="$(do_get_ip 'eth')" +fi +if [ ! -z $(do_get_ip 'eth' '' $gtest_ip_list) ]; then + gtest_ip_list="${gtest_ip_list}:$(do_get_ip 'eth' '' $gtest_ip_list)" +else + echo "[SKIP] two eth interfaces are required. found: ${gtest_ip_list}" + exit 0 +fi +gtest_opt="--addr=${gtest_ip_list}" + +set +eE + +${WORKSPACE}/configure --prefix=$install_dir +make -C tests/gtest + +eval "sudo pkill -9 vmad" +eval "sudo ${install_dir}/sbin/vmad --console -v5 &" + +eval "$timeout_exe env GTEST_TAP=2 LD_PRELOAD=$gtest_lib $gtest_app $gtest_opt" +rc=$(($rc+$?)) + +eval "sudo pkill -9 vmad" + +set -eE + +for f in $(find $gtest_dir -name '*.tap') +do + cp $f ${WORKSPACE}/${prefix}/gtest-$(basename $f .tap).tap +done + +echo "[${0##*/}]..................exit code = $rc" +exit $rc diff --git a/contrib/jenkins_tests/rpm.sh b/contrib/jenkins_tests/rpm.sh new file mode 100755 index 0000000..3481e1a --- /dev/null +++ b/contrib/jenkins_tests/rpm.sh @@ -0,0 +1,79 @@ +#!/bin/bash -eExl + +source $(dirname $0)/globals.sh + +do_check_filter "Checking for rpm ..." "off" + +cd $WORKSPACE + +rm -rf $rpm_dir +mkdir -p $rpm_dir +cd $rpm_dir + +rpm_tap=${WORKSPACE}/${prefix}/rpm.tap + +cd ${build_dir}/0 + +if [ -x /usr/bin/dpkg-buildpackage ]; then + echo "Build on debian" + set +e + ${WORKSPACE}/build/build_deb.sh 2> "${rpm_dir}/rpm-deb.err" 1> "${rpm_dir}/rpm-deb.log" + rc=$((rc + $?)) + if [ -f "${WORKSPACE}/build_debian/build_debian.log" ]; then + cp ${WORKSPACE}/build_debian/build_debian.log ${rpm_dir}/rpm-deb.out + else + echo "file: ${WORKSPACE}/build_debian/build_debian.log is not found" > ${rpm_dir}/rpm-deb.out + fi + do_archive "${rpm_dir}/*.err" "${rpm_dir}/*.log" "${rpm_dir}/rpm-deb.out" + set -e + echo "1..1" > $rpm_tap + if [ $rc -gt 0 ]; then + echo "not ok 1 Debian package" >> $rpm_tap + else + echo ok 1 Debian package >> $rpm_tap + fi +else + echo "Build rpms" + rpmspec=${build_dir}/0/build/libvma.spec + rpmmacros="--define='_rpmdir ${rpm_dir}/rpm-dist' --define='_srcrpmdir ${rpm_dir}/rpm-dist' --define='_sourcedir ${rpm_dir}' --define='_specdir ${rpm_dir}' --define='_builddir ${rpm_dir}'" + rpmopts="--nodeps --buildroot='${rpm_dir}/_rpm'" + + opt_tarball=1 + opt_srcrpm=1 + opt_binrpm=1 + + echo "1..$(($opt_tarball + $opt_srcrpm + $opt_binrpm))" > $rpm_tap + + # SuSE can not create this folder + mkdir -p ${rpm_dir}/rpm-dist + + test_id=0 + if [ $opt_tarball -eq 1 ]; then + # Automake 1.10.1 has a bug https://bugs.debian.org/cgi-bin/bugreport.cgi?bug=456632 + if [ -n "$(automake --version | grep 'automake (GNU automake) 1.10.1')" ]; then + test_exec='make dist' + else + test_exec='make dist && make distcheck' + fi + + do_check_result "$test_exec" "$test_id" "tarball" "$rpm_tap" "${rpm_dir}/rpm-${test_id}" + eval $timeout_exe cp libvma*.tar.gz ${rpm_dir} + test_id=$((test_id+1)) + fi + + if [ $opt_srcrpm -eq 1 ]; then + test_exec="rpmbuild -bs $rpmmacros $rpmopts $rpmspec" + do_check_result "$test_exec" "$test_id" "srcrpm" "$rpm_tap" "${rpm_dir}/rpm-${test_id}" + test_id=$((test_id+1)) + fi + + if [ $opt_binrpm -eq 1 ]; then + test_exec="rpmbuild -bb $rpmmacros $rpmopts $rpmspec" + do_check_result "$test_exec" "$test_id" "binrpm" "$rpm_tap" "${rpm_dir}/rpm-${test_id}" + test_id=$((test_id+1)) + fi +fi + + +echo "[${0##*/}]..................exit code = $rc" +exit $rc diff --git a/contrib/jenkins_tests/style.sh b/contrib/jenkins_tests/style.sh new file mode 100755 index 0000000..000441e --- /dev/null +++ b/contrib/jenkins_tests/style.sh @@ -0,0 +1,89 @@ +#!/bin/bash -xeEl + +source $(dirname $0)/globals.sh + +do_check_filter "Checking for codying style ..." "on" + +cd $WORKSPACE + +rm -rf $style_dir +mkdir -p $style_dir +cd $style_dir + +stoplist_pattern=${stoplist_pattern:="WARNING"} + +checkpatch=/hpc/local/scripts/checkpatch/checkpatch.pl +if [ ! -e $checkpatch ]; then + set +e + eval wget --no-check-certificate https://raw.githubusercontent.com/torvalds/linux/master/scripts/checkpatch.pl + ret=$? + if [ $ret -gt 0 ]; then break; fi + eval wget --no-check-certificate https://github.com/torvalds/linux/blob/master/scripts/spelling.txt + ret=$? + if [ $ret -gt 0 ]; then break; fi + chmod +x checkpatch.pl + set -e + checkpatch=$style_dir/checkpatch.pl +fi + +if [ -e $checkpatch ]; then + + style_tap=${WORKSPACE}/${prefix}/style_test.tap + rm -rf $style_tap + check_files=$(find $WORKSPACE/src/state_machine/ -name '*.c' -o -name '*.cpp' -o -name '*.h') + check_files+=" " + check_files+=$(find $WORKSPACE/src/stats/ -name '*.c' -o -name '*.cpp' -o -name '*.h') + check_files+=" " + check_files+=$(find $WORKSPACE/src/vlogger/ -name '*.c' -o -name '*.cpp' -o -name '*.h') + check_files+=" " + check_files+=$(find $WORKSPACE/src/vma/dev/ -name '*.c' -o -name '*.cpp' -o -name '*.h') + check_files+=" " + check_files+=$(find $WORKSPACE/src/vma/event/ -name '*.c' -o -name '*.cpp' -o -name '*.h') + check_files+=" " + check_files+=$(find $WORKSPACE/src/vma/infra/ -name '*.c' -o -name '*.cpp' -o -name '*.h') + check_files+=" " + check_files+=$(find $WORKSPACE/src/vma/iomux/ -name '*.c' -o -name '*.cpp' -o -name '*.h') + check_files+=" " + check_files+=$(find $WORKSPACE/src/vma/netlink/ -name '*.c' -o -name '*.cpp' -o -name '*.h') + check_files+=" " + check_files+=$(find $WORKSPACE/src/vma/proto/ -name '*.c' -o -name '*.cpp' -o -name '*.h') + check_files+=" " + check_files+=$(find $WORKSPACE/src/vma/sock/ -name '*.c' -o -name '*.cpp' -o -name '*.h') + check_files+=" " + check_files+=$(find $WORKSPACE/src/vma -name '*.c' -o -name '*.cpp' -o -name '*.h') + + echo "1..$(echo $check_files | wc -w)" > $style_tap + i=0 + status="success" + gh_url="$BUILD_URL/console" + nerrors=0 + + for file in $check_files; do + set +e + ret=$(perl -X $checkpatch --file --terse --no-tree $file | grep -v -w $stoplist_pattern| wc -l) + nerrors=$((nerrors+ret)) + set -e + i=$((i+1)) + + fix_file=$(echo $file|sed -e s,$WORKSPACE/,,g) + + if [ $ret -gt 0 ]; then + echo "not ok $i $fix_file # TODO" >> $style_tap + #status="error" + info="checkpatch.pl detected $nerrors style errors" + else + echo "ok $i $fix_file" >> $style_tap + fi + done + + rc=$(($rc+$nerrors)) + + if [ -n "$ghprbGhRepository" ]; then + context="MellanoxLab/codestyle" + do_github_status "repo='$ghprbGhRepository' sha1='$ghprbActualCommit' target_url='$gh_url' state='$status' info='$info' context='$context'" + fi + +fi + +echo "[${0##*/}]..................exit code = $rc" +exit $rc diff --git a/contrib/jenkins_tests/test.sh b/contrib/jenkins_tests/test.sh new file mode 100755 index 0000000..e464ec0 --- /dev/null +++ b/contrib/jenkins_tests/test.sh @@ -0,0 +1,155 @@ +#!/bin/bash -eExl + +source $(dirname $0)/globals.sh + +do_check_filter "Checking for test ..." "on" + +if [ $(command -v ibdev2netdev >/dev/null 2>&1 || echo $?) ]; then + echo "[SKIP] ibdev2netdev tool does not exist" + exit 0 +fi + +cd $WORKSPACE + +rm -rf $test_dir +mkdir -p $test_dir +cd $test_dir + +do_cmd "wget -O sockperf_v2.zip https://github.com/Mellanox/sockperf/archive/sockperf_v2.zip && unzip sockperf_v2.zip && mv sockperf-sockperf_v2 sockperf" +cd sockperf + +./autogen.sh +./configure --prefix=$PWD/install CPPFLAGS="-I${install_dir}/include" +make install +test_app="$PWD/install/bin/sockperf" + +if [ $(command -v $test_app >/dev/null 2>&1 || echo $?) ]; then + echo can not find $test_app + exit 1 +fi + +test_ip_list="" +test_list="tcp-pp tcp-tp tcp-ul" +test_lib=$install_dir/lib/libvma.so + +if [ ! -z "${test_remote_ip}" ] ; then + [[ "${test_remote_ip}" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]] || {\ + echo ">> FAIL wrong ip address ${test_remote_ip}" + exit 1 + } + test_ip_list="eth:${test_remote_ip}" + [ -z "${NODE_NAME}" ] && NODE_NAME=$(hostname) + sperf_exec_dir="/tmp/sockperf_exec_${NODE_NAME}" + rmt_user=root + + rmt_os=$(sudo ssh ${rmt_user}@${test_remote_ip} ". /etc/os-release ; echo \${NAME,,} | awk '{print \$1}'") + [ ! -z "${test_remote_rebuild}" ] && rmt_os="rebuld" + local_os=$(. /etc/os-release ; echo ${NAME,,} | awk '{print $1}') + + #skip_remote_prep=1 + if [ -z "${skip_remote_prep}" ] ; then + sudo ssh ${rmt_user}@${test_remote_ip} "rm -rf ${sperf_exec_dir} && mkdir ${sperf_exec_dir}" + + if [[ "${rmt_os}" =~ .*"${local_os}".* ]] ; then + sudo scp -q ${test_app} ${rmt_user}@${test_remote_ip}:${sperf_exec_dir} + sudo scp -q ${test_lib} ${rmt_user}@${test_remote_ip}:${sperf_exec_dir} + eval "pid=$(sudo ssh ${rmt_user}@${test_remote_ip} pidof vmad)" + if [ ! -z "${pid}" ] ; then + echo "vmad pid=${pid}" + eval "sudo ssh ${rmt_user}@${test_remote_ip} kill -9 ${pid}" + fi + sudo scp -q ${install_dir}/sbin/vmad ${rmt_user}@${test_remote_ip}:${sperf_exec_dir} + eval "sudo ssh ${rmt_user}@${test_remote_ip} sudo ${sperf_exec_dir}/vmad &" + else + sudo -E rsync -q -I -a -r --exclude jenkins --exclude '*.o' --exclude '.deps' --exclude '*.l*' \ + -e ssh ${WORKSPACE} ${rmt_user}@${test_remote_ip}:${sperf_exec_dir} + sudo scp -q ${test_dir}/sockperf_v2.zip ${rmt_user}@${test_remote_ip}:${sperf_exec_dir} + if [ $? -eq 0 ] ; then + subdir=${WORKSPACE##*/} + cmd="cd ${sperf_exec_dir}/${subdir} && " + cmd+="./autogen.sh && ./configure && make ${make_opt} && " + cmd+="cp src/vma/.libs/*.so ${sperf_exec_dir} &&" + cmd+="cd ${sperf_exec_dir} && " + cmd+="unzip sockperf_v2.zip && cd sockperf-sockperf_v2 && " + cmd+="./autogen.sh && ./configure && make ${make_opt} && cp sockperf ${sperf_exec_dir}" + sudo ssh ${rmt_user}@${test_remote_ip} "${cmd}" + else + exit 1 + fi + fi + fi +else + if [ ! -z $(do_get_ip 'ib') ]; then + test_ip_list="${test_ip_list} ib:$(do_get_ip 'ib')" + fi + if [ ! -z $(do_get_ip 'eth') ]; then + test_ip_list="${test_ip_list} eth:$(do_get_ip 'eth')" + fi +fi + +nerrors=0 + +for test_link in $test_ip_list; do + for test in $test_list; do + IFS=':' read test_in test_ip <<< "$test_link" + test_name=${test_in}-${test} + test_tap=${WORKSPACE}/${prefix}/test-${test_name}.tap + + if [ ! -z "${test_remote_ip}" ] ; then + + eval "pid=$(sudo pidof vmad)" + [ ! -z "${pid}" ] && eval "sudo kill -9 ${pid}" + eval "sudo ${install_dir}/sbin/vmad --console -v5 & " + + echo "BUILD_NUMBER=${BUILD_NUMBER}" + eval "pid=$(sudo ssh ${rmt_user}@${test_remote_ip} pidof vmad)" + if [ ! -z "${pid}" ] ; then + echo "vmad pid=${pid}" + eval "sudo ssh ${rmt_user}@${test_remote_ip} kill -9 ${pid}" + fi + sudo scp -q ${install_dir}/sbin/vmad ${rmt_user}@${test_remote_ip}:${sperf_exec_dir} + eval "sudo ssh ${rmt_user}@${test_remote_ip} sudo ${sperf_exec_dir}/vmad &" + + vutil="$(dirname $0)/vutil.sh" + [ ! -e "${vutil}" ] && { echo "error vutil not found" ; exit 1 ; } + + sudo $timeout_exe ${vutil} -a "${test_app}" -x "--load-vma=${test_lib} " -t "${test}:tc[1-9]$" \ + -s "${test_remote_ip}" -p "${test_remote_port}" -l "${test_dir}/${test_name}.log" \ + -e "VMA_TX_BUFS=20000 VMA_RX_BUFS=20000" + + else + $timeout_exe $PWD/tests/verifier/verifier.pl -a ${test_app} -x " --load-vma=$test_lib " \ + -t ${test}:tc[1-9]$ -s ${test_ip} -l ${test_dir}/${test_name}.log \ + -e " VMA_TX_BUFS=20000 VMA_RX_BUFS=20000 " \ + --progress=0 + fi + + cp $PWD/${test_name}.dump ${test_dir}/${test_name}.dump + grep -e 'PASS' -e 'FAIL' ${test_dir}/${test_name}.dump > ${test_dir}/${test_name}.tmp + + do_archive "${test_dir}/${test_name}.dump" "${test_dir}/${test_name}.log" + + echo "1..$(wc -l < ${test_dir}/${test_name}.tmp)" > $test_tap + + v1=1 + while read line; do + if [[ $(echo $line | cut -f1 -d' ') =~ 'PASS' ]]; then + v0='ok' + v2=$(echo $line | sed 's/PASS //') + else + v0='not ok' + v2=$(echo $line | sed 's/FAIL //') + nerrors=$((nerrors+1)) + fi + + echo -e "$v0 ${test_in}: $v2" >> $test_tap + v1=$(($v1+1)) + done < ${test_dir}/${test_name}.tmp + rm -f ${test_dir}/${test_name}.tmp + done +done + +rc=$(($rc+$nerrors)) + +echo "[${0##*/}]..................exit code = $rc" +exit $rc diff --git a/contrib/jenkins_tests/tool.sh b/contrib/jenkins_tests/tool.sh new file mode 100755 index 0000000..646a675 --- /dev/null +++ b/contrib/jenkins_tests/tool.sh @@ -0,0 +1,68 @@ +#!/bin/bash -eExl + +source $(dirname $0)/globals.sh + +do_check_filter "Checking for tool ..." "off" + +cd $WORKSPACE + +rm -rf $tool_dir +mkdir -p $tool_dir +cd $tool_dir + +tool_list="daemon" + +tool_tap=${WORKSPACE}/${prefix}/tool.tap +echo "1..$(echo $tool_list | tr " " "\n" | wc -l)" > $tool_tap + +function check_daemon() +{ + local ret=0 + local out_log=$1 + local service="vma" + + rm -rf ${out_log} + sudo pkill -9 vmad + + if type systemctl >/dev/null 2>&1; then + service=${install_dir}/sbin/vma + else + service=${install_dir}/etc/init.d/vma + fi + + echo "daemon check output: ${service}" > ${out_log} + if [ $(sudo ${service} start >>${out_log} 2>&1 || echo $?) ]; then + ret=1 + fi + sleep 3 + if [ "0" == "$ret" -a "" == "$(pgrep vmad)" ]; then + ret=1 + fi + if [ $(sudo ${service} status >>${out_log} 2>&1 || echo $?) ]; then + ret=1 + fi + if [ $(sudo ${service} stop >>${out_log} 2>&1 || echo $?) ]; then + ret=1 + fi + sleep 3 + if [ "0" == "$ret" -a "" != "$(pgrep vmad)" ]; then + ret=1 + fi + + sudo pkill -9 vmad + + echo "$ret" +} + +test_id=0 +for tool in $tool_list; do + mkdir -p ${tool_dir}/${tool} + cd ${tool_dir}/${tool} + test_id=$((test_id+1)) + test_exec="[ 0 = $(check_daemon "${tool_dir}/${tool}/output.log") ]" + do_check_result "$test_exec" "$test_id" "$tool" "$tool_tap" "${tool_dir}/tool-${test_id}" + cd ${tool_dir} +done + +echo "[${0##*/}]..................exit code = $rc" +exit $rc diff --git a/contrib/jenkins_tests/vg.sh b/contrib/jenkins_tests/vg.sh new file mode 100755 index 0000000..23367a3 --- /dev/null +++ b/contrib/jenkins_tests/vg.sh @@ -0,0 +1,123 @@ +#!/bin/bash -eExl + +source $(dirname $0)/globals.sh + +do_check_filter "Checking for valgrind ..." "on" + +do_module "tools/valgrind-3.12.0" + +set +eE + +cd $WORKSPACE +rm -rf $vg_dir +mkdir -p $vg_dir +cd $vg_dir + +${WORKSPACE}/configure --prefix=${vg_dir}/install --with-valgrind $jenkins_test_custom_configure + +make $make_opt all +make install +rc=$? + + +test_ip_list="" +#if [ ! -z $(do_get_ip 'ib' 'mlx5') ]; then +# test_ip_list="${test_ip_list} ib:$(do_get_ip 'ib' 'mlx5')" +#fi +if [ ! -z "$(do_get_ip 'eth' 'mlx5')" ]; then + test_ip_list="${test_ip_list} eth:$(do_get_ip 'eth' 'mlx5')" +fi +test_list="tcp:--tcp udp:" +test_lib=${vg_dir}/install/lib/libvma.so +test_app=sockperf +test_app_path=${test_dir}/sockperf/install/bin/sockperf + +if [ $(command -v $test_app_path >/dev/null 2>&1 || echo $?) ]; then + test_app_path=sockperf + if [ $(command -v $test_app_path >/dev/null 2>&1 || echo $?) ]; then + echo can not find $test_app_path + exit 1 + fi +fi + +vg_tap=${WORKSPACE}/${prefix}/vg.tap +v1=$(echo $test_list | wc -w) +v1=$(($v1*$(echo $test_ip_list | wc -w))) +echo "1..$v1" > $vg_tap + +nerrors=0 + +for test_link in $test_ip_list; do + for test in $test_list; do + IFS=':' read test_n test_opt <<< "$test" + IFS=':' read test_in test_ip <<< "$test_link" + test_name=${test_in}-${test_n} + + vg_args="-v \ + --memcheck:leak-check=full --track-origins=yes --read-var-info=yes \ + --errors-for-leak-kinds=definite --show-leak-kinds=definite,possible \ + --undef-value-errors=yes --track-fds=yes --num-callers=32 \ + --fullpath-after=${WORKSPACE} --gen-suppressions=all \ + --suppressions=${WORKSPACE}/contrib/valgrind/valgrind_vma.supp \ + " + eval "LD_PRELOAD=$test_lib \ + valgrind --log-file=${vg_dir}/${test_name}-valgrind-sr.log $vg_args \ + $test_app_path sr ${test_opt} -i ${test_ip} > /dev/null 2>&1 &" + sleep 20 + eval "LD_PRELOAD=$test_lib \ + valgrind --log-file=${vg_dir}/${test_name}-valgrind-cl.log $vg_args \ + $test_app_path pp ${test_opt} -i ${test_ip} -t 10" + + if [ `ps -ef | grep $test_app | wc -l` -gt 1 ]; + then + sudo pkill -SIGINT -f $test_app 2>/dev/null || true + sleep 10 + # in case SIGINT didn't work + if [ `ps -ef | grep $test_app | wc -l` -gt 1 ]; + then + sudo pkill -SIGTERM -f $test_app 2>/dev/null || true + sleep 3 + fi + if [ `ps -ef | grep $test_app | wc -l` -gt 1 ]; + then + sudo pkill -SIGKILL -f $test_app 2>/dev/null || true + fi + fi + + ret=$(cat ${vg_dir}/${test_name}-valgrind*.log | awk '/ERROR SUMMARY: [0-9]+ errors?/ { sum += $4 } END { print sum }') + + do_archive "${vg_dir}/${test_name}-valgrind*.log" + + if [ $ret -gt 0 ]; then + echo "not ok ${test_name}: valgrind Detected $ret failures # ${vg_dir}/${test_name}-valgrind*.log" >> $vg_tap + grep -A 10 'LEAK SUMMARY' ${vg_dir}/${test_name}-valgrind*.log >> ${vg_dir}/${test_name}-valgrind.err + cat ${vg_dir}/${test_name}-valgrind*.log + do_err "valgrind" "${vg_dir}/${test_name}-valgrind.err" + else + echo ok ${test_name}: Valgrind found no issues >> $vg_tap + fi + nerrors=$(($ret+$nerrors)) + done +done + +if [ $nerrors -gt 0 ]; then + info="Valgrind found $nerrors errors" + status="error" +else + info="Valgrind found no issues" + status="success" +fi + +vg_url="$BUILD_URL/valgrindResult/" + +if [ -n "$ghprbGhRepository" ]; then + context="MellanoxLab/valgrind" + do_github_status "repo='$ghprbGhRepository' sha1='$ghprbActualCommit' target_url='$vg_url' state='$status' info='$info' context='$context'" +fi + +module unload tools/valgrind-3.12.0 + +rc=$(($rc+$nerrors)) +set -eE +echo "[${0##*/}]..................exit code = $rc" +exit $rc diff --git a/contrib/jenkins_tests/vutil.sh b/contrib/jenkins_tests/vutil.sh new file mode 100755 index 0000000..1a6dc6f --- /dev/null +++ b/contrib/jenkins_tests/vutil.sh @@ -0,0 +1,420 @@ +#!/bin/bash + +HOST=$(hostname) + +server_success_msgs="'Test end', 'interrupted by', 'exit'" +server_failure_msgs="'Segmentation fault', 'Assertion', 'ERROR'" + +client_success_ul_msgs="'Test ended', 'Summary: Latency is'" +client_success_pp_msgs="'Test ended', 'Summary: Latency is'" +client_success_tp_msgs="'Test ended', 'Summary: Message Rate'" +client_failure_msgs="'Segmentation fault', 'Assertion', 'ERROR', 'server down'" + +dlm=~ + +############ +ts_tcp_pp() +############ +{ + local sperf=$1 local ipaddr=$2 local opts=$3 + #1 + ts_tcp_pp_tc1="#1 - ping-pong w/o arguments"${dlm} + ts_tcp_pp_tc1+="${sperf} pp -i ${ipaddr} --tcp ${opts}"${dlm} + ts_tcp_pp_tc1+=${server_success_msgs}${dlm} + ts_tcp_pp_tc1+=${server_failure_msgs}${dlm} + ts_tcp_pp_tc1+="client_success='Test ended', 'Summary: Latency is', 'Warmup stage (sending a few dummy messages)...'"${dlm} + ts_tcp_pp_tc1+=${client_failure_msgs} + #2 + ts_tcp_pp_tc2="#2 - ping-pong option --dontwarmup"${dlm} + ts_tcp_pp_tc2+="${sperf} pp -i ${ipaddr} --tcp --dontwarmup ${opts}"${dlm} + ts_tcp_pp_tc2+=${server_success_msgs}${dlm} + ts_tcp_pp_tc2+=${server_failure_msgs}${dlm} + ts_tcp_pp_tc2+=${client_success_pp_msgs}${dlm} + ts_tcp_pp_tc2+="client_failure='Segmentation fault', 'Assertion', 'ERROR', 'server down', 'Warmup stage (sending a few dummy messages)...'" + #3 + ts_tcp_pp_tc3="#3 - ping-pong option -b10"${dlm} + ts_tcp_pp_tc3+="${sperf} pp -i ${ipaddr} --tcp -b10 ${opts}"${dlm} + ts_tcp_pp_tc3+=${server_success_msgs}${dlm} + ts_tcp_pp_tc3+=${server_failure_msgs}${dlm} + ts_tcp_pp_tc3+="client_success='Test ended', 'Summary: Latency of burst of 10 messages'"${dlm} + ts_tcp_pp_tc3+=${client_failure_msgs} + #4 + ts_tcp_pp_tc4="#4 - ping-pong option -b100"${dlm} + ts_tcp_pp_tc4+="${sperf} pp -i ${ipaddr} --tcp -b100 ${opts}"${dlm} + ts_tcp_pp_tc4+=${server_success_msgs}${dlm} + ts_tcp_pp_tc4+=${server_failure_msgs}${dlm} + ts_tcp_pp_tc4+="client_success='Test ended', 'Summary: Latency of burst of 100 messages'"${dlm} + ts_tcp_pp_tc4+=${client_failure_msgs} + #5 + ts_tcp_pp_tc5="#5 - ping-pong option -b1000"${dlm} + ts_tcp_pp_tc5+="${sperf} pp -i ${ipaddr} --tcp -b1000 ${opts}"${dlm} + ts_tcp_pp_tc5+=${server_success_msgs}${dlm} + ts_tcp_pp_tc5+=${server_failure_msgs}${dlm} + ts_tcp_pp_tc5+="client_success='Test ended', 'Summary: Latency of burst of 1000 messages'"${dlm} + ts_tcp_pp_tc5+=${client_failure_msgs} + #6 + ts_tcp_pp_tc6="#6 - ping-pong option -t10"${dlm} + ts_tcp_pp_tc6+="${sperf} pp -i ${ipaddr} --tcp -t10 ${opts}"${dlm} + ts_tcp_pp_tc6+=${server_success_msgs}${dlm} + ts_tcp_pp_tc6+=${server_failure_msgs}${dlm} + ts_tcp_pp_tc6+="client_success='Test ended', 'Summary: Latency is', 'RunTime=10'"${dlm} + ts_tcp_pp_tc6+=${client_failure_msgs} + #7 + ts_tcp_pp_tc7="#7 - ping-pong option -t30"${dlm} + ts_tcp_pp_tc7+="${sperf} pp -i ${ipaddr} --tcp -t30 ${opts}"${dlm} + ts_tcp_pp_tc7+=${server_success_msgs}${dlm} + ts_tcp_pp_tc7+=${server_failure_msgs}${dlm} + ts_tcp_pp_tc7+="client_success='Test ended', 'Summary: Latency is', 'RunTime=30'"${dlm} + ts_tcp_pp_tc7+=${client_failure_msgs} + #8 + ts_tcp_pp_tc8="#8 - ping-pong option -m32"${dlm} + ts_tcp_pp_tc8+="${sperf} pp -i ${ipaddr} --tcp -m32 ${opts}"${dlm} + ts_tcp_pp_tc8+=${server_success_mss}${dlm} + ts_tcp_pp_tc8+=${server_failure_msgs}${dlm} + ts_tcp_pp_tc8+=${client_success_pp_msgs}${dlm} + ts_tcp_pp_tc8+=${client_failure_msgs} + #9 + ts_tcp_pp_tc9="#9 - ping-pong option -m4096"${dlm} + ts_tcp_pp_tc9+="${sperf} pp -i ${ipaddr} --tcp -m4096 ${opts}"${dlm} + ts_tcp_pp_tc9+=${server_success_msgs}${dlm} + ts_tcp_pp_tc9+=${server_failure_msgs}${dlm} + ts_tcp_pp_tc9+=${client_success_pp_msgs}${dlm} + ts_tcp_pp_tc9+=${client_failure_msgs} +} + +############ +ts_tcp_tp() +############ +{ + local sperf=$1 local ipaddr=$2 local opts=$3 + #1 + ts_tcp_tp_tc1="#1 - throughput w/o arguments"${dlm} + ts_tcp_tp_tc1+="${sperf} tp -i ${ipaddr} --tcp ${opts}"${dlm} + ts_tcp_tp_tc1+=${server_success_msgs}${dlm} + ts_tcp_tp_tc1+=${server_failure_msgs}${dlm} + ts_tcp_tp_tc1+="client_success='Test ended', 'Summary: Message Rate', 'Warmup stage (sending a few dummy messages)...'"${dlm} + ts_tcp_tp_tc1+=${client_failure_msgs} + #2 + ts_tcp_tp_tc2="#2 - throughput option --dontwarmup"${dlm} + ts_tcp_tp_tc2+="${sperf} tp -i ${ipaddr} --tcp --dontwarmup ${opts}"${dlm} + ts_tcp_tp_tc2+=${server_success_msgs}${dlm} + ts_tcp_tp_tc2+=${server_failure_msgs}${dlm} + ts_tcp_tp_tc2+="client_success='Test ended', 'Summary: Message Rate'"${dlm} + ts_tcp_tp_tc2+="client_failure='Segmentation fault', 'Assertion', 'ERROR', 'server down', 'Warmup stage (sending a few dummy messages)...'"${dlm} + #3 + ts_tcp_tp_tc3="#3 - throughput option -b10"${dlm} + ts_tcp_tp_tc3+="${sperf} tp -i ${ipaddr} --tcp -b10 ${opts}"${dlm} + ts_tcp_tp_tc3+=${server_success_msgs}${dlm} + ts_tcp_tp_tc3+=${server_failure_msgs}${dlm} + ts_tcp_tp_tc3+=${client_success_tp_msgs}${dlm} + ts_tcp_tp_tc3+=${client_failure_msgs} + #4 + ts_tcp_tp_tc4="#4 - throughput option -b100"${dlm} + ts_tcp_tp_tc4+="${sperf} tp -i ${ipaddr} --tcp -b100 ${opts}"${dlm} + ts_tcp_tp_tc4+=${server_success_msgs}${dlm} + ts_tcp_tp_tc4+=${server_failure_msgs}${dlm} + ts_tcp_tp_tc4+=${client_success_tp_msgs}${dlm} + ts_tcp_tp_tc4+=${client_failure_msgs} + #5 + ts_tcp_tp_tc5="#5 - throughput option -b1000"${dlm} + ts_tcp_tp_tc5+="${sperf} tp -i ${ipaddr} --tcp -b1000 ${opts}"${dlm} + ts_tcp_tp_tc5+=${server_success_msgs}${dlm} + ts_tcp_tp_tc5+=${server_failure_msgs}${dlm} + ts_tcp_tp_tc5+=${client_success_tp_msgs}${dlm} + ts_tcp_tp_tc5+=${client_failure_msgs} + #6 + ts_tcp_tp_tc6="#6 - throughput option -t10"${dlm} + ts_tcp_tp_tc6+="${sperf} tp -i ${ipaddr} --tcp -t10 ${opts}"${dlm} + ts_tcp_tp_tc6+=${server_success_msgs}${dlm} + ts_tcp_tp_tc6+=${server_failure_msgs}${dlm} + ts_tcp_tp_tc6+=${client_success_tp_msgs}${dlm} + ts_tcp_tp_tc6+=${client_failure_msgs} + #7 + ts_tcp_tp_tc7="#7 - throughput option -t30"${dlm} + ts_tcp_tp_tc7+="${sperf} tp -i ${ipaddr} --tcp -t30 ${opts}"${dlm} + ts_tcp_tp_tc7+=${server_success_msgs}${dlm} + ts_tcp_tp_tc7+=${server_failure_msgs}${dlm} + ts_tcp_tp_tc7+=${client_success_tp_msgs}${dlm} + ts_tcp_tp_tc7+=${client_failure_msgs} + #8 + ts_tcp_tp_tc8="#8 - throughput option -m32"${dlm} + ts_tcp_tp_tc8+="${sperf} tp -i ${ipaddr} --tcp -m32 ${opts}"${dlm} + ts_tcp_tp_tc8+=${server_success_msgs}${dlm} + ts_tcp_tp_tc8+=${server_failure_msgs}${dlm} + ts_tcp_tp_tc8+=${client_success_tp_msgs}${dlm} + ts_tcp_tp_tc8+=${client_failure_msgs} + #9 + ts_tcp_tp_tc9="#9 - throughput option -m4096"${dlm} + ts_tcp_tp_tc9+="${sperf} tp -i ${ipaddr} --tcp -m4096 ${opts}"${dlm} + ts_tcp_tp_tc9+=${server_success_msgs}${dlm} + ts_tcp_tp_tc9+=${server_failure_msgs}${dlm} + ts_tcp_tp_tc9+=${client_success_tp_msgs}${dlm} + ts_tcp_tp_tc9+=${client_failure_msgs} +} + +############ +ts_tcp_ul() +############ +{ + local sperf=$1 local ipaddr=$2 local opts=$3 + #1 + ts_tcp_ul_tc1="#1 - under-load w/o arguments"${dlm} + ts_tcp_ul_tc1+="${sperf} ul -i ${ipaddr} --tcp ${opts}"${dlm} + ts_tcp_ul_tc1+=${server_success_msgs}${dlm} + ts_tcp_ul_tc1+=${server_failure_msgs}${dlm} + ts_tcp_ul_tc1+="client_success='Test ended', 'Summary: Latency is', 'Warmup stage (sending a few dummy messages)...'"${dlm} + ts_tcp_ul_tc1+=${client_failure_msgs} + #2 + ts_tcp_ul_tc2="#2 - under-load option --dontwarmup"${dlm} + ts_tcp_ul_tc2+="${sperf} ul -i ${ipaddr} --tcp --dontwarmup ${opts}"${dlm} + ts_tcp_ul_tc2+=${server_success_msgs}${dlm} + ts_tcp_ul_tc2+=${server_failure_msgs}${dlm} + ts_tcp_ul_tc2+=${client_success_ul_msgs}${dlm} + ts_tcp_ul_tc2+="client_failure='Segmentation fault', 'Assertion', 'ERROR', 'server down', 'Warmup stage (sending a few dummy messages)...'"${dlm} + #3 + ts_tcp_ul_tc3="#3 - under-load option -b10"${dlm} + ts_tcp_ul_tc3+="${sperf} ul -i ${ipaddr} --tcp -b10 ${opts}"${dlm} + ts_tcp_ul_tc3+=${server_success_msgs}${dlm} + ts_tcp_ul_tc3+=${server_failure_msgs}${dlm} + ts_tcp_ul_tc3+="client_success='Test ended', 'Summary: Latency of burst of 10 messages'"${dlm} + ts_tcp_ul_tc3+=${client_failure_msgs} + #4 + ts_tcp_ul_tc4="#4 - under-load option -b100"${dlm} + ts_tcp_ul_tc4+="${sperf} ul -i ${ipaddr} --tcp -b100 ${opts}"${dlm} + ts_tcp_ul_tc4+=${server_success_msgs}${dlm} + ts_tcp_ul_tc4+=${server_failure_msgs}${dlm} + ts_tcp_ul_tc4+="client_success='Test ended', 'Summary: Latency of burst of 100 messages'"${dlm} + ts_tcp_ul_tc4+=${client_failure_msgs} + #5 + ts_tcp_ul_tc5="#5 - under-load option -b1000"${dlm} + ts_tcp_ul_tc5+="${sperf} ul -i ${ipaddr} --tcp -b1000 ${opts}"${dlm} + ts_tcp_ul_tc5+=${server_success_msgs}${dlm} + ts_tcp_ul_tc5+=${server_failure_msgs}${dlm} + ts_tcp_ul_tc5+="client_success='Test ended', 'Summary: Latency of burst of 1000 messages'"${dlm} + ts_tcp_ul_tc5+=${client_failure_msgs} + #6 + ts_tcp_ul_tc6="#6 - under-load option -t10"${dlm} + ts_tcp_ul_tc6+="${sperf} ul -i ${ipaddr} --tcp -t10 ${opts}"${dlm} + ts_tcp_ul_tc6+=${server_success_msgs}${dlm} + ts_tcp_ul_tc6+=${server_failure_msgs}${dlm} + ts_tcp_ul_tc6+="client_success='Test ended', 'Summary: Latency is', 'RunTime=10'"${dlm} + ts_tcp_ul_tc6+=${client_failure_msgs} + #7 + ts_tcp_ul_tc7="#7 - under-load option -t30"${dlm} + ts_tcp_ul_tc7+="${sperf} ul -i ${ipaddr} --tcp -t10 ${opts}"${dlm} + ts_tcp_ul_tc7+=${server_success_msgs}${dlm} + ts_tcp_ul_tc7+=${server_failure_msgs}${dlm} + ts_tcp_ul_tc7+="client_success='Test ended', 'Summary: Latency is', 'RunTime=30'"${dlm} + ts_tcp_ul_tc7+=${client_failure_msgs} + #8 + ts_tcp_ul_tc8="#8 - under-load option -m32"${dlm} + ts_tcp_ul_tc8+="${sperf} ul -i ${ipaddr} --tcp -m32 ${opts}"${dlm} + ts_tcp_ul_tc8+=${server_success_msgs}${dlm} + ts_tcp_ul_tc8+=${server_failure_msgs}${dlm} + ts_tcp_ul_tc8+=${client_success_ul_msgs}${dlm} + ts_tcp_ul_tc8+=${client_failure_msgs} + #9 + ts_tcp_ul_tc9="#9 - under-load option -m4096"${dlm} + ts_tcp_ul_tc9+="${sperf} ul -i ${ipaddr} --tcp -m4096 ${opts}"${dlm} + ts_tcp_ul_tc9+=${server_success_msgs}${dlm} + ts_tcp_ul_tc9+=${server_failure_msgs}${dlm} + ts_tcp_ul_tc9+=${client_success_ul_msgs}${dlm} + ts_tcp_ul_tc9+=${client_failure_msgs} +} + +server_pid="" + +check_message() +{ + oifs="${IFS}" + IFS=',' read -r -a array <<< $(echo ${1##*=}) + IFS="${oifs}" + for im in "${array[@]}" ; do + im=$(echo ${im} | sed -e 's/^[[:space:]]*//' -e 's/[[:space:]]*$//' -e 's/^'\''*//' -e 's/'\''*$//') + [[ "${2}" =~ .*"${im}".* ]] && echo "${im}" + done +} + +start_server() +{ + local env=$1 local ipaddr=$2 local port=$3 local proto=$4 local log_file=$5 local tsnv=$6 + + server_cmd="env ${env} ${SERVER_DIR}/sockperf sr -i ${ipaddr} -p ${port} --tcp --load-vma=${SERVER_DIR}/libvma.so" + + server_pid=$(ssh root@${ipaddr} ps -ax | grep -i sockperf | grep ${port} | awk '{print $1}') + [ ! -z "${server_pid}" ] && ssh root@${ipaddr} kill -9 ${server_pid} + + tmp=$(mktemp) + ssh root@${ipaddr} "${server_cmd}" >> ${tmp} 2>&1 >> ${tmp} & + sleep 5 + res=$(cat ${tmp}) + rm -f ${tmp} + echo "${res}" + echo "${res}" >> "${log_file}" + server_fail=$(echo ${tsnv} | awk -F${dlm} '{ print $4 }') + local chk_res=$(check_message "${server_fail}" "${res}") + if [ ! -z "${chk_res}" ] ; then + echo ">> FAIL ${server_cmd}" + else + server_success=$(echo ${tsnv} | awk -F${dlm} '{ print $3 }') + check_message "${server_success}" "${res}" + echo ">> PASS ${server_cmd}" + fi + server_pid=$(tac ${log_file} | grep -m 1 -oP '(?<=Pid: ).*') +} + +stop_server() +{ + local ipaddr=$1 local pid=$2 local log_file=$3 + res=$(ssh root@${ipaddr} kill -9 ${pid} >> ${log_file} 2>&1) + echo ">> Server process ${pid} has finished" >> "${log_file}" +} + +log_st() +{ + echo "${2}" >> "${1}" +} + +perform_ts() +{ + ts=$1 ns=$2 ne=$3 app_env=$4 sperf=$5 ipaddr=$6 port=$7 opts=$8 log_file=$9 + log_st_file="${log_file%.*}.dump" + log_st_file=${log_st_file##*/} + # init ts with params + ts_${ts} ${sperf} ${ipaddr} ${opts} + + log_st "${log_st_file}" "***********" + for ((i = ${ns}; i <= ${ne}; i++)); do + tsn="ts_${ts}_tc${i}" + if [ ! -z "${!tsn}" ] ; then + tsnv=${!tsn} + start_server "${app_env}" "${ipaddr}" ${port} "--tcp" "${log_file}" "${tsnv}" + + if [ -z "${dbg_srv}" ] ; then + name=$(echo ${tsnv} | awk -F${dlm} '{ print $1 }') + echo ${name} + st=$(echo ${tsnv} | awk -F${dlm} '{ print $2 }') + cmd_test="env ${app_env} ${st} -p ${port}" + local res=$(${cmd_test} 2>&1) + echo "${res}" + echo "${res}" >> "${log_file}" + client_fail=$(echo ${tsnv} | awk -F${dlm} '{ print $6 }') + chk_res=$(check_message "${client_fail}" "${res}") + if [ ! -z "${chk_res}" ] ; then + test_st="FAIL" + else + client_success=$(echo ${tcnv} | awk -F${dlm} '{ print $5 }') + check_message "${client_success}" "${chk_res}" + test_st="PASS" + fi + echo ">> ${test_st} ${cmd_test}" + log_st "${log_st_file}" "${test_st} ${ts} tc${i} ${name}" + fi + stop_server "${ipaddr}" "${server_pid}" "${log_file}" + else + break + fi + done + log_st "${log_st_file}" "***********" +} + +prepare_perform_ts() +{ + app_env=$1 app=$2 app_args=$3 task=$4 target=$5 port=$6 log_file=$7 + ts=${task%:*} ; ts=${ts//-/_} + num_tests=${TASK#*[} ; num_tests=${num_tests%]*} + start_num=${num_tests%-*} + end_num=${num_tests#*-} + + HOST=${HOST%%.*} + [ -z "${SERVER_DIR}" ] && SERVER_DIR="/tmp/sockperf_exec_${HOST}" + + if [ ! -z "${SRV_OPS}" ] ; then + if [ "${SRV_OPS}" == "start" ] ; then + start_server "${app_env}" ${target} ${port} "--tcp" "${log_file}" + stop_server ${target} "${server_pid}" "${log_file}" + fi + [ "${SRV_OPS}" == "stop" ] && stop_server "${target}" "${server_pid}" "${log_file}" + return + fi + perform_ts ${ts} ${start_num} ${end_num} "${app_env}" "${app}" ${target} ${port} "${app_args}" "${log_file}" +} + +usage() +{ +cat << eOm + usage:$0 -a app [-x|--app-arg 'args'] [-e|--app-env 'vars'] [-t|--task test] [-s|--target address] [-p|--port N] + [-l|--log fname] [--server-dir dir] [--dr] [-h] +eOm + exit 0 +} + +[ $# -eq 0 ] && usage + +OPTS=$(getopt -o ha:x:e:t:s:p:l: -l app:,app-arg:,app-env:,task:,target:,port:,log:,server-dir:,srv-start,srv-stop,help -- "$@") +[[ $? -ne 0 ]] && usage +eval set -- "${OPTS}" + +while true ; do + case "$1" in + -a|--app) + APP="$2" + shift 2 + ;; + -x|--app-arg) + APP_ARGS="$2" + shift 2 + ;; + -e|--app-env) + APP_ENV="$2" + shift 2 + ;; + -t|--task) + TASK="$2" + shift 2 + ;; + -s|--target) + [[ "$2" =~ ^[0-9]+\.[0-9]+\.[0-9]+\.[0-9]+$ ]] && TARGET="$2" + shift 2 + ;; + -p|--port) + [[ "$2" =~ ^-?[0-9]+$ ]] && PORT="$2" + shift 2 + ;; + -l|--log) + LOG_FILE="$2" + shift 2 + ;; + --srv-start|--srv-stop) + SRV_OPS=${1##*-} + shift 1 + ;; + --dr) + DRY_RUN=1 + shift 1 + ;; + --srv-ops) + SRV_OPS="$2" + shift 2 + ;; + -h|--help) + shift 1 + ;; + --) + shift + break + ;; + *) + usage + ;; + esac +done + +if [ ! -z "${APP}" ] ; then + prepare_perform_ts "${APP_ENV}" "${APP}" "${APP_ARGS}" "${TASK}" "${TARGET}" "${PORT}" "${LOG_FILE}" +else + usage +fi + +# + diff --git a/contrib/scripts/vma.init.in b/contrib/scripts/vma.init.in new file mode 100644 index 0000000..f440432 --- /dev/null +++ b/contrib/scripts/vma.init.in @@ -0,0 +1,149 @@ +#!/bin/bash +# +# Copyright (C) Mellanox Technologies Ltd. 2001-2020. ALL RIGHTS RESERVED. +# See file LICENSE for terms. +# +# vma: Start the VMA Daemon +# +# chkconfig: 345 80 20 +# description: This is a daemon which handles the task of \ +# monitoring processes launched under VMA. +# +### BEGIN INIT INFO +# Provides: vma +# Required-Start: $local_fs $syslog +# Required-Stop: $local_fs $syslog +# Default-Start: 2 3 4 5 +# Default-Stop: 0 1 6 +# Short-Description: Start the VMA Daemon +# Description: This is a daemon which handles the task of +# monitoring processes launched under VMA. +### END INIT INFO + +prefix=@prefix@ +exec_prefix=@exec_prefix@ + +RETVAL=0 +exefile=vmad +options= +pidfile=/var/run/lock/vmad.lock + +# Source function library. +if [[ -s /etc/init.d/functions ]]; then + # RHEL / CentOS / SL / Fedora. + . /etc/init.d/functions + rc_status() { :; } +elif [[ -s /lib/lsb/init-functions ]]; then + # SLES / openSuSE / Debian. + . /lib/lsb/init-functions + success() { log_success_msg; } + failure() { log_failure_msg; } +elif [[ -s /etc/rc.status ]]; then + # Older SuSE systems. + . /etc/rc.status + failure() { rc_status -v; } + success() { rc_status -v; } +fi + + +function check_running() +{ + [ -e ${pidfile} ] && + [ "$(readlink "/proc/$(<${pidfile})/exe")" = "@sbindir@/${exefile}" ] +} + +function check_permission() +{ + [ "$(id -u)" = "0" ] +} + +function do_start() +{ + echo -n "Starting ${exefile}: " + + if check_running; then + RETVAL=0 + else + @sbindir@/${exefile} ${options} > /dev/null 2>&1 + RETVAL=$? + fi + if [[ $RETVAL -eq 0 ]]; then + success + sleep 1 + else + failure + fi + echo +} + +function do_stop() +{ + echo -n "Shutting down ${exefile}: " + + RETVAL=0 + if [ $(command -v pkill >/dev/null 2>&1 && echo $?) ]; then + pkill -INT "${exefile}" + RETVAL=$? + fi + if [[ $RETVAL -ne 0 ]]; then + killproc "${exefile}" -INT + RETVAL=$? + fi + if [[ $RETVAL -eq 0 ]]; then + success + sleep 1 + else + failure + fi + echo +} + +function do_status () +{ + pid="`pidof ${exefile}`" + ret=$? + if [ $ret -eq 0 ] ; then + echo "${exefile} is running... pid=$pid" + else + echo "${exefile} is not running." + fi +} + +function do_restart() +{ + do_stop + do_start +} + + +function do_check() +{ + if ! check_permission; then + echo "root permission is required" + exit 1 + fi +} + +case "$1" in + start) + do_check + do_start + ;; + stop) + do_check + do_stop + ;; + status) + do_status + ;; + restart | force-reload) + do_check + do_restart + ;; + *) + echo $"Usage: $0 {start|stop|status|restart|force-reload}" + RETVAL=1 + ;; +esac + +exit $RETVAL diff --git a/contrib/scripts/vma.service.in b/contrib/scripts/vma.service.in new file mode 100644 index 0000000..e268173 --- /dev/null +++ b/contrib/scripts/vma.service.in @@ -0,0 +1,15 @@ +[Unit] +Description=VMA Daemon. Version: @VERSION@-@VMA_LIBRARY_RELEASE@ +After=network.target syslog.target +Requires=network.target + +[Service] +Type=forking +Restart=on-failure +ExecStart=@prefix@/sbin/vma start +ExecStop=@prefix@/sbin/vma stop +ExecReload=@prefix@/sbin/vma restart +RestartForceExitStatus=1 SIGTERM + +[Install] +WantedBy=multi-user.target diff --git a/contrib/test_jenkins.sh b/contrib/test_jenkins.sh new file mode 100755 index 0000000..fb3d9bb --- /dev/null +++ b/contrib/test_jenkins.sh @@ -0,0 +1,224 @@ +#!/bin/bash -El +# +# Testing script for VMA, to run from Jenkins CI +# +# Copyright (C) Mellanox Technologies Ltd. 2001-2020. ALL RIGHTS RESERVED. +# +# See file LICENSE for terms. +# +# +# Environment variables set by Jenkins CI: +# - WORKSPACE : path to working directory +# - BUILD_NUMBER : jenkins build number +# - JOB_URL : jenkins job url +# - JENKINS_RUN_TESTS : whether to run unit tests +# - TARGET : target configuration +# + +echo "======================================================" +echo +echo "# starting on host ---------> $(hostname) " +echo "# arguments called with ----> ${@} " +echo "# path to me ---------------> ${0} " +echo "# parent path --------------> ${0%/*} " +echo "# name ---------------------> ${0##*/} " +echo + +PATH=${PATH}:/hpc/local/bin:/hpc/local/oss/vma/ +MODULEPATH=${MODULEPATH}:/hpc/local/etc/modulefiles +env +for f in autoconf automake libtool ; do $f --version | head -1 ; done +echo "======================================================" + +source $(dirname $0)/jenkins_tests/globals.sh + +set -xe +# check go/not go +# +do_check_env + +rel_path=$(dirname $0) +abs_path=$(readlink -f $rel_path) + +# Values: none, fail, always +# +jenkins_opt_artifacts=${jenkins_opt_artifacts:="always"} + +# Values: 0..N test (max 100) +# +jenkins_opt_exit=${jenkins_opt_exit:="6"} + +# Test scenario list +# +jenkins_test_build=${jenkins_test_build:="yes"} + +jenkins_test_compiler=${jenkins_test_compiler:="yes"} +jenkins_test_rpm=${jenkins_test_rpm:="yes"} +jenkins_test_cov=${jenkins_test_cov:="yes"} +jenkins_test_cppcheck=${jenkins_test_cppcheck:="yes"} +jenkins_test_csbuild=${jenkins_test_csbuild:="yes"} +jenkins_test_run=${jenkins_test_run:="yes"} +jenkins_test_gtest=${jenkins_test_gtest:="yes"} +jenkins_test_vg=${jenkins_test_vg:="yes"} +jenkins_test_style=${jenkins_test_style:="no"} +jenkins_test_tool=${jenkins_test_tool:="yes"} + + +echo Starting on host: $(hostname) + +cd $WORKSPACE + +rm -rf ${WORKSPACE}/${prefix} +rm -rf autom4te.cache + +./autogen.sh -s + + +for target_v in "${target_list[@]}"; do + ret=0 + IFS=':' read target_name target_option <<< "$target_v" + + export jenkins_test_artifacts="${WORKSPACE}/${prefix}/vma-${BUILD_NUMBER}-$(hostname -s)-${target_name}" + export jenkins_test_custom_configure="${target_option}" + export jenkins_target="${target_name}" + set +x + echo "======================================================" + echo "Jenkins is checking for [${target_name}] target ..." + echo "======================================================" + set -x + + # check building and exit immediately in case failure + # + if [ "$jenkins_test_build" = "yes" ]; then + $WORKSPACE/contrib/jenkins_tests/build.sh + ret=$? + if [ $ret -gt 0 ]; then + do_err "case: [build: ret=$ret]" + fi + rc=$((rc + $ret)) + fi + + # check other units w/o forcing exiting + # + set +e + if [ 1 -lt "$jenkins_opt_exit" -o "$rc" -eq 0 ]; then + if [ "$jenkins_test_compiler" = "yes" ]; then + $WORKSPACE/contrib/jenkins_tests/compiler.sh + ret=$? + if [ $ret -gt 0 ]; then + do_err "case: [compiler: ret=$ret]" + fi + rc=$((rc + $ret)) + fi + fi + if [ 2 -lt "$jenkins_opt_exit" -o "$rc" -eq 0 ]; then + if [ "$jenkins_test_rpm" = "yes" ]; then + $WORKSPACE/contrib/jenkins_tests/rpm.sh + ret=$? + if [ $ret -gt 0 ]; then + do_err "case: [rpm: ret=$ret]" + fi + rc=$((rc + $ret)) + fi + fi + if [ 3 -lt "$jenkins_opt_exit" -o "$rc" -eq 0 ]; then + if [ "$jenkins_test_cov" = "yes" ]; then + $WORKSPACE/contrib/jenkins_tests/cov.sh + ret=$? + if [ $ret -gt 0 ]; then + do_err "case: [cov: ret=$ret]" + fi + rc=$((rc + $ret)) + fi + fi + if [ 4 -lt "$jenkins_opt_exit" -o "$rc" -eq 0 ]; then + if [ "$jenkins_test_cppcheck" = "yes" ]; then + $WORKSPACE/contrib/jenkins_tests/cppcheck.sh + ret=$? + if [ $ret -gt 0 ]; then + do_err "case: [cppcheck: ret=$ret]" + fi + rc=$((rc + $ret)) + fi + fi + if [ 5 -lt "$jenkins_opt_exit" -o "$rc" -eq 0 ]; then + if [ "$jenkins_test_csbuild" = "yes" ]; then + $WORKSPACE/contrib/jenkins_tests/csbuild.sh + ret=$? + if [ $ret -gt 0 ]; then + do_err "case: [csbuild: ret=$ret]" + fi + rc=$((rc + $ret)) + fi + fi + if [ 6 -lt "$jenkins_opt_exit" -o "$rc" -eq 0 ]; then + if [ "$jenkins_test_run" = "yes" ]; then + $WORKSPACE/contrib/jenkins_tests/test.sh + ret=$? + if [ $ret -gt 0 ]; then + do_err "case: [test: ret=$ret]" + fi + rc=$((rc + $ret)) + fi + fi + if [ 7 -lt "$jenkins_opt_exit" -o "$rc" -eq 0 ]; then + if [ "$jenkins_test_gtest" = "yes" ]; then + $WORKSPACE/contrib/jenkins_tests/gtest.sh + ret=$? + if [ $ret -gt 0 ]; then + do_err "case: [gtest: ret=$ret]" + fi + rc=$((rc + $ret)) + fi + fi + if [ 8 -lt "$jenkins_opt_exit" -o "$rc" -eq 0 ]; then + if [ "$jenkins_test_vg" = "yes" ]; then + $WORKSPACE/contrib/jenkins_tests/vg.sh + ret=$? + if [ $ret -gt 0 ]; then + do_err "case: [vg: ret=$ret]" + fi + rc=$((rc + $ret)) + fi + fi + if [ 9 -lt "$jenkins_opt_exit" -o "$rc" -eq 0 ]; then + if [ "$jenkins_test_style" = "yes" ]; then + $WORKSPACE/contrib/jenkins_tests/style.sh + ret=$? + if [ $ret -gt 0 ]; then + do_err "case: [style: ret=$ret]" + fi + rc=$((rc + $ret)) + fi + fi + if [ 10 -lt "$jenkins_opt_exit" -o "$rc" -eq 0 ]; then + if [ "$jenkins_test_tool" = "yes" ]; then + $WORKSPACE/contrib/jenkins_tests/tool.sh + ret=$? + if [ $ret -gt 0 ]; then + do_err "case: [tool: ret=$ret]" + fi + rc=$((rc + $ret)) + fi + fi + set -e + + # Archive all logs in single file + do_archive "${WORKSPACE}/${prefix}/${target_name}/*.tap" + + if [ "$jenkins_opt_artifacts" == "always" ] || [ "$jenkins_opt_artifacts" == "fail" -a "$rc" -gt 0 ]; then + set +x + gzip "${jenkins_test_artifacts}.tar" + echo "======================================================" + echo "Jenkins result for [${target_name}] target: return $rc" + echo "Artifacts: ${jenkins_test_artifacts}.tar.gz" + echo "======================================================" + set -x + fi + +done + +rm -rf $WORKSPACE/config.cache + +echo "[${0##*/}]..................exit code = $rc" +exit $rc diff --git a/contrib/valgrind/valgrind_vma.supp b/contrib/valgrind/valgrind_vma.supp new file mode 100644 index 0000000..1fac86a --- /dev/null +++ b/contrib/valgrind/valgrind_vma.supp @@ -0,0 +1,523 @@ +########################################################### +# false positive librdmacm.so +{ + rdmacm Cond rdma_get_devices + Memcheck:Cond + ... + fun:rdma_get_devices +} +{ + rdmacm Value8 rdma_get_devices + Memcheck:Value8 + ... + fun:rdma_get_devices +} +{ + rdmacm Param rdma_get_devices + Memcheck:Param + write(buf) + ... + fun:rdma_get_devices +} +{ + rdmacm Cond rdma_create_id + Memcheck:Cond + ... + obj:*/librdmacm.so* + fun:rdma_create_id +} +{ + rdmacm Value8 rdma_create_id + Memcheck:Value8 + ... + obj:*/librdmacm.so* + fun:rdma_create_id +} +{ + rdmacm Value8 rdma_bind_addr + Memcheck:Value8 + ... + fun:rdma_bind_addr +} +{ + rdmacm Cond rdma_bind_addr + Memcheck:Cond + ... + fun:rdma_bind_addr +} +{ + rdmacm Param rdma_bind_addr + Memcheck:Param + write(buf) + ... + fun:rdma_bind_addr +} +{ + rdmacm Cond rdma_destroy_id + Memcheck:Cond + ... + fun:rdma_destroy_id +} +{ + rdmacm Value8 rdma_destroy_id + Memcheck:Value8 + ... + fun:rdma_destroy_id +} +{ + rdmacm Param rdma_destroy_id + Memcheck:Param + write(buf) + ... + fun:rdma_destroy_id +} +{ + rdma_create_event_channel + Memcheck:Cond + obj:/*librdmacm.so* + ... + fun:rdma_create_event_channel + fun:_ZN15neigh_table_mgrC1Ev +} +{ + rdma_create_event_channel + Memcheck:Value8 + obj:/*librdmacm.so* + ... + fun:rdma_create_event_channel + fun:_ZN15neigh_table_mgrC1Ev +} +# false positive verbs +{ + libibverbs Param ibv_exp_cmd_create_qp + Memcheck:Param + write(buf) + obj:*/libibverbs.so* + ... + fun:write + fun:ibv_exp_cmd_create_qp +} +{ + libibverbs Param ibv_exp_cmd_create_flow + Memcheck:Param + write(buf) + ... + obj:*/libibverbs.so* + fun:ibv_exp_cmd_create_flow + fun:ibv_exp_create_flow +} +{ + libibverbs Param ibv_exp_destroy_flow + Memcheck:Param + write(buf) + ... + fun:*cmd_destroy_flow + fun:ibv_exp*_destroy_flow +} +# issue in verbs_exp +{ + verbs resource_domain leak + Memcheck:Leak + match-leak-kinds: definite + fun:calloc + fun:allocate_send_db + fun:mlx5_exp_create_res_domain + fun:ibv_exp_create_res_domain +} +########################################################### +# false positive libmlx5\4 +{ + libmlx5 Param ibv_create_qp + Memcheck:Param + write(buf) + ... + fun:ibv_exp_cmd_create_qp + obj:*/libmlx*.so* + ... + fun:ibv_create_qp +} +{ + libmlx5 Cond ibv_exp_cmd_query_device + Memcheck:Cond + fun:ibv_exp_cmd_query_device + obj:*/libmlx*.so* + ... + fun:ibv_open_device +} +{ + libmlx Cond ibv_exp_modify_qp + Memcheck:Cond + obj:*/libmlx*.so* + ... + fun:ibv_exp_modify_qp +} +{ + libmlx5 Cond ibv_destroy_qp + Memcheck:Cond + obj:*/libmlx*.so* + fun:_ZN6qp_mgrD1Ev +} +{ + libibverbs Cond ibv_cmd_destroy_qp + Memcheck:Cond + fun:ibv_cmd_destroy_qp + obj:*/libmlx*.so* +} +{ + libmlx5 Param ibv_exp_cmd_create_cq + Memcheck:Param + write(buf) + ... + fun:ibv_exp_cmd_create_cq + obj:*/libmlx*.so* + fun:ibv_exp_create_cq +} +{ + libmlx5 Param ibv_cmd_destroy_cq + Memcheck:Param + write(buf) + ... + fun:ibv_cmd_destroy_cq + obj:/*libmlx*.so* + fun:ibv_destroy_cq* +} +{ + libmlx5 Param ibv_cmd_dealloc_pd + Memcheck:Param + write(buf) + ... + fun:ibv_cmd_dealloc_pd + obj:/*libmlx*.so* +} +{ + libmlx5 Cond ibv_destroy_cq + Memcheck:Cond + fun:ibv_cmd_destroy_cq + ... + fun:*_destroy_cq* +} +{ + libmlx5 Param ibv_create_ah + Memcheck:Param + write(buf) + ... + obj:*/libmlx*.so* + fun:ibv_create_ah +} +{ + libmlx5 Value8 ibv_create_ah + Memcheck:Value8 + ... + obj:*/libmlx*.so* + fun:ibv_create_ah +} +{ + libmlx5 Cond ibv_create_ah + Memcheck:Cond + ... + fun:ibv_create_ah +} +{ + libmlx5 Param ibv_cmd_exp_reg_mr + Memcheck:Param + write(buf) + ... + fun:ibv_cmd_exp_reg_mr + obj:*/libmlx*.so* +} +{ + libmlx4\5 Cond + Memcheck:Cond + ... + obj:*/libmlx*.so + ... + fun:_ZN6qp_mgrD1Ev +} +{ + libmlx5 umr ibv_exp_create_mr + Memcheck:Param + write(buf) + ... + fun:write + fun:ibv_exp_cmd_create_mr +} +{ + libmlx5 umr ibv_exp_cmd_create_wq + Memcheck:Param + write(buf) + ... + fun:write + fun:ibv_exp_cmd_create_wq + fun:mlx5_exp_create_wq +} +{ + libmlx5 umr mlx5_poll_one Value8 + Memcheck:Value8 + ... + fun:mlx5_poll_one + ... + fun:_ZN11ring_eth_cbD1Ev +} +{ + libmlx5 umr mlx5_poll_one Cond + Memcheck:Cond + ... + fun:mlx5_poll_one + ... + fun:_ZN11ring_eth_cbD0Ev +} +{ + umr destroy qp + Memcheck:Value8 + fun:get_sw_cqe + fun:__mlx5_cq_clean + ... + fun:_ZN14ib_ctx_handlerD1Ev +} +{ + ring direct destroy + Memcheck:Cond + obj:*/libmlx5.so* + fun:*qp_mgr_eth_direct* + ... + fun:*release_ringEP21ring_alloc_logic_attr* +} +{ + query mlx5 device + Memcheck:Cond + fun:ibv_exp_cmd_query_device + fun:mlx5_exp_query_device +} +{ + mlx5_lock_cqs + Memcheck:Cond + fun:mlx5_lock_cqs + fun:mlx5_destroy_qp +} +{ + mlx5_unlock_cqs + Memcheck:Cond + fun:mlx5_unlock_cqs + fun:mlx5_destroy_qp +} + +########################################################### +##### libmlx4 only +{ + libmlx4 Cond ibv_exp_query_device + Memcheck:Cond + obj:*/libmlx4*.so* + fun:ibv_exp_query_device +} +{ + libmlx4 Param ibv_exp_create_cq + Memcheck:Param + mmap(length) + ... + obj:*/libmlx4*.so* + fun:ibv_exp_create_cq +} +{ + libmlx4 Param ibv_exp_create_cq + Memcheck:Param + mmap(offset) + ... + obj:*/libmlx4*.so* + fun:ibv_exp_create_cq +} +{ + libmlx4 Param ibv_exp_create_cq + Memcheck:Param + madvise(length) + fun:madvise + ... + obj:*/libmlx4*.so* + ... + fun:ibv_exp_create_cq +} +{ + libmlx4 Cond ibv_exp_create_cq + Memcheck:Cond + ... + obj:*/libmlx4*.so* + fun:ibv_exp_create_cq +} +{ + libmlx4 Cond ibv_exp_create_cq + Memcheck:Cond + obj:*/libmlx4*.so* + ... + fun:ibv_exp_create_cq +} +{ + libmlx4 Param ibv_exp_create_cq + Memcheck:Param + write(buf) + ... + fun:ibv_exp_cmd_create_cq +} +{ + libmlx4 Cond ibv_destroy_cq + Memcheck:Cond + ... + obj:*/libmlx4*.so* + ... + fun:ibv_destroy_cq +} +{ + libmlx4 Param ibv_destroy_cq + Memcheck:Param + munmap(length) + fun:munmap + obj:*/libmlx4*.so* + fun:ibv_destroy_cq +} +{ + libmlx4 Cond ibv_create_qp + Memcheck:Cond + ... + obj:*/libmlx4*.so* + ... + fun:ibv_create_qp +} +{ + libmlx4 Param ibv_create_qp + Memcheck:Param + write(buf) + ... + fun:ibv_exp_cmd_create_qp +} +{ + libmlx4 Cond ibv_modify_qp_from_init_to_rts + Memcheck:Cond + obj:*/libmlx4*.so* + obj:*/libibverbs.so* + fun:_Z35priv_ibv_modify_qp_from_init_to_rtsP6ibv_qpj +} +{ + libmlx4 Cond ibv_destroy_qp + Memcheck:Cond + obj:*/libmlx4*.so* + ... + fun:_ZN20net_device_table_mgr22verify_eth_qp_creationEPKch +} +{ + libmlx4 Value8 ibv_destroy_qp + Memcheck:Value8 + obj:*/libmlx4*.so* + ... + fun:_ZN20net_device_table_mgr22verify_eth_qp_creationEPKch +} +{ + libmlx4 Param ibv_cmd_reg_mr + Memcheck:Param + write(buf) + ... + fun:ibv_cmd_reg_mr + obj:*/libmlx4*.so* +} +{ + libmlx4 Cond rdma_create_id + Memcheck:Cond + obj:*/librdmacm.so* + ... + fun:_ZN20net_device_table_mgr15map_net_devices* +} +{ + libmlx4 Value8 rdma_create_id + Memcheck:Value8 + obj:*/librdmacm.so* + ... + fun:_ZN20net_device_table_mgr15map_net_devicesEv +} +{ + libmlx4 Value8 + Memcheck:Value8 + obj:*/libmlx4*.so + ... + fun:_ZN6qp_mgrD1Ev +} +{ + libmlx4 Value8 ibv_create_qp + Memcheck:Value8 + obj:*/libmlx4*.so + ... + fun:ibv_create_qp +} +{ + libmlx4 ibv_exp_poll_cq + Memcheck:Value8 + obj:*/libmlx4*.so + fun:ibv_exp_poll_cq +} +{ + libmlx4 Param ibv_exp_cmd_modify_cq + Memcheck:Param + write(buf) + ... + fun:ibv_exp_cmd_modify_cq + obj:*/libmlx4*.so +} +{ + libmlx4 Cond ibv_exp_poll_cq + Memcheck:Cond + obj:*/libmlx4*.so + fun:ibv_exp_poll_cq +} +{ + libmlx4 Value8 ibv_exp_poll_cq + Memcheck:Value8 + obj:*/libmlx4*.so + ... + fun:ibv_exp_poll_cq +} +{ + libmlx4 Param ibv_destroy_cq + Memcheck:Param + madvise(length) + fun:madvise + ... + obj:*/libmlx4*.so + ... + fun:ibv_destroy_cq +} +########################################################### +# sockperf +{ + sockperf Leak definite + Memcheck:Leak + match-leak-kinds: definite + fun:malloc + fun:_ZN6ServerI10IoRecvfrom9SwitchOffS1_E13server_acceptEi + fun:_ZN6ServerI10IoRecvfrom9SwitchOffS1_E6doLoopEv +} +########################################################### +# libnl1 +{ + ibnl Leak possible + Memcheck:Leak + match-leak-kinds: possible + ... + obj:*libnl.so* + fun:nl_cache_parse +} +{ + ibnl leak definite + Memcheck:Leak + match-leak-kinds: definite + fun:realloc + fun:__vasprintf_chk + fun:__asprintf_chk + ... + fun:nl_cache_pickup +} +{ + ibnl malloc leak definite + Memcheck:Leak + match-leak-kinds: definite + fun:malloc + fun:__vasprintf_chk + fun:__asprintf_chk + fun:__nl_error + fun:nl_recvmsgs +} diff --git a/debian/changelog.in b/debian/changelog.in new file mode 100644 index 0000000..a374280 --- /dev/null +++ b/debian/changelog.in @@ -0,0 +1,5 @@ +libvma (@VERSION@-@VMA_LIBRARY_RELEASE@) release; urgency=low + + * Please refer to journal.txt for full changelog. + + -- Liran Oz @BUILD_DATE_CHANGELOG@ diff --git a/debian/compat b/debian/compat new file mode 100644 index 0000000..7f8f011 --- /dev/null +++ b/debian/compat @@ -0,0 +1 @@ +7 diff --git a/debian/control b/debian/control new file mode 100644 index 0000000..c681258 --- /dev/null +++ b/debian/control @@ -0,0 +1,81 @@ +Source: libvma +Section: net +Priority: optional +Maintainer: Liran Oz +Build-Depends: debhelper (>= 7), + pkg-config, + autoconf, + automake, + libibverbs-dev, + librdmacm-dev, + libnl-route-3-dev | libnl-dev, +Homepage: https://github.com/Mellanox/libvma + +Package: libvma +Architecture: any +Depends: ${shlibs:Depends}, ${misc:Depends} +Description: LD_PRELOAD-able library that boosts performance + libvma is a LD_PRELOAD-able library that boosts performance of TCP and + UDP traffic. It allows application written over standard socket API to + handle fast path data traffic from user space over Ethernet and/or + Infiniband with full network stack bypass and get better throughput, + latency and packets/sec rate. + . + No application binary change is required for that. + libvma is supported by RDMA capable devices that support "verbs" + IBV_QPT_RAW_PACKET QP for Ethernet and/or IBV_QPT_UD QP for IPoIB. + . + This package includes the dynamic library itself. + +Package: libvma-dev +Section: libdevel +Architecture: any +Depends: ${misc:Depends}, libvma (= ${binary:Version}) +Description: Development files for the libvma library + libvma is a LD_PRELOAD-able library that boosts performance of TCP and + UDP traffic. It allows application written over standard socket API to + handle fast path data traffic from user space over Ethernet and/or + Infiniband with full network stack bypass and get better throughput, + latency and packets/sec rate. + . + No application binary change is required for that. + libvma is supported by RDMA capable devices that support "verbs" + IBV_QPT_RAW_PACKET QP for Ethernet and/or IBV_QPT_UD QP for IPoIB. + . + This package includes headers for building programs with libvma's interface + directly, as opposed to loading it dynamically with LD_PRELOAD. + +Package: libvma-dbg +Section: debug +Architecture: any +Depends: ${misc:Depends}, libvma (= ${binary:Version}) +Description: Debugging symbols for the libvma library + libvma is a LD_PRELOAD-able library that boosts performance of TCP and + UDP traffic. It allows application written over standard socket API to + handle fast path data traffic from user space over Ethernet and/or + Infiniband with full network stack bypass and get better throughput, + latency and packets/sec rate. + . + No application binary change is required for that. + libvma is supported by RDMA capable devices that support "verbs" + IBV_QPT_RAW_PACKET QP for Ethernet and/or IBV_QPT_UD QP for IPoIB. + . + This package contains the debugging symbols associated with libvma. + +Package: libvma-utils +Section: utils +Architecture: any +Depends: ${shlibs:Depends}, ${misc:Depends}, libvma (= ${binary:Version}) +Description: Examples and tools for the libvma library + libvma is a LD_PRELOAD-able library that boosts performance of TCP and + UDP traffic. It allows application written over standard socket API to + handle fast path data traffic from user space over Ethernet and/or + Infiniband with full network stack bypass and get better throughput, + latency and packets/sec rate. + . + No application binary change is required for that. + libvma is supported by RDMA capable devices that support "verbs" + IBV_QPT_RAW_PACKET QP for Ethernet and/or IBV_QPT_UD QP for IPoIB. + . + This package contains the tool vma_stats for collecting and + analyzing Libvma statistic. diff --git a/debian/copyright b/debian/copyright new file mode 100644 index 0000000..823ad6e --- /dev/null +++ b/debian/copyright @@ -0,0 +1,174 @@ +Format: https://www.debian.org/doc/packaging-manuals/copyright-format/1.0/ +Upstream-Name: libvma +Source: https://github.com/Mellanox/libvma + +Files: * +Copyright: 2001-2020 Mellanox Technologies, Ltd. +License: GPLv2-and-2BSD + +Files: + src/vma/lwip/cc.c + src/vma/lwip/cc.h + src/vma/lwip/cc_cubic.c + src/vma/lwip/cc_cubic.h + src/vma/lwip/cc_lwip.c +Copyright: + 2007-2008, Swinburne University of Technology, Melbourne, Australia. + 2009-2010, Lawrence Stewart + 2010, The FreeBSD Foundation + 2001-2018 Mellanox Technologies, Ltd. +License: GPLv2-and-2BSD + +Files: src/vma/lwip/def.h + src/vma/lwip/err.h + src/vma/lwip/init.c + src/vma/lwip/init.h + src/vma/lwip/ip.h + src/vma/lwip/ip_addr.h + src/vma/lwip/opt.h + src/vma/lwip/pbuf.c + src/vma/lwip/pbuf.h + src/vma/lwip/stats.h + src/vma/lwip/tcp.c + src/vma/lwip/tcp.h + src/vma/lwip/tcp_impl.h + src/vma/lwip/tcp_in.c + src/vma/lwip/tcp_out.c +Copyright: 2001-2004, Swedish Institute of Computer Science. +License: BSD-3-clause + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT + * SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY + * OF SUCH DAMAGE. + +Files: + tests/gtest/common/gtest-all.cc + tests/gtest/common/gtest.h +Copyright: 2005, Google Inc. + 2008, Google Inc. +License: BSD-3-clause-Google + Redistribution and use in source and binary forms, with or without + modification, are permitted provided that the following conditions are + met: + . + * Redistributions of source code must retain the above copyright + notice, this list of conditions and the following disclaimer. + * Redistributions in binary form must reproduce the above + copyright notice, this list of conditions and the following disclaimer + in the documentation and/or other materials provided with the + distribution. + * Neither the name of Google Inc. nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + . + THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS + "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT + LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR + A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT + OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, + SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT + LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +Files: tests/gtest/common/tap.h +Copyright: 2011, Bruno P. Kinoshita +License: Expat + Permission is hereby granted, free of charge, to any person obtaining a copy + of this software and associated documentation files (the "Software"), to deal + in the Software without restriction, including without limitation the rights + to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + copies of the Software, and to permit persons to whom the Software is + furnished to do so, subject to the following conditions: + . + The above copyright notice and this permission notice shall be included in + all copies or substantial portions of the Software. + . + THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + THE SOFTWARE. + +Files: src/vma/config_parser.c + src/vma/config_parser.h +Copyright: 1984, 1989-1990, 2000-2012, Free Software Foundation, Inc. +License: GPL-3+-bison + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + . + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + . + You should have received a copy of the GNU General Public License + along with this program. If not, see . + . + As a special exception, you may create a larger work that contains + part or all of the Bison parser skeleton and distribute that work + under terms of your choice, so long as that work isn't itself a + parser generator using the skeleton or a modified version thereof + as a parser skeleton. Alternatively, if you modify or redistribute + the parser skeleton itself, you may (at your option) remove this + special exception, which will cause the skeleton and the resulting + Bison output files to be licensed under the GNU General Public + License without this special exception. + . + On Debian systems, a copy of the General Public License version 3 could + be found at /usr/share/common-licenses/GPL-3 + +License: GPLv2-and-2BSD + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + . + On Debian systems, a copy of the General Public License version 2 could + be found at /usr/share/common-licenses/GPL-2 diff --git a/debian/libvma-dev.install b/debian/libvma-dev.install new file mode 100644 index 0000000..8fa24cf --- /dev/null +++ b/debian/libvma-dev.install @@ -0,0 +1 @@ +usr/include/mellanox/vma_extra.h diff --git a/debian/libvma-utils.install b/debian/libvma-utils.install new file mode 100644 index 0000000..86a9e9c --- /dev/null +++ b/debian/libvma-utils.install @@ -0,0 +1 @@ +usr/bin/vma_stats diff --git a/debian/libvma.install b/debian/libvma.install new file mode 100644 index 0000000..c2dc65b --- /dev/null +++ b/debian/libvma.install @@ -0,0 +1,10 @@ +usr/lib/libvma*.so.* +usr/lib/libvma.so +usr/share/doc/libvma/README.txt +usr/share/doc/libvma/journal.txt +usr/share/doc/libvma/VMA_VERSION +usr/sbin/ +etc/libvma.conf +etc/ +contrib/scripts/vma.service lib/systemd/system +libvma-debug.so usr/lib diff --git a/debian/postinst b/debian/postinst new file mode 100644 index 0000000..bac584a --- /dev/null +++ b/debian/postinst @@ -0,0 +1,21 @@ +#!/bin/bash +if [ `grep memlock /etc/security/limits.conf |grep unlimited |wc -l` -le 0 ]; then + echo "* - memlock unlimited" >> /etc/security/limits.conf + echo "* soft memlock unlimited" >> /etc/security/limits.conf + echo "* hard memlock unlimited" >> /etc/security/limits.conf + echo "- Changing max locked memory to unlimited (in /etc/security/limits.conf)" + echo " Please log out from the shell and login again in order to update this change " + echo " Read more about this topic in the VMA's User Manual" +fi + +/sbin/ldconfig + +if type systemctl >/dev/null 2>&1; then + systemctl --no-reload enable vma.service >/dev/null 2>&1 || true +elif [ -e /sbin/chkconfig ]; then + /sbin/chkconfig --add vma +elif [ -e /usr/sbin/update-rc.d ]; then + /usr/sbin/update-rc.d vma defaults +else + /usr/lib/lsb/install_initd /etc/init.d/vma +fi diff --git a/debian/postrm b/debian/postrm new file mode 100644 index 0000000..7578c69 --- /dev/null +++ b/debian/postrm @@ -0,0 +1,5 @@ +#!/bin/bash +/sbin/ldconfig +if type systemctl >/dev/null 2>&1; then + systemctl --system daemon-reload >/dev/null 2>&1 || true +fi diff --git a/debian/prerm b/debian/prerm new file mode 100644 index 0000000..bad8013 --- /dev/null +++ b/debian/prerm @@ -0,0 +1,15 @@ +#!/bin/bash + +if type systemctl >/dev/null 2>&1; then + systemctl --no-reload disable vma.service >/dev/null 2>&1 || true + systemctl stop vma.service || true +elif [ -e /sbin/chkconfig ]; then + /etc/init.d/vma stop || true + /sbin/chkconfig --del vma || true +elif [ -e /usr/sbin/update-rc.d ]; then + /etc/init.d/vma stop || true + /usr/sbin/update-rc.d -f vma remove || true +else + /etc/init.d/vma stop || true + /usr/lib/lsb/remove_initd /etc/init.d/vma || true +fi diff --git a/debian/rules b/debian/rules new file mode 100755 index 0000000..70f3114 --- /dev/null +++ b/debian/rules @@ -0,0 +1,24 @@ +#!/usr/bin/make -f +# -*- mode: makefile; coding: utf-8 -*- + +#export DH_VERBOSE=1 + +%: + dh $@ + +build: build-debug + ./configure --with-ofed=/usr --prefix=/usr --libdir=/usr/lib --includedir=/usr/include --sysconfdir=/etc + +build-debug: + ./configure --enable-opt-log=none --with-ofed=/usr --prefix=/usr --libdir=/usr/lib --includedir=/usr/include --sysconfdir=/etc + make + cp -f src/vma/.libs/libvma.so libvma-debug.so + make clean + + +# Commands not to run +override_dh_auto_configure: + +# Workaround for missing dependency information in libmongoclient package +override_dh_shlibdeps: + dh_shlibdeps --dpkg-shlibdeps-params=--ignore-missing-info diff --git a/debian/watch b/debian/watch new file mode 100644 index 0000000..2adb703 --- /dev/null +++ b/debian/watch @@ -0,0 +1,7 @@ +# Compulsory line, this is a version 4 file +version=4 + +# GitHub hosted projects +opts="filenamemangle=s%(?:.*?)?v?(\d[\d.]*)\.tar\.gz%libvma-$1.tar.gz%" \ + https://github.com/mellanox/libvma/tags \ + (?:.*?/)?v?(\d[\d.]*)\.tar\.gz debian uupdate diff --git a/install.sh b/install.sh new file mode 100755 index 0000000..0dc23a6 --- /dev/null +++ b/install.sh @@ -0,0 +1,17 @@ +#!/bin/sh + +VMA_LIBRARY_MAJOR=`grep "VMA_LIBRARY_MAJOR=" configure.ac | cut -f2 -d '='` +VMA_LIBRARY_MINOR=`grep "VMA_LIBRARY_MINOR=" configure.ac | cut -f2 -d '='` +VMA_LIBRARY_REVISION=`grep "VMA_LIBRARY_REVISION=" configure.ac | cut -f2 -d '='` +VMA_LIBRARY_RELEASE=`grep "VMA_LIBRARY_RELEASE=" configure.ac | cut -f2 -d '='` +VMA_VERSION="$VMA_LIBRARY_MAJOR.$VMA_LIBRARY_MINOR.$VMA_LIBRARY_REVISION-$VMA_LIBRARY_RELEASE" + +./autogen.sh +#configure without parameters - good if you don't need to install. libvma.so will be in ./src/vma/.libs/libvma.so. +#./configure +#configure with parameters required for install +#example: ./configure --with-ofed=/usr --prefix=/usr --libdir=/usr/lib64 --includedir=/usr/include --docdir=/usr/share/doc/libvma-8.5.2 --sysconfdir=/etc +./configure --with-ofed=/usr --prefix=/usr --libdir=/usr/lib64 --includedir=/usr/include --docdir="/usr/share/doc/libvma-$VMA_VERSION" --sysconfdir=/etc +make +sudo make install + diff --git a/journal.txt b/journal.txt new file mode 100644 index 0000000..b3f78e5 --- /dev/null +++ b/journal.txt @@ -0,0 +1,2766 @@ +Version 9.0.2-0: +Date + Time 2020-02-06 +============================================================= +Fixed: + - RM #2069198 Disable BF usage for Azure + +Version 9.0.1-0: +Date + Time 2020-01-20 +============================================================= +Added: + - RM #2053834 Update License date to 2020 + - RM #1606044 Adapt sendfile() for UDP + - RM #1606044 Optimize sendfile() for TCP + - RM #1606044 Introduce lwip:register_sys_readv() + - RM #1606044 Pass bitwise flag into lwip:tcp_write() + - RM #1606044 Change tx() function prototype + - RM #1971409 Add ring_eth_direct support for upstream + - RM #1971409 Add vma_get_dpcp_devices() extra api + - RM #1971409 Add dpcp adapter initialization + - RM #1971409 Add --with-dpcp configuration option + - RM #1858709 Add daemon option to set spoofed SYN retry interval on cleanup + +Fixed: + - RM #2016629 Add setsockopt to shutdown socket's RX side + - RM #1794728 Fix calling unregister_timer_event() twice + - RM #1775713 Remove useless timeout during VMA destructor + - RM #1775713 Remove useless code related m_call_orig_close_on_dtor + - RM #1775713 Return ECONNABORTED on connect() for errorable socket + - RM #1775713 Remove force_close() usage + - RM #1264894 Cleanup socket fd from epoll + - RM #1264894 Resolve fd collection cleanup issues + - RM #1684348 Remove BullseyeCoverage redundant excludes + - RM #1684349 Move timetest to tests dir + +Version 8.9.5-0: +Date + Time 2019-10-28 +============================================================= +Fixed: + - RM #1946579 Fix event notification logic used by extra API + - RM #1909532 Resolve gcc 9.x issues + - RM #1734068 Fix inaccurate timer_handler unregistration + - RM #1930072 Fix send() operations processing + +Version 8.9.4-0: +Date + Time 2019-09-23 +============================================================= +Fixed: + - RM #1898614 Fix bad head tail access in mp_cq + +Version 8.9.3-0: +Date + Time 2019-09-19 +============================================================= +Added: + - RM #1902084 Add deprecated message for Multi Packet RQ + +Fixed: + - RM #1894523 Disable BF usage for KVM VMs + - RM #1908481 Improve error processing tcp_rexmit_segment() + - RM #1859490 Disable migration in case failure + - RM #1843623 Fix tcp_split_segment() + - RM #1903593 Set TSO operation for all segments large MSS + - RM #1890471 Fix retransmission TSO segment with large buffer + +Version 8.9.2-0: +Date + Time 2019-08-26 +============================================================= +Added: + - RM #1074708 Add TSO capability + +Version 8.9.1-0: +Date + Time 2019-08-20 +============================================================= +Added: + - RM #1772199 Add socketxtreme example + +Fixed: + - RM #1718617 Update valgrind suppression file + - RM #1772805 Fix return code for non blocking recv()/send() + - RM #1797193 Return ECONNABORTED as error value for accept() + - RM #1798908 Fix daemon agent initialization in case fork() usage + - RM #1699062 Handle RING_LOGIC_PER_USER_ID unsupported optval + - RM #1772199 Cleanup vma_extra.h + - RM #1734397 Fix csbuild 2.3.0 issues + +Version 8.8.3-0: +Date + Time 2019-04-15 +============================================================= +Added: + - RM #1512054 Add warning in case Device Memory can not be used + - RM #1718617 Update valgrind suppression file + - RM #1701456 Enable mlx4 flow steering warning for VM + +Fixed: + - RM #1732395 Detect CPU Frequency for aarch64 + - RM #1564149 Add RTM_DELLINK processing + - RM #1566916 Do not call ibv_get_async_event() after getting IBV_EVENT_DEVICE_FATAL + - RM #1663915 Fix destroy qp collision for ring_eth_cb + - RM #1687458 Fix double free of TCP timer event + +Version 8.8.2-0: +Date + Time 2019-02-20 +============================================================= +Added: + - RM #1684500 Remove useless inlining + - RM #1684500 Remove useless inlining in daemon + - RM #1684500 Remove inlining for vlog_printf() + - RM #1582418 Add Burst-Packet-Pacing support for upstream + - RM #1582417 Add Packet Pacing support for upstream + - RM #1557652 Improve flow processing + - RM #1557652 Add tc_add_filter_dev2tap() operation + - RM #1557652 Add tc_add_filter_tap2dev() operation + - RM #1557652 Add tc_add_filter_link() operation + - RM #1557652 Use netlink API to control TC rules + - RM #1380243 Ignore events of a closed socket for SocketXtreme + - RM #1677457 Add methods mask to VMA extra api + - RM #1671069 Verify mlx4 steering creation for bare-metal + - RM #1426871 Include Debian log file into jenkins artifact + - RM #1426871 Update debian/control file + - RM #1426871 Update debian copyright + - RM #1426871 Enable service status for non root + - RM #1426871 Add debian watch file + - RM #1426871 Improve init script + - RM #1426871 Add service mandatory option as force-reload + - RM #1426871 Improve vma service support on different systems + - RM #1661438 General redundant deadcode cleanup + - RM #1628881 Add ring modification function to vma_extra api + - RM #1631600 Cleanup socketxtreme and vmapoll configuration parameters + - RM #1651457 Add burst capabilty check to get_ring_descriptors + +Fixed: + - RM #1682300 Fix double free in free_libvma_resources + - RM #931123 Fix filter multicast packets after bind + - RM #1087459 Fix VMA_EXCEPTION_HANDLING=0/1 generates core + - RM #1679709 Fix missing update of CQ consumer index + - RM #1653789 Fix Rx and Tx migrations thread safety + - RM #1426871 Fix root detection + - RM #1668925 Fix RoCE lag warning is not presented + - RM #1599446 Fix RX TAP migration failure for 3t connections + - RM #1073005 Fix g_tcp_timers_collection destructor cleanup + +Version 8.7.7-0: +Date + Time 2019-01-30 +============================================================= +Added: + - RM #1647737 Modify Packet Pacing capability calculation + - RM #1605819 Add ring allocation logic control for all socket types + - RM #1576346 Modify RX HW timestamp capability calculation + - RM #1614459 Suppress pclose() ECHILD error + - RM #1610832 Cleanup socketxtreme in Jenkins + - RM #1450183 General code cleanups in ring headers + - RM #1450183 Move request_buffers_tx logic to ring_slave + - RM #1450183 Move rx_process_buffer logic to ring_slave + - RM #1450183 Move detach_flow logic to ring_slave + - RM #1450183 Move attach_flow logic to ring_slave + - RM #1450183 Move common fields and methods to ring_slave + - RM #1582189 Add Device Memory (Memic) support for upstream + - RM #1582188 Replace direct memcpy in dm_mgr + - RM #1574870 Print warning while RoCE LAG is enabled + - RM #1614436 Remove RoCE LAG warning duplications + - RM #1617358 Update License date to 2019 + - RM #1592783 Add mlx5 PTP support for upstream + - RM #1608765 Clean redundant dead code from vma_lwip class + +Fixed: + - RM #1649370 Fix crash during Plugin event + - RM #1646743 Fix get_cqe64() processing + - RM #1649972 Fix incorrect TCP snd_wnd statistic + - RM #1522964 Fix can't be interrupted while VMA_SELECT_POLL=-1 + +Version 8.8.1-0: +Date + Time 2018-12-26 +============================================================= +Added: + - RM #1075188 Simplify is_mp_ring() method + - RM #1075188 Cleanup performance counters + - RM #1075188 Improve UDP performance + - RM #1075188 Improve rx ring creation flow + - RM #1075188 Unify poll_and_process_element_rx() and drain_and_process() + - RM #1075188 Unify rfs_uc::rx_dispatch_packet() + - RM #1075188 Use single event update method + - RM #1075188 Disable draining for SocketXtreme + - RM #1075188 Add SocketXtreme API usage check + - RM #1075188 Set default parameters for SocketXtreme mode + - RM #1075188 Add VMA_SOCKETXTREME environment variable + - RM #1075188 Unify clean_cq() function + - RM #1075188 Add sanity check for correct SocketXtreme mode usage + - RM #1075188 Unify setting rx.context + - RM #1075188 Optimize get_cqe64() + - RM #1075188 Remove socketxtreme specific methods in cq_mgr_mlx5 + - RM #1075188 Use m_mlx5_cq fields + - RM #1075188 Remove socketXtreme specific fields in cq_mgr_mlx5 + - RM #1075188 Improve performance of cqe processing + - RM #1075188 Move socketxtreme cq processing to cq_mgr_mlx5 + - RM #1075188 Use post_recv() in socketxtreme mode from qp_mgr_mlx5 + - RM #1075188 Unify sockinfo code + - RM #1075188 Enable socketxtreme_poll() for all modes + - RM #1075188 Unify reclaim rx buffers + +Fixed: + - RM #1603531 Fix pad length in umr mode + +Version 8.8.0-0: +Date + Time 2018-12-16 +============================================================= +Added: + - RM #1592040 Add mlx5 Dummy-send support for upstream + - RM #1435293 Do not start daemon during package installation + - RM #1584343 Disable vlan stripping in MP_RQ + - RM #1072426 Update valgrind suppression file + - RM #1575618 Modify add_conf_rule() input param type +Fixed: + - RM #1600817 Resolve --enable-opt-log=high compilation errors + - RM #1574908 Fix buffer overflow in epfd_info::statistics_print() + - RM #1575056 Fix infinite loop in TCP is_readable() + - RM #1073005 Fix memory leak in g_tcp_timers_collection + - RM #1546495 Fix VMA_RING_LIMIT_PER_INTERFACE segmentation fault + - RM #1537051 Fix VMA_RING_LIMIT_PER_INTERFACE VMA ERROR + - RM #1583027 Fix crash while "vma_stats -S" is used + - RM #1580794 Fix EPOLL events for RST packets + - RM #1566648 Fix missing initialization of poll_sn variable + +Version 8.7.5-0: +Date + Time 2018-11-15 +============================================================= +Added: + - RM #1571657 Suppress failure error of ibv_get_async_event +Fixed: + - RM #1571899 Fix a typo in bond warning message + - RM #1566916 Fix CQ cleanup ordering issue + +Version 8.7.4-0: +Date + Time 2018-11-13 +============================================================= +Added: + - RM #1438405 Parse vlan priority mapping file + - RM #1438405 Allow dynamic updating header bits after ring creation + - RM #1557652 BF is not supported for upstream on VMs + - RM #1557652 Check MLX5_SHUT_UP_BF env + - RM #1178933 Print warning while mlx5 RoCE lag is enabled + - RM #1540213 Print warning while mlx4 RoCE lag is enabled + - RM #1521396 Reduce MP_RQ allocated size when stripping + - RM #1563104 Print error while No IB capable devices found + - RM #1554637 Modify NetVSC device detection + - RM #1072998 Update valgrind suppression file + - RM #1523707 Improve plugout processing + - RM #1552382 Redirect ioctl SIOCGIFVLAN +Fixed: + - RM #1557668 Fix infinite polling loop during plugout + - RM #1564259 Fix IP_TTL socket option inheritance + - RM #1557681 Fix timer converter Segmentation fault + - RM #1521396 Fix user memory passing to socket + - RM #1557786 Protect vma_ib_mlx5_get_cq() + - RM #1556067 Add sanity check in clean_cq() + - RM #1554167 Fix symbol lookup error for mlx5dv + - RM #1523707 Fix deadlock collision for ring_tap + +Version 8.7.3-0: +Date + Time 2018-11-01 +============================================================= +Added: + - RM #1471639 Add TCP Rx Timestamp support + - RM #1471639 Move Rx timestamp handling to sockinfo + - RM #1541702 Change the visibility of global TCP names + - RM #1513181 Increased FID_MAX for daemon + - RM #1521601 Allow user to change the flow_id + - RM #1542628 Do not use memic on devices w/o blue flame + - RM #1512054 Enable direct mlx5 processing on VMs + - RM #1540484 Add UDP Rx Hw timestamp support for upstream + - RM #1540484 Add ibv_device_attr_ex support + - RM #1540483 Add flow tag support for upstream + - RM #1536822 Add Daemon version to help output + - RM #1521396 Add external memory for MP_RQ ring + - RM #1536838 Enable plugin on upstream after reboot +Fixed: + - RM #1521601 Fix user defined flow tag handling + - RM #1541163 Fix return value from sockinfo:setsockopt() + - RM #1526810 Fix stride calculation when using KLM + - RM #1541768 Fix bad operator== in ring_profile + - RM #1498311 Fix crash during CQ creation on upstream + +Version 8.7.2-0: +Date + Time 2018-10-18 +============================================================= +Added: + - RM #1537043 Improve qp_mgr_mlx5 class + - RM #1537043 Add mlx5 req_notify_cq() + - RM #1537043 Enable socketxtreme mode for rdma-core (upstream) + - RM #1537043 Support direct verbs + - RM #1537043 Detect direct ring support during configure + - RM #1537043 Remove MLX5_CQE_SIG_ERR usage + - RM #1537043 Introduce access to head and tail of qp.rq + - RM #1537043 Introduce vma_ib_mlx5_qp_t + - RM #1537043 Introduce vma_ib_mlx5_cq_t + - RM #1537043 Use to_mqp() to convert ibv_qp to mlx5_qp + - RM #1537043 Introduce ib layer + - RM #1537043 Start direct mlx5 support since MOFED 3.4 + - RM #1537043 Add direct verbs support for upstream OFED + - RM #1537043 Cleanup hw/mlx5 folder + - RM #1537043 Cleanup MLX5_HW_ETH_WQE_HEADER definition + - RM #915305 Add connection with daemon on the fly + - RM #807132 Clean unused code from lwip pbuf.c and tcp.c + - RM #807132 Remove all LWIP Non TCP related options + - RM #807132 Remove LWIP BACKLOG handling which is not in use + - RM #807132 Remove LWIP pcbs lists which are not in use + - RM #1522890 Update valgrind suppression file + +Version 8.7.1-0: +Date + Time 2018-09-12 +============================================================= +Added: + - RM #1510731 Enhance IPoIB devices validation process + - RM #1390052 Add Windows Hypervisor VMs to Jenkins + - RM #1441462 Add mlx5 IPoIB support for upstream + - RM #1417620 Enable UDP/TCP connection on same VM w/o SRIOV + - RM #1507004 General code cleanup + - RM #1498330 Update driver restart recommendations + - RM #1469556 Add sendfile/sendfile64 support + - RM #1491767 Increase SO_RCVTIMEO for agent socket + - RM #1471640 Add inbox OFED detection during VMA configuration + - RM #1471640 Improve OFED location detection + - RM #1498320 Set RDMAV_ALLOW_DISASSOC_DESTROY env for upstream + - RM #1489419 Use macro to detect exp arguments in verbs.m4 + - RM #1480982 Add cq_moderation support for upstream +Fixed: + - RM #1516726 Fix bond traffic not offloaded over IPoIB + - RM #1175479 Inherit missing properties from the parent + - RM #1087466 Fix VMA_EXCEPTION_HANDLING=2 mode handling + +Version 8.7.0-0: +Date + Time 2018-07-31 +============================================================= +Added: + - RM #1465058 Introduce ring per ip-address logic + - RM #1427836 Register memory of single context + - RM #1427835 Update update_netvsc_slaves() by parameter + - RM #1427833 Update ib_ctx collection table by parameter + - RM #1394737 Remove mem_buf_desc_owner + - RM #1475467 Remove virtual functions from mem_buf_desc_owner + - RM #1384362 Report version mismatch between vmad and agent + - RM #1469960 Add iterator shortcut to vma_list_t + - RM #1463833 Improve peer notification capability + - RM #1388840 Support ring allocation methods for ring tap + - RM #1467248 Remove ah_cleaner class + - RM #1322101 Add support for IP_TTL socket option + - RM #1450325 Enhance vma_stats output to monitor ring_type + - RM #1178933 Print warning while RoCE lag is enabled + - RM #1449421 Suppress warning in case flow attach is interrupted +Fixed: + - RM #1480511 Fix TCP client hangs while connecting to local ip + - RM #1477680 Fix issue with loopback detection + - RM #1475443 Fix getsockname/getpeername returned data + - RM #1452014 Fix gcc8 stringop-truncation warning + +Version 8.6.10-0: +Date + Time 2018-06-27 +============================================================= +Fixed: + - RM #1387232 Fix issue with default path mismatch + +Version 8.6.9-0: +Date + Time 2018-06-27 +============================================================= +Added: + - RM #1436435 Remove duplicate Hugepages warning + - RM #1433896 Enhance ib_ctx handler debug information +Fixed: + - RM #1435682 Fix incorrect TCP seqno after zero-wnd-probe + - RM #1432927 Fix traffic stop after second fail over + +Version 8.6.8-0: +Date + Time 2018-06-21 +============================================================= +Added: + - RM #1431683 Add retransmissions counter to ring_tap + - RM #1178926 Add HW RX checksum support for SocketXtreme + - RM #1431297 Remove VMA_RX_SW_CSUM control + - RM #1284069 Set MLX5_DEVICE_FATAL_CLEANUP env for mlx5 device + - RM #1387232 Add vmad configurable notify dir + - RM #1428737 Add pending to remove list to daemon + - RM #1284069 Improve device removing in plugout case + - RM #1392361 Improve daemon logging verbosity + - RM #1424159 Modify TC rule to offload only non-frag packets + - RM #1424157 Modify daemon flow structure + - RM #1423874 Offload UDP traffic to TAP + - RM #1423819 Enable Blue Flame send for NetVSC interfaces + - RM #1420228 Modify mem_buf_desc_t constructor + - RM #1365423 Use mmap to allocate huge pages + - RM #1413354 Add Ring tap enhancements + +Fixed: + - RM #1431269 Fix ring_bond_netvsc with more than 2 resources error + - RM #1365423 Fix fallback memory allocation logic + - RM #1418682 Fix TCP on ring tap from multiple processes on the same IP + - RM #1424602 Fix crash while valgrind is loaded without SRIOV + - RM #1405641 Fix crash when m_net_device_map_index is empty + - RM #1403118 Fix incorrect release memory of Tx buffers + - RM #1417243 Fix gcc8 build warnings + - RM #1418379 Fix incorrect handling of IF_DOWN event + - RM #1417077 Fix TCP bind to device same interface failure + +Version 8.6.7-0: +Date + Time 2018-06-06 +============================================================= +Added: + - RM #1383463 Report warning if there is no connection with daemon + - RM #1405041 Improve hypervisor type detection + - RM #1348872 Remove bad pointer check when getting memory info + - RM #1413587 Modify warning while log_num_mgm_entry_size != -1 + - RM #1402512 Daemon is able to detect VMA zombie process + - RM #1403631 Add support master slave for qp direct + +Fixed: + - RM #1416053 Fix valgrind invalid read warning + - RM #1403304 Fix duplicate UDP Tx packets w/o SRIOV + - RM #1404279 Fix TCP sockets of secondary IPs are not offloaded + - RM #1405898 Fix get_net_device_val search method + - RM #1405113 Fix VMA crash if qp creation failed + - RM #1405111 When using tx prm verify cq_size is power of 2 + +Version 8.6.6-0: +Date + Time 2018-05-29 +============================================================= +Added: + - RM #1284069 Calculate csum at the ring level + - RM #1284069 Reclaim not owned buffers on socket close + - RM #1284069 Verify ib device before vf creation + - RM #1284069 Add plugout counter into statistic + - RM #1284069 Add IF_VERBS_FAILURE_EX macro + - RM #1284069 Pass if_index to slave_data_t constructor + - RM #1284069 Introduce plugin support under Hypervisor + - RM #1284069 Introduce register_memory() for buffer_pool + - RM #1284069 Improve code readability + - RM #1284069 Add ibv_exp_get_device_list usage + - RM #1284069 Introduce plugout support under Hypervisor + - RM #1284069 Add extra RX buffers reclaim + - RM #1284069 Validate memory allocation type for VM + - RM #1284069 Suppress error message for update_epfd() + - RM #1284069 Optimize m_p_tx_mem_buf_desc_list releasing + - RM #1284069 Improve ib_ctx_handler_collection::update_tbl() + - RM #1284069 Improve net_device_table_mgr processing + - RM #1284069 Introduce update_netvsc_slaves() + - RM #1284069 Introduce allocator::deregister_memory() + - RM #1284069 Improve ib memory registration + - RM #1284069 Change ring::restart() prototype + +Fixed: + - RM #1284069 Fix issue in UDP checksum calculation + - RM #1284069 Fix issue with csum ability detection + - RM #1284069 Fix issue in dst_entry::return_buffers_pool() + - RM #1398946 Fix issue with incorrect ring_eth_cb creation + +Version 8.6.5-0: +Date + Time 2018-05-24 +============================================================= +Added: + - RM #1284069 Support outcome UDP traffic in ring_tap + - RM #1284069 Support outcome TCP traffic in ring_tap + - RM #1284069 Store flow:sink pair in ring_bond + - RM #1284069 Redirect traffic from tap to netvsc + - RM #1284069 Use m_neigh_cma_event_channel in case IB exists only + - RM #1284069 Allow memory allocation w/o registering + - RM #1284069 Remove m_p_cma_event_channel from fd_collection + - RM #1284069 Display message in case there is no IB devices + - RM #1284069 Ring tap support rfs creation + - RM #1284069 Use ring_slave in rfs classes + - RM #1284069 Introduce ring_slave::m_type + - RM #1284069 Introduce ring_tap + - RM #1338066 Disable flow tag for unicast flows + - RM #1382889 Add Tx statistics to ring + +Fixed: + - RM #1387232 Fix creating write permission files + +Version 8.6.4-0: +Date + Time 2018-05-22 +============================================================= +Added: + - RM #1284069 Update valgrind suppression file + - RM #1284069 Add print_val() for ring + - RM #1284069 Use if_index only to create any ring + - RM #1284069 Add special methods to add/remove slave ring + - RM #1284069 Change type of m_bond_rings to vector + - RM #1284069 Reduce m_active_rings usage from ring_bond + - RM #1284069 Remove active parameter from all ring CTORs + - RM #1284069 Pass if_index into ring_eth/ib constructor + - RM #1284069 Pass if_index into ring_bond constructor + - RM #1284069 Use ifindex in slave_data + - RM #1284069 Set MLX4_DEVICE_FATAL_CLEANUP env for mlx4 device + - RM #1284069 Set device type in net_device_val() + - RM #1284069 Enable RTM_NEWLINK, RTM_DELLINK notification + - RM #1284069 Move inc_tx_retransmissions to ring_slave + - RM #1284069 Use slave_ring in ring_bond + - RM #1284069 Move few methods to ring_slave + - RM #1284069 Move statistic to slave ring + - RM #1284069 Introduce ring slave + - RM #1331580 Remove redundant variables from ring + - RM #1331577 Remove dead code from_ring simple + - RM #1331574 Move flow spec declarations to ring_simple + +Fixed: + - RM #1284069 Fix memory leak for ring_bond::m_p_n_rx_channel_fds + +Version 8.6.3-0: +Date + Time 2018-05-14 +============================================================= +Added: + - RM #1284069 Modify get_ib_ctx() to process bonding mode=4 + - RM #1284069 Update valgrind suppression file + - RM #1284069 Add ib_ctx_handle::get_ibname() + - RM #1284069 Modify operations with lkey + - RM #1284069 Remove rdma cm from vet_device_val() + - RM #1284069 Cleanup ib_ctx_handler_collection + - RM #1284069 Use Verbs API to create ib devices + - RM #1284069 Get ib_ctx_handler using ifname + +Fixed: + - RM #1284069 Valgrind strtok fix + - RM #1284069 Fix issue in deregestering memory in ring_eth_cb + +Version 8.6.2-0: +Date + Time 2018-05-10 +============================================================= +Added: + - RM #1284069 Unify flow_spec_udp + - RM #1284069 Improve code readability + - RM #1284069 Optimize storage for access net_device_val + - RM #1284069 Remove net_device_val::local_addr + - RM #1284069 Use netlink during device mapping + - RM #1284069 Net device enhancements + - RM #1284069 Add m_ip to net_dev_val + - RM #1284069 Add set_ip_array() + - RM #1284069 Improve net_dev_val verification functions + - RM #1284069 Modify net_dev_val parameters assignment + - RM #1284069 Modify map_device logic + +Version 8.6.1-0: +Date + Time 2018-05-06 +============================================================= +Added: + - RM #1348872 Add capabiliy mask to device data + - RM #1348872 Add support retrieving the memory used by RING_CB + - RM #1348872 Add support to get RING_CB HW descriptors + - RM #1348872 add external mem support for RING_CB + - RM #1376089 Always use at least four WQEs with MP_RQ + - RM #1376088 use DUMP_LKEY to strip down network + - RM #1182150 Support UDP HW RX timestamp in SocketXtreme (#605) + - RM #1353974 Use ib_ctx inside ring to convert CQE TS to timespec (#599) + - RM #1332983 Add some debug prints to ring_cb (#595) + - RM #1075188 Reduce unnecessary socketxtreme code + - RM #1075188 Unify setsockopt/getsockopt +Fixed: + - RM #1375982 Modify libnl3-route-dev error message (#608) + - RM #1357010 Fix setsockopt(IP_MULTICAST_IF) with struct mreq (#613) + - RM #1354251 Fix statistics for socketXreme income data (#611) + +Version 8.6.0-0: +Date + Time 2018-03-22 +============================================================= +Added: + - RM #issue: 1330652 Use kbps value for setting enhanced rate limit (#591) +Fixed: + - RM #1322084 Fix TCP MSS incompatibility with RFC-793 + - RM #1262560 Fix zero-window ack probe + - RM #1190638 Align lwip timer interval to VMA tcp timer interval + - RM #1311399 Remove cq_type parameter from ring functions + +Version 8.5.7-0: +Date + Time 2018-02-19 +============================================================= +Added: + - RM #1284081 Extend gtest with vmad_flow tests + - RM #1284081 Extend gtest with vmad_state tests + - RM #1284081 Extend gtest with vmad_init tests + - RM #1284081 Extend gtest with vmad_hash tests + - RM #1284081 Extend gtest with vmad_bitmap tests + - RM #1284081 Create gtest/vmad unit + - RM #1284081 Extend gtest with mix_list tests +Fixed: + - RM #1308755 Fix no traffic for Alias NetVSC interfaces + - RM #1284081 Fix issues found by gtest/vmad_hash + - RM #1284081 Fix issues found by gtest/vmad_bitmap + +Version 8.5.6-0: +Date + Time 2018-02-08 +============================================================= +Added: + - RM #1276626 Add support for padded stridingRQ +Fixed: + - RM #1254846 Fix rate limit if no connection established yet + - RM #1149532 Fix Deadlock on blocking TCP send + - RM #1262610 Changed bucket node choice for 5t flow + - RM #1284253 Move dm_context class from util to dev + - RM #1172298 Rename jenkins target vmapoll with socketxtreme + - RM #1280902 Fix issues in tcp_split_segment function + +Version 8.5.5-0: +Date + Time 2018-02-01 +============================================================= +Added: + - RM #1254846 Align burst control with libibverbs new API + - RM #1279684 Modify TAP interface name + - RM #1226478 Add valgrind detection macro to ring_eth_direct + - RM #1279599 Update License date to 2018 +Fixed: + - RM #1280126 Inline private function in ring_bond_eth_netvsc + - RM #1279340 Fix Ubuntu 17.10 issue in build_deb.sh + +Version 8.5.4-0: +Date + Time 2018-01-28 +============================================================= +Fixed: + - RM #1279344 Fix gcc7 format-truncation warning + +Version 8.5.3-0: +Date + Time 2018-01-27 +============================================================= +Added: + - RM #1262610 Improve handle for 5t rule + - RM #1262610 Add free htid detection + - RM #1262610 Add VMA_MSG_FLOW processing + - RM #1262610 Inform daemon about flows + - RM #1272182 Add TAP statistics + - RM #1160029 Add ring_bond_eth_netvsc + - RM #1160029 Detect NetVSC interfaces + - RM #1277704 Cleanup redundant poll_sn occurrences + - RM #1277085 Update recommended installation command + - RM #1265099 Add get ib_ctx object from interface name + - RM #1272179 Arrange Ring statistics + - RM #1272183 Clean stats_printer.cpp + - RM #1248968 Add support for IP_TOS in setsockopt + - RM #1254846 Add support for packet packing burst control + - RM #1254784 Add support for pcp setting in setsockopt + - RM #1254782 Merge kernel setsockopt from TCP and UDP sockets + - RM #1254780 Minor changes in flow_table operators + - RM #1264431 Detect device port without rdma_cm + - RM #1264919 Improve vma.service unit for inbox + - RM #1259552 Optimize global get buffers procedure + - RM #1248705 Add extra api to get ring TX fd from socket +Fixed: + - RM #1262610 Resolve issue from static analysis tools + - RM #1262610 Workaround for tc u32 thable deletion issue + - RM #1262198 Fix epoll_wait returns fd EPOLLOUT event twice + - RM #1226478 Fix direct QP and CQ destruction + - RM #1263120 Fix compilation if libibverbs doesn't use libnl + - RM #1172298 Compile Sockperf for Jenkins against actual libvma + - RM #1226478 Fix SQ buf pointer + +Version 8.5.2-0: +Date + Time 2018-01-11 +============================================================= +Added: + - RM #1262144 Print warning when extra api functions are not supported + - RM #1256248 Improve debian package build script + - RM #1226478 Add direct-ring creation and HW descriptors logic + - RM #1226473 Expose network header using new extra_api + - RM #1248697 Make profiles unique in the map + - RM #1178934 Add new structs for external ring type + - RM #1250147 Reorder netlink event registration at startup + - RM #1055916 Add support for Pattern Klm in MP-RQ + - RM #1151606 Modify valgrind suppression file + - RM #1069965 Remove ldconfig from installation + - RM #1171143 Add write combining barrier for Blue Flame send +Fixed: + - RM #1256248 Fix systemd mode in debian package + - RM #1247358 Fix crash when VMA could not resolve neighbor + - RM #1258245 Fix SYN messages are sent to invalid peer + +Version 8.5.1-0: +Date + Time 2017-12-24 +============================================================= +Added: + - RM #1172298 Use --enable-socketxtreme configuration in jenkins + - RM #1172298 Add failure message for --enable-vmapoll + - RM #1172298 Rename inside tests + - RM #1172298 Rename vmapoll internal functions + - RM #1172298 Rename vmapoll interfaces + - RM #1172298 Use --enable-socketxtreme option + - RM #1214066 Add support for route and rule extended table id + - RM #1242606 Disable TCP congestion control using VMA parameter + - RM #1148076 Remove volatile when handling CQE in MP_RQ + - RM #1214451 Add systemd support for daemon + - RM #1056823 Improve build configure structure + - RM #1214453 Improve selection of filesystem monitor method + - RM #1235810 Allow to override build date + - RM #1168028 Add support for static ARP entries + - RM #1080998 Optimize ib_ctx_handler class +Fixed: + - RM #1248185 Revert some vma_extra sockeXtreme names change + - RM #1247022 Fix typo in CQ struct debth->debt + - RM #1235797 Not respond to incoming FIN in the listen state + - RM #1229443 Not respond to incoming RST in the listen state + - RM #1214453 Fix daemon for powerpc + - RM #1214453 Fix issue in inotify processing + - RM #1222660 Correct warning check for flow steering + - RM #1224020 Fix typos in README.txt + +Version 8.4.10-0: +Date + Time 2017-11-23 +============================================================= +Added: + - RM #1207424 Send quickack depending on the payload size + - RM #1179641 Add MTU find logic to ring + - RM #1190606 VMA stats - Remove timewait interval in last cycle +Fixed: + - RM #1192017 Fix ibv_destroy_cq() failure while VMA_TCP_3T_RULES=1 + - RM #1213984 Use dst_enty to find the route MTU + - RM #1175479 Inherit missing properties from the parent + - RM #1175657 Fix ring statistics are not updated was flowtag is enabled + - RM #1201040 Fix VMA_BUFFER_BATCHING_MODE is ignored while set from spec + +Version 8.4.9-0: +Date + Time 2017-11-13 +============================================================= +Added: + - RM #1179641 Override MTU from route over interface MTU + - RM #1179641 Add parsing route table metrics + - RM #1190054 Rename VMA_SPEC parameter 6973 to 7750 + - RM #1176937 Add VMA_TCP_QUICKACK environment variable + - RM #1176937 Add TCP_QUICKACK option support + - RM #1176937 Add delayed ack control into LWIP + - RM #1151606 Modify valgrind suppression file + - RM #1182826 Check if the module parameters exists + - RM #847360 Add Coverity badge to README.md + - RM #1172255 Disable parser warning coverity checker +Fixed: + - RM #1182981 Fix TCP zero-window probe message sequence number + - RM #1181379 Fix TCP zero window probe when there is data in-flight + - RM #1185978 Fix compilation for --enable-opt-log=high + - RM #1164732 Fix possible deadlock during connect + - RM #1185372 Fix traffic is not offloaded for high VMA_RX_WRE values + +Version 8.4.8-0: +Date + Time 2017-10-10 +============================================================= +Added: + - RM #1163086 Remove pkg.m4 from sources + - RM #1156840 Add TCP Tx window availability +Fixed: + - RM #1160823 Fix a crash when reg_mr fails + +Version 8.4.7-0: +Date + Time 2017-10-08 +============================================================= +Added: + - RM #1151343 Update MP RQ dropped packet statistics in case of bad CSUM + - RM #1150137 Remove VMA_THROW from functions signature +Fixed: + - RM #1150011 Fix ioctl() FIONREAD return value + - RM #1154650 Fix IP fragmentation threshold + - RM #1152517 Fix flags order to compile VMA with O3 + +Version 8.4.6-0: +Date + Time 2017-09-27 +============================================================= +Fixed: + - RM #1153175 Modify On Device Memory reg_mr API according to libmlx5/libibverbs + - RM #1150366 Align memory barrier to verbs's one + - RM #1151606 Modify valgrind suppresion file + - RM #1147574 Fix missing argument in modify_qp_to_ready_state() + - RM #1148076 Remove volatile when handling CQE + +Version 8.4.5-0: +Date + Time 2017-09-19 +============================================================= +Added: + - RM #1065079 Add On Device Memory support + - RM #1117626 Move epoll's poll_os logic to the internal thread + - RM #1073223 Enable Debian package timestamp +Fixed: + - RM #1088208 consider cache_line when getting CQE + - RM #1137302 Solve gcc7.x [deprecated] + - RM #1137302 Solve gcc7.x [format-overflow] + - RM #1137302 Solve gcc7.x [array-bounds] + - RM #1137302 Solve gcc7.x [implicit-fallthrough] + - RM #1062367 Fix TCP window deadlock in case of a small window + - RM #1134718 Fix lack of interface reinitialization (Tx PRM implementation) + - RM #1141586 Fix Valgrind issue - uninitialized byte(s) + - RM #1041212 Fix wrong handling of UDP setsockopt() using UDP_MAP_REMOVE + +Version 8.4.4-0: +Date + Time 2017-09-10 +============================================================= +Added: + - RM #1081001 Add netmap API VMA wrapper + - RM #1089826 Add statistics to Multi-Packet rings +Fixed: + - RM #1131365 Update output example and default parameters in README.txt + - RM #1096849 Fix for the first packet checksum in case VLAN trunk + - RM #1127632 Remove ambiguous line from insufficient hugepage warning + - RM #1079751 Fix possible crash when migration occurs + - RM #1114239 Fix compilation error when configuring using disable-exp-cq + - RM #1094542 Reduce interval for syncing time_converter clock parameters + - RM #1117819 Nullify application name after freeing its pointer + +Version 8.4.3-0: +Date + Time 2017-08-14 +============================================================= +Fixed: + - RM #1115999 Fix DUMMY_SEND packets in new PostSend PRM + +Version 8.4.2-0: +Date + Time 2017-08-08 +============================================================= +Added: + - RM #1097641 Add a dummy packet send during getsockname() + - RM #1055917 Enable dynamic signaling for Tx PostSend PRM + - RM #1055917 Enable dynamic WQE size inlining in BlueFlame + - RM #1055917 Amend VMAPOLL Tx to use MLX5 PostSend PRM + - RM #1055917 Enable TX Post-Send PRM for non-VMAPOLL + - RM #1055917 Add scatter/gather array helper + - RM #933949 Add KVM virtualization detection + - RM #1049758 Support ioctl() with FIONREAD flag + - RM #1061103 Improve performance of epoll implementation + - RM #1080207 Add an option to gracefully exit() on startup failure + - RM #1063479 Add valgrind detection macros + - RM #1078695 Disable ring migration for RING_LOGIC_PER_USER_ID + - RM #1049980 Eliminate coverity-8.7 failures + - RM #1063164 Update License date to 2017 + +Fixed: + - RM #884440 Fix handling of SO_LINGER in case l_linger is zero + - RM #863457 Change logic of interface selection in attach receiver + - RM #1097116 Reduce amount of log messaging for DBG level + - RM #1085670 Fix VMA_POLL crash when exiting + - RM #1043830 Add set of gtest cases + - RM #1043830 Fix flags parameter verification for UDP flow + - RM #1043830 Fix incorrect setting EEXIST in global_ring_wakeup + - RM #1043830 Fix error processing for send operations + - RM #1079098 Fix array out of range in vma_extra api + - RM #1063479 Fix some Valgrind issues in VMA + - RM #1075811 Fix csbuild reported errors (one error is suppressed) + +Version 8.4.1-0: +Date + Time 2017-07-04 +============================================================= +Added: + - RM #959307 Link VMA with libnl1 or libnl3 based on libibverbs + - RM #1050652 Improve Jenkins - increase csbuild detection timeout, upgrade to cppcheck ver 1.79 +Fixed: + - RM #1065276 Fix incorrect logging strings + +Version 8.3.7-0: +Date + Time 2017-06-19 +============================================================= +Fixed: + - RM #1055915 Fix tcp performance related to MP_RQ + +Version 8.3.6-0: +Date + Time 2017-06-14 +============================================================= +Added: + - RM #953382 Support Precision Time Protocol (PTP) + - RM #788404 Support Packet Pacing (socket option SO_MAX_PACING_RATE) + - RM #1048224 Improve performance while searching for non-offloaded data + - RM #1055915 Improve performance under MP-RQ mode + - RM #1052605 Increase monitored epoll fds, CQs and RINGs in vma_stats + - RM #1061898 Disable flow tag when TCP GRO is used + +Fixed: + - RM #1041107 Fix flow_tag handling on VLAN interfaces + - RM #1031172 Fix Checkmarx security application reported issues + +Version 8.3.5-0: +Date + Time 2017-05-25 +============================================================= +Fixed: + - RM #1050059 Fix corruption in the net device ring map + +Version 8.3.4-0: +Date + Time 2017-05-24 +============================================================= +Added: + - RM #1014125 Support cyclic buffer handling of multi packets based on HW Striding-RQ + - RM #1014125 Separate buffer memory allocation from buffer pool logic + - RM #968024 Improve latency with PRM implementation for poll cq + - RM #1037898 Reduce latency max numbers (high percentile) by avoiding malloc calls at runtime + - RM #1026406 Add libvma-debug.so installation (supporting max log level FUNC_ALL) + - RM #998528 Verify --enable-vmapoll configuration is supported by OFED + - RM #959833 Do not offload ConnectX-3 interfaces in vmapoll mode + - RM #1023040 Show VMA Git number in VMA banner at startup + - RM #1025933 Add -Wundef flag to VMA compilation + - RM #1036427 Print driver (OFED) version in VMA banner + - RM #847360 Remove Coverity badge from README.md file + - RM #1010036 Update VMA_TCP_TIMESTAMP_OPTION description in README.txt + - RM #1042006 Delete redundant files from libvma home directory + +Fixed: + - RM #962481 Fix cqe error under vmapoll mode + - RM #1008712 Fix mutual exclusion access in ring bond + - RM #1027871 Fix valgrind uninitialized struct epoll_event + - RM #1037215 Fix autotool detection of rdma_lib_reset + - RM #1020667 Fix VMA_CQ_POLL_BATCH_MAX incorrect value in README.txt + - RM #1043382 Fix VMA_PROGRESS_ENGINE_WCE_MAX value in README.txt + +Version 8.3.3-0: +Date + Time 2017-04-25 +============================================================= +Added: + - RM #1030331 Update global_sn update in cq_mgr_mlx5 class + +Fixed: + - RM #1030299 Fix wrong consumer index in request notification + - RM #1027659 Fix timestamp cqe conversion in prm poll + - RM #1030134 Fix RX UDP hardware timestamp calculation + - RM #1029468 Fix crash in cq_mgr_mlx5 destructor + - RM #1029287 Fix RX UDP local if in buffer descriptor + +Version 8.3.2-0: +Date + Time 2017-04-20 +============================================================= +Added: + - RM #968024 Improve latency by direct cq polling instead of interfacing libibverbs + - RM #1006374 Optimize Tx data path - prepare in advance TCP next buffer and segment + - RM #1006374 Optimize Tx data path - remove redundant variable + - RM #1006374 Optimize Tx data path - reorder ring and qp_mgr::send() operations + - RM #1025129 Reduce TCP lock contention in case of high rate polling + - RM #1023714 Remove redundant if statement from dst_entry_udp fast path + - RM #1023040 Show VMA Git number in VMA banner at startup + - RM #996086 Add configuration option --enable-opt-log levels (max VMA log level is DEBUG) + - RM #1011829 Add VMA_TCP_NODELAY parameter (Nagle algorithm) + +Fixed: + - RM #1005856 Fix MC flow_tag id when flow_tag is not supported + +Version 8.3.1-0: +Date + Time 2017-03-28 +============================================================= +Added: + - RM #933410 Support IPoIB on ConnectX-4 + - RM #968200 Support Flow tag for vmapoll mode + - RM #968200 Support flow tag for multicast flows + - RM #987802 Support Flow tag for non-VMAPOLL modes + - RM #996018 Add README file for GTEST suite + +Version 8.2.10-0: +Date + Time 2017-03-22 (branch: fast_update_release_20032017) +============================================================= +Fixed: + - RM #922411 Fix usage of AC_SUBST in rdma_lib_reset() detection + +Version 8.2.9-0: +Date + Time 2017-03-21 (branch: fast_update_release_20032017) +============================================================= +Added: + - RM #964022 Optimize vma_list.h for performance + - RM #972524 Optimize data-path by removing unnecessary if statements + - RM #973211 Optimize buffer pool code + - RM #981372 Optimize mem_buf_desc_t class + - RM #987687 Request buffer for next packet after send_ring_buffer() + - RM #965720 Add new specification VMA_SPEC=multi_ring_latency + - RM #972524 Protect pointer access to OS API + - RM #981342 Remove buffer disorder statistics + - RM #964360 Reduce page fault events in mlx5_send() call + - RM #968751 Add boolean configuration parameter VMA_RX_POLL_ON_TX_TCP + - RM #969414 Set log severity to ERROR (was DEBUG) in case of insufficient buffers in pool + - RM #953380 Support dynamic routing in case of "add route" event + - RM #910917 Return error on socket API if exception occurred + +Fixed: + - RM #922411 Fix fork() in child process + - RM #922411 Fix rdma_lib_reset() auto detection + - RM #957729 Fix potential race condition in vma_stats + - RM #906042 Fix VMA errno when reaching TCP max backlog to match OS + - RM #973194 Fix configure on parameters that caused compilation errors + - RM #987687 Fix request buffer for next packet after send_ring_buffer() + - RM #1003524 Fix fanotify_init() failure on aarch64 architecture + +Version 8.2.8-0: +Date + Time 2017-01-25 +======================= +Added: + - RM #954841 Update README.txt file with last modification date + +Fixed: + - RM #962718 Fix peer-notification daemon removal on Ubuntu 14.10 + - RM #955209 Fix vma_extra.h zero-copy example + +Version 8.2.7-0: +Date + Time 2017-01-18 +======================= +Added: + - RM #954841 Document VMA_SPEC "latency" in README.txt file + +Fixed: + - RM #946914 Fix epoll_create() should return an error if size is 0 + +Version 8.2.6-0: +Date + Time 2017-01-08 +======================= +Added: + - RM #850311 Update log info when link is down on VMA bringup + - RM #802922 Added TCP server-client window size exerciser test + +Fixed: + - RM #943551 Fix received UDP vlan over bond (fail_over_mac=1) prints an error + +Version 8.2.5-0: +Date + Time 2016-12-28 +======================= +Added: + - RM #784338 Add VMA gtest (vmapoll API) + - RM #784338 Add testbed application (vmapoll API) + - RM #784338 Extend jenkins tests (vmapoll API) + - RM #865066 Use one vma_extra.h file for both vmapoll and legacy socket API + - RM #850311 Add support for socket offload even if link is down on VMA bringup + - RM #850270 Add VMA_TCP_CTL_THREAD=2 parameter under spec 6973 + - RM #925571 Add textual vma spec support and a new latency spec + - RM #865066 Merge experimental branch (vmapoll API) into master branch + +Fixed: + - RM #924683 Fix src addr when sending MCAST with IP_MULTICAST_IF option + - RM #896067 Fix remove ring event during socket closure (vmapoll API) + +Version 8.2.4-0: +Date + Time 2016-12-27 +======================= +Added: + - RM #933236 Improve peer-notification debian package removal procedure + +Version 8.2.3-0: +Date + Time 2016-12-26 +======================= +Fixed: + - RM #768398 Fix peer-notification vma daemon package removal under debian + +Version 8.2.2-0: +Date + Time 2016-12-20 +======================= +Added: + - RM #768398 Peer-notification daemon implementation (on half-open TCP connections) + - RM #842842 Move udp rx hw timestamp conversion from cq_mgr to sockinfo_udp + - RM #825055 Implement "Dummy Send" for improving low msg rate latency + +Fixed: + - RM #910917 Fix vma crash after installing libvma from GitHub on RH7.3 inbox drivers + +Version 8.2.1-0: +Date + Time 2016-12-14 +======================= +Added: + - RM #897296 Advanced routing update + - RM #866429 Add HW and SW timestamps to user callback data + +Fixed: + - RM #898505 Fix connecting twice to dest IP through non-exiting GW causes seg fault + - RM #897296 Fix resolving the correct outgoing interface for route decision + - RM #888475 Fix core dump for IB pkeys under bond + - RM #888475 Fix IB not offloaded in case of two bonds over pkeys + - RM #865172 Fix illegal memory access by netlink wrapper. + - RM #911076 Fix rcvmsg failure with MSG_VMA_ZCOPY_FORCE on OS socket + - RM #824880 Fix gtest compilation for gcc 4.4.7 + +Version 8.1.7-0: +Date + Time 2016-09-21 +======================= +Added: + - RM #824880 Add Google Test based infrastructure + +Fixed: + - RM #857641 Fix setsockopt with IGMPv3 IP_DROP_SOURCE_MEMBERSHIP option + +Version 8.1.6-0: +Date + Time 2016-09-16 +======================= +Fixed: + - RM #864201 Fix double freeing of pbuf in LWIP split function + +Version 8.1.5-0: +Date + Time 2016-09-08 +======================= +Added: + - RM #749133 IGMPv3 - add support for 'Source Specific Multicast' + - RM #770746 Add csbuild support into jenkins + - RM #770746 Update jenkins with valgrind support + - RM #848172 Code updates to comply with Coverity 8.5 version + +Fixed: + - RM #846042 Fix failure to receive fragmented multicast packets over ConnectX3-IB + - RM #843662 Fix log WARNING to DEBUG in case not an mlx4 driver on IPoIB interface + - RM #844374 Fix error handling while registering a socket with plural epoll instances + - RM #847025 Fix extra API tests failing on make with vlogger.h not found + +Version 8.1.4-0: +Date + Time 2016-08-24 +======================= +Added: + - RM #847360 update github's README.md file + - RM #848239 Remove unused utility functions + +Fixed: + - RM #848172 Fix multiple Coverity warnings (potential leaks, dead code, potential garbage values) + +Version 8.1.3-0: +Date + Time 2016-08-18 +======================= +Added: + - RM #786030 Revise RDTSC logic to consider CPU speeds taken from /proc/cpuinfo + - RM #801695 VMA out-of-box sockperf tuning for low latency (VMA_SPEC=10) + - RM #825799 Add locking to setsockopt() TCP to protect against pcb changes + - RM #847904 On bringup check if an interface is a bond by looking at its base name + +Version 8.1.2-0: +Date + Time 2016-08-15 +======================= +Added: + - RM #826700 Act gracefully when running on upstream drivers for non supported features + - RM #802922 LWIP - Split head of TCP unsent segment when bigger than send window size and NODELAY flag is set + -RM #825888 Dump fd statistics into VMA logger using vma_stats + - RM #840733 Avoid using mce_sys global variable in fast path to minimize cache hit and improve performance + - RM #822441 Support SW checksum validation for ingress TCP/UDP IP packets + - RM #803783 Allow local TCP connections for offloaded bounded listen sockets + - RM #828809 Control the number of post sends until requesting completion SIGNAL + - RM #825842 Add option for LWIP debug printing + - RM #841302 Coverity enhancements. Add support for ccpcheck and clang + - RM #827622 Add jenkins support for inbox + +Fixed: + - RM #822562 Fix segmentation fault in vma_stats when exceeding fds string length + - RM #817046 Coverity fixes + - RM #836374 Fix exp-cq help message when executing "./configure --help" + +Version 8.1.1-0: +Date + Time 2016-07-19 +======================= +Added: + - RM #775233 Allow excluding logging levels to improve performance + - RM #781540 Wait for Rx WQE's when closing RING (QP) + - RM #777358 Un-offload UDP socket that is bound to any non-offloaded interfaces (e.g. loopback 127.0.0.1) + - RM #816006 TCP send flows for non-blocked socket - add Rx CQ poll + - RM #820855 Explicitly set QP init attr cap of max send SG to 2 + - RM #820858 Update message regarding no steering rules in mlx4 to adapt to upstream drivers + - RM #804206 DNS server (UDP,port 53) to be handled by OS by default (libvma.conf) + - RM #786406 Jenkins - add building of source RPM + - RM #823037 Replace deprecated API (e.g. inet_addr) with ipv6 supporting API (e.g inet_pton) +Fixed: + - RM #783976 Fix VMA not able to receive data on epoll_wait() + - RM #806219 Fix segfault when running with CTL_THREAD_NO_WAKEUP + - RM #793030 Fix rpm rebuild failure + - RM #784234 Fix epoll is not cleared from vma_stats + +Version 8.0.4-0: +Date + Time 2016-05-29 +======================= +Added: + - RM #781384 add log details on UDP setsockopt(SO_BINDTODEVICE) and add python test code + +Fixed: + - RM #781967 Fix ./configure failures on some distros - add pkg.m4 into distribution + - RM #775440 Fix 'make distclean' breakage + +Version 8.0.3-0: +Date + Time 2016-05-18 +======================= +Added: + - RM #770746 Jenkins - Modify valgrind suppresion file + - RM #770746 Jenkins - Extend jenkins check + +Fixed: + - RM #778470 Fix segfault in sockperf pingpong, TCP socket on management interface, epoll + - RM #778287 Fix usage of get_env_params() method in mce_sys_var struct (broke version 8.0.2-0) + +Version 8.0.2-0: +Date + Time 2016-05-15 +======================= +Added: + - RM #763018 Search neigh table using both dst addr and ifindex + - RM #682675 Convert m_ready_fds (epoll_info.h) from std::tr1::unordered_map to vma_list + - RM #768358 Convert socket_fd_vec (epoll_wait_call.cpp) from std::vector into vma_list + - RM #776821 Support multiple node names in vma_list + - RM #764912 Expand vma_stats to include Send-Q & Recv-Q + - RM #737508 Add internal thread TCP timer handling mode: immediate/deferred + - RM #764913 Add vma_stats -v 5 PID column (netstat like) + - RM #736343 Add "--disable-exp-cq" parameter to control CQ polling using experimental verbs API + - RM #381520 Disable VMA's CQ interrupt moderation logic if missing MLNX_OFED's experimental CQ moderiation API + - RM #770746 Add jenkins check support on github pull request + - RM #773091 Add Intel compiler support + - RM #777322 Update logging on iomux flows + - RM #777958 Document limitation of using VMA_RING_ALLOCATION_LOGIC_RX|TX with select()/poll() + - RM #735940 Skip 'OFED Version' logging in VMA banner if ofed_info is not supported + +Fixed: + - RM #756851 Fix TCP WIN SCALE incompatibility with RFC-1323, section 2.2/2.3 + - RM #774912 Fix realloc size parameter in event_handler_manager class + - RM #777063 Fix usage of safe_mce_sys() calls instead of unsafe mce_sys calls + +Version 8.0.1-0: +Date + Time 2016-03-10 +======================= +Added: + - RM #747132 Change VMA license from "GPLv2 or commercial" to "GPLv2 or BSD" (dual license). + - RM #736034 Prepare Fedora 24 RPM spec file + - RM #748216 remove (old) non-scaled rcv window support from VMA + - RM #675823 return ENOMEM if select() uses 'ndfs' larger then the system's FD_SETSIZE + - RM #736029 Add VMA limits file "30-libvma-limits.conf" to make install + - RM #688734 BuildRequires should be flexible for environments with paralel versions of libnl-devel + - RM #682871 specify rpm build depenedcy on devel packages + +Fixed: + - RM #692387 Fix vma_stats does not use the right shared memory for cleaning. + - RM #690836 Fix ib1 (second IB interface, port 2) is not recognized + - RM #739323 Fix crash when running in DEBUG and interface does not have an ip address + - RM #737218 Fix zero_copy_rx() handling of iov scattered data + - RM #740071 Fix TCP crash when executing setsockopt() SO_RCVBUF on listening socket + - RM #688876 fix misleading user message about libnl package + - RM #688877 fix wrong package name librdma_cm-devel in error message + - RM #742951 Fix GCC 6.0.0-0.13 compilation errors + +Version 7.0.14-0: +Date + Time 2016-01-14 +======================= +Added: + - RM #682804 full support for IP/UDP/TCP software checksum in environments where HW checksum offloading is not supported + - RM #678967 rpms updates to comply with rpmlint + - RM #568607 remove config_parser.y (in addition to removal of config_scanner.l) from compilation + - RM #541581 removed debug log that created false alarm at exit of application + +Version 7.0.13-0: +Date + Time 2016-01-09 +======================= +Added: + - RM #678967 Updates based on Fedora and Redhat submission guidelines + - RM #682804 Make VMA compile with infiniband group upstream where TX/RX checksum offload is missing (Fedora 23) + - RM #635537 Restore default of VMA_EXCEPTION_HANDLING=-1 (log debug) to + increase cases of offloaded traffic + - RM #541581 add debug log of wqe RX checksum errors +Fixed: + - RM #568607 Avoid lex warning: push config_scanner.c based on Flex 2.5.39 and remove config_scanner.l from compilation + - RM #651946 Fix releasing of used RX buffers without socket lock + +Version 7.0.12-0: +Date + Time 2015-12-24 +======================= +Added: + - RM #635537 Change default of VMA_EXCEPTION_HANDLING=0 - log debug and try recovery + - RM #651946 improve RX/TX stddev and latency for UDP and TCP sockets + - RM #661421 support log level input as strings + - RM #652089 Add FORTIFY definition to CFLAGS/CXXFLAGS in configure and makefile + - RM #668321 avoid lex -Werror issues by generating correct C source files in advance + - RM #631197 create log severity "details" (between info and debug) + +Fixed: + - RM #623252 Fix make dist/clean for lex files errors (due to FORTIFY) + - RM #659575 Fix a crash when closing the process while other threads + are waiting on some sockets + +Version 7.0.11-0: +Date + Time 2015-11-17 +======================= +Fixed: + - RM #647577 global ctors are called after library constructor results + in unsafe values + - RM #618523 LAG bond receive double amount of multicast traffic if + LACP not working + - RM #647962 Fix a crash when all bond's slaves ports are taken down + and then up + - RM #649148 Fix a segfault in vma_stats -v 4 + - RM #648405 Issue WARNING - vlan over bond while fail_ove_mac=1 is + not supported, fallback to OS + +Version 7.0.10-0: +Date + Time 2015-11-08 +======================= +Fixed: + - RM #623139 In neighbor rdma_resolve_addr, pass src if multicast + - RM #641777 fix compilation issue on ubuntu 15.10 + - RM #553389 handle failures due to "no file descriptors" + - RM #618620 Bond LAG - Wrong hash with xmit_hash_policy=1 (L3+L4) + +Version 7.0.9-0: +Date + Time 2015-10-26 +======================= +Added: + - RM #591755 Handling of socket API exception flows + - RM #630037 Build RPM/DEB support + +Fixed: + - RM #623139 handle addr_resolve() in neighbor + - RM #632455 Handle correctly msg_flag returned by recvmmesg() + - RM #VMA error when running getsockopt with an invalid level and an + invalid optlen + - RM #632654 should initialize all globals in fork()/daemon() + - RM #Fix make dist + +Version 7.0.8-0: +Date + Time 2015-10-15 +======================= +Added: + - RM #591741 Support bonding LAG + - RM #591741 support vlan over bond + - RM #591738 Support UDP RX HW time stamping + - RM #591764 Make VMA_MTU not needed + - RM #610758 optimize UDP TX + - RM #611344 Align checksum offload with latest libibverbs 1.2.0 + - RM #612658 Replace PANIC at VMA startup with gracefull resource releasing + - RM #591747 remove sockperf from VMA rpm/deb packaging + - RM #618636 Support TCP Timestamp option + +Fixed: + - RM #606490 possible race in neigh_entry state machine + - RM #612231 TCP accept() call does not exit when socket is closed + - RM #592216 Fix a crash when FD limit is less than required for initialization + - RM #612841 calculate MTU of VLAN interface rather than VLAN base interface + - RM #618624 TCP recv() might keep spining due to uninitialized member + +Version 7.0.7-0: +Date + Time 2015-9-8 +======================= +Added: + - RM #591755 infrastructure for exception flow in vma + - RM #601270 optimize accept() call + +Fixed: + - RM #559317 duplicate tcp port when binding to INPORT_ANY while REUSEADDR is on + - RM #590813 MC ADD_MEMBERSHIP status to follow OS failure + - RM #599332 Fix install libvma.conf for PREFIX directory + - RM #599814 deadlock between epoll_wait and socket add/del from epfd + - RM #601200 use static cpuset allocation instead of dynamic + +Version 7.0.6-0: +Date + Time 2015-9-2 +======================= +Added: + - RM #541581 Support RX CSUM verification offload - align to MLNX_OFED 3.1 + - RM #597700 Handle VMA exception flows - patch 1 + +Fixed: + - RM #577677 backlog params will now be taken from sysctl_reader.h + - RM #589972 provide info to the user if ulimit -n is too small + - RM #563682 fix a crash when ibv_create_flow failed + - RM #596462 fallback to OS in case ibv_create_flow failed + - RM #597449 Do not load vma resources if not required + +Version 7.0.5-0: +Date + Time 2015-8-11 +======================= +Added: + - RM #561857 Support non-standard vlan names + - RM #564158 provide 'sysctl' services for the rest of VMA objects to access + - RM #557948 Option to avoid system calls on tcp fd + - RM #562770 improve connection establishment rate + - RM #582394 Support flow-steering priority scheme on ConnectX-4 + - RM #587515 Adapt vma to ConnectX-4 send inline demands + - RM #531820 Modify the syn throttling feature + +Fixed: + - RM #565962 Wrong TCP backlog handling + - RM #349852 fix cubic congestion control algorithm + - RM #549313 neigh post_send_tcp, fix check buffer validity + - RM #559589 Close on exec for netlink sockets, to fix crash on exec + - RM #554834 Fix issue when tcp header bigger than packet length + - RM #575582 Wrong handling of TCP window scale option + - RM #575697 VMA can't send FIN while snd_buf is full + - RM #552441 fix a possible crash in case of multithread udp recvfrom + - RM #565588 fix SO_LINGER functionality different than OS + - RM #576497 for upstream RH7.2 look at interface dev_port and not dev_id + - RM #559317 Continue processing CQ for ready FD after failure on one of them + - RM #560899 tcp socket hang in syn_recv state + - RM #501206 fix SO_RCVBUF and SO_SNDBUF for TCP + - RM #581915 incorrect errno when giving getsockopt an invalid level value + - RM #588042 Adjust tcp receive window to the socket receive buffer size + +Version 7.0.4-0: +Date + Time 2015-7-8 +======================= +Fixed: + - RM #550714 fix rx buffer leak in case packet is coming when tcp state is "closed" + - RM #550714 enable reclaim buffers for tcp socket from internal thread in "no batching" mode + +Version 7.0.3-0: +Date + Time 2015-7-8 +======================= +Added: + - RM #561831 add a script for testing epoll and poll returned events + - RM #561831 Support returning error events for poll + - RM #560658 support SO_LINGER socket option + +Fixed: + - RM #560210 periodic monitoring of active slave changes on bond interfaces + to support proper device removal and bond events without rdma_cm + - RM #561831 fix error events for epoll + - RM #545457 fix libnl rpm&deb dependency + - RM #560940 fix compilation warnings + - RM #547669 "netstat like" vma_stats to work when no process running + +Version 7.0.2-0: +Date + Time 2015-6-26 +======================= +Fixed: + - RM #545457 libnl dependency - fix headers mismatch + - RM #560835 crash in tcp socket when calling recv with 0 length buffer + - RM #557948 Avoid system calls on tcp fd - fixes + +Version 7.0.1-0: +Date + Time 2015-6-24 +======================= +Added: + - RM #557948 Option to avoid system calls on tcp fd + - RM #549313 Improve latency spikes in connection establishment - continue + - RM #58689 Add buffer pool statistics to vma_stats + - RM #555850 keep hash on vma_stats header and check for version + compatibility based on it + - RM #Convert std::deque into an improved version of Linux kernel list + - RM #545457 libnl dependency - add libnl3 + - RM #50714 expand vma batching mode to reclaim unused buffers + +Fixed: + - RM #559303 Wrong handling with the reference count of the memory buffers + - RM #543246 Clean VMA Checkmarx vulnerability report + - RM #558710 Non-blocked TCP connection attempt to non-existing server + does not wake up with failure + - RM #550714 VMA stops accepting connections after large number of connections + +Version 7.0.0-0: +Date + Time 2015-6-09 +======================= +Added: + - RM #549313 Improve latency spikes in connection establishment + - RM #553380 Improve high jitter on close + +Fixed: + - RM #550706 vlan not using the naming conventions + - RM #548134 error: ~sockinfo() not all buffers were freed for TCP + - RM #554592 connection hangs in closed state when using epoll + - RM #548131 cqe status is Local Protection Error + - RM #551630 VMA not reading routing tables properly + +Version 6.9.1-0: +Date + Time 2015-5-21 +======================= +Added: + - RM #546272 - vma_stats (-v 5) shows netstat like view + (particularly similar to 'netstat -tunaep') + +Version 6.9.0-0: +Date + Time 2015-5-10 +======================= +Added: + - RM #543022 handle DEVICE_FATAL event to support hot-unplug + - RM #541581 Support RX CSUM verification offload + - RM #533217 Support creating vma_stats shared memory files in a given + directory + - RM #531820 Add syn/fin throttling support + - RM #501215 let the tcp timer run from the user thread when possible + - RM #501210 Add retransmission counters + +Fixed: + - RM #535204 fix crash when there is no route back to syn sender + - RM #531810 address resolution for new neighbor block for 1 msec + - RM #509794 Don't wait in tcp prepare_to_close for last ack + it will be handled by the tcp timer. + - RM #515391 if destroy flow failed, continue destroying the rfs + object + - RM #501206 fix SO_RCVBUF and SO_SNDBUF for TCP + - RM #491134 Available send work requestes are affected by return time + of buffers to pool + +Version 6.8.4-0: +Date + Time 2015-3-24 +======================= +Fixed: + - RM #515391 if destroy flow failed - continue destroying the rfs object + - RM #509794 Don't wait in tcp prepare_to_close for last ack it will be + handled by the tcp timer + - RM #501206 fix SO_RCVBUF for TCP #2 + - RM #501215 let the tcp timer run from the user thread when possible + - RM #501210 Add retransmission counters + + - RM #501206 fix SO_RCVBUF and SO_SNDBUF for TCP + - RM #491134 Available send work requestes are affected by return time + of buffers to pool + +Version 6.8.3-0: +Date + Time 2015-1-6 +======================= +Fixed: + - RM #429310 Fix bug in TCP zero callback DROP case + +Version 6.8.2-0: +Date + Time 2014-12-29 +======================= +Added: + - RM #327504 update sockperf version 2.5.254 + +Version 6.8.1-0: +Date + Time 2014-12-29 +======================= +Added: + - RM #327504 update sockperf version 2.5.253 + +Fixed: + - RM #429310 Modify udp_lat test to latest VMA extra API + +Version 6.8.0-0: +Date + Time 2014-12-19 +======================= +Added: + - RM #429310 Support TCP zero copy + - RM #434108 extract lwip patchs and backport some of latest lwip + changes + +Version 6.7.2-0: +Date + Time 2014-8-21 +======================= +Fixed: + - RM #408235 Support for PPC64 on older GCC + +Version 6.7.1-0: +Date + Time 2014-8-6 +======================= +Added: + - RM #408075 Add support for ARM64 + - RM #408235 Add support for PPC64 + - RM #408072 Align to upstream ETH flow steering + - RM #407928 Add support for Ubuntu14.04 + - RM #407742 Add support for routing rules and secondary tables + +Fixed: + - RM #401603 VMA_SPEC=29 should not set tx bufs + - RM #390002 loading libhugetlbfs with libvma crash on startup + - RM #390019 unclosed listen socket become ready at process shutdown + - RM #389691 shutdown listen socket - freeing a lock by mistake + +Version 6.6.4-0: +Date + Time 2014-4-23 +======================= +Fixed: + - RM #387249 wrong tcp packet count for vma_stats + - RM #387745 VMA_INTERNAL_THREAD_CPUSET causes seg-fault + +Version 6.6.3-0: +Date + Time 2014-4-10 +======================= +Added: + - RM #386387 add vma_spec parameter for memcached + +Fixed: + - RM #375244 Align VMA with latest MLNX_OFED 2.2 experimental verbs - round #2 + - RM #363428 TCP Listen port not seen in vma_stats + - RM #386389 print monitoring warnings only once, and in INFO log level + - RM #385473 Epoll, process interrupts per ring, instead of globaly + for all rings + +Version 6.6.2-0: +Date + Time 2014-3-31 +======================= +Added: + - RM #379150 support for accept4 + - RM #381520 Support adaptive interrupt moderation per CQ + - RM #381708 Add ring statistics to vma_stats + - RM #378076 Allow L2 only rules for ETH MC + - RM #375244 Align VMA with latest MLNX_OFED 2.2 experimental verbs + +Fixed: + - RM #384372 lwip unsent queue might be released to the RX buffer pool + instead of TX + - RM #384370 TCP TX buffer management issues - multiple parallel usage + of the mem_buf_desc->next pointer + - RM #381917 lacking consideration of igmp_max_memberships parameter + - RM #380783 Truncate existing VMA logs + - RM #379169 VMA ERROR is raised when trying to get IPV4 of non + existing interface + - RM #379080 rfs access is not protected in ring dtor + - RM #378103 filter MC loopback on RX flow + - RM #34322 No support for SO_BINDTODEVICE on already offloaded connected TCP socket + +Version 6.6.1-0: +Date + Time 2014-2-25 +======================= +Added: + - RM #375244 Align VMA with latest MLNX_OFED 2.2 experimental verbs + - RM #374399 Add udp_connect rule to libvma.conf + - RM #372972 handle SOCK_NONBLOCK and SOCK_CLOEXEC socket() flags +Fixed: + - RM #376409 bind to MC address might lead to not-offloaded TX traffic + - RM #372976 multicast dest should ignore gateway when registering neigh + - RM #372792 net_device_table_mgr: Remove unnecessary epoll_wait + - RM #371374 small modification to UDP RX software packet timestamp support + +Version 6.6.0-0: +Date + Time 2014-2-2 +======================= +Added: + - RM #371374 Add software UDP RX packet timestamp support + - RM #371550 improve wakeup mechanism + - RM #371337 support multiple cmsg messages + - RM #371081 vma stats: add a parameter for number of print cycles + - RM #34322 Add support for SO_BINDTODEVICE + - RM #368408 allow 3 tuple rules for TCP (dst ip, dst port), instead + of 5 tuple rules + - RM #368154 enlrage hash-map bucket numbers to 4096 (from 256) + - RM #367852 minor performance improvment in flow tuple comparison + - RM #367852 minor performance improvment in epoll/epfd + - RM #363758 Add the low_pps_tcp_send and time_measurements tests to + VMA repo +Fixed: + - RM #360776 epoll_pwait/epoll_wait with bad flow return wrong errno + - RM #371207 Seg-fault as a result of a race in ONESHOT timer removal + - RM #369921 route table is too small, new limit is 4096 + - RM #368906 VMA_INTERNAL_THREAD_CPUSET does not working correctly + - RM #368905 local loopback (127.0.0.1) try to follow route table and + might reach an offloaded interface + - RM #368597 calling listen() for the second time when the socket is + in ready to listen state, generate VMA PANIC + - RM #367849 flow-tuple second ctor parameter list - wrong order + - RM #367857 tcp is_readable, ring_map_lock might be unlocked without taking the lock + - RM #367755 using "VMA_TIMER_RESOLUTION_MSEC=1000" with TCP causes + seg-fault + - RM #367744 route entry might not have valid route val, which can + lead to seg-fault + - RM #367697 Delete rx channel from global fd collection + - RM #365650 Issues with MLX4_SINGLE_THREADED=1 + - RM #367389 IB pkey interface MTU is not read correctly + - RM #365538 Loading VMA with Redis server give a seg-fault + - RM #364303 bounded socket should send with bounded interface src ip, + even if routing table lead to different one + - RM #364174 vlogger: pass buffer as string in fprintf + - RM #364015 Seg-fault, missing initialization of a key for an + hash-map + - RM #363470 DEADLOCK in TX ring migration + - RM #362475 In ring migration, with limited number of rings, there + might be infinite loop + - RM #362368 In ring migration, buffers from previous rings might be + lost. + +Version 6.5.9-0: +Date + Time 2013-12-23 +======================= +Fixed: + - RM #361117 fixed a typo causing contiguous pages allocation issues + +Version 6.5.8-0: +Date + Time 2013-12-22 +======================= +Fixed: + - RM #361121 Expand raw packet QP error message to include alternatives + - RM #361117 Add configure check for IBV_ACCESS_ALLOCATE_MR + - RM #360776 epoll_pwait/epoll_wait with bad flow generates an error + - RM #360770 route gateway which lead to the same route entry does not work + +Version 6.5.7-0: +Date + Time 2013-12-18 +======================= +Fixed: + - RM #360098 fix coverity errors + - RM #359537 IB, non-blocking TCP connect take long time + +Version 6.5.6-0: +Date + Time 2013-12-11 +======================= +Fixed: + - RM #359314 TCP migration + new TX buffer management - segfault + - RM #359330 seg-fault when trying to close tcp socket in the middle of connect. + +Version 6.5.5-0: +Date + Time 2013-12-11 +======================= +Added: + - RM #100993 New TX Buffers Managment Logic + - RM #355605 create timers groups to lower to load of handling many timers + - RM #317 Loopback support on Ethernet (Multicast & Unicast) +Fixed: + - RM #357115 VMA does not consider gateway information in route table + in case there are multiple source ip for the interface + +Version 6.5.4-0: +Date + Time 2013-12-02 +======================= +Added: + - RM #349852 Add congestion control algorithms to LWIP +Fixed: + - RM #349737 ubuntu13.10 compilation errors + - RM #349765 VMA does not support select with more than 1024 sockets + - RM #349767 internal thread does not able to handle thouthands of request + - RM #349924 wakeup mechanism - race condition when going to sleep + - RM #349769 separate TX buffer managment from QP WR credits + - RM #351353 wakeup mechanism is not fit for multithread access to the same fd + - RM #352158 logical resource deadlock - multithread access to the same ring + - RM #355154 TCP recv part of sent data and close the socket raises "VMA ERROR" + +Version 6.5.3-0: +Date + Time 2013-11-07 +======================= +Fixed: + - RM #348637 IP_PKTINFO doesn't work properly with multicast + - RM #348638 Add member ship for the same socket twice should fail + - RM #347702 TCP listen socket accept socket on offloaded interface, but the + accepted socket is not offloaded + - RM #340484 Extra API libvma.conf rules - required to end with "\n" + - RM #343162 Extra API offload_thread should not affect accepted sockets + +Version 6.5.2-0: +Date + Time 2013-10-21 +======================= +Added: + - RM #343162 Support creating all sockets on a given thread as offloaded/not-offloaded + - RM #342335 Support debian packaging from source tarball + +Version 6.5.1-0: +Date + Time 2013-10-15 +======================= +Added: + - RM #327504 update sockperf version 2.5.233 + - RM #216808 Support VMA compilation on Ubuntu + - RM #340484 improve VMA blacklist - libvma.conf + - RM #339564 support IP_PKTINFO in UDP recvmsg + - RM #333651 improve TCP recv latency +Fixed: + - RM #333393 Multithread, fix IGMP handling + +Version 6.5.0-0: +Date + Time 2013-9-12 +======================= +Added: + - RM #327504 update sockperf version 2.5.232 + +Fixed: + - RM #332451 wrong MACRO for printing IP address + +Version 6.4.11-0: +Date + Time 2013-9-8 +======================= +Added: + - RM #322103 Add GRO support in VMA + - RM #330660 improve low pps latency by prefetching the next packet buffer before poll + - RM #327504 update sockperf version 2.5.231 +Fixed: + - RM #326897 ADD/DEL descriptor many times on same efd failed + - RM #331250 segmentation fault when using VMA_INTERNAL_THREAD_ARM_CQ=1 + +Version 6.4.10-0: +Date + Time 2013-8-20 +======================= +Fixed: + - compatability with new MLNX_OFED 2.0-2.6.8 ABI + - RM #306332 ioct with null parameter generates a core dumped + +Version 6.4.9-0: +Date + Time 2013-8-5 +======================= +Added: + - enable fork support by default (VMA_FORK=1) + +Fixed: + - RM #276253 segmentation fault in igmp handler + +Version 6.4.8-0: +Date + Time 2013-7-29 +======================= +Added: + - RM #100793 Support for getsockopt() SOL_SOCKET-SO_ERROR + - RM #227794 add TCP remote side ip+port to vma_stats -v 3 + - RM #216735 OFED 2.0 flow steering ABI change - pass parameters as big endian + - RM #213355 add support for 'make install' + - RM #214079 add "max locked memory" to limits.conf by rpm/deb installation + - RM #213361 Add VMA_RX_WRE_BATCHING parameter + +Fixed: + - RM #229405 return ECONNREFUSED when writing on "failed to connect" TCP socket, instead of ECONNRESET + - RM #227821 IGMP packets are not freed - buffer leak when having IB MC over IPR to a router + - RM #226212 misssing reset send in lwip + - RM #226017 memaslap - segmentation fault when exiting with ctrl+c + - RM #120784 sendto() flag "MSG_OOB " works with VMA on a UDP socket,which shouldn't work + - RM #221942 TCP close socket - buffer leak + - RM #216807 Ubuntu: Two threads, epoll with EPOLLIN that is doesn't triggered on some connections even if there is a data + - RM #216614 ring migration with ring limiter - VMA PANIC when trying to release non-existing key + - RM #216612 segmentation fault when registering more than 32 sockets to the same MC group (ip+port) + - RM #91672 Support for qpid-latency-test (VMA get segmentation fault) + - RM #215175 [BGATE 2408] Shutdown UDP socket returns VMA ERROR + - fixed SLES11.1 compilation issue + +Version 6.4.7-0: +Date + Time 2013-6-24 +======================= +Added: + - RM #202651 support epoll_pwait, pselect and ppoll + - imported VMA_TX_NONBLOCKED_EAGAINS from 6.1 + - RM #206064 - Start multithread support + - RM #100988 Dynamic ring migration + - RM #206655 Ring amount limiter + - RM #212516 remove VMA license + - RM #212407 performance improvment for apps with #threads > #cores + +Fixed: + - RM #94303 reduce our default inline to 220 + - RM #206247 fail to read a big routing table + - RM #199075 TX sendmsg UDP, address struct len not equal to + sizeof(struct sockaddr_in) - VMA crash + - RM #206261 Epoll - epfd_info DEADLOCK + - RM #209004 lock reordering + - RM #6961 VMA does not handle MSG_TRUNC correctly + - RM #122619 error in getsockname + +Version 6.4.6-0: +Date + Time 2013-5-23 +======================= +Added: + - RM #200376 accepted sockets should inherit TCP_NODELAY from listen socket + - RM #198501 Warn the user when there is a mismatch between interface + MTU and VMA_MTU + +Fixed: + - RM #199075 TX sendmsg UDP, IPV6 address struct len - VMA crash + - RM #198500 epoll ctl ADD/MOD epfd with events without EPOLLIN/OUT + fails + - RM #197925 set recv timeout to zero (blocking), recv do + not get blocking and return -1 EAGAIN. + - RM #123608 TCP EPOLL on not offloaded listen socket + - RM #97050 in epoll, poll only the relevent rings instead of polling + all using the global functions + - RM #199819 TCP trying to read/write while asyc-connect should return EAGAIN and not EPIPE/ENOTCONN + - RM #200374 TX while async connect should poll the CQ + +Version 6.4.5-0: +Date + Time 2013-5-09 +======================= +Added: + - RM #195165 Support sendmmsg + - Changed epoll events handling to be able to handle all events and + not only EPOLLIN/OUT. + - RM #122159 Added support for TCP send with MSG_DONTWAIT + - RM #864 add wakeup mechanism to epoll + - RM #101647 - Support Memcached UDP PORT MAP + - RM #4608 vma_stats - add socket type (TCP/UDP) to description + (vma_stats -v 3) + +Fixed: + - RM #190075 ibv_destroy_cq fails with EBUSY (errno=-16) when working + with multicast rules of same ip diff port + - RM #195755 Multicast, buffer panic + - RM #195157 Sockperf end with connection refused when + running client with 1000 TCP sockets for the same interface + - RM #195079 IB, Sockperf PP with 1000 multicast sockets + block when loading VMA in client side + - RM #194815 epoll with EPOLLIN after shutdown RD returns timeout instead of EOF + - RM #191882 multiple non blocking connects + - RM #191428 Multicast, VMA couldn't send data on + Non-offloaded interface when set the outgoing interface + +Version 6.4.4-0: +Date + Time 2013-4-14 +======================= +Added: + - RM #51147 - Added support for IB BC ARP on deafult pkey + - RM #30701 - TCP window scaling + - RM #10449 - Support TCP MSG_PEEK recv flag + - New sockperf 2.5.208 - which can run multiple sockperf MC server to receive an ingress MC stream on single machine + - RM #165464 - implement new flow steering API + - RM #86123 - insert VLAN to rfs + - RM #101288 - tcp closure - many bug fixes - see full list in RM + - RM #51148 - support neigh sends packets from his own QP, also when dst_entry was removed + - Support runtime configuration of TCP MSS and LWIP TX buffer pool + - Increase TCP_SND_BUF for better throughput + - RM #190096 - support ip_mreqn in setsockopt ADD/DROP_MEMBERSHIP and MULTICAST_IF + - RM #100700 - Ubuntu installer support +Fixed: + - Disable lwip_buf_pool that is not in use and consume a lot of memory + - RM #185253 - Segfault during MC ETH neigh removal + +Version 6.4.3-0: +Date + Time 2013-2-07 +======================= +Added: + - New sockperf 2.5.200 - new options: '--client_port' and '--client_ip'. Allow loopback tests. + - RM #169015: Feature - first step - internal thread progress engine with interrupt coalescing - fesability study +Fixed: + - RM #168879 - TCP, not-offloaded interface, bind()+connect() - connect faild + +Version 6.4.2-0: +Date + Time 2013-1-28 +======================= +Added: + - Redirection of vfork() to fork() +Fixed: + - RM #131373 - epoll_wait_call:37:epoll_wait_call() epfd 148 not found + - RM #130599 - vma_stats -v 4 is not working + + +Version 6.4.1-0: +Date + Time 2013-1-21 +======================= +Added: + - Added VMA time measuring code (instrumental VMA) + - RM #130201: Auto route Eth loopback traffic to/from OS without dedicated libvma.conf rule +Fixed: + - RM #125279: libvma.conf: Multicast tx traffic should go through OS but it is going through VMA + - RM #130192: Fixed IB UC loopback + + +Version 6.4.0-0: +Date + Time 2013-1-3 +======================= + - First VMA 6.4 build + - Copy of VMA 6.3.28-0 + - Moved to git: /.autodirect/mswg/git/accl/vma.git/ + + +Version 6.3.25-0: +Date + Time 2012-12-25 +======================= +Fixed: + -RM #30235: Async connect is broken + -RM #120561: VMA_ERROR whan running on alias interface + + +Version 6.3.24-0: +Date + Time 2012-12-24 +======================= +Added: + - RM #104427: Added support for host report to igmp query comming from router (IB only) +Fixed: + - RM #111761: ETH unicast. No traffic to host behind router over VMA + - RM #111816: Iomux: Low polling works as high polling + - RM #112805, #111738, #4497: epoll perfromnce degradation + - BGATE RM #969: Epoll TCP with EPOLLIN, one shot and edge trigger, triggered only once even if the event is modified by epoll_ctl() + - RM #111750 and #112038: IB- PKEY, UC&MC - no traffic while using vma/nonvma server/client + - RM #112891: LBM is not working when running with VMA_SPEC=29 + - RM #112530: Untagged interface receives tagged traffic while it shouldn't. + + +Version 6.3.23-0: +Date + Time 2012-12-15 +======================= +Added: + - Support for epoll EPOLLOUT ET. + Known limitation: In socket UDP, both the OS and VMA will announce an fd as ready to write for the first time. + - RM #109300: Add enviroment variable VMA_TCP_TIMER_RESOLUTION_MSEC, to allow control of TCP timer rate. + +Changed: + - RM #101701: + 1. United VMA params: VMA_RX_POLL_OS_RATIO and VMA_RX_SKIP_OS into one + parameter: VMA_RX_UDP_POLL_OS_RATIO + 2. VMA_RX_UDP_POLL_OS_RATIO value will determine after how many CQ polls will + OS fd be polled. No matter if hit or miss. No matter if socket is blocking or non-blocking. + +Fixed: + - RM #104860: Fixed TCP socket connect hang in case connection has failed + - RM #106790: Fixed VMA_INTERNAL_THREAD_AFFINITY option generates a core dump, when it's set by a hexadecimal value + - RM #95010, #95010 : Fixed TCP retransmition mechanism. + - RM 101701: non blocking iomux should only poll once + - RM #110853: Fixed panic in case of drop_membership without add_membership first + - RM #91299: Fixed OS stats. We counted cases of EAGAIN as os_errors instead of os_eagain + + +Version 6.3.22-0: +Date + Time 2012-11-30 +======================= +Added: + -Enabled fragmentation handling + -Added enviroment variable flag: VMA_SUPPRESS_IGMP_WARNING + +Fixed: + -RM #101397: Pure virtual function was called in: return_buffers_to_owners, because mem_buf_desc_owner was freed earlier. + + +Version 6.3.21-0: +Date + Time 2012-11-29 +======================= +Added: + -Enabled Loopback support in IB MC. + -RM #101284: Added warning print in case umcast is enabled + -RM #102181: Added support for MSG_DONTWAIT with TCP socket + -New Sockperf - 2.5.193 + Major Changes: + - Bug fix: warmup messages were counted as duplicated/out-of-order messages. + - Bug fix: sockperf flags (-p -m -r) parsing + - Bug fix: negative valid run time of dropped packet + +Fixed: + -RM #101405: Fixed libvma.conf bug: Changed the multicast rule check to mc ip address instead of local interface ip address. + The rules shoudl address the packet's dest ip address. + -RM #94993: Prioritizing OS in non-blocking sockets that were mapped in select/poll/epoll. + This will fix the following bug (example scenario): + 1. Non-blocking UDP socket binds to INADDR_ANY. + 2. Socket is mapped in iomux. + 3. Not-offloaded traffic arrives. iomux notifies that fd is ready + 4. Because the fd is mapped as offloaded, in rx flow, we check the OS once + each 'rx_poll_os_ratio' times, and sampling it once each 'rx_skip_os' times. + Because it's a non-blocking fd, we'll get EAGAIN although iomux already + notified this fd as ready. (Very problematic when using EPOLLET) + + -RM #101397:Removed the code that deals with fragmentation and added a warning printout and drop packet in case of a fragmented packet + Added warning printout with packet info in case of drop due to fragmentation + -RM #102178: Fixed UDP MSG_DONTWAIT flag support in VMA + -RM #102182: Fixed non blocking logic in TCP socket + -RM #101874: Fix to perforomnce bug in select and poll that coused by latest commit that fixes functionality bug + +Version 6.3.20-0: +Date + Time 2012-11-18 +======================= +Added: + -RM #100602: Support for LD_BIND_NOW=1 + -RM #100485: Added printout for user to correctly use Rx buffs env param in case not enough available Rx buffs for cq allocation +Removed: + -VMA mechanism of connect timeout that was based on measuring time, and rely only on the mechanism of lwip which count SYN retransmissions + -RM#100754: VMA_IGMP support.Not relevant for this version and when disabled, prevents receiving traffic. + +Fixed: + -RM #97234, RM #101069: Fixed coverity error in vlogger.h + -RM #94996 (RM BGATE #1217): Avoid binding twice in case of calling connect again after previous call failed. + -RM #4547: Partial fix for segfault in accept. + +Version 6.3.19-0: +Date + Time 2112-11-11 +======================= +Added: + -RM #31007: change ARP related neigh_entry memebers m_uc_arp_quata, m_wait_till_send_arp_msec to be VMA params + +Changed: + New sockperf version 2.5.190 + + Major Changes: + - Bug fix to enable running with feed file containing multiple multicast groups + - Bug fix of memory leaks + - Added few initializations of errno in order to discard previous value + - RM #96487: VPI, Sockperf, MC feed file IBÐ, big drops - MC statistics bug + +Fixed: + -RM #97098: VPI, UC , TCP+ UDP, sockperf over VMA, msg size > MTU , Client run more than one time --> Traffic stops + -RM #97043: Deadlock in code + -RM #94487: Wrong VMA warning log when checking interface mode in case of bonding over ib + -RM #99187: libvma.conf rules- no matching APPLICATION_ID in libvma.conf caused traffic to go via os + -RM #97096, #31008: IB, Segmentation fault durin HA event, call for pure virtual function + -RM #97408: VMA Error that accures because of Multicast ARP that shouldn't be sent + -RM #79770: Eth, UC- TCP, sockperf over VMA, Big spikes - performance + -RM #97226: Coverity bug: Added initialization of members in the CTOR of ib_ctx_handler + +Version 6.3.18-0: +================= +Date + Time 2112-10-31 + +Fixed: + -RM #97108: Segmentation fault during HA event + -RM #97420, #97238: TCP traffic on IB wasn't resumed after HA event + -RM #97776 (BGATE #1537) : TCP ACK wasn't resent - cause to connection hang + +Version 6.3.17-0: +================= +Date + Time 2112-10-30 + +Added: - RM #93517, #96182: Manual route suppport for IB + - RM : Support for UC IB ARP (internal implementation) + - QPN and block loopback in ibv_attach_flow() (internal implementation) +Fixed: + - RM #94694: Segmenattaion fault in select + - RM #90795, #31702: sfnt-pingpong app: client get segfault when loading + - RM #96824: Fixed debug print of IPoIB MAC address + - RM #97101: IB MC traffic - low performance + +Version 6.3.16-0: +================= +Date + Time 2112-10-28 + +Removed: Disabling MC loopback for IB. MC loopback is not supported both over + IB and over ETH. + +Fixed: + - RM #96292: Segmentation fault when running traffic via bond interface over vlan + - RM #96396: TCP traffic over 10 nonblocked sockets via IB is stacked + - RM #96162, #94455: There is VMA_ERROR and traffic stops after ETH HA event. + - RM #94455, RM#96922 : Fixed IB HA and potential ETH HA bug. + - RM #7880: Rsync basic test get VMA error + - RM #96822: Added ring release in neigh_entry DTOR. + - Changed prefix_dir to /usr/ so that the installation of libvma will always be under /usr/lib64 + - Use same resource_allocation_key (=0) for neigh, dst_entry and sockinf(rfs) + so we don't create more then a single ring per interface (local ip) + - Fixed IPoIB L2 address print + - Internal logic fixes for clean_obj(), that can cause to segmentation fault + - Update README.txt with latest VMA_PARAMETERS + +Version 6.3.15-0: +================= +Date + Time 2112-10-25 + +Added: + - Support for SLES11 SP1 (RM#96188) + +Fixed: + - RM #40904: Now traffic after HA event when running VMA over bond interface over 2 HCAs + - RM #96266 /Bgate #1518: Segmentation fault in timer handler in multiple scenarious + - RM #95007 / Bgate #1479, RM #95006 / Bgate #1493: No traffic and TCP error when using select with mixed TCP sockets over offloaded and non-offloaded interfaces. + fds were mistakingly left out of the fd_set. + - Several internal fixes to qp_mngr and cq_mngr + +Version 6.3.14-0: +================ +Date + Time 2112-10-24 + +Removed: Umcast validation logic, since umcast is not relevant anymore + +Fixed: + - Bgate #1310: Sockperf client hangs when running MC traffic, throughput or latency tests, with 1000 sockets + Now VMA supports 2000 sockets + - RM #93425: Netperf server stuck and gets YO!TIMESUP! + - RM #90701: Traffic over more then 2 TCP sockets after HA event from server side will not resume. + Bug in TCP retransmit mechanism + - RM #93293, #93425: The handle_timer_expired function was called after deleteing the object. + - RM #94263: valgrand: invalid read after free + - RM #51113: epoll with more then 1024 sockets crushes with segfault + - RM #33909: Sockperf over VMA with mixed feed file, traffic stuck in the middle of the run + Bug: Sometimes packet will not be sent becaouse of the wrong lkey + -RM #94572: valgrand + -RM #93244: When using libvma.conf to redirect listen socket to OS, traffic goes through VMA + -RM #95058 (Bgate #1501): Broken TCP beacouse of memory corruption. + + +Version 6.3.13-0: +================ +Date + Time 2112-10-17 + +Added: + - Enabling IPOIB support by default, compatible with MLNX_OFED_LINUX-1.8.6-0.0.8 + +Fixed: + - Fixing IPOIB support + +Version 6.3.12-0: +================ +Date + Time 2112-10-16 + +Fixed: + - Bgate RM #1235: select() returns more than one socket ready to write even if one socket is monitored by select(). + - Bgate RM #935 where epoll returns with ready events that weren't added in epoll_ctl + - Fix race between new TCP connection on server and quick FIN from client: RM #51103, #89799, #89788. + - RM #89714 - can't run vma server over vlan interface. + - RM #26237 - receiving segmentation fault when running long command line (over 512 bytes) + - RM #33104 - ETH UC TCP - no connection when using nonblocked traffic + - RM #89795 - Sockperf over VMA with low polling gives ring VMA error on both sided + - RM #89853 - possible invalid free of memory in TCP server + - RM #31492 - Add IGMP protocol logging to ring::rx_process_buffer() + - RM #30419 - Eth, MC, 2 HCA, sockperf over VMA- client get VMA ERROR + +Internal: + - Reduce ERROR and WARN log level of ring in cases we handle it properly + - Add netlink periodic cache update with notification for neigh entries to support unreported kernel events when neigh state change directly from STALE to REACHABLE. + +Version 6.3.11-0: +================ +Date + Time 2112-10-09 + +Added: + - Updated new sockperf 2.5.186 that fixes redmine issue #34180 + +Fixed: + - Disabled IPOIB by default + - Completed fix for redmine issue #51103 - TCP - 1 server 2 clients recvfrom, server gives core dump + +Version 6.3.10-0: +================ +Date + Time 2112-10-04 + +Fixed: + -Fixed TCP ETH HA + -Fixed HA over more then one NIC + -Fix for redmine issue #85559 - sfnt-pingpong failed when running the client first + -Fix for redmine issue #51103 - TCP - 1 server 2 clients recvfrom, server gives core dump. + +Version 6.3.9-0: +================ +Date + Time 2112-09-27 + +Added: + -Updated new sockperf 2.5.185 that fixes + bug: #34180: Eth, Sockperf over VMA, UL muti-thread, 5 MC&5 TCP sockets, Close server before client gives + Segmentation fault + +Fixed: + -IB wasn't enabled be default + +Version 6.3.8-0 +================ +Date + Time: 2012-09-27 +Added: + - Support for IB. +Fixes: + - Bug #30038: Eth, TCP, Sockperf over VMA with libvma.conf: server over VMA and client over OS, client failes to connect + The Fix: Changed server to skip checking transport rules for accepted sockets + - Bugs: #30057, #33911: VMA_ERROR on application exit "Error when blocking for next tx buffer" + - Fix for Tx buffer cleanup logic + - Bugs #31491, #29983: VMA ERROR "setsockopt(DROP_MEMBERSHIP) failed (errno=99 Cannot assign requested address)" when running VMA + with VMA_IGMP=0 (IGMP disabled) + + +Version 6.3.7-0 +=============== +Date + Time: 2012-09-23 +Added: + - Added support for MSG_WAITALL flag of tcp recvfrom(). + Bug fix #31708: sfnt-pingpong failed for message sizes larger then MTU + - Update to sockperf_2.5.184 + +Fixed: + - Bug #65022: Fix iomux TCP support + - Bug Fix: (redmine #78428, bgate: #1320) : Eth, UC, TCP, sockperf over VMA, server stuck after client's second run + - Changing dst_entry prepare_to_send logic to support lockless notify_cb() + - Changed fd_collection lock to be recursive do to a potential deadlock on + The Fix: handling fd_collection timer on fd close + - Reduce the VMA license + details log output on default run (for RPM + releases) + - Bug #85398: Eth, MC, sockperf TP, nonblocked, Client gets Segmentation fault + - Bug #33104: ETH UC TCP- no connection when using nonblocked traffic (happened with more than 1 socket). + The Fix: Non blocking socket should return with error and set errno when out of buffer space. + - Bgate Redmine bug #1239 where recvfrom with MSG_DONTWAIT for not offloaded sockets failed + +================ +Date + Time: 2012-09-13 +Added: + - Check for bonding mode. If the mode is + not active-backup or failover_mac!=1, VMA prints warning + - Added support for contiguous pages. + Changed memory allocation logic: + + VMA_MEM_ALLOC_TYPE will now replace VMA_HUGETBL: + 0 - "ANON" -using malloc. + 1 - "CONTIG" -using contiguous pages. + 2 - "HUGEPAGE" -using huge pages. + The default is 1 + + - Increase number of CQ in vma_stats to 8 (used to be 2) + Now we can monitor 8 rings (which is 2 ports x 4 NICs) without a wanring + +Removed: + - Disable ip frag periodic timer until HW/SW supports ip frag offload + in (Rx path) +Fixed: + - Fixed segmentation fault in ring diring HA event + - Bgate Redmine bug #1239: recvfrom with MSG_DONTWAIT for not offloaded sockets failed + - Bug #47527: Deadlock between dst_entry and route_entry + - Fixed bug in iomux - this cause to slight performnce degradation, Ported from VMA_6.1 + - Fixed several iperf bugs: #33905 + - Fixed global buffer pool thread safe usage + - Bgate Redmine bug #1251: Multicast: sendto() through specific inteface eturns number of sent data + but actually it doesn't send data + - Bug #29996: VMA WARNING: "Buffer CQ owner not found" + - 2~3% PPS improvement in small msg sizes + - Bug #51073: Error in case of IPv6 interface + - Fixing lock and resource free issues in dst_entry + - Bug: #34427: Netperf Server gets segmentation fault + - Fixed a lot of valgrind issues that cause to glibc crush + - Bgate Redmine bug #1305: TCP over non offloaded interface doesn't work + - Removed several locks in dest_entry to prevent deadlocks + +Version 6.3.5-0 +================= +Date + Time: 2012-09-06 +Fixed: + - Improved send performance by fixing double_lock occurance in tx flow (ring tx_lock and buffer_pool_tx_lock) + - Improved TCP throughput performance + - Redmine bug #33866: Eth, Netperf, Server gets VMA WARNING and there is no traffic + - Redmine (bgate) bug #1286: Fixed vma panic when opening more than 32 sockets on the same IP+PORT + - bug #31281: No traffic over VLAN-tagged interface + - Fixed HA bugs that fix the following Redmine bugs: + #31422: HA event on client side during UDP traffic - the traffic is not comming back + #31421: HA event on server side during UDP traffic - the traffic is going through OS + + +Version 6.3.4-0 +================= +Date + Time: 2012-09-03 +Changes: + - Added check for port status, if port (IB) status is not active - net_device_val_ib object will not be created + - Fixed bug when handling IPv6 address in sockinfo::getsockname(), this is part of fix to bug #30632 (netserver side) + - In case of non offloaded device- create_new_entry for net_device returns NULL + - Removed handling interface down/up events + - Fixed a bug in tx-buffers release when there are still unsent + packets in the qp + - Fixed deadlock in ring by changing func all to direct cq funcs + - Fixed access to resolve_ring() calls from dst_entry + - Fixed vargrind warnings about uninitialized values, mismatched new [] / delete + - Unregister sockinfo_tcp timer at destruction + - Added lock on send function + - Fixed net_device_entry registration to event_handler_manager to enable multiple events + - Added ring locks around tx buffers get and release to prevent a missing buffer while draining buffers on ring->restart + - Added a direct call to poll_and_process_element through the cq + - Added inline flag to the dummy packet in the qp triger_completion_for_all_sent_packets() + - Fixed coverity errors + - Make p_buffer and sz_buffer constant in mem_buf_desc_t, and add a constructor to initialize them + - Move lwip_pbuf initialization from sockinfo_tcp to ring.cpp, avoid changing sz_data because we don't change p_buffer any more + - Fixed bug in ring that accurs during local bonding event + - Remove Rx CQ size configuration (Default is now like in Tx which is equal to size of QP). + In new design we have CQ (Rx + Tx) per single QP inside the ring. There is no point having a CQ larger then the QP (RQ or SQ) + - Remove qp_mgr locks. All locks are in ring. + - Func set_default_lkey(ib_ctx_handler* p_ib_ctx_h) was added to + buffer pool. The purpose is to allow change of lkey by ring in case of local bonding event. + - Fixed dead lock in dest-entry - removed lock in route_entry + - Added support for tx buffers in case of local bonding event between different devices + - Fixed L2_address::compare() function + - Fixed bug in UC traffic in case of remote bonging event + - Fixed broken tcp flow because of wrong lwip_buffer_tx_lkey + - Force TCP data out before going to wait(). Fixes #30632 + - Bind can be called twice since we silently call bind on connect. Fixes: #30632 + - Fixed deadlock in local bonding event in tcp + - Added ethertype fiels assignment in ibv_flow_spec struct, before call to ibv_attach_flow() + +Version 6.3.3-0 +================== +Date + Time: 2012-08-21 +Added: + - Added verification logic that IPoIB is running in datagram mode on all IB interfaces + - Moved TX buffers management from qp_mgr to ring + - Added support to interface up/down events +Fixed: + - Fix bug #25890. + - fix bug #30140. + - fix for bugs: # 30110, 30121 + - Fix IB MC send + - Fix race conditions in dst_entry/neigh send logic + - fix: #30608, #30237 + + +Version 6.3.1-0 +================== +Date + Time: 2012-08-02 +Added: + - First 6.3 version to QA and verification + + +Version 6.1.8-0 +================== +Date + Time: 2012-05-14 +Added: + - journal.txt added to VMA rpm + - Added new parameter VMA_MAX_INLINE (replacing VMA_TX_SGE) it will be + used to set max message size that can be send using inline. + Default 224 bytes + - Added new parameter VMA_BF it enables / disables Blue Flame usage of + the ConnectX. The default is on + - Added support for REUSEADDR in TCP socket + - Added support for push flag in last packet of TCP segment. (When not + doing so can cause to delays when working with systems that relay on this) + +Fixed: + - Fixed epoll behavior in case we got FIN on one of the TCP sockets + - Fixes to TCP socket close logic + - Fixed bug: If we accept connection with OS, this doesn't mean we won't accept + the next connection with VMA + - Fixed hang when SO_RCVTIMEO was used :Added a missing taking lock action when SO_RCVTIMEO timed out. + - Fixed: ibcm getting deleted while having a rcv sink (TCP only). + Added two protections: + 1. Deleting ibcm from remove list when adding a sink. + 2. Confirming there are no sinks just before deleting the ibcm. + - Fixed vma_stats prints: Need to print RX statistics in case that sockinfo queue + has packets but there was no recv on this socket +Changed: + - VMA will be compiled with debug symbols, by default + - New sockperf version + - Removed rx (VMA_RX_SGE) and tx sge (VMA_TX_SGE) user parameters, rx_sge will be set to 1 and + can't be changed by user any more + - Changed default of VMA_RX_BYTE_MIN_LIMIT from 2000000 to 64K + +Version 6.1.7-0 +================== +Date + Time: 2012-04-16 +Added: + - Add support for TCP ioctl FIONBIO. + - Add Minshall algorithm to tcp_out. + - Support udp port mapping - redirecting UDP traffic to other sockets + by round-robin - special setsockopt. + - HW handles tx checksum for unfragmented traffic. +Fixed: + - fixed alias devices handling. + - Thread wakeup cleanup and fixes. + - Remove socket from fd collection when it fallbacks to OS(server side) + - FIX: vma uses select with more then 1024 fd's, change to poll. + - Set lwip mss according to net device mtu. + - Remove unneeded locking from iomux. + + +Version 6.1.6-0 +================== +Date + Time: 2012-03-18 +Added: - Support setting a different cpuset for vma internal thread + +Fixed: - TCP bind that was broken in a previous build + - Fix missing lock in stat_publisher for select and poll creation + +Version 6.1.5-0 +================== +Date + Time: 2012-03-15 +Added: - Support for loopback (still will not be offloaded but + redirected to the OS) + - Support SO_RCVTIMEO + - Add support for QP per application thread (VMA_QP_LOGIC=2) + This will work good when combined with VMA_CQ_LOGIC=2 + Only relevant for application that have dedicated thread that + handling specific set of sockets + - Option to run without VMA internal thread affinity + (VMA_INTERNAL_THREAD_AFFINITY -1) + - Backlogging UDP packets when connection is not ready , instead + of dropping them +Fixes: - Minor fixes in statistics + - Setsockopt should fail with bad address in case of NULL optval + - Segfault when calling setsockopt on non connected socket + - Several segmentation faults with multithreaded applications + - Bug in epoll - number of return fds sometimes was bigger then + the actual number of readt fds + - epoll statistics (Weren't initialized right therefore + presented garbage info) + - Fix segmentation fault when run application with cset by + adding fall back mode if CPU Thread affinity value is not in the + cpu-set of the process + - Blocking TCP socket should return only when sent all data + - Alias route resolution + + +Version 6.1.4-0 +================ +Date + Time: 2012-02-16 +Added: - Async Connect/Listen support + - Support for TCP listen() on both offloaded and not-offloaded interfaces. +Fixes: - Fixed broken M/C support +Changes:- Removed VMA_TX_IP_CHECKSUM logic + + +Version 6.1.3-0 +================ +Date + Time: 2012-02-07 +Added: + - IPoIB support + - TCP linger timeout +Fixes: + - Fixed bug occured when VMA was loaded with dlopen (instead of LD_PRELOAD) + - Fixed hang on exit. The issue was related to TCP cleanup. + - Optimization of UDP TX flow +Changes: + - Optimization of vma_stats mechanism to reduce outliers. + + +Version: 6.1.2-0 +================ +Date + Time: 2012-01-30 +Fixes: + - Fixed VMA_RX_POLL=-1 behavior for non-blocking sockets - it should not loop forever + - Added rules check in register_as_uc_receiver() when binding to INADDR_ANY. (#4085) + - If the first call to listen succeeded then the second call should succeed as well + - Fixed: If libvma.conf didn't exist nothing was offloaded. In case that libvma.conf doesn't exist target_family = TRANS_VMA). Means that everything will be offloaded (#5000) + - Fixed UDP connect. In case that UDP connect has been called by the app, VMA should search at 5 tuple level first. + + +Changes: + - Added support for vma_stats history. When setting VMA_STATS_FILE flag, + VMA will dump each socket's statistics into a file just before closing that socket + - VMA handles L3, LWIP only does TCP - this allows VMA to support + Multiple Default Gateways + - Removed igmp logic except for warning message regarding igmpV2 over IB - + this is not needed with ConnectX3 Flow Steering + - Added limited support for SO_RCVTIMEO socket option + - Added support for async connect/listen + - Added Internal Thread Affinity support + - Added support for RDMA_CM_EVENT_ADDR_CHANGE event in UC conn_mgr - + Support for bonding Active / Passive (no mac - default mode) in Ethernet. + - Updated sockperf to 2.5.156 + +Version: 6.0.7-0 +================= +Date + Time: 2011-12-19 +Fixes: + - Segmentation fault when running multy threaded server with iomux (#4607) + - Fixed several bugs in VMA conf file (libvma.conf) (#4087, #4611) +Changes: + - Update sockperf version to 2.5.149 + +Version: 6.0.6-0 +================= +Date + Time: 2011-12-15 +Fixes: + - Resolves number of multithreaded issues + - Issues with libvma.conf file parsing + - Add support of large TCP send (bigger than 64k) + - Fix failure related to async connect (handled by os) + - Drop packet in case of buffer shortage in TX flow +Changes: + - Add 32 bit support + - Increase the default value of VMA_TX_WRE parameter to 16000 + - Removed 'udp_lat' and 'tcp_lat' from rpm installation + - Update sockperf version to 2.5.148 + +Version: 6.0.5-0 +================= +Date + Time: 2011-12-08 +Fixes: + - Removed VMA panic raised in accept_clone() if parent's socket local address is NULL. + - BugFix #3720 - set epoll ready events of a newly added fd. + - BugFixes #(4468, 4225,3419) - seg fault during vma process exit (main_destroy()). + - Ported the recmmsg/recmsg fix from 4.5. + +Changes: + + - Added support for SUSE (Sles) operating system. + - Increased RX_POLL and SELECT_POLL values to 100000. + - Moved from MAC validation license mechanism to Customer details logging. + - Added statistics details per epfd on vma_stats. + - Reversed umcast logic. Removed preload scripts. + - Updated sockperf version to 2.5.146. + - Added support for different file systems (xfs, reiserfs, etc ..) when reading license file. + (Also fixes #4276) + - Added tcp socket to server test. + - Enhanced lock statistics: add (optional) name to each lock, and measure average lock wait time. + +Version: 6.0.4-0 +================= +Date + Time: 2011-11-28 +Fixes: + - QP attach/detach error flow - drop lock on error + - Removed empty VMA_WARNING in case there is a connection error + - Compilation error of VMA on SUSE 11 (#3807) + - Compilation error of sockperf on SUSE 11 and RH5 ( + - Application hang in case that connect has failed (#4007) + - ERROR log replaced with DBG log if UDP socket is ruled to use OS instead of VMA (#4052) + - Removed unnecessary unlock /lock to p_cq_mgr lock in vma_outpout() + This fixes segfault in several case when stopping application with (#3999) + - Moved several internal errors prints to debug + - Fixed application hang in iomuxes (relevant to all) in case it got FIN from the other end (#3721) + - Fixed vma_recvfom_zcopy documentation (#612) + - Return ENOTCONN instead of ECONNRESET, on attemp to write on a socket which is still is not connected (#4029) + - listen without bind / bind to port 0 caused to appl hang (#3996) + - getsockname and getpeername functions (#4199) + +Changes: + + - Added support for 32bit + - Replace clock_gettime by rdtsc in internal VMA code this improves + spikes in latency + - Added error message when flow steering option is disabled + - VMA license modification process - phase 1: + Disabling mac validation in rpm installation + - Added implemenation of __read_chk() which is required for netcat support (#4066) + - Added implemenation of __recv_chk() and __recvfrom_chk() + - More scalable select and poll (#3327) + + +Version: 6.0.3-0 +================= +Date + Time: 2011-11-09 +Fixes: + - Spelling mistake in libvma.conf. + - Tcp socket close hangs since it fails to flush the unset queue on exit. + - ibv_post_receive() returns with EAGAIN. + - An attemp to establsih a connection over not supported (not + ConnectX) interface failed. Added mechanism which fallbacks the connection to OS + if VMA fails to handlle it by itself. + - Multi threaded issues that was reproduced with iperf tcp multi + threaded : hangs, errors, segmenation faults...: + a. Change rt_table_handler to be thread safe + b. Change the state of the ibcm to UC_ST_TX_READY and only then empty the unsent queue + c. Put under lwip lock new pcb creation + d. Add wakeup() to accept and conect cb + - Segmentation fault which occurs when application is closed + while it receives traffic + -VMA ERROR when running application over multiple interfaces / vlan + interface + -Deadlock that was reproduced by running sockperf pp test with 50 sockets + By remove unlock and lock of lwip lock in vma_output() + +Changes: + - Incsreased timeout of rdma_cm to 3500 to allow kernel + retransmission mechanism to be carried out before rdma_cm raises an error + event, for example to resend an ARP query + +Version: 6.0.2-0 +================= +Date + Time: 2011-10-27 +Changes: + - Added: progress engine logic, which allows VMA to progress the stack + even when no context is provided by the application. + - BugFix: Fixed segfault occured on epoll notification attemp + done for a closed socket. + - BugFix: Fixed segmentation fault occured in infinite polling + mode. + - BugFix: When VMA fails to read the configuration file (libvma.conf) + it just prints a warning message instead of aborting. + + +================= +Date + Time: 2011-10-18 +Changes: + - Added Poll support. + - Added Vlan and alias support. + - Added new Epoll infrastructure: + 1. Edge triggering support. + 2. One shot support. + 3. Better scalability for larger numbers of file descriptors. + 4. Fairness between registered sockets when receiving events. + 5. Epoll_create1() support. + - Added OS Eagain support to vma_stats. + - Added support for setsockopt SO_RCVBUF. + - Added ttl support in TX according to user request. + - Added possibility to change qp lock (spin,mutex) with env flag. + - Disabled CQ drain thread by default. + - Changed epoll_create() size to match userspace max. + - Changed wakeup mechanism for TCP. + - Updated sockperf 2.5.137. + - BugFix: Fixed bug when enlarging message size using INLINE. + - BugFix: Fixed segfault caused by calling epoll (NULL,NULL,NULL). + - BugFix: Fixed bug when posting a SR without any s/g to a RAW QP. + - BugFix: Close stderr on child process. + - BugFix: Fixed segfault in select when readfds = NULL. + - BugFix: VMA warning regarding IGMP version of aliased/Vlan interface. + +Version: 6.0.0-0 +================= +Date + Time: 2011-09-19 +Added: + - First VMA version which supports the new flow steering over CX3. + - Alias interface support. + + +Version: 5.0.9-0 +================= +Date + Time: 2011-06-19 +Changes: + - Added: a new environment variable - VMA_CLOSE_ON_DUP2. + This will treat all of the dup2() calls as close calls, internally. + This ONLY deals with the case of using dup2() for closing files. + Enabled by default. + - BugFix: Fixed issue which caused VMA to drop in some scenarios the + ARP packets. + - BugFix: When VMA is interrupted on accept() call instead of raising + an abort it sets the errno to EINTR and returns with -1. + - BugFix: Fixed segfault occured on TCP listen socket close(). + +Version: 5.0.8-0 +================= +Date + Time: 2011-06-05 +Changes: + - Added: Throughput enhacment. + - Changed: sockperf version. The version was upgraded from 2.5.30 + to 2.5.58- this new version resolves the TCP partial send bug. + - BugFix: Fixed core dump which occured on lwip's buffers allocation + failure. + - BugFix: Before closing the socket VMA sends all the unsent segments, + this mechanism allows VMA to send the FIN and to close the connection + properly. + - BugFix: Fixed bug in CQ statistics managment. + +Version: 5.0.7-0 +================= +Date + Time: 2011-05-31 +Changes: + - Added: New benchmark utility for measuring performance- sockperf, + which will be installed as part of the VMA package. + - Added: New VMA environmen parameter: VMA_STATS_FD_NUM- which allows + to modify the max number of file descriptors which are monitored by + VMA statistics. + - BugFix: Fixed bug in sendmsg() socket call. + - BugFix: Fixed failure which has occured when an applciation called to listen + socket call multiple times, from the same socket. + - Changed: Default multicast TTL fomr 32 to 64. + +Version: 5.0.6-0 +================= +Date + Time: 2011-04-14 +Changes: + - BugFix: [#107131, #107080, #106197] We have enlarged the VMA buffers + pools in order to avoid core dumps in throughput tests with multiple + sockets. + - BugFix: [#106593] Enabled periodic CQ drain mechanism in VMA which + wake ups each 10 msecs and polls the CQ- this allows VMA to progress + the stack even when applciation doesn't call to recive. + - BugFix: Setting the port to INPORT_ANY while calling to rdma_bind + has resolved the failure of calling to rdma_bind multiple times. + - BugFix: Fixed issue which caused to segmentation fault on application + attemp to monitor more than 50 sockets by means of epoll mechanism. + - BugFix: Fixed bug related to VMA internal hash table cache + managment. + - Added: Zero copy API support for UDP socket. + + +Version: 5.0.5-0 +================= +Date + Time: 2011-04-05 +Version: 5.0.5-0 + - BugFix: [#106055] Fixed issue that caused to, application which + listens in the same time to kernel's and VMA virtual interfaces, + to hang. + - BugFix: Fix of the epoll event mask validation, which VMA performs + in epoll_ctl(EPOLL_CTL_MOD) function. + - BugFix: [#105923] Fixed UDP connect flow. + - BugFix: [#104678, 104511] Fix in the the mechanism which queries the + interface bonding configuration. + - BugFix: [#106027, 105664] Forked VMA image havent inherited the + LD_PRELOAD environment parameter of its parent, because it was + unsetted and never reconstructed in parent's VMA constructor. + VMA was modified to restore the LD_PRELOAD environment parameter. + - BugFix: [#104688] Fix of VMA socket's multicast ingress/outgress intreface + update done by means of socket calls (setsockopts). + - BugFix: [#1046107] VMA was modified to put the VMA interface IP + and MAC in transmitted, over its virtual inerface, multicast packet, + instead of IP and MAC of the "base" interface. + + +================= +Date + Time: 2011-03-27 +Changes: + - Added: Support of UDP offloading policy configuration, in the + libvma.comf file (this is relevant both for unicast & + multicast). + - Added: Initial VMA TCP statistic managment. + - Changes: VMA_UC_OFFLOAD environment parameter was removed. VMA will + offload the UDP unicast traffic by default. VMA offload policy + can be configured now by means of VMA configuration file. + - Changes: Merge of various commits from VMA 4.5 branch. + - BugFix: [#105612, 104750] Call to select/epoll has caused to segmentation + fault.Correction of, the sanity check in select flow and the initialization + of the VMA listen socket on listen call, has resolved the issue. + - BugFix: [#105371] Bind of UDP socket to local VMA interface was fixed. + - BugFix: [#104682] Fix of the rx multicast statistic counters managment. + +Version: 5.0.3-1 +================= +Date + Time: 2011-03-20 +Changes: + - BugFix: [Bug #104540] Various error/warning message printed by vma on + startup/exit were removed. + - Changed: Documentation in libvma.conf was adjusted to the new + configure file format. + +Version: 5.0.3-0 +================= +Date + Time: 2011-03-16 +Changes: + - Added: Full UDP unicast support- both, the transmit and the receive path, are + offloaded by VMA. + - Added: Support of asyncronous TCP connect. + - Added: Support of TCP connect on already bounded TCP socket. + - Added: Default gateway support in TCP and UC UDP flows. + - Added: Epoll support in TCP. + - Added: VMA rpm installs the libvma.conf file in /etc/. + - Added: Environment parameter- VMA_CONFIG_FILE, which allows + migration of the libvma.conf. + - Changed: Enhancment of the VMA buffer managment mechanism. + - Changes: Enhancment of VMA config file format. + - BugFix: A number of fixes related to TCP connection + establishment/termination. + +Version: 5.0.2-0 +================= +Date + Time: 2011-02-17 +Changes: + - BugFix: Transmition of a big stream burst caused to segmentation fault. + Fix of the stream offset calculation resolved the issue. + +Version: 5.0.1-0 +================= +Date + Time: 2011-02-14 +Changes: + Work with vanilla OFED-1.5.3-rc3 firmware 2.8.0 + Preformance optimization: + - inline + blueflame + - hw checksum ofload + - tcp stack performance optimizations + Better support for multithreaded apps: + - iperf (accept thread/worker thread) + - send/recv in separate threads + Memory footprint reduced to about 600k + License check is does not require python anymore + tcp_lat uses rdtsc to get more precise latency measurements + Various bugfixes + +Version: 5.0.0-0 +================ +Date + Time: 2011-01-16 +SVN Revision: 2653 + +Changes: + Added: - First TCP Demo + diff --git a/src/Makefile.am b/src/Makefile.am new file mode 100644 index 0000000..067e9ab --- /dev/null +++ b/src/Makefile.am @@ -0,0 +1 @@ +SUBDIRS := utils vlogger state_machine stats vma diff --git a/src/state_machine/Makefile.am b/src/state_machine/Makefile.am new file mode 100644 index 0000000..4b749c7 --- /dev/null +++ b/src/state_machine/Makefile.am @@ -0,0 +1,23 @@ +AM_CPPFLAGS := -I$(top_srcdir)/src + +noinst_LTLIBRARIES = libstate_machine.la +libstate_machine_la_LDFLAGS = -static +libstate_machine_la_SOURCES = \ + sm_fifo.cpp \ + sm.cpp \ + sm_fifo.h \ + sm.h + + +noinst_PROGRAMS = state_machine_test +state_machine_test_LDADD = \ + libstate_machine.la \ + $(top_builddir)/src/utils/libutils.la \ + $(top_builddir)/src/vlogger/libvlogger.la + +state_machine_test_SOURCES = main.cpp + +state_machine_test_DEPENDENCIES = \ + libstate_machine.la \ + $(top_builddir)/src/utils/libutils.la \ + $(top_builddir)/src/vlogger/libvlogger.la diff --git a/src/state_machine/main.cpp b/src/state_machine/main.cpp new file mode 100644 index 0000000..eaf7228 --- /dev/null +++ b/src/state_machine/main.cpp @@ -0,0 +1,325 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include +#include +#include +#include "sm.h" +#include "sm_fifo.h" +#include + +#define MODULE_NAME "SM_TEST: " + +#define NOT_IN_USE(a) ((void)(a)) + +/* SM example */ + +typedef enum { + SM_ST_A = 0, + SM_ST_B, + SM_ST_C, + SM_ST_LAST +} sm_state_e; + + + +typedef enum { + SM_EV_1 = 0, + SM_EV_2, + SM_EV_3, + SM_EV_4, + SM_EV_LAST +} sm_event_e; + +// Debug functions definitions +const char *state_num_to_str_func(int state); +const char* event_num_to_str_func(int event); +void print_event_info(int state, int event, void* app_hndl); + +void sm_st_entry(const sm_info_t& info); +void sm_st_leave(const sm_info_t& info); +void sm_st_A_ev_1(const sm_info_t& info); +void sm_st_A_ev_2(const sm_info_t& info); +void sm_st_A_ev_3(const sm_info_t& info); +void sm_st_B_ev_1(const sm_info_t& info); +void sm_st_B_ev_2(const sm_info_t& info); +void sm_st_B_ev_3(const sm_info_t& info); +void sm_st_C_ev_1(const sm_info_t& info); +void sm_st_C_ev_2(const sm_info_t& info); +void sm_st_C_ev_3(const sm_info_t& info); + +void sm_default_trans_func(const sm_info_t& info); + + +//// The short table +sm_short_table_line_t sm_short_table[] = { +// {curr state, event, next state, action func } + { SM_ST_A, SM_STATE_ENTRY, SM_NO_ST, sm_st_entry}, + { SM_ST_A, SM_EV_1, SM_ST_A, sm_st_A_ev_1}, + { SM_ST_A, SM_EV_2, SM_ST_B,sm_st_A_ev_2}, + { SM_ST_A, SM_EV_3, SM_ST_C, sm_st_A_ev_3}, + { SM_ST_A, SM_STATE_LEAVE, SM_NO_ST, sm_st_leave}, + + { SM_ST_B, SM_STATE_ENTRY, SM_NO_ST, sm_st_entry}, + { SM_ST_B, SM_EV_1, SM_ST_B, sm_st_B_ev_1}, + { SM_ST_B, SM_EV_2, SM_ST_C, sm_st_B_ev_2}, + { SM_ST_B, SM_EV_3, SM_ST_A, sm_st_B_ev_3}, + { SM_ST_B, SM_STATE_LEAVE, SM_NO_ST, sm_st_leave}, + + { SM_ST_C, SM_STATE_ENTRY, SM_NO_ST, sm_st_entry}, + { SM_ST_C, SM_EV_1, SM_ST_C, sm_st_C_ev_1}, + { SM_ST_C, SM_EV_2, SM_ST_A, sm_st_C_ev_2}, + { SM_ST_C, SM_EV_3, SM_ST_B, sm_st_C_ev_3}, + { SM_ST_C, SM_STATE_LEAVE, SM_NO_ST, sm_st_leave}, + + SM_TABLE_END +}; + +#if 0 + +typedef struct { + int event; + char* name; +} test_entry; + +void fifo_test() +{ + sm_fifo my_fifo; + int i=0; + fifo_entry_t ret; + + test_entry arr_num[] = { + {1, "one"}, + {2, "two"}, + {3, "three"}, + {4, "four"}, + {5, "five"}, + {6, "six"}, + {7, "seven"}, + {8, "eight"}, + {9, "nine"}, + {10,"ten"} + }; + + + vlog_printf(VLOG_INFO, "fifo test\n"); + + while (i<10) { + my_fifo.push_back(arr_num[i].event, (void *) arr_num[i].name ); + vlog_printf(VLOG_ERROR, "element %d was inserted\n", arr_num[i]); + my_fifo.debug_print_fifo(); + ret = my_fifo.get_front(); + vlog_printf(VLOG_ERROR, "element %d was removed (%s)\n", ret.event, ret.ev_data); + my_fifo.debug_print_fifo(); + i++; + } + /*while (i>0) { + ret = my_fifo.get_element(); + vlog_printf(VLOG_ERROR, "element %d was removeded\n", ret); + my_fifo.debug_print_fifo(); + i--; + }*/ +} + +#endif + + +#if _BullseyeCoverage + #pragma BullseyeCoverage off +#endif + + +state_machine* g_sm; + +int main(int argc, char *argv[]) +{ + vlog_levels_t log_level = VLOG_DETAILS; + + if (argc > 1) { + log_level = log_level::from_str(argv[1], VLOG_INIT); + if (log_level == VLOG_INIT ) { + printf("illegal log level %s\n", argv[1]); + return -1; + } + } + vlog_start("SM_TEST", log_level, NULL, 0); + //fifo_test(); + + g_sm = new state_machine(NULL, + SM_ST_A, + SM_ST_LAST, + SM_EV_LAST, + sm_short_table, + sm_default_trans_func, + NULL, + NULL, + print_event_info); + + g_sm->process_event(SM_EV_2,(void *)"event 2"); + + delete g_sm; +} + + + +//// Debug functions definitions +const char* state_num_to_str_func(int state) +{ + switch (state) { + case SM_ST_A: + return "SM_ST_A"; + case SM_ST_B: + return "SM_ST_B"; + case SM_ST_C: + return "SM_ST_C"; + default: + return "Undefined"; + } + +} + +const char* event_num_to_str_func(int event) +{ + switch (event) { + case SM_EV_1: + return "SM_EV_1"; + case SM_EV_2: + return "SM_EV_2"; + case SM_EV_3: + return "SM_EV_3"; + case SM_EV_4: + return "SM_EV_4"; + default: + return "Undefined"; + } +} + +void print_event_info(int state, int event, void* app_hndl) +{ + NOT_IN_USE(app_hndl); + printf(MODULE_NAME "Got event %s (%d) in state %s (%d)\n", + event_num_to_str_func(event), event, state_num_to_str_func(state), state); +} + +//////////////////////////////////////// +// SM Entry Function +void sm_st_entry(const sm_info_t& info) +{ + printf(MODULE_NAME "State changed %s (%d) => %s (%d)\n", + state_num_to_str_func(info.old_state), info.old_state, + state_num_to_str_func(info.new_state), info.new_state); +} + +//////////////////////////////////////// +// SM Leave Function +void sm_st_leave(const sm_info_t& info) +{ + printf(MODULE_NAME "State changing %s (%d) => %s (%d)\n", + state_num_to_str_func(info.old_state), info.old_state, + state_num_to_str_func(info.new_state), info.new_state); +} + +//////////////////////////////////////// +// SM Transition Functions +void sm_default_trans_func(const sm_info_t& info) +{ + printf(MODULE_NAME "Default Transition: Handle event %s (%d) in state %s (%d)\n", + event_num_to_str_func(info.event), info.event, + state_num_to_str_func(info.old_state), info.old_state); + if (info.new_state != SM_ST_STAY) { + printf(MODULE_NAME "Default Transition: Moving to state %s (%d)\n", state_num_to_str_func(info.new_state), info.new_state); + + } +} + +void sm_st_A_ev_1(const sm_info_t& info) +{ + printf(MODULE_NAME "Got event %s in state A\n", (char*)info.ev_data); +} + +void sm_st_A_ev_2(const sm_info_t& info) +{ + printf(MODULE_NAME "Got event %s in state A\n", (char*)info.ev_data); + g_sm->process_event(SM_EV_4, (void*)"event 4"); + g_sm->process_event(SM_EV_1, (void*)"event 1"); + g_sm->process_event(SM_EV_2, (void*)"event 2"); + g_sm->process_event(SM_EV_3, (void*)"event 3"); + g_sm->process_event(SM_EV_4, (void*)"event 4"); + //g_sm->m_sm_fifo.debug_print_fifo(); +} + +void sm_st_A_ev_3(const sm_info_t& info) +{ + printf(MODULE_NAME "Got event %s\n", (char*)info.ev_data); +} + +void sm_st_B_ev_1(const sm_info_t& info) +{ + NOT_IN_USE(info); + printf(MODULE_NAME "Got event %s\n", event_num_to_str_func(SM_EV_1)); +} + +void sm_st_B_ev_2(const sm_info_t& info) +{ + printf(MODULE_NAME "Got event %s\n", (char*)info.ev_data); + g_sm->process_event(SM_EV_1, (void*)"event 1"); +} + +void sm_st_B_ev_3(const sm_info_t& info) +{ + NOT_IN_USE(info); + printf(MODULE_NAME "Got event %s\n", event_num_to_str_func(SM_EV_3)); +} + +void sm_st_C_ev_1(const sm_info_t& info) +{ + NOT_IN_USE(info); + printf(MODULE_NAME "Got event %s\n", event_num_to_str_func(SM_EV_1)); +} + +void sm_st_C_ev_2(const sm_info_t& info) +{ + NOT_IN_USE(info); + printf(MODULE_NAME "Got event %s\n", event_num_to_str_func(SM_EV_2)); + g_sm->process_event(SM_EV_4, (void*)"event 4"); +} + +void sm_st_C_ev_3(const sm_info_t& info) +{ + NOT_IN_USE(info); + printf(MODULE_NAME "Got event %s\n", event_num_to_str_func(SM_EV_3)); +} + +#if _BullseyeCoverage + #pragma BullseyeCoverage on +#endif diff --git a/src/state_machine/sm.cpp b/src/state_machine/sm.cpp new file mode 100644 index 0000000..da2520c --- /dev/null +++ b/src/state_machine/sm.cpp @@ -0,0 +1,281 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "sm.h" + +#include +#include +#include "vlogger/vlogger.h" +#include "utils/bullseye.h" +#include "sm_fifo.h" + +#undef MODULE_NAME +#define MODULE_NAME "sm" + +#define sm_logpanic __log_info_panic +#define sm_logerr __log_info_err +#define sm_logdbg __log_info_dbg +#define sm_logfunc __log_info_func + +#define SM_ASSERT_POINTER(__ptr) { if (__ptr == NULL) sm_logpanic("problem with memory allocation"); } + +state_machine::state_machine(void* app_hndl, + int start_state, + int max_states, + int max_events, + sm_short_table_line_t* short_table, + sm_action_cb_t default_entry_func, + sm_action_cb_t default_leave_func, + sm_action_cb_t default_trans_func, + sm_new_event_notify_cb_t new_event_notify_func + ) : + m_max_states(max_states), m_max_events(max_events), + m_new_event_notify_func(new_event_notify_func), + m_b_is_in_process(false) +{ +BULLSEYE_EXCLUDE_BLOCK_START + if (start_state < 0 || start_state >= m_max_states) + sm_logpanic("SM start state out of range for app_hndl %p (min=%d, max=%d, start=%d)", app_hndl, 0, m_max_states, start_state); +BULLSEYE_EXCLUDE_BLOCK_END + + m_info.old_state = start_state; + m_info.new_state = -1; + m_info.event = -1; + m_info.ev_data = NULL; + m_info.app_hndl = app_hndl; + + m_sm_fifo = new sm_fifo; + SM_ASSERT_POINTER(m_sm_fifo); + + int ret = process_sparse_table(short_table, default_entry_func, default_leave_func, default_trans_func); +BULLSEYE_EXCLUDE_BLOCK_START + if (ret) { + // TODO - check status + } +BULLSEYE_EXCLUDE_BLOCK_END +} + +state_machine::~state_machine() +{ + for (int st=0; st= m_max_states) { + sm_logerr("ERROR on line [%d]: STATE bad value!! St[%d], Ev[%d] (nextSt[%d], action func[%p])", + line+1, st, ev, next_state, action_func); + return ERROR; + } +BULLSEYE_EXCLUDE_BLOCK_END + switch (ev) { + case SM_STATE_ENTRY: + sm_logfunc("line %d: St[%d], Ev[ENTRY] (action func[%p])", line+1, st, action_func); + m_p_sm_table[ st ].entry_func = action_func; + break; + + case SM_STATE_LEAVE: + sm_logfunc("line %d: St[%d], Ev[LEAVE] (action func[%p])", line+1, st, action_func); + m_p_sm_table[ st ].leave_func = action_func; + break; + + default: + { + sm_logfunc("line %d: St[%d], Ev[%d] (nextSt[%d], action func[%p])", line+1, st, ev, next_state, action_func); +BULLSEYE_EXCLUDE_BLOCK_START + if (ev < 0 || ev >= m_max_events) { + sm_logerr("ERROR on line [%d]: EVENT bad value!! St[%d], Ev[%d] (nextSt[%d], action func[%p])", + line+1, st, ev, next_state, action_func); + return ERROR; + } + + if (next_state >= m_max_states) { + sm_logerr("ERROR on line [%d]: next state bad value!! St[%d], Ev[%d] (nextSt[%d], action func[%p])", + line+1, st, ev, next_state, action_func); + return ERROR; + } + + SM_ASSERT_POINTER(m_p_sm_table[st].event_info); + + if (m_p_sm_table[ st ].event_info[ ev ].trans_func != default_trans_func) { + sm_logerr("ERROR on line [%d]: St+Ev entry re-use error!!! St[%d], Ev[%d] (nextSt[%d], action func[%p])", + line+1, st, ev, next_state, action_func); + return ERROR; + } +BULLSEYE_EXCLUDE_BLOCK_END + m_p_sm_table[ st ].event_info[ ev ].next_state = next_state; + m_p_sm_table[ st ].event_info[ ev ].trans_func = action_func; + } + break; + } + + // Continue with next line in users short table + line++; + } + + sm_logdbg("SM full table processing done. Allocated memory size of %d bytes", sm_table_entries_size); + return 0; +} + + + +int state_machine::lock_in_process(int event, void* ev_data) +{ + if (!m_b_is_in_process) { + m_b_is_in_process = 1; + sm_logfunc("lock_in_process: critical section free. Locking it"); + } + else { + m_sm_fifo->push_back(event, ev_data); + sm_logfunc("lock_in_process: critical section is in use"); + return -1; + } + return 0; +} + + +void state_machine::unlock_in_process() +{ + m_b_is_in_process = 0; + if (m_sm_fifo->is_empty()) { + sm_logfunc("unlock_in_process: there are no pending events"); + } + else { + sm_logfunc("unlock_in_process: there are pending events"); + sm_fifo_entry_t ret = m_sm_fifo->pop_front(); + process_event(ret.event, ret.ev_data); + } +} + + +int state_machine::process_event(int event, void* ev_data) +{ + if (lock_in_process(event, ev_data) == -1) { + return 0; + } + +BULLSEYE_EXCLUDE_BLOCK_END + // if we got here: State machine is free + if ((event > m_max_events) || (event < 0)) { + sm_logdbg("ERROR: illegal event num %d", event); + unlock_in_process(); + return -1; + } +BULLSEYE_EXCLUDE_BLOCK_END + sm_state_info_t* p_sm_state_info = &m_p_sm_table[get_curr_state()]; + int next_state = p_sm_state_info->event_info[event].next_state; + m_info.new_state = next_state; + m_info.event = event; + m_info.ev_data = ev_data; + + // Run print event info function + if (m_new_event_notify_func) { + m_new_event_notify_func(get_curr_state(), event, m_info.app_hndl); + } + + // Run leave function + if ((next_state != get_curr_state()) && (next_state != SM_ST_STAY) && p_sm_state_info->leave_func) { + p_sm_state_info->leave_func(m_info); + } + + // Run the action function + if (p_sm_state_info->event_info[event].trans_func) { + p_sm_state_info->event_info[event].trans_func(m_info); + } + + // Move to next state + if ((next_state != get_curr_state()) && (next_state != SM_ST_STAY)) { + + // Run entry function + if (m_p_sm_table[next_state].entry_func) { + m_p_sm_table[next_state].entry_func(m_info); + } + + // Update current state + m_info.old_state = next_state; + } + + unlock_in_process(); + return 0; +} + diff --git a/src/state_machine/sm.h b/src/state_machine/sm.h new file mode 100644 index 0000000..94ee26c --- /dev/null +++ b/src/state_machine/sm.h @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef SM_H +#define SM_H + +#include "sm_fifo.h" +#include + +class sm_fifo; + + +#define ERROR (-1) +#define SM_NO_ST (-2) +#define SM_NO_EV (-2) +#define SM_ST_STAY (-3) +#define SM_STATE_ENTRY (-4) +#define SM_STATE_LEAVE (-5) +#define SM_TABLE_END { SM_NO_ST, SM_NO_EV, SM_NO_ST, NULL} + + +typedef struct { + int old_state; + int new_state; + int event; + void* ev_data; + void* app_hndl; +} sm_info_t; + +// SM Callback prototypes +typedef void (*sm_action_cb_t)(const sm_info_t& info); +typedef void (*sm_new_event_notify_cb_t)(int state, int event, void* app_hndl); + + +// Short table line +typedef struct { + int state; // State to handle event + int event; // Event to handle + int next_state; // New state to move to + sm_action_cb_t action_func; // Do-function +} sm_short_table_line_t; + + +// sparse (big) table event entry +typedef struct { + int next_state; // New state to move to + sm_action_cb_t trans_func; // Do-function +} sm_event_info_t; + + + +// sparse (big) table state entry (including all events) +typedef struct sm_state_info{ + sm_action_cb_t entry_func; // Entry function + sm_action_cb_t leave_func; // Leave function + sm_event_info_t* event_info; // Event -> Transition function +} sm_state_info_t; + + + + +class state_machine +{ +public: + // get short matrix and build the sparse matrix + state_machine(void* app_hndl, + int start_state, + int max_states, + int max_events, + sm_short_table_line_t* short_table, + sm_action_cb_t default_entry_func, + sm_action_cb_t default_leave_func, + sm_action_cb_t default_trans_func, + sm_new_event_notify_cb_t new_event_notify_func + ); + ~state_machine(); + + int process_event(int event, void* ev_data); + int get_curr_state(); + +private: + // convert function (from short to sparse matrix) + int process_sparse_table(sm_short_table_line_t* short_table, + sm_action_cb_t default_entry_func, + sm_action_cb_t default_leave_func, + sm_action_cb_t default_trans_func + ); + + // warp internal fifo in lock/unlock logic + int lock_in_process(int event, void* ev_data); + void unlock_in_process(); + + int m_max_states; // MAX state + int m_max_events; // MAX event + sm_state_info_t* m_p_sm_table; // pointer to full SM table + sm_new_event_notify_cb_t m_new_event_notify_func; + sm_fifo* m_sm_fifo; // fifo queue for the events + bool m_b_is_in_process; + + sm_info_t m_info; // SM info to provide user in all CB functions +}; + +#endif //SM_H diff --git a/src/state_machine/sm_fifo.cpp b/src/state_machine/sm_fifo.cpp new file mode 100644 index 0000000..f469b85 --- /dev/null +++ b/src/state_machine/sm_fifo.cpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "sm_fifo.h" + +bool sm_fifo::is_empty() +{ + return m_sm_event_fifo.empty(); +} + +void sm_fifo::push_back(int element, void* ev_data) +{ + sm_fifo_entry_t fe; + fe.ev_data = ev_data; + fe.event = element; + m_sm_event_fifo.push_back(fe); +} + +// Return the first element in the fifo. +// in case the fifo is empty: ret.event = -1 +sm_fifo_entry_t sm_fifo::pop_front() +{ + sm_fifo_entry_t ret; + ret.event = -1; + ret.ev_data = NULL; + if (!m_sm_event_fifo.empty()) { + ret = m_sm_event_fifo.front(); + m_sm_event_fifo.pop_front(); + } + return ret; +} + +//code coverage +#if 0 +void sm_fifo::debug_print_fifo() +{ + int i = 1; + sm_event_list_t::iterator tmp = m_sm_event_fifo.begin(); + for (sm_event_list_t::iterator tmp = m_sm_event_fifo.begin(); tmp != m_sm_event_fifo.end(); tmp++) { + printf("element num %d is %d\n",i , tmp->event); + i++; + } +} +#endif + diff --git a/src/state_machine/sm_fifo.h b/src/state_machine/sm_fifo.h new file mode 100644 index 0000000..b4ddaa8 --- /dev/null +++ b/src/state_machine/sm_fifo.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef V_SM_FIFO_H +#define V_SM_FIFO_H + +#include +#include + +typedef struct { + int event; + void* ev_data; +} sm_fifo_entry_t; + +typedef std::deque sm_event_list_t; + + +class sm_fifo +{ +public: + bool is_empty(); + void push_back(int element, void* ev_data); + sm_fifo_entry_t pop_front(); + void debug_print_fifo(); + +private: + sm_event_list_t m_sm_event_fifo; +}; + +#endif diff --git a/src/stats/Makefile.am b/src/stats/Makefile.am new file mode 100755 index 0000000..03359ef --- /dev/null +++ b/src/stats/Makefile.am @@ -0,0 +1,18 @@ +AM_CPPFLAGS := -I$(top_srcdir)/src ${LIBNL_CFLAGS} + +noinst_LTLIBRARIES = libstats.la +libstats_la_LDFLAGS = -static +libstats_la_SOURCES = \ + stats_printer.cpp \ + stats_publisher.cpp \ + stats_data_reader.h + +bin_PROGRAMS = vma_stats +vma_stats_LDADD= -lrt \ + libstats.la \ + $(top_builddir)/src/utils/libutils.la \ + $(top_builddir)/src/vlogger/libvlogger.la +vma_stats_SOURCES = stats_reader.cpp +vma_stats_DEPENDENCIES = \ + libstats.la \ + $(top_builddir)/src/vlogger/libvlogger.la diff --git a/src/stats/stats_data_reader.h b/src/stats/stats_data_reader.h new file mode 100644 index 0000000..6f72fed --- /dev/null +++ b/src/stats/stats_data_reader.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef STATS_DATA_READER_H +#define STATS_DATA_READER_H + +#include +#include "utils/lock_wrapper.h" +#include "vma/event/timer_handler.h" + +typedef std::map< void*, std::pair > stats_read_map_t; + +typedef struct { + int size; + void* shm_addr; +} data_addr_and_size_t; + +class stats_data_reader : public timer_handler +{ + public: + stats_data_reader(); + void handle_timer_expired(void *ctx); + void register_to_timer(); + void add_data_reader(void* local_addr, void* shm_addr, int size); + void* pop_data_reader(void* local_addr); + + private: + void* m_timer_handler; + stats_read_map_t m_data_map; + lock_spin m_lock_data_map; +}; + +#endif //STATS_DATA_READER_H diff --git a/src/stats/stats_printer.cpp b/src/stats/stats_printer.cpp new file mode 100644 index 0000000..ac0baa7 --- /dev/null +++ b/src/stats/stats_printer.cpp @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "vma/util/utils.h" +#include "vma/util/vma_stats.h" +#include "vma/lwip/tcp.h" +#include "vma/vma_extra.h" +#include "vma/util/sys_vars.h" + +typedef enum { + e_K = 1024, + e_M = 1048576 +} units_t; + +user_params_t user_params; + +#define BYTES_TRAFFIC_UNIT e_K + +const char* to_str_socket_type(int type) +{ + switch (type) { + case SOCK_STREAM: return "TCP"; + case SOCK_DGRAM: return "UDP"; + case SOCK_RAW: return "RAW"; + default: + break; + } + return "???"; +} + +const char* to_str_socket_type_netstat_like(int type) +{ + switch (type) { + case SOCK_STREAM: return "tcp"; + case SOCK_DGRAM: return "udp"; + case SOCK_RAW: return "raw"; + default: + break; + } + return "???"; +} + +// Print statistics for offloaded sockets +void print_full_stats(socket_stats_t* p_si_stats, mc_grp_info_t* p_mc_grp_info, FILE* filename) +{ + + if (!filename) return; + + bool b_any_activiy = false; + char post_fix[3] = ""; + + if (user_params.print_details_mode == e_deltas) + strcpy(post_fix, "/s"); + fprintf(filename, "======================================================\n"); + fprintf(filename, "\tFd=[%d]\n", p_si_stats->fd); + + // + // Socket information + // + fprintf(filename, "- %s", to_str_socket_type(p_si_stats->socket_type)); + fprintf(filename, ", %s", p_si_stats->b_blocking?"Blocked":"Non-blocked"); + + // + // Multicast information + // + if (p_si_stats->socket_type == SOCK_DGRAM) { + fprintf(filename, ", MC Loop %s", p_si_stats->b_mc_loop?"Enabled ":"Disabled"); + if (p_si_stats->mc_tx_if) { + /* cppcheck-suppress wrongPrintfScanfArgNum */ + fprintf(filename, ", MC IF = [%d.%d.%d.%d]", NIPQUAD(p_si_stats->mc_tx_if)); + } + } + fprintf(filename, "\n"); + + // + // Bounded + Connected information + // + if (p_si_stats->bound_if || p_si_stats->bound_port) { + /* cppcheck-suppress wrongPrintfScanfArgNum */ + fprintf(filename, "- Local Address = [%d.%d.%d.%d:%d]\n", NIPQUAD(p_si_stats->bound_if), ntohs(p_si_stats->bound_port)); + } + if (p_si_stats->connected_ip || p_si_stats->connected_port) { + /* cppcheck-suppress wrongPrintfScanfArgNum */ + fprintf(filename, "- Foreign Address = [%d.%d.%d.%d:%d]\n", NIPQUAD(p_si_stats->connected_ip), ntohs(p_si_stats->connected_port)); + } + if (p_mc_grp_info){ + for (int grp_idx = 0; grp_idx < p_mc_grp_info->max_grp_num; grp_idx++) { + if (p_si_stats->mc_grp_map.test(grp_idx)) { + /* cppcheck-suppress wrongPrintfScanfArgNum */ + fprintf(filename, "- Member of = [%d.%d.%d.%d]\n", NIPQUAD(p_mc_grp_info->mc_grp_tbl[grp_idx].mc_grp)); + } + } + } + if ((p_si_stats->threadid_last_rx != 0) || (p_si_stats->threadid_last_tx != 0)) { + fprintf(filename, "- Thread Id Rx: %5u, Tx: %5u\n", p_si_stats->threadid_last_rx, p_si_stats->threadid_last_tx); + } + + // + // Ring Allocation Logic information + // + // + if (p_si_stats->ring_alloc_logic_rx == RING_LOGIC_PER_USER_ID) + fprintf(filename, "- RX: Ring User ID = %lu\n", p_si_stats->ring_user_id_rx); + if (p_si_stats->ring_alloc_logic_tx == RING_LOGIC_PER_USER_ID) + fprintf(filename, "- TX: Ring User ID = %lu\n", p_si_stats->ring_user_id_tx); + + // + // Socket statistics + // + if (p_si_stats->counters.n_tx_sent_byte_count || p_si_stats->counters.n_tx_sent_pkt_count || p_si_stats->counters.n_tx_drops || p_si_stats->counters.n_tx_errors) + { + fprintf(filename, "Tx Offload: %u / %u / %u / %u [kilobytes/packets/drops/errors]%s\n", p_si_stats->counters.n_tx_sent_byte_count/BYTES_TRAFFIC_UNIT,p_si_stats->counters.n_tx_sent_pkt_count, p_si_stats->counters.n_tx_drops, p_si_stats->counters.n_tx_errors, post_fix); + b_any_activiy = true; + } + if (p_si_stats->counters.n_tx_os_bytes || p_si_stats->counters.n_tx_os_packets || p_si_stats->counters.n_tx_os_eagain || p_si_stats->counters.n_tx_os_errors) + { + fprintf(filename, "Tx OS info: %u / %u / %u / %u [kilobytes/packets/eagains/errors]%s\n", p_si_stats->counters.n_tx_os_bytes/BYTES_TRAFFIC_UNIT, p_si_stats->counters.n_tx_os_packets, p_si_stats->counters.n_tx_os_eagain, p_si_stats->counters.n_tx_os_errors, post_fix); + b_any_activiy = true; + } + if (p_si_stats->counters.n_tx_dummy) { + fprintf(filename, "Tx Dummy messages : %d\n", p_si_stats->counters.n_tx_dummy); + b_any_activiy = true; + } + if (p_si_stats->counters.n_rx_bytes || p_si_stats->counters.n_rx_packets || p_si_stats->counters.n_rx_eagain || p_si_stats->counters.n_rx_errors) + { + fprintf(filename, "Rx Offload: %u / %u / %u / %u [kilobytes/packets/eagains/errors]%s\n", p_si_stats->counters.n_rx_bytes/BYTES_TRAFFIC_UNIT, p_si_stats->counters.n_rx_packets, p_si_stats->counters.n_rx_eagain, p_si_stats->counters.n_rx_errors, post_fix); + b_any_activiy = true; + } + if (p_si_stats->counters.n_rx_os_bytes || p_si_stats->counters.n_rx_os_packets || p_si_stats->counters.n_rx_os_eagain || p_si_stats->counters.n_rx_os_errors) + { + fprintf(filename, "Rx OS info: %u / %u / %u / %u [kilobytes/packets/eagains/errors]%s\n", p_si_stats->counters.n_rx_os_bytes/BYTES_TRAFFIC_UNIT, p_si_stats->counters.n_rx_os_packets, p_si_stats->counters.n_rx_os_eagain, p_si_stats->counters.n_rx_os_errors, post_fix); + b_any_activiy = true; + } + if (p_si_stats->counters.n_rx_packets || p_si_stats->n_rx_ready_pkt_count) + { + fprintf(filename, "Rx byte: cur %u / max %u / dropped%s %u / limit %u\n", p_si_stats->n_rx_ready_byte_count, p_si_stats->counters.n_rx_ready_byte_max, post_fix,p_si_stats->counters.n_rx_ready_byte_drop, p_si_stats->n_rx_ready_byte_limit); + fprintf(filename, "Rx pkt : cur %u / max %u / dropped%s %u\n", p_si_stats->n_rx_ready_pkt_count, p_si_stats->counters.n_rx_ready_pkt_max, post_fix,p_si_stats->counters.n_rx_ready_pkt_drop); + b_any_activiy = true; + } + if (p_si_stats->n_rx_zcopy_pkt_count) + { + fprintf(filename, "Rx zero copy buffers: cur %u\n", p_si_stats->n_rx_zcopy_pkt_count); + b_any_activiy = true; + } + if (p_si_stats->counters.n_rx_poll_miss || p_si_stats->counters.n_rx_poll_hit) + { + double rx_poll_hit = (double)p_si_stats->counters.n_rx_poll_hit; + double rx_poll_hit_percentage = (rx_poll_hit / (rx_poll_hit + (double)p_si_stats->counters.n_rx_poll_miss)) * 100; + fprintf(filename, "Rx poll: %u / %u (%2.2f%%) [miss/hit]\n", p_si_stats->counters.n_rx_poll_miss, p_si_stats->counters.n_rx_poll_hit, rx_poll_hit_percentage); + b_any_activiy = true; + } + + if (p_si_stats->counters.n_rx_migrations || p_si_stats->counters.n_tx_migrations) + { + fprintf(filename, "Ring migrations Rx: %u, Tx: %u\n", p_si_stats->counters.n_rx_migrations, p_si_stats->counters.n_tx_migrations); + } + + if (p_si_stats->counters.n_tx_retransmits) + { + fprintf(filename, "Retransmissions: %u\n", p_si_stats->counters.n_tx_retransmits); + } + + if (b_any_activiy == false) { + fprintf(filename, "Rx and Tx where not active\n"); + } +} + +// Print statistics headers for all sockets - used in case view mode is e_netstat_like +void print_netstat_like_headers(FILE* file) +{ + static bool already_printed = false; + if(!already_printed) fprintf(file, "Proto Offloaded Recv-Q Send-Q Local Address Foreign Address State Inode PID/Program name\n"); + already_printed = true; +} + +// Print statistics of a single socket - used in case view mode is e_netstat_like +void print_netstat_like(socket_stats_t* p_si_stats, mc_grp_info_t* , FILE* file, int pid) +{ + static const int MAX_ADDR_LEN = strlen("123.123.123.123:12345"); // for max len of ip address and port together + char process[PATH_MAX + 1]; + + if(! p_si_stats->inode) return; // shmem is not updated yet + + fprintf(file, "%-5s %-9s ", to_str_socket_type_netstat_like(p_si_stats->socket_type), p_si_stats->b_is_offloaded ? "Yes" : "No"); + fprintf(file, "%-6d %-6d ", (int)p_si_stats->n_rx_ready_byte_count, (int)p_si_stats->n_tx_ready_byte_count); + + // + // Bounded + Connected information + // + int len = 0; + if (p_si_stats->bound_if || p_si_stats->bound_port) { + /* cppcheck-suppress wrongPrintfScanfArgNum */ + len = fprintf(file, "%d.%d.%d.%d:%-5d", NIPQUAD(p_si_stats->bound_if), ntohs(p_si_stats->bound_port)); + if (len < 0) len = 0; // error + } + if (len < MAX_ADDR_LEN )fprintf(file, "%*s ", MAX_ADDR_LEN-len, ""); // pad and delimiter + + fprintf(file, " "); + + if (p_si_stats->connected_ip || p_si_stats->connected_port) { + /* cppcheck-suppress wrongPrintfScanfArgNum */ + len = fprintf(file, "%d.%d.%d.%d:%-5d", NIPQUAD(p_si_stats->connected_ip), ntohs(p_si_stats->connected_port)); + } + else { + len = fprintf(file, "0.0.0.0:*"); + } + if (len < 0) len = 0; // error + if (len < MAX_ADDR_LEN )fprintf(file, "%*s ", MAX_ADDR_LEN-len, ""); // pad and delimiter + + const char * tcp_state = ""; + if (p_si_stats->socket_type == SOCK_STREAM) { + tcp_state = tcp_state_str[((enum tcp_state)p_si_stats->tcp_state)]; + } + + fprintf(file, "%-11s %-10lu %d/%s\n", + tcp_state, (u_long)p_si_stats->inode, pid, + (get_procname(pid, process, sizeof(process)) == 0 ? process : "-")); // max tcp state len is 11 characters = ESTABLISHED +} + + diff --git a/src/stats/stats_publisher.cpp b/src/stats/stats_publisher.cpp new file mode 100644 index 0000000..372d24d --- /dev/null +++ b/src/stats/stats_publisher.cpp @@ -0,0 +1,652 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "stats/stats_data_reader.h" +#include "vma/util/vma_stats.h" +#include "vma/sock/sock-redirect.h" +#include "vma/event/event_handler_manager.h" + +#define MODULE_NAME "STATS: " + +static lock_spin g_lock_mc_info("g_lock_mc_info"); +static lock_spin g_lock_skt_inst_arr("g_lock_skt_inst_arr"); +static lock_spin g_lock_ring_inst_arr("g_lock_ring_inst_arr"); +static lock_spin g_lock_cq_inst_arr("g_lock_cq_inst_arr"); +static lock_spin g_lock_bpool_inst_arr("g_lock_bpool_inst_arr"); +static lock_spin g_lock_iomux("g_lock_iomux"); + +static sh_mem_info_t g_sh_mem_info; +static sh_mem_t* g_sh_mem; +static sh_mem_t g_local_sh_mem; + +//statistic file +FILE* g_stats_file = NULL; +stats_data_reader* g_p_stats_data_reader = NULL; + + +// keep writing statistics after a request for "duration" with "interval" +#define STATS_PUBLISH_DURATION (10*1000) // 10 sec +#define STATS_PUBLISH_INTERVAL 500 // 500 msec + +#define TIMERS_IN_STATS_PUBLISH_DURATION (STATS_PUBLISH_DURATION/STATS_PUBLISHER_TIMER_PERIOD) +#define TIMERS_IN_STATS_PUBLISH_INTERVAL (STATS_PUBLISH_INTERVAL/STATS_PUBLISHER_TIMER_PERIOD) + +bool printed_sock_limit_info = false; +bool printed_ring_limit_info = false; +bool printed_cq_limit_info = false; +bool printed_bpool_limit_info = false; + +stats_data_reader::stats_data_reader() : m_timer_handler(NULL), m_lock_data_map("m_lock_data_map") +{ +} + + +#define LOCAL_OBJECT_DATA iter->first +#define SHM_DATA_ADDRESS iter->second.first +#define COPY_SIZE iter->second.second + +bool should_write() +{ + // initial value that will prevent write to shmem before an explicit request + static int timers_counter = TIMERS_IN_STATS_PUBLISH_DURATION + 1; + + static int reader_counter = 0; + int prev_reader_counter = reader_counter; + reader_counter = g_sh_mem->reader_counter; + + if (prev_reader_counter != reader_counter) { + timers_counter = 0; // will allow writing without new request for "duration" + return true; + } + + if (timers_counter > TIMERS_IN_STATS_PUBLISH_DURATION) + return false; // don't write until we'll see explicit request + + ++timers_counter; + + return (timers_counter % TIMERS_IN_STATS_PUBLISH_INTERVAL == 0); // write once in interval +} + + +void stats_data_reader::handle_timer_expired(void *ctx) +{ + NOT_IN_USE(ctx); + + if (!should_write()) { + return; + } + + if (g_sh_mem->fd_dump != STATS_FD_STATISTICS_DISABLED) { + if (g_p_event_handler_manager) { + g_p_event_handler_manager->statistics_print(g_sh_mem->fd_dump, g_sh_mem->fd_dump_log_level); + } + g_sh_mem->fd_dump = STATS_FD_STATISTICS_DISABLED; + g_sh_mem->fd_dump_log_level = STATS_FD_STATISTICS_LOG_LEVEL_DEFAULT; + } + stats_read_map_t::iterator iter; + m_lock_data_map.lock(); + for (iter = m_data_map.begin(); iter != m_data_map.end(); iter++) { + memcpy(SHM_DATA_ADDRESS, LOCAL_OBJECT_DATA, COPY_SIZE); + } + m_lock_data_map.unlock(); + +} + +void stats_data_reader::register_to_timer() +{ + m_timer_handler = g_p_event_handler_manager->register_timer_event(STATS_PUBLISHER_TIMER_PERIOD, g_p_stats_data_reader, PERIODIC_TIMER, 0); +} + +void stats_data_reader::add_data_reader(void* local_addr, void* shm_addr, int size) +{ + m_lock_data_map.lock(); + m_data_map[local_addr] = std::make_pair(shm_addr, size); + m_lock_data_map.unlock(); +} + +void* stats_data_reader::pop_data_reader(void* local_addr) +{ + void* rv = NULL; + m_lock_data_map.lock(); + stats_read_map_t::iterator iter = m_data_map.find(local_addr); + if (iter != m_data_map.end()) {//found + rv = SHM_DATA_ADDRESS; + m_data_map.erase(local_addr); + } + m_lock_data_map.unlock(); + return rv; +} + +void write_version_details_to_shmem(version_info_t* p_ver_info) +{ + p_ver_info->vma_lib_maj = VMA_LIBRARY_MAJOR; + p_ver_info->vma_lib_min = VMA_LIBRARY_MINOR; + p_ver_info->vma_lib_rev = VMA_LIBRARY_REVISION; + p_ver_info->vma_lib_rel = VMA_LIBRARY_RELEASE; +} + +void vma_shmem_stats_open(vlog_levels_t** p_p_vma_log_level, uint8_t** p_p_vma_log_details) +{ + void *buf = NULL; + void *p_shmem = NULL; + int ret; + size_t shmem_size = 0; + mode_t saved_mode; + + g_p_stats_data_reader = new stats_data_reader(); + + BULLSEYE_EXCLUDE_BLOCK_START + if (NULL == g_p_stats_data_reader) { + vlog_printf(VLOG_ERROR,"%s:%d: Can't allocate g_p_stats_data_reader \n", __func__, __LINE__); + goto shmem_error; + } + BULLSEYE_EXCLUDE_BLOCK_END + + shmem_size = SHMEM_STATS_SIZE(safe_mce_sys().stats_fd_num_max); + buf = malloc(shmem_size); + if (buf == NULL) + goto shmem_error; + memset(buf, 0, shmem_size); + + p_shmem = buf; + + if (strlen(safe_mce_sys().stats_shmem_dirname) <= 0) + goto no_shmem; + + g_sh_mem_info.filename_sh_stats[0] = '\0'; + g_sh_mem_info.p_sh_stats = MAP_FAILED; + ret = snprintf(g_sh_mem_info.filename_sh_stats, sizeof(g_sh_mem_info.filename_sh_stats), "%s/vmastat.%d", safe_mce_sys().stats_shmem_dirname, getpid()); + if (!((0 < ret) && (ret < (int)sizeof(g_sh_mem_info.filename_sh_stats)))) { + vlog_printf(VLOG_ERROR, "%s: Could not create file under %s %m\n", __func__, safe_mce_sys().stats_shmem_dirname, errno); + goto no_shmem; + } + saved_mode = umask(0); + g_sh_mem_info.fd_sh_stats = open(g_sh_mem_info.filename_sh_stats, O_CREAT|O_RDWR, + S_IRUSR | S_IWUSR | S_IRGRP | S_IROTH); + umask(saved_mode); + + BULLSEYE_EXCLUDE_BLOCK_START + if (g_sh_mem_info.fd_sh_stats < 0) { + vlog_printf(VLOG_ERROR, "%s: Could not open %s %m\n", __func__, g_sh_mem_info.filename_sh_stats, errno); + goto no_shmem; + } + BULLSEYE_EXCLUDE_BLOCK_END + + ret = write(g_sh_mem_info.fd_sh_stats, buf, shmem_size); + + BULLSEYE_EXCLUDE_BLOCK_START + if (ret < 0) { + vlog_printf(VLOG_ERROR, "%s: Could not write to %s - %m\n", __func__, g_sh_mem_info.filename_sh_stats, errno); + goto no_shmem; + } + BULLSEYE_EXCLUDE_BLOCK_END + + g_sh_mem_info.p_sh_stats = mmap(0, shmem_size, PROT_WRITE|PROT_READ, MAP_SHARED, g_sh_mem_info.fd_sh_stats, 0); + + BULLSEYE_EXCLUDE_BLOCK_START + if (g_sh_mem_info.p_sh_stats == MAP_FAILED) { + vlog_printf(VLOG_ERROR, "%s: MAP_FAILED for %s - %m\n", __func__, g_sh_mem_info.filename_sh_stats); + goto no_shmem; + } + BULLSEYE_EXCLUDE_BLOCK_END + + p_shmem = g_sh_mem_info.p_sh_stats; + + free(buf); + buf = NULL; + + goto success; + + no_shmem: + if (g_sh_mem_info.p_sh_stats == MAP_FAILED) { + if (g_sh_mem_info.fd_sh_stats > 0) { + close(g_sh_mem_info.fd_sh_stats); + unlink(g_sh_mem_info.filename_sh_stats); + } + } + + g_sh_mem_info.p_sh_stats = 0; + + success: + + MAP_SH_MEM(g_sh_mem, p_shmem); + + write_version_details_to_shmem(&g_sh_mem->ver_info); + memcpy(g_sh_mem->stats_protocol_ver, STATS_PROTOCOL_VER, min(sizeof(g_sh_mem->stats_protocol_ver), sizeof(STATS_PROTOCOL_VER))); + g_sh_mem->max_skt_inst_num = safe_mce_sys().stats_fd_num_max; + g_sh_mem->reader_counter = 0; + __log_dbg("file '%s' fd %d shared memory at %p with %d max blocks\n", g_sh_mem_info.filename_sh_stats, g_sh_mem_info.fd_sh_stats, g_sh_mem_info.p_sh_stats, safe_mce_sys().stats_fd_num_max); + + // Update the shmem initial log values + g_sh_mem->log_level = **p_p_vma_log_level; + g_sh_mem->log_details_level = **p_p_vma_log_details; + + // Update the shmem with initial fd dump values + g_sh_mem->fd_dump = STATS_FD_STATISTICS_DISABLED; + g_sh_mem->fd_dump_log_level = STATS_FD_STATISTICS_LOG_LEVEL_DEFAULT; + + // ReMap internal log level to ShMem area + *p_p_vma_log_level = &g_sh_mem->log_level; + *p_p_vma_log_details = &g_sh_mem->log_details_level; + + g_p_stats_data_reader->register_to_timer(); + + return; + + shmem_error: + + BULLSEYE_EXCLUDE_BLOCK_START + g_sh_mem_info.fd_sh_stats = -1; + g_sh_mem_info.p_sh_stats = MAP_FAILED; + g_sh_mem = &g_local_sh_mem; + g_sh_mem->reset(); + *p_p_vma_log_level = &g_sh_mem->log_level; + *p_p_vma_log_details = &g_sh_mem->log_details_level; + BULLSEYE_EXCLUDE_BLOCK_END +} + +void vma_shmem_stats_close() +{ + if (g_sh_mem_info.p_sh_stats && g_sh_mem_info.p_sh_stats != MAP_FAILED) { + __log_dbg("file '%s' fd %d shared memory at %p with %d max blocks\n", g_sh_mem_info.filename_sh_stats, g_sh_mem_info.fd_sh_stats, g_sh_mem_info.p_sh_stats, safe_mce_sys().stats_fd_num_max); + + BULLSEYE_EXCLUDE_BLOCK_START + if (munmap(g_sh_mem_info.p_sh_stats, SHMEM_STATS_SIZE(safe_mce_sys().stats_fd_num_max)) != 0) { + vlog_printf(VLOG_ERROR, "%s: file [%s] fd [%d] error while unmap shared memory at [%p]\n", __func__, g_sh_mem_info.filename_sh_stats, g_sh_mem_info.fd_sh_stats, g_sh_mem_info.p_sh_stats); + } + BULLSEYE_EXCLUDE_BLOCK_END + + g_sh_mem_info.p_sh_stats = MAP_FAILED; + + if (g_sh_mem_info.fd_sh_stats) + close(g_sh_mem_info.fd_sh_stats); + + if(!g_is_forked_child) + unlink(g_sh_mem_info.filename_sh_stats); + } else if (g_sh_mem_info.p_sh_stats != MAP_FAILED) { + free(g_sh_mem); + } + g_sh_mem = NULL; + g_p_vlogger_level = NULL; + g_p_vlogger_details = NULL; + delete g_p_stats_data_reader; + g_p_stats_data_reader = NULL; +} + +void vma_stats_instance_create_socket_block(socket_stats_t* local_stats_addr) +{ + socket_stats_t* p_skt_stats = NULL; + g_lock_skt_inst_arr.lock(); + + //search the first free sh_mem block + for (uint32_t i = 0; i < g_sh_mem->max_skt_inst_num; i++) { + if (g_sh_mem->skt_inst_arr[i].b_enabled == false) { + // found free slot ,enabled and returning to the user + p_skt_stats = &g_sh_mem->skt_inst_arr[i].skt_stats; + g_sh_mem->skt_inst_arr[i].b_enabled = true; + goto out; + } + + } + if (g_sh_mem->max_skt_inst_num + 1 < safe_mce_sys().stats_fd_num_max) { + // allocate next sh_mem block + p_skt_stats = &g_sh_mem->skt_inst_arr[g_sh_mem->max_skt_inst_num].skt_stats; + g_sh_mem->skt_inst_arr[g_sh_mem->max_skt_inst_num].b_enabled = true; + g_sh_mem->max_skt_inst_num++; + goto out; + } + else { + if (!printed_sock_limit_info) { + printed_sock_limit_info = true; + vlog_printf(VLOG_INFO, "VMA Statistics can monitor up to %d sockets - increase VMA_STATS_FD_NUM\n", safe_mce_sys().stats_fd_num_max); + } + goto out; + } + + out: + if (p_skt_stats) { + p_skt_stats->reset(); + g_p_stats_data_reader->add_data_reader(local_stats_addr, p_skt_stats, sizeof(socket_stats_t)); + } + g_lock_skt_inst_arr.unlock(); +} + +void vma_stats_instance_remove_socket_block(socket_stats_t* local_addr) +{ + + g_lock_skt_inst_arr.lock(); + + print_full_stats(local_addr, NULL, g_stats_file); + socket_stats_t* p_skt_stats = (socket_stats_t*)g_p_stats_data_reader->pop_data_reader(local_addr); + + if (p_skt_stats == NULL) { + __log_dbg("application vma_stats pointer is NULL\n"); + g_lock_skt_inst_arr.unlock(); + return; + } + + //coverity - g_sh_mem->skt_inst_arr cannot be null + /*BULLSEYE_EXCLUDE_BLOCK_START + if (g_sh_mem->skt_inst_arr == NULL) { + vlog_printf(VLOG_ERROR,"%s:%d: g_sh_mem->instances_arr not init\n", __func__, __LINE__); + g_lock_skt_stats.unlock(); + return; + } + BULLSEYE_EXCLUDE_BLOCK_END*/ + + // Search sh_mem block to release + for (uint32_t i = 0; i < g_sh_mem->max_skt_inst_num; i++) { + if (&g_sh_mem->skt_inst_arr[i].skt_stats == p_skt_stats) { + g_sh_mem->skt_inst_arr[i].b_enabled = false; + g_lock_skt_inst_arr.unlock(); + return; + } + } + + vlog_printf(VLOG_ERROR, "%s:%d: Could not find user pointer (%p)\n", __func__, __LINE__, p_skt_stats); + g_lock_skt_inst_arr.unlock(); +} + +void vma_stats_mc_group_add(in_addr_t mc_grp, socket_stats_t* p_socket_stats) +{ + int empty_entry = -1; + int index_to_insert = -1; + + g_lock_mc_info.lock(); + for (int grp_idx = 0; grp_idx < g_sh_mem->mc_info.max_grp_num && index_to_insert == -1; grp_idx++) { + if (g_sh_mem->mc_info.mc_grp_tbl[grp_idx].sock_num == 0 && empty_entry == -1) + empty_entry = grp_idx; + else if (g_sh_mem->mc_info.mc_grp_tbl[grp_idx].sock_num && g_sh_mem->mc_info.mc_grp_tbl[grp_idx].mc_grp == mc_grp) + index_to_insert = grp_idx; + } + + if (index_to_insert == -1 && empty_entry != -1) + index_to_insert = empty_entry; + else if (index_to_insert == -1 && g_sh_mem->mc_info.max_grp_num < MC_TABLE_SIZE) { + index_to_insert = g_sh_mem->mc_info.max_grp_num; + g_sh_mem->mc_info.mc_grp_tbl[index_to_insert].mc_grp = mc_grp; + g_sh_mem->mc_info.max_grp_num++; + } + + if (index_to_insert != -1) { + g_sh_mem->mc_info.mc_grp_tbl[index_to_insert].sock_num++; + p_socket_stats->mc_grp_map.set((size_t)index_to_insert, 1); + } + g_lock_mc_info.unlock(); + if (index_to_insert == -1) + vlog_printf(VLOG_INFO, "VMA Statistics can monitor up to %d mc groups\n", MC_TABLE_SIZE); +} + +void vma_stats_mc_group_remove(in_addr_t mc_grp, socket_stats_t* p_socket_stats) +{ + g_lock_mc_info.lock(); + for (int grp_idx = 0; grp_idx < g_sh_mem->mc_info.max_grp_num; grp_idx++) { + if (g_sh_mem->mc_info.mc_grp_tbl[grp_idx].sock_num && g_sh_mem->mc_info.mc_grp_tbl[grp_idx].mc_grp == mc_grp) { + p_socket_stats->mc_grp_map.set((size_t)grp_idx, 0); + g_sh_mem->mc_info.mc_grp_tbl[grp_idx].sock_num--; + if (!g_sh_mem->mc_info.mc_grp_tbl[grp_idx].sock_num) + g_sh_mem->mc_info.max_grp_num--; + } + } + g_lock_mc_info.unlock(); +} + +void vma_stats_instance_create_ring_block(ring_stats_t* local_stats_addr) +{ + ring_stats_t* p_instance_ring = NULL; + g_lock_ring_inst_arr.lock(); + for (int i=0; i < NUM_OF_SUPPORTED_RINGS; i++) { + if (!g_sh_mem->ring_inst_arr[i].b_enabled) { + g_sh_mem->ring_inst_arr[i].b_enabled = true; + p_instance_ring = &g_sh_mem->ring_inst_arr[i].ring_stats; + memset(p_instance_ring, 0, sizeof(*p_instance_ring)); + break; + } + } + if (p_instance_ring == NULL) { + if (!printed_ring_limit_info) { + printed_ring_limit_info = true; + vlog_printf(VLOG_INFO, "VMA Statistics can monitor up to %d ring elements\n", NUM_OF_SUPPORTED_RINGS); + } + } + else { + g_p_stats_data_reader->add_data_reader(local_stats_addr, p_instance_ring, sizeof(ring_stats_t)); + __log_dbg("Added ring local=%p shm=%p\n", local_stats_addr, p_instance_ring); + } + g_lock_ring_inst_arr.unlock(); +} + +void vma_stats_instance_remove_ring_block(ring_stats_t* local_stats_addr) +{ + g_lock_ring_inst_arr.lock(); + __log_dbg("Remove ring local=%p\n", local_stats_addr); + + ring_stats_t* p_ring_stats = (ring_stats_t*)g_p_stats_data_reader->pop_data_reader(local_stats_addr); + + if (p_ring_stats == NULL) { // happens on the tx cq (why don't we keep tx cq stats?) + __log_dbg("application vma_stats pointer is NULL\n"); + g_lock_ring_inst_arr.unlock(); + return; + } + + //coverity - g_sh_mem->ring_inst_arr cannot be null + /*BULLSEYE_EXCLUDE_BLOCK_START + if (g_sh_mem->ring_inst_arr == NULL) { + vlog_printf(VLOG_ERROR,"%s:%d: g_sh_mem->instances_arr not init\n", __func__, __LINE__); + g_lock_skt_stats.unlock(); + return; + } + BULLSEYE_EXCLUDE_BLOCK_END*/ + + // Search sh_mem block to release + for (int i=0; iring_inst_arr[i].ring_stats == p_ring_stats) { + g_sh_mem->ring_inst_arr[i].b_enabled = false; + g_lock_ring_inst_arr.unlock(); + return; + } + } + + vlog_printf(VLOG_ERROR, "%s:%d: Could not find user pointer (%p)", __func__, __LINE__, p_ring_stats); + g_lock_ring_inst_arr.unlock(); +} + +void vma_stats_instance_create_cq_block(cq_stats_t* local_stats_addr) +{ + cq_stats_t* p_instance_cq = NULL; + g_lock_cq_inst_arr.lock(); + for (int i=0; i < NUM_OF_SUPPORTED_CQS; i++) { + if (!g_sh_mem->cq_inst_arr[i].b_enabled) { + g_sh_mem->cq_inst_arr[i].b_enabled = true; + p_instance_cq = &g_sh_mem->cq_inst_arr[i].cq_stats; + memset(p_instance_cq, 0, sizeof(*p_instance_cq)); + break; + } + } + if (p_instance_cq == NULL) { + if (!printed_cq_limit_info) { + printed_cq_limit_info = true; + vlog_printf(VLOG_INFO, "VMA Statistics can monitor up to %d cq elements\n", NUM_OF_SUPPORTED_CQS); + } + } + else { + g_p_stats_data_reader->add_data_reader(local_stats_addr, p_instance_cq, sizeof(cq_stats_t)); + __log_dbg("Added cq local=%p shm=%p\n", local_stats_addr, p_instance_cq); + } + g_lock_cq_inst_arr.unlock(); +} + +void vma_stats_instance_remove_cq_block(cq_stats_t* local_stats_addr) +{ + g_lock_cq_inst_arr.lock(); + __log_dbg("Remove cq local=%p\n", local_stats_addr); + + cq_stats_t* p_cq_stats = (cq_stats_t*)g_p_stats_data_reader->pop_data_reader(local_stats_addr); + + if (p_cq_stats == NULL) { // happens on the tx cq (why don't we keep tx cq stats?) + __log_dbg("application vma_stats pointer is NULL\n"); + g_lock_cq_inst_arr.unlock(); + return; + } + + //coverity - g_sh_mem->cq_inst_arr cannot be null + /*BULLSEYE_EXCLUDE_BLOCK_START + if (g_sh_mem->cq_inst_arr == NULL) { + vlog_printf(VLOG_ERROR,"%s:%d: g_sh_mem->instances_arr not init\n", __func__, __LINE__); + g_lock_skt_stats.unlock(); + return; + } + BULLSEYE_EXCLUDE_BLOCK_END*/ + + // Search sh_mem block to release + for (int i=0; icq_inst_arr[i].cq_stats == p_cq_stats) { + g_sh_mem->cq_inst_arr[i].b_enabled = false; + g_lock_cq_inst_arr.unlock(); + return; + } + } + + vlog_printf(VLOG_ERROR, "%s:%d: Could not find user pointer (%p)", __func__, __LINE__, p_cq_stats); + g_lock_cq_inst_arr.unlock(); +} + +void vma_stats_instance_create_bpool_block(bpool_stats_t* local_stats_addr) +{ + bpool_stats_t* p_instance_bpool = NULL; + g_lock_bpool_inst_arr.lock(); + for (int i=0; i < NUM_OF_SUPPORTED_BPOOLS; i++) { + if (!g_sh_mem->bpool_inst_arr[i].b_enabled) { + g_sh_mem->bpool_inst_arr[i].b_enabled = true; + p_instance_bpool = &g_sh_mem->bpool_inst_arr[i].bpool_stats; + memset(p_instance_bpool, 0, sizeof(bpool_stats_t)); + break; + } + } + if (p_instance_bpool == NULL) { + if (!printed_bpool_limit_info) { + printed_bpool_limit_info = true; + vlog_printf(VLOG_INFO, "VMA Statistics can monitor up to %d buffer pools\n", NUM_OF_SUPPORTED_BPOOLS); + } + } + else { + g_p_stats_data_reader->add_data_reader(local_stats_addr, p_instance_bpool, sizeof(bpool_stats_t)); + __log_dbg("Added bpool local=%p shm=%p\n", local_stats_addr, p_instance_bpool); + } + g_lock_bpool_inst_arr.unlock(); +} + +void vma_stats_instance_remove_bpool_block(bpool_stats_t* local_stats_addr) +{ + g_lock_bpool_inst_arr.lock(); + __log_dbg("Remove bpool local=%p\n", local_stats_addr); + + bpool_stats_t* p_bpool_stats = (bpool_stats_t*)g_p_stats_data_reader->pop_data_reader(local_stats_addr); + + if (p_bpool_stats == NULL) { + __log_dbg("application vma_stats pointer is NULL\n"); + g_lock_bpool_inst_arr.unlock(); + return; + } + + // Search sh_mem block to release + for (int i=0; ibpool_inst_arr[i].bpool_stats == p_bpool_stats) { + g_sh_mem->bpool_inst_arr[i].b_enabled = false; + g_lock_bpool_inst_arr.unlock(); + return; + } + } + + vlog_printf(VLOG_ERROR, "%s:%d: Could not find user pointer (%p)", __func__, __LINE__, p_bpool_stats); + g_lock_bpool_inst_arr.unlock(); +} + +void vma_stats_instance_get_poll_block(iomux_func_stats_t* local_stats_addr) +{ + g_p_stats_data_reader->add_data_reader(local_stats_addr, &g_sh_mem->iomux.poll, sizeof(iomux_func_stats_t)); +} + +void vma_stats_instance_get_select_block(iomux_func_stats_t* local_stats_addr) +{ + g_p_stats_data_reader->add_data_reader(local_stats_addr, &g_sh_mem->iomux.select, sizeof(iomux_func_stats_t)); +} + +void vma_stats_instance_create_epoll_block(int fd, iomux_func_stats_t* local_stats_addr) +{ + g_lock_iomux.lock(); + + for (unsigned i = 0; i < NUM_OF_SUPPORTED_EPFDS; ++i) { + epoll_stats_t* ep_stats = &g_sh_mem->iomux.epoll[i]; + if (!ep_stats->enabled) { + ep_stats->enabled = true; + ep_stats->epfd = fd; + g_p_stats_data_reader->add_data_reader(local_stats_addr, &ep_stats->stats, sizeof(iomux_func_stats_t)); + g_lock_iomux.unlock(); + return; + } + } + + vlog_printf(VLOG_INFO, "VMA Statistics can monitor up to %d epoll fds", NUM_OF_SUPPORTED_EPFDS); + g_lock_iomux.unlock(); + return; +} + +void vma_stats_instance_remove_epoll_block(iomux_func_stats_t* local_stats_addr) +{ + g_lock_iomux.lock(); + iomux_func_stats_t* ep_func_stats = (iomux_func_stats_t*)g_p_stats_data_reader->pop_data_reader(local_stats_addr); + + if (NULL == ep_func_stats) { + __log_dbg("application vma_stats pointer is NULL\n"); + g_lock_iomux.unlock(); + return; + } + + // Search ep_mem block to release + for (int i=0; iiomux.epoll[i].stats == ep_func_stats) { + g_sh_mem->iomux.epoll[i].enabled = false; + g_lock_iomux.unlock(); + return; + } + } + + vlog_printf(VLOG_ERROR, "%s:%d: Could not find user pointer (%p)", __func__, __LINE__, ep_func_stats); + g_lock_iomux.unlock(); + return; +} diff --git a/src/stats/stats_reader.cpp b/src/stats/stats_reader.cpp new file mode 100644 index 0000000..3091ed8 --- /dev/null +++ b/src/stats/stats_reader.cpp @@ -0,0 +1,1789 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include +#include /* getopt()*/ +#include +#include +#include +#include +#include +#include +#include +#include +#include "utils/rdtsc.h" +#include "vma/util/utils.h" +#include "vma/util/vma_stats.h" +#include "vma/util/sys_vars.h" + +using namespace std; + +typedef std::list fd_list_t; + + +typedef struct { + in_addr_t mc_grp; + fd_list_t fd_list; +} mc_group_fds_t; + +typedef enum { + e_K = 1024, + e_M = 1048576 +} units_t; + +#define MODULE_NAME "vmastat" +#define log_msg(log_fmt, log_args...) printf(MODULE_NAME ": " log_fmt "\n", ##log_args) +#define log_err(log_fmt, log_args...) fprintf(stderr,MODULE_NAME ": " log_fmt "\n", ##log_args) +#define log_system_err(log_fmt, log_args...) fprintf(stderr,MODULE_NAME ": " log_fmt " (errno=%d %s)\n", ##log_args, errno, strerror(errno)) +#define log_dbg(log_fmt, log_args...) printf(MODULE_NAME ": " log_fmt "\n", ##log_args) + +#define BASE_HEADERS_NUM 2 +#define BASIC_STATS_LINES_NUM 2 +#define UPPER_SHORT_VIEW_HEADER " %-7s %42s %31s\n" +#define LOWER_SHORT_VIEW_HEADER " %-7s %10s %7s %8s %7s %6s %7s %7s %7s %7s\n" +#define RX_SHORT_VIEW " %-3d %-3s %10u %7u %8u %7u %6.1f %7u %7u %7u %7u\n" +#define TX_SHORT_VIEW " %-3s %-3s %10u %7u %8u %7u %-6s %7u %7u %7u %7u\n" +#define IOMUX_FORMAT "%-8s%-2s %-9s%u%-1s%u %-12s %-9s%-5u %-7s%-4u %-5s%-2.2f%-3s %-5s%d%-1s\n" + +#define MEDIUM_HEADERS_NUM 3 +#define MEDIUM_STATS_LINES_NUM 2 +#define UPPER_MEDIUM_VIEW_HEADER " %-7s %65s %31s\n" +#define MIDDLE_MEDIUM_VIEW_HEADER " %-7s %10s %7s %8s %7s %6s%23s %7s %7s %7s %7s\n" +#define LOWER_MEDIUM_VIEW_HEADER " %50s %6s %6s %6s \n" +#define RX_MEDIUM_VIEW " %-3d %-3s %10u %7u %8u %7u %6.1f %6u %6u %6u %7u %7u %7u %7u\n" +#define TX_MEDIUM_VIEW " %-3s %-3s %10u %7u %8u %7u %29s %7u %7u %7u %7u\n" +#define CYCLES_SEPARATOR "-------------------------------------------------------------------------------\n" +#define FORMAT_STATS_32bit "%-20s %u\n" +#define FORMAT_STATS_64bit "%-20s %llu %-3s\n" +#define FORMAT_RING_PACKETS "%-20s %zu / %zu [kilobytes/packets] %-3s\n" +#define FORMAT_RING_INTERRUPT "%-20s %zu / %zu [requests/received] %-3s\n" +#define FORMAT_RING_MODERATION "%-20s %u / %u [frames/usec period] %-3s\n" +#define FORMAT_RING_DM_STATS "%-20s %zu / %zu / %zu [kilobytes/packets/oob] %-3s\n" +#define FORMAT_RING_TAP_NAME "%-20s %s\n" +#define FORMAT_RING_MASTER "%-20s %p\n" + +#define INTERVAL 1 +#define BYTES_TRAFFIC_UNIT e_K +#define SCREEN_SIZE 24 +#define MAX_BUFF_SIZE 256 +#define PRINT_DETAILS_MODES_NUM 2 +#define VIEW_MODES_NUM 5 +#define DEFAULT_DELAY_SEC 1 +#define DEFAULT_CYCLES 0 +#define DEFAULT_VIEW_MODE e_basic +#define DEFAULT_DETAILS_MODE e_totals +#define DEFAULT_PROC_IDENT_MODE e_by_runn_proccess +#define VLOG_DETAILS_NUM 4 +#define INIT_VMA_LOG_DETAILS -1 +#define NANO_TO_MICRO(n) (((n) + 500) / 1000) +#define SEC_TO_MICRO(n) ((n) * 1000000) +#define TIME_DIFF_in_MICRO(start,end) (SEC_TO_MICRO((end).tv_sec-(start).tv_sec) + \ + (NANO_TO_MICRO((end).tv_nsec-(start).tv_nsec))) +// printf formating when IP is in network byte ordering (for LITTLE_ENDIAN) +#define NETWORK_IP_PRINTQUAD_LITTLE_ENDIAN(ip) (uint8_t)((ip)&0xff), (uint8_t)(((ip)>>8)&0xff),(uint8_t)(((ip)>>16)&0xff),(uint8_t)(((ip)>>24)&0xff) + +// printf formating when IP is in host byte ordering (for LITTLE_ENDIAN) +#define HOST_IP_PRINTQUAD_LITTLE_ENDIAN(ip) (uint8_t)(((ip)>>24)&0xff),(uint8_t)(((ip)>>16)&0xff),(uint8_t)(((ip)>>8)&0xff),(uint8_t)((ip)&0xff) + + +#if __BYTE_ORDER == __LITTLE_ENDIAN +/* The host byte order is the same as network byte order, so these functions are all just identity. */ +# define NIPQUAD(ip) NETWORK_IP_PRINTQUAD_LITTLE_ENDIAN(ip) +#else +# if __BYTE_ORDER == __BIG_ENDIAN +# define NIPQUAD(ip) HOST_IP_PRINTQUAD_LITTLE_ENDIAN(ip) +# endif +#endif + +bool g_b_exit = false; +struct sigaction g_sigact; +uint8_t* g_fd_mask; +uint32_t g_fd_map_size = e_K; + +//statistic file +FILE* g_stats_file = stdout; + +void usage(const char *argv0) +{ + printf("\nVMA Statistics\n"); + printf("Usage:\n"); + printf("\t%s [-p pid] [-k directory] [-v view] [-d details] [-i interval] \n", argv0); + printf("\n"); + printf("Defaults:\n"); + printf("\tfind_pid=enabled, directory=\"%s\", view=1, details=1, interval=1, \n", MCE_DEFAULT_STATS_SHMEM_DIR); + printf("\n"); + printf("Options:\n"); + printf(" -p, --pid=\t\tShow VMA statistics for process with pid: \n"); + printf(" -k, --directory=\tSet shared memory directory path to \n"); + printf(" -n, --name=\tShow VMA statistics for application: \n"); + printf(" -f, --find_pid\t\tFind and show statistics for VMA instance running (default)\n"); + printf(" -F, --forbid_clean\t\tBy setting this flag inactive shared objects would not be removed\n"); + printf(" -i, --interval=\t\tPrint report every seconds\n"); + printf(" -c, --cycles=\t\tDo report print cycles and exit, use 0 value for infinite (default)\n"); + printf(" -v, --view=<1|2|3|4|5>\tSet view type:1- basic info,2- extra info,3- full info,4- mc groups,5- similar to 'netstat -tunaep'\n"); + printf(" -d, --details=<1|2>\t\tSet details mode:1- to see totals,2- to see deltas\t\t\n"); + printf(" -z, --zero\t\t\tZero counters\n"); + printf(" -l, --log_level=\tSet VMA log level to (one of: none/panic/error/warn/info/details/debug/fine/finer/all)\n"); + printf(" -S, --fd_dump= []\tDump statistics for fd number using log level . use 0 value for all open fds.\n"); + printf(" -D, --details_level=\tSet VMA log details level to (0 <= level <= 3)\n"); + printf(" -s, --sockets=\tLog only sockets that match or , format: 4-16 or 1,9 (or combination)\n"); + printf(" -V, --version\t\t\tPrint version\n"); + printf(" -h, --help\t\t\tPrint this help message\n"); +} + +void update_delta_stat(socket_stats_t* p_curr_stat, socket_stats_t* p_prev_stat) +{ + int delay = INTERVAL; + p_prev_stat->counters.n_tx_sent_byte_count = (p_curr_stat->counters.n_tx_sent_byte_count - p_prev_stat->counters.n_tx_sent_byte_count) / delay; + p_prev_stat->counters.n_tx_sent_pkt_count = (p_curr_stat->counters.n_tx_sent_pkt_count - p_prev_stat->counters.n_tx_sent_pkt_count) / delay; + p_prev_stat->counters.n_tx_drops = (p_curr_stat->counters.n_tx_drops - p_prev_stat->counters.n_tx_drops) / delay; + p_prev_stat->counters.n_tx_errors = (p_curr_stat->counters.n_tx_errors - p_prev_stat->counters.n_tx_errors) / delay; + p_prev_stat->counters.n_tx_dummy = (p_curr_stat->counters.n_tx_dummy - p_prev_stat->counters.n_tx_dummy) / delay; + p_prev_stat->counters.n_tx_os_bytes = (p_curr_stat->counters.n_tx_os_bytes - p_prev_stat->counters.n_tx_os_bytes) / delay; + p_prev_stat->counters.n_tx_os_packets = (p_curr_stat->counters.n_tx_os_packets - p_prev_stat->counters.n_tx_os_packets) / delay; + p_prev_stat->counters.n_tx_os_eagain = (p_curr_stat->counters.n_tx_os_eagain - p_prev_stat->counters.n_tx_os_eagain) / delay; + p_prev_stat->counters.n_tx_os_errors = (p_curr_stat->counters.n_tx_os_errors - p_prev_stat->counters.n_tx_os_errors) / delay; + p_prev_stat->counters.n_rx_bytes = (p_curr_stat->counters.n_rx_bytes - p_prev_stat->counters.n_rx_bytes) / delay; + p_prev_stat->counters.n_rx_packets = (p_curr_stat->counters.n_rx_packets - p_prev_stat->counters.n_rx_packets) / delay; + p_prev_stat->counters.n_rx_eagain = (p_curr_stat->counters.n_rx_eagain - p_prev_stat->counters.n_rx_eagain) / delay; + p_prev_stat->counters.n_rx_errors = (p_curr_stat->counters.n_rx_errors - p_prev_stat->counters.n_rx_errors) / delay; + p_prev_stat->counters.n_rx_os_bytes = (p_curr_stat->counters.n_rx_os_bytes - p_prev_stat->counters.n_rx_os_bytes) / delay; + p_prev_stat->counters.n_rx_os_packets = (p_curr_stat->counters.n_rx_os_packets - p_prev_stat->counters.n_rx_os_packets) / delay; + p_prev_stat->counters.n_rx_os_eagain = (p_curr_stat->counters.n_rx_os_eagain - p_prev_stat->counters.n_rx_os_eagain) / delay; + p_prev_stat->counters.n_rx_os_errors = (p_curr_stat->counters.n_rx_os_errors - p_prev_stat->counters.n_rx_os_errors) / delay; + p_prev_stat->counters.n_rx_poll_miss = (p_curr_stat->counters.n_rx_poll_miss - p_prev_stat->counters.n_rx_poll_miss) / delay; + p_prev_stat->counters.n_rx_poll_hit = (p_curr_stat->counters.n_rx_poll_hit - p_prev_stat->counters.n_rx_poll_hit) / delay; + p_prev_stat->n_rx_ready_byte_count = p_curr_stat->n_rx_ready_byte_count; + p_prev_stat->n_tx_ready_byte_count = p_curr_stat->n_tx_ready_byte_count; + p_prev_stat->n_rx_ready_byte_limit = p_curr_stat->n_rx_ready_byte_limit; + p_prev_stat->counters.n_rx_ready_byte_max = p_curr_stat->counters.n_rx_ready_byte_max; + p_prev_stat->counters.n_rx_ready_byte_drop = (p_curr_stat->counters.n_rx_ready_byte_drop - p_prev_stat->counters.n_rx_ready_byte_drop) / delay; + p_prev_stat->counters.n_rx_ready_pkt_drop = (p_curr_stat->counters.n_rx_ready_pkt_drop - p_prev_stat->counters.n_rx_ready_pkt_drop) / delay; + p_prev_stat->n_rx_ready_pkt_count = p_curr_stat->n_rx_ready_pkt_count; + p_prev_stat->counters.n_rx_ready_pkt_max = p_curr_stat->counters.n_rx_ready_pkt_max; + p_prev_stat->n_rx_zcopy_pkt_count = p_curr_stat->n_rx_zcopy_pkt_count; + + p_prev_stat->threadid_last_rx = p_curr_stat->threadid_last_rx; + p_prev_stat->threadid_last_tx = p_curr_stat->threadid_last_tx; + + p_prev_stat->counters.n_rx_migrations = (p_curr_stat->counters.n_rx_migrations - p_prev_stat->counters.n_rx_migrations) / delay; + p_prev_stat->counters.n_tx_migrations = (p_curr_stat->counters.n_tx_migrations - p_prev_stat->counters.n_tx_migrations) / delay; + p_prev_stat->counters.n_tx_retransmits = (p_curr_stat->counters.n_tx_retransmits - p_prev_stat->counters.n_tx_retransmits) / delay; +} + +void update_delta_iomux_stat(iomux_func_stats_t* p_curr_stats, iomux_func_stats_t* p_prev_stats) +{ + int delay = INTERVAL; + if (p_curr_stats && p_prev_stats) { + p_prev_stats->n_iomux_errors = (p_curr_stats->n_iomux_errors - p_prev_stats->n_iomux_errors) / delay; + p_prev_stats->n_iomux_os_rx_ready = (p_curr_stats->n_iomux_os_rx_ready - p_prev_stats->n_iomux_os_rx_ready) / delay; + p_prev_stats->n_iomux_poll_hit = (p_curr_stats->n_iomux_poll_hit - p_prev_stats->n_iomux_poll_hit) / delay; + p_prev_stats->n_iomux_poll_miss = (p_curr_stats->n_iomux_poll_miss - p_prev_stats->n_iomux_poll_miss) / delay; + p_prev_stats->n_iomux_rx_ready = (p_curr_stats->n_iomux_rx_ready - p_prev_stats->n_iomux_rx_ready) / delay; + p_prev_stats->n_iomux_timeouts = (p_curr_stats->n_iomux_timeouts - p_prev_stats->n_iomux_timeouts) / delay; + p_prev_stats->threadid_last = p_curr_stats->threadid_last; + } +} + +void update_delta_ring_stat(ring_stats_t* p_curr_ring_stats, ring_stats_t* p_prev_ring_stats) +{ + int delay = INTERVAL; + if (p_curr_ring_stats && p_prev_ring_stats) { + p_prev_ring_stats->n_rx_byte_count = (p_curr_ring_stats->n_rx_byte_count - p_prev_ring_stats->n_rx_byte_count) / delay; + p_prev_ring_stats->n_rx_pkt_count = (p_curr_ring_stats->n_rx_pkt_count - p_prev_ring_stats->n_rx_pkt_count) / delay; + p_prev_ring_stats->n_tx_byte_count = (p_curr_ring_stats->n_tx_byte_count - p_prev_ring_stats->n_tx_byte_count) / delay; + p_prev_ring_stats->n_tx_pkt_count = (p_curr_ring_stats->n_tx_pkt_count - p_prev_ring_stats->n_tx_pkt_count) / delay; + p_prev_ring_stats->n_tx_retransmits = (p_curr_ring_stats->n_tx_retransmits - p_prev_ring_stats->n_tx_retransmits) / delay; + if (p_prev_ring_stats->n_type == RING_TAP) { + memcpy(p_prev_ring_stats->tap.s_tap_name, p_curr_ring_stats->tap.s_tap_name, sizeof(p_curr_ring_stats->tap.s_tap_name)); + p_prev_ring_stats->tap.n_tap_fd = p_curr_ring_stats->tap.n_tap_fd; + p_prev_ring_stats->tap.n_rx_buffers = p_curr_ring_stats->tap.n_rx_buffers; + p_prev_ring_stats->tap.n_vf_plugouts = (p_curr_ring_stats->tap.n_vf_plugouts - p_prev_ring_stats->tap.n_vf_plugouts); + } else { + p_prev_ring_stats->simple.n_rx_interrupt_received = (p_curr_ring_stats->simple.n_rx_interrupt_received - p_prev_ring_stats->simple.n_rx_interrupt_received) / delay; + p_prev_ring_stats->simple.n_rx_interrupt_requests = (p_curr_ring_stats->simple.n_rx_interrupt_requests - p_prev_ring_stats->simple.n_rx_interrupt_requests) / delay; + p_prev_ring_stats->simple.n_rx_cq_moderation_count = p_curr_ring_stats->simple.n_rx_cq_moderation_count; + p_prev_ring_stats->simple.n_rx_cq_moderation_period = p_curr_ring_stats->simple.n_rx_cq_moderation_period; + p_prev_ring_stats->simple.n_tx_dev_mem_allocated = p_curr_ring_stats->simple.n_tx_dev_mem_allocated; + p_prev_ring_stats->simple.n_tx_dev_mem_byte_count = (p_curr_ring_stats->simple.n_tx_dev_mem_byte_count - p_prev_ring_stats->simple.n_tx_dev_mem_byte_count) / delay; + p_prev_ring_stats->simple.n_tx_dev_mem_pkt_count = (p_curr_ring_stats->simple.n_tx_dev_mem_pkt_count - p_prev_ring_stats->simple.n_tx_dev_mem_pkt_count) / delay; + p_prev_ring_stats->simple.n_tx_dev_mem_oob = (p_curr_ring_stats->simple.n_tx_dev_mem_oob - p_prev_ring_stats->simple.n_tx_dev_mem_oob) / delay; + } + } +} + +void update_delta_cq_stat(cq_stats_t* p_curr_cq_stats, cq_stats_t* p_prev_cq_stats) +{ + int delay = INTERVAL; + if (p_curr_cq_stats && p_prev_cq_stats) { + p_prev_cq_stats->n_rx_drained_at_once_max = p_curr_cq_stats->n_rx_drained_at_once_max; + p_prev_cq_stats->n_rx_pkt_drop = (p_curr_cq_stats->n_rx_pkt_drop - p_prev_cq_stats->n_rx_pkt_drop) / delay; + p_prev_cq_stats->n_rx_sw_queue_len = p_curr_cq_stats->n_rx_sw_queue_len; + p_prev_cq_stats->n_buffer_pool_len = p_curr_cq_stats->n_buffer_pool_len; + } +} + +void update_delta_bpool_stat(bpool_stats_t* p_curr_bpool_stats, bpool_stats_t* p_prev_bpool_stats) +{ + int delay = INTERVAL; + if (p_curr_bpool_stats && p_prev_bpool_stats) { + p_prev_bpool_stats->n_buffer_pool_size = p_curr_bpool_stats->n_buffer_pool_size; + p_prev_bpool_stats->n_buffer_pool_no_bufs = (p_curr_bpool_stats->n_buffer_pool_no_bufs - p_prev_bpool_stats->n_buffer_pool_no_bufs) / delay; + } +} + +void print_ring_stats(ring_instance_block_t* p_ring_inst_arr) +{ + ring_stats_t* p_ring_stats = NULL; + char post_fix[3] = ""; + + if (user_params.print_details_mode == e_deltas) + strcpy(post_fix, "/s"); + + for (int i = 0; i < NUM_OF_SUPPORTED_RINGS; i++) { + if (p_ring_inst_arr[i].b_enabled) { + p_ring_stats = &p_ring_inst_arr[i].ring_stats; + printf("======================================================\n"); + + printf("\t%s=[%u]\n", ring_type_str[p_ring_stats->n_type], i); + + if (p_ring_stats->p_ring_master) { + printf(FORMAT_RING_MASTER, "Master:", p_ring_stats->p_ring_master); + } + + printf(FORMAT_RING_PACKETS, "Tx Offload:", p_ring_stats->n_tx_byte_count/BYTES_TRAFFIC_UNIT, p_ring_stats->n_tx_pkt_count, post_fix); + printf(FORMAT_RING_PACKETS, "Rx Offload:", p_ring_stats->n_rx_byte_count/BYTES_TRAFFIC_UNIT, p_ring_stats->n_rx_pkt_count, post_fix); + + if (p_ring_stats->n_tx_retransmits) { + printf(FORMAT_STATS_64bit, "Retransmissions:", (unsigned long long int)p_ring_stats->n_tx_retransmits, post_fix); + } + + if (p_ring_stats->n_type == RING_TAP) { + printf(FORMAT_STATS_32bit, "Rx Buffers:", p_ring_stats->tap.n_rx_buffers); + if (p_ring_stats->tap.n_vf_plugouts) { + printf(FORMAT_STATS_32bit, "VF Plugouts:", p_ring_stats->tap.n_vf_plugouts); + } + printf(FORMAT_STATS_32bit, "Tap fd:", p_ring_stats->tap.n_tap_fd); + printf(FORMAT_RING_TAP_NAME, "Tap Device:", p_ring_stats->tap.s_tap_name); + } else { + if (p_ring_stats->simple.n_rx_interrupt_requests || p_ring_stats->simple.n_rx_interrupt_received) { + printf(FORMAT_RING_INTERRUPT, "Interrupts:", p_ring_stats->simple.n_rx_interrupt_requests, p_ring_stats->simple.n_rx_interrupt_received, post_fix); + } + if (p_ring_stats->simple.n_rx_cq_moderation_count || p_ring_stats->simple.n_rx_cq_moderation_period) { + printf(FORMAT_RING_MODERATION, "Moderation:", p_ring_stats->simple.n_rx_cq_moderation_count, p_ring_stats->simple.n_rx_cq_moderation_period, post_fix); + } + if (p_ring_stats->simple.n_tx_dev_mem_allocated) { + printf(FORMAT_STATS_32bit, "Dev Mem Alloc:", p_ring_stats->simple.n_tx_dev_mem_allocated); + printf(FORMAT_RING_DM_STATS, "Dev Mem Stats:", p_ring_stats->simple.n_tx_dev_mem_byte_count/BYTES_TRAFFIC_UNIT, p_ring_stats->simple.n_tx_dev_mem_pkt_count, p_ring_stats->simple.n_tx_dev_mem_oob, post_fix); + } + } + } + } + printf("======================================================\n"); +} + +void print_cq_stats(cq_instance_block_t* p_cq_inst_arr) +{ + cq_stats_t* p_cq_stats = NULL; + char post_fix[3] = ""; + + if (user_params.print_details_mode == e_deltas) + strcpy(post_fix, "/s"); + + for (int i = 0; i < NUM_OF_SUPPORTED_CQS; i++) { + if (p_cq_inst_arr[i].b_enabled) { + p_cq_stats = &p_cq_inst_arr[i].cq_stats; + printf("======================================================\n"); + printf("\tCQ=[%u]\n", i); + printf(FORMAT_STATS_64bit, "Packets dropped:", (unsigned long long int)p_cq_stats->n_rx_pkt_drop, post_fix); + printf(FORMAT_STATS_32bit, "Packets queue len:",p_cq_stats->n_rx_sw_queue_len); + printf(FORMAT_STATS_32bit, "Drained max:", p_cq_stats->n_rx_drained_at_once_max); + printf(FORMAT_STATS_32bit, "Buffer pool size:",p_cq_stats->n_buffer_pool_len); + } + } + printf("======================================================\n"); +} + +void print_bpool_stats(bpool_instance_block_t* p_bpool_inst_arr) +{ + bpool_stats_t* p_bpool_stats = NULL; + char post_fix[3] = ""; + + if (user_params.print_details_mode == e_deltas) + strcpy(post_fix, "/s"); + + for (int i = 0; i < NUM_OF_SUPPORTED_BPOOLS; i++) { + if (p_bpool_inst_arr && p_bpool_inst_arr[i].b_enabled) { + p_bpool_stats = &p_bpool_inst_arr[i].bpool_stats; + printf("======================================================\n"); + if (p_bpool_stats->is_rx) + printf("\tBUFFER_POOL(RX)=[%u]\n", i); + else if (p_bpool_stats->is_tx) + printf("\tBUFFER_POOL(TX)=[%u]\n", i); + else + printf("\tBUFFER_POOL=[%u]\n", i); + printf(FORMAT_STATS_32bit, "Size:", p_bpool_stats->n_buffer_pool_size); + printf(FORMAT_STATS_32bit, "No buffers error:", p_bpool_stats->n_buffer_pool_no_bufs); + } + } + printf("======================================================\n"); +} + +void print_basic_stats(socket_stats_t* p_stats) +{ + // + // Socket statistics + // + double rx_poll_hit_percentage = 0; + + if (p_stats->counters.n_rx_poll_hit) { + double rx_poll_hit = (double)p_stats->counters.n_rx_poll_hit; + rx_poll_hit_percentage = (rx_poll_hit / (rx_poll_hit + (double)p_stats->counters.n_rx_poll_miss)) * 100; + } + printf(RX_SHORT_VIEW,p_stats->fd,"Rx:",p_stats->counters.n_rx_packets, + p_stats->counters.n_rx_bytes/BYTES_TRAFFIC_UNIT,p_stats->counters.n_rx_eagain, + p_stats->counters.n_rx_errors,rx_poll_hit_percentage, + p_stats->counters.n_rx_os_packets,p_stats->counters.n_rx_os_bytes / BYTES_TRAFFIC_UNIT, + p_stats->counters.n_rx_os_eagain,p_stats->counters.n_rx_os_errors); + + printf(TX_SHORT_VIEW," ", "Tx:",p_stats->counters.n_tx_sent_pkt_count, + p_stats->counters.n_tx_sent_byte_count/BYTES_TRAFFIC_UNIT,p_stats->counters.n_tx_drops, + p_stats->counters.n_tx_errors," ", + p_stats->counters.n_tx_os_packets,p_stats->counters.n_tx_os_bytes / BYTES_TRAFFIC_UNIT, + p_stats->counters.n_tx_os_eagain,p_stats->counters.n_tx_os_errors); + +} + +void print_medium_total_stats(socket_stats_t* p_stats) +{ + // + // Socket statistics + // + double rx_poll_hit_percentage = 0; + + if (p_stats->counters.n_rx_poll_hit) { + double rx_poll_hit = (double)p_stats->counters.n_rx_poll_hit; + rx_poll_hit_percentage = (rx_poll_hit / (rx_poll_hit + (double)p_stats->counters.n_rx_poll_miss)) * 100; + } + printf(RX_MEDIUM_VIEW,p_stats->fd,"Rx:",p_stats->counters.n_rx_packets, + p_stats->counters.n_rx_bytes/BYTES_TRAFFIC_UNIT,p_stats->counters.n_rx_eagain, + p_stats->counters.n_rx_errors,rx_poll_hit_percentage, + p_stats->n_rx_ready_pkt_count, p_stats->counters.n_rx_ready_pkt_max, + p_stats->counters.n_rx_ready_pkt_drop,p_stats->counters.n_rx_os_packets,p_stats->counters.n_rx_os_bytes / BYTES_TRAFFIC_UNIT, + p_stats->counters.n_rx_os_eagain,p_stats->counters.n_rx_os_errors); + + printf(TX_MEDIUM_VIEW," ", "Tx:",p_stats->counters.n_tx_sent_pkt_count, + p_stats->counters.n_tx_sent_byte_count/BYTES_TRAFFIC_UNIT,p_stats->counters.n_tx_drops, + p_stats->counters.n_tx_errors," ", + p_stats->counters.n_tx_os_packets,p_stats->counters.n_tx_os_bytes / BYTES_TRAFFIC_UNIT, + p_stats->counters.n_tx_os_eagain,p_stats->counters.n_tx_os_errors); +} + +void print_basic_delta_stats(socket_stats_t* p_curr_stat, socket_stats_t* p_prev_stat) +{ + update_delta_stat(p_curr_stat, p_prev_stat); + print_basic_stats(p_prev_stat); +} + +void print_medium_delta_stats(socket_stats_t* p_curr_stat, socket_stats_t* p_prev_stat) +{ + update_delta_stat(p_curr_stat, p_prev_stat); + print_medium_total_stats(p_prev_stat); +} + +void print_full_delta_stats(socket_stats_t* p_curr_stat, socket_stats_t* p_prev_stat, mc_grp_info_t* p_mc_grp_info) +{ + update_delta_stat(p_curr_stat, p_prev_stat); + print_full_stats(p_prev_stat, p_mc_grp_info, g_stats_file); +} + +void print_basic_mode_headers() +{ + switch (user_params.print_details_mode) { + case e_totals: + printf(UPPER_SHORT_VIEW_HEADER,"fd","------------ total offloaded -------------","--------- total os ----------"); + printf(LOWER_SHORT_VIEW_HEADER," ","pkt","Kbyte","eagain","error","poll%","pkt","Kbyte","eagain","error"); + break; + case e_deltas: + printf(UPPER_SHORT_VIEW_HEADER,"fd","--------------- offloaded ----------------","---------- os ---------"); + printf(LOWER_SHORT_VIEW_HEADER," ","pkt/s","Kbyte/s","eagain/s","error/s","poll%","pkt/s","Kbyte/s","eagain/s","error/s"); + break; + default: + break; + } +} + +void print_medium_mode_headers() +{ + switch (user_params.print_details_mode) { + case e_totals: + printf(UPPER_MEDIUM_VIEW_HEADER,"fd", "----------------------- total offloaded -------------------------", "--------- total os ----------"); + printf(MIDDLE_MEDIUM_VIEW_HEADER," ","pkt","Kbyte","eagain","error","poll%","---- queue pkt -----", "pkt", "Kbyte","eagain", "error"); + printf(LOWER_MEDIUM_VIEW_HEADER," ", "cur","max","drop"); + break; + case e_deltas: + printf(UPPER_MEDIUM_VIEW_HEADER,"fd", "---------------------------- offloaded --------------------------", "---------- os ---------"); + printf(MIDDLE_MEDIUM_VIEW_HEADER," ","pkt/s","Kbyte/s","eagain/s","error/s","poll%","----- queue pkt ------", "pkt/s", "Kbyte/s", "eagain/s", "error/s"); + printf(LOWER_MEDIUM_VIEW_HEADER," ", "cur","max","drop/s"); + break; + default: + break; + } +} + +void print_headers() +{ + switch (user_params.view_mode) { + case e_basic: + print_basic_mode_headers(); + break; + case e_medium: + print_medium_mode_headers(); + break; + case e_netstat_like: + print_netstat_like_headers(g_stats_file); + break; + default: + break; + } +} + +void show_basic_stats(socket_instance_block_t* p_instance,socket_instance_block_t* p_prev_instance_block) +{ + switch (user_params.print_details_mode) { + case e_totals: + print_basic_stats(&p_instance->skt_stats); + break; + case e_deltas: + print_basic_delta_stats(&p_instance->skt_stats, &p_prev_instance_block->skt_stats); + break; + default: + break; + } +} + +void print_medium_stats(socket_instance_block_t* p_instance, socket_instance_block_t* p_prev_instance_block) +{ + switch (user_params.print_details_mode) { + case e_totals: + print_medium_total_stats(&p_instance->skt_stats); + break; + case e_deltas: + print_medium_delta_stats(&p_instance->skt_stats, &p_prev_instance_block->skt_stats); + break; + default: + break; + } +} + +void show_full_stats(socket_instance_block_t* p_instance, socket_instance_block_t* p_prev_instance_block, mc_grp_info_t* p_mc_grp_info) +{ + switch (user_params.print_details_mode) { + case e_totals: + print_full_stats(&p_instance->skt_stats, p_mc_grp_info, g_stats_file); + break; + case e_deltas: + print_full_delta_stats(&p_instance->skt_stats, &p_prev_instance_block->skt_stats, p_mc_grp_info); + break; + default: + break; + } +} + +int show_socket_stats(socket_instance_block_t* p_instance, socket_instance_block_t* p_prev_instance_block,uint32_t num_of_obj, int* p_printed_lines_num, mc_grp_info_t* p_mc_grp_info, int pid) +{ + int num_act_inst = 0; + + if (*p_printed_lines_num >= SCREEN_SIZE && user_params.view_mode != e_full) { + print_headers(); + switch (user_params.view_mode) { + case e_basic: + *p_printed_lines_num = BASE_HEADERS_NUM; + break; + case e_medium: + *p_printed_lines_num = MEDIUM_HEADERS_NUM; + break; + default: + break; + } + } + + for (uint32_t i=0; i < num_of_obj; i++) { + size_t fd = (size_t)p_instance[i].skt_stats.fd; + if (p_instance[i].b_enabled && g_fd_mask[fd]) { + num_act_inst++; + switch (user_params.view_mode) { + case e_basic: + show_basic_stats(&p_instance[i], &p_prev_instance_block[i]); + *p_printed_lines_num += BASIC_STATS_LINES_NUM; + break; + case e_medium: + print_medium_stats(&p_instance[i], &p_prev_instance_block[i]); + *p_printed_lines_num += MEDIUM_STATS_LINES_NUM; + break; + case e_full: + show_full_stats(&p_instance[i], &p_prev_instance_block[i], p_mc_grp_info); + break; + case e_netstat_like: + print_netstat_like(&p_instance[i].skt_stats, p_mc_grp_info, g_stats_file, pid); + break; + default: + break; + } + } + } + return num_act_inst; +} + +// Print statistics for select(), poll(), epoll() +void print_full_iomux_stats(const char* func_name, iomux_func_stats_t* p_iomux_stats) +{ + char post_fix[3] = ""; + + if (user_params.print_details_mode == e_deltas) + strcpy(post_fix, "/s"); + + if (p_iomux_stats && (p_iomux_stats->n_iomux_os_rx_ready || p_iomux_stats->n_iomux_rx_ready || + p_iomux_stats->n_iomux_timeouts || p_iomux_stats->n_iomux_errors || + p_iomux_stats->n_iomux_poll_miss || p_iomux_stats->n_iomux_poll_hit)) { + + printf("======================================================\n"); + printf("\t%s\n", func_name); + printf("Polling CPU%s:%d%%\n", post_fix, p_iomux_stats->n_iomux_polling_time); + if (p_iomux_stats->threadid_last != 0) + printf("- Thread Id: %5u\n", p_iomux_stats->threadid_last); + if (p_iomux_stats->n_iomux_os_rx_ready || p_iomux_stats->n_iomux_rx_ready) + printf("Rx fds ready: %u / %u [os/offload]%s\n", p_iomux_stats->n_iomux_os_rx_ready, p_iomux_stats->n_iomux_rx_ready, post_fix); + if (p_iomux_stats->n_iomux_poll_miss + p_iomux_stats->n_iomux_poll_hit) { + double iomux_poll_hit = (double)p_iomux_stats->n_iomux_poll_hit; + double iomux_poll_hit_percentage = (iomux_poll_hit / (iomux_poll_hit + (double)p_iomux_stats->n_iomux_poll_miss)) * 100; + printf("Polls [miss/hit]%s: %u / %u (%2.2f%%)\n", post_fix,p_iomux_stats->n_iomux_poll_miss, p_iomux_stats->n_iomux_poll_hit, iomux_poll_hit_percentage); + if (p_iomux_stats->n_iomux_timeouts) + printf("Timeouts%s: %u\n",post_fix, p_iomux_stats->n_iomux_timeouts); + if (p_iomux_stats->n_iomux_errors) + printf("Errors%s: %u\n", post_fix, p_iomux_stats->n_iomux_errors); + printf("======================================================\n"); + } + } +} + +void print_basic_iomux_stats(const char* func_name, iomux_func_stats_t* p_iomux_stats, int* p_printed_lines_num) +{ + double rx_poll_hit_percentage = 0; + char post_fix[3] = ""; + + if (user_params.print_details_mode == e_deltas) + strcpy(post_fix, "/s"); + + if (p_iomux_stats->n_iomux_poll_hit) { + double iomux_poll_hit = (double)p_iomux_stats->n_iomux_poll_hit; + rx_poll_hit_percentage = (iomux_poll_hit / (iomux_poll_hit + (double)p_iomux_stats->n_iomux_poll_miss)) * 100; + } + + if (p_iomux_stats->n_iomux_os_rx_ready || p_iomux_stats->n_iomux_rx_ready || + p_iomux_stats->n_iomux_timeouts || p_iomux_stats->n_iomux_errors || + p_iomux_stats->n_iomux_poll_miss || p_iomux_stats->n_iomux_poll_hit) { + printf(IOMUX_FORMAT,func_name, post_fix,"Rx Ready:", p_iomux_stats->n_iomux_os_rx_ready, + "/", p_iomux_stats->n_iomux_rx_ready, + "[os/offload]", "Timeouts:", p_iomux_stats->n_iomux_timeouts, + "Errors:", p_iomux_stats->n_iomux_errors, + "Poll:", rx_poll_hit_percentage, "%", + "Polling CPU:", p_iomux_stats->n_iomux_polling_time, "%"); + (*p_printed_lines_num)++; + } +} + +void print_iomux_totals(iomux_stats_t* p_iomux_stats, int* p_printed_lines_num) +{ + if (p_printed_lines_num) { + print_basic_iomux_stats("poll", &p_iomux_stats->poll, p_printed_lines_num); + print_basic_iomux_stats("select", &p_iomux_stats->select, p_printed_lines_num); + } else { + print_full_iomux_stats("poll", &p_iomux_stats->poll); + print_full_iomux_stats("select", &p_iomux_stats->select); + } + for (int i = 0; i < NUM_OF_SUPPORTED_EPFDS; i++) { + epoll_stats_t *ep_stats = &p_iomux_stats->epoll[i]; + if (ep_stats->enabled) { + char epfd_name[20]; + snprintf(epfd_name, sizeof(epfd_name), "epoll[%d]", ep_stats->epfd); + if (p_printed_lines_num) { + print_basic_iomux_stats(epfd_name, &ep_stats->stats, p_printed_lines_num); + } else { + print_full_iomux_stats(epfd_name, &ep_stats->stats); + } + } + } +} + +void update_iomux_deltas(iomux_stats_t* p_curr_iomux_stats, iomux_stats_t* p_prev_iomux_stats) +{ + update_delta_iomux_stat(&p_curr_iomux_stats->poll, &p_prev_iomux_stats->poll); + update_delta_iomux_stat(&p_curr_iomux_stats->select, &p_prev_iomux_stats->select); + for (int i = 0; i < NUM_OF_SUPPORTED_EPFDS; i++) { + if (p_curr_iomux_stats->epoll[i].enabled && p_prev_iomux_stats->epoll[i].enabled) { + update_delta_iomux_stat(&p_curr_iomux_stats->epoll[i].stats, + &p_prev_iomux_stats->epoll[i].stats); + } + } +} + +void print_full_iomux_deltas(iomux_stats_t* p_curr_iomux_stats, iomux_stats_t* p_prev_iomux_stats) +{ + update_iomux_deltas(p_curr_iomux_stats, p_prev_iomux_stats); + print_iomux_totals(p_prev_iomux_stats, NULL); +} + +void print_basic_iomux_deltas(iomux_stats_t* p_curr_stats, iomux_stats_t* p_prev_stats, int* p_printed_lines_num) +{ + update_iomux_deltas(p_curr_stats, p_prev_stats); + print_iomux_totals(p_prev_stats, p_printed_lines_num); +} + +void print_full_iomux_stats(iomux_stats_t* p_curr_stats, iomux_stats_t* p_prev_stats) +{ + switch (user_params.print_details_mode) { + case e_totals: + print_iomux_totals(p_curr_stats, NULL); + break; + default: + print_full_iomux_deltas(p_curr_stats, p_prev_stats); + break; + } +} + +void print_ring_deltas(ring_instance_block_t* p_curr_ring_stats, ring_instance_block_t* p_prev_ring_stats) +{ + for (int i = 0; i < NUM_OF_SUPPORTED_RINGS; i++) { + update_delta_ring_stat(&p_curr_ring_stats[i].ring_stats,&p_prev_ring_stats[i].ring_stats); + } + print_ring_stats(p_prev_ring_stats); +} + +void print_cq_deltas(cq_instance_block_t* p_curr_cq_stats, cq_instance_block_t* p_prev_cq_stats) +{ + for (int i = 0; i < NUM_OF_SUPPORTED_CQS; i++) { + update_delta_cq_stat(&p_curr_cq_stats[i].cq_stats,&p_prev_cq_stats[i].cq_stats); + } + print_cq_stats(p_prev_cq_stats); +} + +void print_bpool_deltas(bpool_instance_block_t* p_curr_bpool_stats, bpool_instance_block_t* p_prev_bpool_stats) +{ + for (int i = 0; i < NUM_OF_SUPPORTED_BPOOLS; i++) { + update_delta_bpool_stat(&p_curr_bpool_stats[i].bpool_stats,&p_prev_bpool_stats[i].bpool_stats); + } + print_bpool_stats(p_prev_bpool_stats); +} + +void show_ring_stats(ring_instance_block_t* p_curr_ring_blocks, ring_instance_block_t* p_prev_ring_blocks) +{ + switch (user_params.print_details_mode) { + case e_totals: + print_ring_stats(p_curr_ring_blocks); + break; + default: + print_ring_deltas(p_curr_ring_blocks, p_prev_ring_blocks); + break; + } +} + +void show_cq_stats(cq_instance_block_t* p_curr_cq_blocks, cq_instance_block_t* p_prev_cq_blocks) +{ + switch (user_params.print_details_mode) { + case e_totals: + print_cq_stats(p_curr_cq_blocks); + break; + default: + print_cq_deltas(p_curr_cq_blocks, p_prev_cq_blocks); + break; + } +} + +void show_bpool_stats(bpool_instance_block_t* p_curr_bpool_blocks, bpool_instance_block_t* p_prev_bpool_blocks) +{ + switch (user_params.print_details_mode) { + case e_totals: + print_bpool_stats(p_curr_bpool_blocks); + break; + default: + print_bpool_deltas(p_curr_bpool_blocks, p_prev_bpool_blocks); + break; + } +} + +void show_basic_iomux_stats(iomux_stats_t* p_curr_stats, iomux_stats_t* p_prev_stats, int* p_printed_lines_num) +{ + switch (user_params.print_details_mode) { + case e_totals: + print_iomux_totals(p_curr_stats, p_printed_lines_num); + break; + default: + print_basic_iomux_deltas(p_curr_stats, p_prev_stats, p_printed_lines_num); + break; + } +} + +void show_iomux_stats(iomux_stats_t* p_curr_stats, iomux_stats_t* p_prev_stats, int* p_printed_lines_num) +{ + switch (user_params.view_mode) { + case e_basic: + case e_medium: + show_basic_iomux_stats(p_curr_stats, p_prev_stats, p_printed_lines_num); + break; + case e_full: + print_full_iomux_stats(p_curr_stats, p_prev_stats); + break; + default: + break; + } +} + +// Find mc_grp in mc_group_fds array. +// if exist: add the fd to the list. +// if not: add the mc group to the array and the fd to the list +void add_fd_to_array(int fd, in_addr_t mc_grp, mc_group_fds_t * mc_group_fds, int * array_size) +{ + // Go over the mc_group_fds array + int i=0; + for (i=0; i < *array_size; i++) { + if (mc_grp == mc_group_fds[i].mc_grp) { + //add fd to the list + mc_group_fds[i].fd_list.push_back(fd); + return; + } + } + // the mc_group wasnt found + // Add this mc group to the array + mc_group_fds[i].mc_grp=mc_grp; + int fd1=fd; + mc_group_fds[i].fd_list.push_back(fd1); + (*array_size)++; +} + +void print_mc_group_fds(mc_group_fds_t * mc_group_fds, int array_size) +{ + printf("\n"); + printf("VMA Group Memberships Information\n"); + printf("Group fd number\n"); + printf("------------------------------\n"); + for (int i=0; i< array_size; i++) { + char mcg_str[256]; + /* cppcheck-suppress wrongPrintfScanfArgNum */ + sprintf(mcg_str, "[%d.%d.%d.%d]", NIPQUAD(mc_group_fds[i].mc_grp)); + printf("%-22s", mcg_str); + for (fd_list_t::iterator iter = mc_group_fds[i].fd_list.begin(); iter != mc_group_fds[i].fd_list.end(); iter++) { + printf("%d ", *iter); + } + printf("\n"); + } +} + +void show_mc_group_stats(mc_grp_info_t* p_mc_grp_info , socket_instance_block_t* p_instance, uint32_t num_of_obj) +{ + // keep array for all the mc addresses and their fds. + int array_size=0; + mc_group_fds_t *mc_group_fds = new mc_group_fds_t[num_of_obj*MC_TABLE_SIZE]; + if (!mc_group_fds) { + printf(CYCLES_SEPARATOR); + printf("Could not allocate enough memory\n"); + printf(CYCLES_SEPARATOR); + return; + } + // go over all the fds and fill the array + for (uint32_t i=0; i < num_of_obj; i++) { + size_t fd = (size_t)p_instance[i].skt_stats.fd; + if (p_instance[i].b_enabled && g_fd_mask[fd]) { + socket_stats_t* p_si_stats = &p_instance[i].skt_stats; + for (int grp_idx = 0; grp_idx < p_mc_grp_info->max_grp_num; grp_idx++) { + if (p_si_stats->mc_grp_map.test(grp_idx)) { + //printf("fd %d Member of = [%d.%d.%d.%d]\n",p_si_stats->fd, NIPQUAD(p_si_stats->mc_grp[grp_idx])); + add_fd_to_array(p_si_stats->fd, p_mc_grp_info->mc_grp_tbl[grp_idx].mc_grp, mc_group_fds, &array_size); + } + } + } + } + if (array_size > 0) + print_mc_group_fds(mc_group_fds, array_size); + printf(CYCLES_SEPARATOR); + + delete [] mc_group_fds; +} + +int print_app_name(int pid) +{ + char app_base_name[FILE_NAME_MAX_SIZE]; + + if (get_procname(pid, app_base_name, sizeof(app_base_name)) < 0) { + return -1; + } + printf("application: %s ", app_base_name); + + return 0; +} + +void print_version(int pid) +{ + if (pid == -1) { + log_msg("Linked with VMA version: %d.%d.%d.%d", VMA_LIBRARY_MAJOR, VMA_LIBRARY_MINOR, VMA_LIBRARY_REVISION, VMA_LIBRARY_RELEASE); + #ifdef VMA_SVN_REVISION + log_msg("Revision: %d", VMA_SVN_REVISION); + #endif + #ifdef VMA_DATE_TIME + log_msg("Build Date: %s", VMA_DATE_TIME); + #endif + } + else { + printf(MODULE_NAME ": stats for "); + if (print_app_name(pid) < 0) + printf("proccess "); + printf("with pid: %d\n", pid); + } +} + +int check_vma_ver_compatability(version_info_t* p_stat_ver_info) +{ + return (p_stat_ver_info->vma_lib_maj == VMA_LIBRARY_MAJOR && + p_stat_ver_info->vma_lib_min == VMA_LIBRARY_MINOR && + p_stat_ver_info->vma_lib_rel == VMA_LIBRARY_RELEASE && + p_stat_ver_info->vma_lib_rev == VMA_LIBRARY_REVISION); +} + +void cleanup(sh_mem_info* p_sh_mem_info) +{ + if (p_sh_mem_info == NULL) + return; + if (p_sh_mem_info->p_sh_stats != MAP_FAILED) + { + if (munmap(p_sh_mem_info->p_sh_stats, p_sh_mem_info->shmem_size) != 0) { + log_system_err("file='%s' sh_mem_info.fd_sh_stats=%d; error while munmap shared memory at [%p]\n", p_sh_mem_info->filename_sh_stats, p_sh_mem_info->fd_sh_stats, p_sh_mem_info->p_sh_stats); + } + } + close(p_sh_mem_info->fd_sh_stats); +} + +void stats_reader_sig_handler(int signum) +{ + switch (signum) { + case SIGINT: + log_msg("Got Ctrl-C (interrupted by user)"); + break; + default: + log_msg("Got signal %d - exiting", signum); + break; + } + g_b_exit = true; +} + +void set_signal_action() +{ + g_sigact.sa_handler = stats_reader_sig_handler; + sigemptyset(&g_sigact.sa_mask); + g_sigact.sa_flags = 0; + + sigaction(SIGINT, &g_sigact, NULL); +} + +void alloc_fd_mask() +{ + struct rlimit rlim; + if ((getrlimit(RLIMIT_NOFILE, &rlim) == 0) && ((uint32_t)rlim.rlim_max > g_fd_map_size)) + g_fd_map_size = rlim.rlim_max; + g_fd_mask = (uint8_t*)malloc(g_fd_map_size * sizeof(uint8_t)); + if (!g_fd_mask) + log_err("Failed to malloc g_fd_mask var\n"); +} + +void inc_read_counter(sh_mem_t* p_sh_mem) +{ + p_sh_mem->reader_counter++; +} + +void set_defaults() +{ + user_params.interval = DEFAULT_DELAY_SEC; + user_params.view_mode = DEFAULT_VIEW_MODE; + user_params.print_details_mode = DEFAULT_DETAILS_MODE; + user_params.proc_ident_mode = DEFAULT_PROC_IDENT_MODE; + user_params.vma_log_level = VLOG_INIT; + user_params.vma_details_level = INIT_VMA_LOG_DETAILS; + user_params.forbid_cleaning = false; + user_params.zero_counters = false; + user_params.write_auth = true; //needed to set read flag on + user_params.cycles = DEFAULT_CYCLES; + user_params.fd_dump = STATS_FD_STATISTICS_DISABLED; + user_params.fd_dump_log_level = STATS_FD_STATISTICS_LOG_LEVEL_DEFAULT; + user_params.vma_stats_path = MCE_DEFAULT_STATS_SHMEM_DIR; + + alloc_fd_mask(); + if (g_fd_mask) + memset((void*)g_fd_mask, 1, sizeof(uint8_t) * g_fd_map_size); +} + +bool check_if_process_running(char* pid_str) +{ + char proccess_proc_dir[FILE_NAME_MAX_SIZE] = {0}; + struct stat st; + int n = -1; + + n = snprintf(proccess_proc_dir, sizeof(proccess_proc_dir), "/proc/%s", pid_str); + if (likely((0 < n) && (n < (int)sizeof(proccess_proc_dir)))) { + return stat(proccess_proc_dir, &st) == 0; + } + return false; +} + +bool check_if_process_running(int pid) +{ + char pid_str[MAX_BUFF_SIZE] = {0}; + int n = -1; + + n = snprintf(pid_str, sizeof(pid_str), "%d", pid); + if (likely((0 < n) && (n < (int)sizeof(pid_str)))) { + return check_if_process_running(pid_str); + } + return false; +} + +void stats_reader_handler(sh_mem_t* p_sh_mem, int pid) +{ + int ret; + int num_act_inst = 0; + int cycles = user_params.cycles ? user_params.cycles : -1; + int printed_line_num = SCREEN_SIZE; + struct timespec start, end; + bool proc_running = true; + socket_instance_block_t *prev_instance_blocks; + socket_instance_block_t *curr_instance_blocks; + cq_instance_block_t prev_cq_blocks[NUM_OF_SUPPORTED_CQS]; + cq_instance_block_t curr_cq_blocks[NUM_OF_SUPPORTED_CQS]; + ring_instance_block_t prev_ring_blocks[NUM_OF_SUPPORTED_RINGS]; + ring_instance_block_t curr_ring_blocks[NUM_OF_SUPPORTED_RINGS]; + bpool_instance_block_t prev_bpool_blocks[NUM_OF_SUPPORTED_BPOOLS]; + bpool_instance_block_t curr_bpool_blocks[NUM_OF_SUPPORTED_BPOOLS]; + iomux_stats_t prev_iomux_blocks; + iomux_stats_t curr_iomux_blocks; + + if (user_params.fd_dump != STATS_FD_STATISTICS_DISABLED) { + if (user_params.fd_dump) + log_msg("Dumping Fd %d to VMA using log level = %s...", user_params.fd_dump, log_level::to_str(user_params.fd_dump_log_level)); + else + log_msg("Dumping all Fd's to VMA using log level = %s...", log_level::to_str(user_params.fd_dump_log_level)); + return; + } + + prev_instance_blocks = (socket_instance_block_t*)malloc(sizeof(*prev_instance_blocks) * p_sh_mem->max_skt_inst_num); + if (NULL == prev_instance_blocks) { + return ; + } + curr_instance_blocks = (socket_instance_block_t*)malloc(sizeof(*curr_instance_blocks) * p_sh_mem->max_skt_inst_num); + if (NULL == curr_instance_blocks) { + free(prev_instance_blocks); + return ; + } + + memset((void*)prev_instance_blocks,0, sizeof(socket_instance_block_t) * p_sh_mem->max_skt_inst_num); + memset((void*)curr_instance_blocks,0, sizeof(socket_instance_block_t) * p_sh_mem->max_skt_inst_num); + memset((void*)prev_cq_blocks,0, sizeof(cq_instance_block_t) * NUM_OF_SUPPORTED_CQS); + memset((void*)curr_cq_blocks,0, sizeof(cq_instance_block_t) * NUM_OF_SUPPORTED_CQS); + memset((void*)prev_ring_blocks,0, sizeof(ring_instance_block_t) * NUM_OF_SUPPORTED_RINGS); + memset((void*)curr_ring_blocks,0, sizeof(ring_instance_block_t) * NUM_OF_SUPPORTED_RINGS); + memset((void*)prev_bpool_blocks,0, sizeof(bpool_instance_block_t) * NUM_OF_SUPPORTED_BPOOLS); + memset((void*)curr_bpool_blocks,0, sizeof(bpool_instance_block_t) * NUM_OF_SUPPORTED_BPOOLS); + memset(&prev_iomux_blocks,0, sizeof(prev_iomux_blocks)); + memset(&curr_iomux_blocks,0, sizeof(curr_iomux_blocks)); + + if (user_params.print_details_mode == e_deltas) { + memcpy((void*)prev_instance_blocks,(void*)p_sh_mem->skt_inst_arr, p_sh_mem->max_skt_inst_num * sizeof(socket_instance_block_t)); + memcpy((void*)prev_cq_blocks,(void*)p_sh_mem->cq_inst_arr, NUM_OF_SUPPORTED_CQS * sizeof(cq_instance_block_t)); + memcpy((void*)prev_ring_blocks,(void*)p_sh_mem->ring_inst_arr, NUM_OF_SUPPORTED_RINGS * sizeof(ring_instance_block_t)); + memcpy((void*)prev_bpool_blocks,(void*)p_sh_mem->bpool_inst_arr, NUM_OF_SUPPORTED_BPOOLS * sizeof(bpool_instance_block_t)); + prev_iomux_blocks = curr_iomux_blocks; + uint64_t delay_int_micro = SEC_TO_MICRO(user_params.interval); + if (!g_b_exit && check_if_process_running(pid)){ + usleep(delay_int_micro); + } + } + + set_signal_action(); + + while (!g_b_exit && proc_running && cycles) + { + --cycles; + + if (gettime(&start)) { + log_system_err("gettime()"); + goto out; + } + + if (user_params.print_details_mode == e_deltas) { + memcpy((void*)curr_instance_blocks,(void*)p_sh_mem->skt_inst_arr, p_sh_mem->max_skt_inst_num * sizeof(socket_instance_block_t)); + memcpy((void*)curr_cq_blocks,(void*)p_sh_mem->cq_inst_arr, NUM_OF_SUPPORTED_CQS * sizeof(cq_instance_block_t)); + memcpy((void*)curr_ring_blocks,(void*)p_sh_mem->ring_inst_arr, NUM_OF_SUPPORTED_RINGS * sizeof(ring_instance_block_t)); + memcpy((void*)curr_bpool_blocks,(void*)p_sh_mem->bpool_inst_arr, NUM_OF_SUPPORTED_BPOOLS * sizeof(bpool_instance_block_t)); + curr_iomux_blocks = p_sh_mem->iomux; + } + switch (user_params.view_mode) { + case e_full: + ret = system("clear"); + NOT_IN_USE(ret); + break; + case e_mc_groups: + show_mc_group_stats(&p_sh_mem->mc_info, p_sh_mem->skt_inst_arr, p_sh_mem->max_skt_inst_num); + goto out; + break; + default: + break; + } + switch (user_params.print_details_mode) { + case e_totals: + num_act_inst = show_socket_stats(p_sh_mem->skt_inst_arr, NULL, p_sh_mem->max_skt_inst_num, &printed_line_num, &p_sh_mem->mc_info, pid); + show_iomux_stats(&p_sh_mem->iomux, NULL, &printed_line_num); + if (user_params.view_mode == e_full) { + show_cq_stats(p_sh_mem->cq_inst_arr,NULL); + show_ring_stats(p_sh_mem->ring_inst_arr,NULL); + show_bpool_stats(p_sh_mem->bpool_inst_arr,NULL); + } + break; + case e_deltas: + num_act_inst = show_socket_stats(curr_instance_blocks, prev_instance_blocks, p_sh_mem->max_skt_inst_num, &printed_line_num, &p_sh_mem->mc_info, pid); + show_iomux_stats(&curr_iomux_blocks, &prev_iomux_blocks, &printed_line_num); + if (user_params.view_mode == e_full) { + show_cq_stats(curr_cq_blocks, prev_cq_blocks); + show_ring_stats(curr_ring_blocks, prev_ring_blocks); + show_bpool_stats(curr_bpool_blocks, prev_bpool_blocks); + } + memcpy((void*)prev_instance_blocks,(void*)curr_instance_blocks, p_sh_mem->max_skt_inst_num * sizeof(socket_instance_block_t)); + memcpy((void*)prev_cq_blocks,(void*)curr_cq_blocks, NUM_OF_SUPPORTED_CQS * sizeof(cq_instance_block_t)); + memcpy((void*)prev_ring_blocks,(void*)curr_ring_blocks, NUM_OF_SUPPORTED_RINGS * sizeof(ring_instance_block_t)); + prev_iomux_blocks = curr_iomux_blocks; + break; + default: + break; + } + if (user_params.view_mode == e_netstat_like) + break; + if (num_act_inst) { + printf(CYCLES_SEPARATOR); + printed_line_num++; + } + if (gettime(&end)) { + log_system_err("gettime()"); + goto out; + } + uint64_t delay_int_micro = SEC_TO_MICRO(user_params.interval); + uint64_t adjasted_delay = delay_int_micro - TIME_DIFF_in_MICRO(start, end); + if (!g_b_exit && proc_running){ + if (cycles) { + usleep(adjasted_delay); + } + inc_read_counter(p_sh_mem); + } + proc_running = check_if_process_running(pid); + } + if (!proc_running) + log_msg("Proccess %d ended - exiting", pid); + +out: + free(prev_instance_blocks); + free(curr_instance_blocks); +} + +bool check_if_app_match(char* app_name, char* pid_str) +{ + char app_full_name[PATH_MAX] = {0}; + char proccess_proc_dir[FILE_NAME_MAX_SIZE] = {0}; + char* app_base_name = NULL; + int n = -1; + + n = snprintf(proccess_proc_dir, sizeof(proccess_proc_dir), "/proc/%s/exe", pid_str); + if (likely((0 < n) && (n < (int)sizeof(proccess_proc_dir)))) { + n = readlink(proccess_proc_dir, app_full_name, sizeof(app_full_name) - 1); + if (n > 0) { + app_full_name[n] = '\0'; + app_base_name = strrchr(app_full_name, '/'); + if (app_base_name) { + return strcmp((app_base_name + 1), app_name) == 0; + } + } + } + + return false; +} + +void clean_inactive_sh_ibj() +{ + DIR *dir; + struct dirent *dirent; + int module_name_size = strlen(MODULE_NAME); + int pid_offset = module_name_size + 1; + + dir = opendir(user_params.vma_stats_path.c_str()); + if (dir == NULL){ + log_system_err("opendir %s failed\n", user_params.vma_stats_path.c_str()); + return; + } + dirent = readdir(dir); + while (dirent != NULL && !user_params.forbid_cleaning) { + if(!strncmp("vmastat.", dirent->d_name, module_name_size)) { + bool proccess_running = false; + proccess_running = check_if_process_running(dirent->d_name + pid_offset); + if (!proccess_running) { + char to_delete[PATH_MAX + 1] = {0}; + int n = -1; + + n = snprintf(to_delete, sizeof(to_delete), "%s/%s", user_params.vma_stats_path.c_str(), dirent->d_name); + if (likely((0 < n) && (n < (int)sizeof(to_delete)))) { + unlink(to_delete); + } + } + } + dirent = readdir(dir); + } + closedir(dir); +} + +char* look_for_vma_stat_active_sh_obj(char* app_name) +{ + DIR *dir; + struct dirent *dirent; + bool found = false; + char* sh_file_name = NULL; + int module_name_size = strlen(MODULE_NAME); + int pid_offset = module_name_size + 1; + + dir = opendir(user_params.vma_stats_path.c_str()); + if (dir == NULL){ + log_system_err("opendir %s failed\n", user_params.vma_stats_path.c_str()); + return NULL; + } + dirent = readdir(dir); + + while (dirent != NULL && !found) { + if(!strncmp("vmastat.", dirent->d_name, module_name_size)) { + found = check_if_process_running(dirent->d_name + pid_offset); + if (app_name && found) + found = check_if_app_match(app_name, dirent->d_name + pid_offset); + if (found) { + sh_file_name = (char*)calloc(FILE_NAME_MAX_SIZE,sizeof(char)); + if (!sh_file_name) { + log_err("Failed to malloc sh_file_name var\n"); + closedir(dir); + return NULL; + } + strcpy(sh_file_name,dirent->d_name + pid_offset); + } + } + dirent = readdir(dir); + } + closedir(dir); + return sh_file_name; +} + +int update_range_of_fds(char* left_str, char* right_str) +{ + int left = 0; + int right = 0; + + errno = 0; + left = strtol(left_str, NULL, 0); + if (errno != 0 || left < 0 || (uint32_t)left > g_fd_map_size) { + log_err("Invalid fd val: %s", left_str); + return 1; + } + + if (right_str) { + right = strtol(right_str, NULL, 0); + if (errno != 0 || right < 0 || (uint32_t)right > g_fd_map_size) { + log_err("Invalid fd val: %s", right_str); + return 1; + } + } + else { + right = left; + } + + if ( right < left) { + swap(right, left); + } + + for (int i = left; i <= right; i++) + g_fd_mask[i] = 1; + + return 0; +} + +int analize_fds_range(char* range) +{ + char* left = range; + char* right = NULL; + char* delim_loc = NULL; + char range_copy[101]; + + if (strlen(range) + 1 > sizeof(range_copy)) { + log_err("Invalid fd val size : %zu, cannot exceed %zu", strlen(range), sizeof(range_copy) - 1); + return 1; + } + + strncpy(range_copy, range, sizeof(range_copy) - 1); + range_copy[sizeof(range_copy) - 1] = '\0'; + delim_loc = strchr(range_copy, '-'); + + if (delim_loc != NULL) { + right = delim_loc + 1; + *delim_loc = '\0'; + left = range; + } + return update_range_of_fds(left, right); +} + +int update_fds_mask(char* fds_list) +{ + memset((void*)g_fd_mask, 0 , sizeof(uint8_t) * g_fd_map_size); + char delims[] = ","; + char *curr_fds_range = NULL; + curr_fds_range = strtok(fds_list, delims); + while( curr_fds_range != NULL ) { + if (analize_fds_range(curr_fds_range)) + return 1; + curr_fds_range = strtok(NULL, delims); + } + return 0; +} + +void zero_socket_stats(socket_stats_t* p_socket_stats) +{ + memset((void*)&p_socket_stats->counters, 0, sizeof(socket_counters_t)); +} + +void zero_iomux_stats(iomux_stats_t* p_iomux_stats) +{ + memset(&p_iomux_stats->select, 0, sizeof(iomux_func_stats_t)); + memset(&p_iomux_stats->poll, 0, sizeof(iomux_func_stats_t)); + for (int i=0; iepoll[i].enabled) + memset((&p_iomux_stats->epoll[i].stats), 0, sizeof(iomux_func_stats_t)); + } + + //memset(p_iomux_stats, 0, sizeof(*p_iomux_stats)); +} + +void zero_ring_stats(ring_stats_t* p_ring_stats) +{ + p_ring_stats->n_rx_pkt_count = 0; + p_ring_stats->n_rx_byte_count = 0; + p_ring_stats->n_tx_pkt_count = 0; + p_ring_stats->n_tx_byte_count = 0; + p_ring_stats->n_tx_retransmits = 0; + if (p_ring_stats->n_type == RING_TAP) { + p_ring_stats->tap.n_vf_plugouts = 0; + } + else { + p_ring_stats->simple.n_rx_interrupt_received = 0; + p_ring_stats->simple.n_rx_interrupt_requests = 0; + p_ring_stats->simple.n_tx_dev_mem_byte_count = 0; + p_ring_stats->simple.n_tx_dev_mem_pkt_count = 0; + p_ring_stats->simple.n_tx_dev_mem_oob = 0; + } +} + +void zero_cq_stats(cq_stats_t* p_cq_stats) +{ + p_cq_stats->n_rx_pkt_drop = 0; + p_cq_stats->n_rx_drained_at_once_max = 0; +} + +void zero_bpool_stats(bpool_stats_t* p_bpool_stats) +{ + p_bpool_stats->n_buffer_pool_size = 0; + p_bpool_stats->n_buffer_pool_no_bufs = 0; +} + +void zero_counters(sh_mem_t* p_sh_mem) +{ + log_msg("Zero counters..."); + for (size_t i=0; i < p_sh_mem->max_skt_inst_num; i++) { + size_t fd = (size_t)p_sh_mem->skt_inst_arr[i].skt_stats.fd; + if (p_sh_mem->skt_inst_arr[i].b_enabled && g_fd_mask[fd]){ + zero_socket_stats(&p_sh_mem->skt_inst_arr[i].skt_stats); + } + } + zero_iomux_stats(&p_sh_mem->iomux); + + for (int i = 0; i < NUM_OF_SUPPORTED_CQS; i++) { + zero_cq_stats(&p_sh_mem->cq_inst_arr[i].cq_stats); + } + for (int i = 0; i < NUM_OF_SUPPORTED_RINGS; i++) { + zero_ring_stats(&p_sh_mem->ring_inst_arr[i].ring_stats); + } + for (int i = 0; i < NUM_OF_SUPPORTED_BPOOLS; i++) { + zero_bpool_stats(&p_sh_mem->bpool_inst_arr[i].bpool_stats); + } +} + +int get_pid(char* proc_desc, char* argv0) +{ + char* app_name = NULL; + int pid = -1; + + if (NULL == proc_desc) { + return -1; + } + + if (user_params.proc_ident_mode == e_by_pid_str) { + errno = 0; + pid = strtol(proc_desc, NULL, 0); + if (errno != 0 || pid < 0) { + log_err("'-p' Invalid pid val: %s", proc_desc); + usage(argv0); + cleanup(NULL); + pid = -1; + } + } + else { + if (user_params.proc_ident_mode == e_by_app_name) + app_name = proc_desc; + + char* pid_str = look_for_vma_stat_active_sh_obj(app_name); + if (pid_str) { + errno = 0; + pid = strtol(pid_str, NULL, 0); + if (errno != 0) { + log_system_err("Failed to convert:%s", pid_str); + cleanup(NULL); + pid = -1; + }; + free(pid_str); + } + else { + log_err("Failed to identify process please provide pid of active proccess...\n"); + } + } + + return pid; +} + +void set_dumping_data(sh_mem_t* p_sh_mem) +{ + p_sh_mem->fd_dump = user_params.fd_dump; + p_sh_mem->fd_dump_log_level = user_params.fd_dump_log_level; +} + +void set_vma_log_level(sh_mem_t* p_sh_mem) +{ + p_sh_mem->log_level = user_params.vma_log_level; +} + + +void set_vma_log_details_level(sh_mem_t* p_sh_mem) +{ + p_sh_mem->log_details_level = (int)user_params.vma_details_level; +} + +//////////////////forward declarations ///////////////////////////// +void get_all_processes_pids(std::vector &pids); +int print_processes_stats(const std::vector &pids); + +//////////////////////////////////////////////////////////////////// +int main (int argc, char **argv) +{ + char proc_desc[MAX_BUFF_SIZE] = {0}; + + set_defaults(); + if (!g_fd_mask) return 1; + + while (1) { + int c = 0; + + static struct option long_options[] = { + {"interval", 1, NULL, 'i'}, + {"cycles", 1, NULL, 'c'}, + {"view", 1, NULL, 'v'}, + {"details", 1, NULL, 'd'}, + {"pid", 1, NULL, 'p'}, + {"directory", 1, NULL, 'k'}, + {"sockets", 1, NULL, 's'}, + {"version", 0, NULL, 'V'}, + {"zero", 0, NULL, 'z'}, + {"log_level", 1, NULL, 'l'}, + {"fd_dump", 1, NULL, 'S'}, + {"details_level", 1, NULL, 'D'}, + {"name", 1, NULL, 'n'}, + {"find_pid", 0, NULL, 'f'}, + {"forbid_clean", 0, NULL, 'F'}, + {"help", 0, NULL, 'h'}, + {0,0,0,0} + }; + + if ((c = getopt_long(argc, argv, "i:c:v:d:p:k:s:Vzl:S:D:n:fFh?", long_options, NULL)) == -1) + break; + + switch (c) { + case 'i': { + errno = 0; + int interval = strtol(optarg, NULL, 0); + if (errno != 0 || interval < 0) { + log_err("'-%c' Invalid interval val: %s", c,optarg); + usage(argv[0]); + cleanup(NULL); + return 1; + } + user_params.interval = interval; + } + break; + case 'c': { + errno = 0; + int cycles = strtol(optarg, NULL, 0); + if (errno != 0 || cycles < 0) { + log_err("'-%c' Invalid cycles val: %s", c,optarg); + usage(argv[0]); + cleanup(NULL); + return 1; + } + user_params.cycles = cycles; + } + break; + case 'v': { + errno = 0; + int view_mod = 0; + view_mod = strtol(optarg, NULL, 0); + if (errno != 0 || view_mod < 1 || view_mod > VIEW_MODES_NUM) { + log_err("'-%c' Invalid view val: %s", c,optarg); + usage(argv[0]); + cleanup(NULL); + return 1; + } + user_params.view_mode = (view_mode_t)view_mod; + } + break; + case 'd': { + errno = 0; + int detail_mode = strtol(optarg, NULL, 0); + if (errno != 0 || detail_mode < 1 || detail_mode > PRINT_DETAILS_MODES_NUM) { + log_err("'-%c' Invalid details val: %s", c,optarg); + usage(argv[0]); + cleanup(NULL); + return 1; + } + user_params.print_details_mode = (print_details_mode_t)detail_mode; + } + break; + case 'p': + user_params.proc_ident_mode = e_by_pid_str; + strncpy(proc_desc, optarg, sizeof(proc_desc) - 1); + proc_desc[sizeof(proc_desc) - 1] = '\0'; + break; + case 'k': + user_params.vma_stats_path = std::string((char*)optarg); + break; + case 's': { + if (update_fds_mask(optarg)) { + usage(argv[0]); + cleanup(NULL); + return 1; + } + } + break; + case 'V': + print_version(-1); + cleanup(NULL); + return 0; + case 'z': + user_params.write_auth = true; + user_params.zero_counters = true; + break; + case 'l': { + vlog_levels_t log_level = log_level::from_str(optarg, VLOG_INIT); + if (log_level == VLOG_INIT) { + log_err("'-%c' Invalid log level val: %s", c,optarg); + usage(argv[0]); + cleanup(NULL); + return 1; + } + user_params.write_auth = true; + user_params.vma_log_level = log_level; + } + break; + case 'S': { + errno = 0; + optind--; + int fd_to_dump = strtol(argv[optind], NULL, 0); + if (errno != 0 || fd_to_dump < 0) { + log_err("'-%c' Invalid fd val: %s", c, argv[optind]); + usage(argv[0]); + cleanup(NULL); + return 1; + } + user_params.fd_dump = fd_to_dump; + if (++optind < argc && *argv[optind] != '-') { + vlog_levels_t dump_log_level = log_level::from_str(argv[optind], VLOG_INIT); + if (dump_log_level == VLOG_INIT) { + log_err("'-%c' Invalid log level val: %s", c, argv[optind]); + usage(argv[0]); + cleanup(NULL); + return 1; + } + user_params.fd_dump_log_level = dump_log_level; + } + } + break; + case 'D': { + errno = 0; + int details_level = 0; + details_level = strtol(optarg, NULL, 0); + if (errno != 0 || details_level < 0 || details_level >= VLOG_DETAILS_NUM) { + log_err("'-%c' Invalid details level val: %s", c,optarg); + usage(argv[0]); + cleanup(NULL); + return 1; + } + user_params.write_auth = true; + user_params.vma_details_level = details_level; + } + break; + case 'n': + user_params.proc_ident_mode = e_by_app_name; + strncpy(proc_desc, optarg, sizeof(proc_desc) - 1); + proc_desc[sizeof(proc_desc) - 1] = '\0'; + break; + case 'f': + user_params.proc_ident_mode = e_by_runn_proccess; + break; + case 'F': + user_params.forbid_cleaning = true; + break; + case '?': + case 'h': + usage(argv[0]); + return 0; + break; + default: + usage(argv[0]); + cleanup(NULL); + return 1; + } + } + + clean_inactive_sh_ibj(); + + std::vector pids; + if(user_params.view_mode == e_netstat_like) { + get_all_processes_pids(pids); + } + else { + int pid = get_pid(proc_desc, argv[0]); + if (pid != -1) pids.push_back(pid); + } + + if ( pids.size() == 0 ){ + free(g_fd_mask); + if(user_params.view_mode == e_netstat_like) { + print_headers(); + return 0; + } + else { + usage(argv[0]); + return 1; + } + } + + if(user_params.view_mode == e_netstat_like) + user_params.cycles =1;// print once and exit + + int ret = print_processes_stats(pids); + + free(g_fd_mask); + return ret; +} + +///////////////////////////////// +int init_print_process_stats(sh_mem_info_t & sh_mem_info) +{ + sh_mem_t* sh_mem; + int pid = sh_mem_info.pid; + + sprintf(sh_mem_info.filename_sh_stats, "%s/vmastat.%d", user_params.vma_stats_path.c_str(), pid); + + if (user_params.write_auth)//S_IRUSR | S_IWUSR | S_IRGRP + sh_mem_info.fd_sh_stats = open(sh_mem_info.filename_sh_stats, + O_RDWR, __S_IREAD | __S_IWRITE| S_IROTH); + else + sh_mem_info.fd_sh_stats = open(sh_mem_info.filename_sh_stats, O_RDONLY); + + if (sh_mem_info.fd_sh_stats < 0) { + log_err("VMA statistics data for process id %d not found\n", pid); + return 1; + } + sh_mem_info.p_sh_stats = mmap(0, sizeof(sh_mem_t), PROT_READ, MAP_SHARED, sh_mem_info.fd_sh_stats, 0); + MAP_SH_MEM(sh_mem,sh_mem_info.p_sh_stats); + if (sh_mem_info.p_sh_stats == MAP_FAILED) { + log_system_err("MAP_FAILED - %s\n", strerror (errno)); + close(sh_mem_info.fd_sh_stats); + return 1; + } + + int version_check = 1; + if (sizeof(STATS_PROTOCOL_VER) > 1) { + if (memcmp(sh_mem->stats_protocol_ver, STATS_PROTOCOL_VER, min(sizeof(sh_mem->stats_protocol_ver), sizeof(STATS_PROTOCOL_VER)))) { + log_err("Version %s is not compatible with stats protocol version %s\n", + STATS_PROTOCOL_VER, sh_mem->stats_protocol_ver); + version_check = 0; + } + } else { + if (!check_vma_ver_compatability(&sh_mem->ver_info)) { + log_err("Version %d.%d.%d.%d is not compatible with VMA version %d.%d.%d.%d\n", + VMA_LIBRARY_MAJOR, VMA_LIBRARY_MINOR, + VMA_LIBRARY_REVISION, VMA_LIBRARY_RELEASE, + sh_mem->ver_info.vma_lib_maj, sh_mem->ver_info.vma_lib_min, + sh_mem->ver_info.vma_lib_rev, sh_mem->ver_info.vma_lib_rel); + version_check = 0; + } + } + if (!version_check) { + if (munmap(sh_mem_info.p_sh_stats, sizeof(sh_mem_t)) != 0) { + log_system_err("file='%s' sh_mem_info.fd_sh_stats=%d; error while munmap shared memory at [%p]\n", sh_mem_info.filename_sh_stats, sh_mem_info.fd_sh_stats, sh_mem_info.p_sh_stats); + } + close(sh_mem_info.fd_sh_stats); + return 1; + } + + sh_mem_info.shmem_size = SHMEM_STATS_SIZE(sh_mem->max_skt_inst_num); + if (munmap(sh_mem_info.p_sh_stats, sizeof(sh_mem_t)) != 0) { + log_system_err("file='%s' sh_mem_info.fd_sh_stats=%d; error while munmap shared memory at [%p]\n", sh_mem_info.filename_sh_stats, sh_mem_info.fd_sh_stats, sh_mem_info.p_sh_stats); + } + if (user_params.write_auth) + sh_mem_info.p_sh_stats = mmap(0, sh_mem_info.shmem_size, PROT_WRITE|PROT_READ, MAP_SHARED, sh_mem_info.fd_sh_stats, 0); + else + sh_mem_info.p_sh_stats = mmap(0, sh_mem_info.shmem_size, PROT_READ, MAP_SHARED, sh_mem_info.fd_sh_stats, 0); + + if (sh_mem_info.p_sh_stats == MAP_FAILED) { + log_system_err("MAP_FAILED - %s\n", strerror (errno)); + close(sh_mem_info.fd_sh_stats); + return 1; + } + MAP_SH_MEM(sh_mem,sh_mem_info.p_sh_stats); + if(user_params.view_mode != e_netstat_like) + print_version(pid); + if (user_params.zero_counters == true) + zero_counters(sh_mem); + if (user_params.vma_log_level != VLOG_INIT) + set_vma_log_level(sh_mem); + if (user_params.vma_details_level != INIT_VMA_LOG_DETAILS) + set_vma_log_details_level(sh_mem); + if (user_params.fd_dump != STATS_FD_STATISTICS_DISABLED) + set_dumping_data(sh_mem); + + + // here we indicate VMA to write to shmem + inc_read_counter(sh_mem); + return 0; +} + +//////////////////////////////////////////////////////////////////// +int complete_print_process_stats(sh_mem_info_t & sh_mem_info) +{ + sh_mem_t* sh_mem; + MAP_SH_MEM(sh_mem,sh_mem_info.p_sh_stats); + + stats_reader_handler(sh_mem, sh_mem_info.pid); + cleanup(&sh_mem_info); + return 0; +} + + +/////////////////////////// +void get_all_processes_pids(std::vector &pids) +{ + const int MODULE_NAME_SIZE = strlen(MODULE_NAME); + const int PID_OFFSET = MODULE_NAME_SIZE + 1; + + DIR *dir = opendir(user_params.vma_stats_path.c_str()); + if (dir == NULL){ + log_system_err("opendir %s failed\n", user_params.vma_stats_path.c_str()); + return; + } + + for( struct dirent *dirent = readdir(dir); dirent != NULL ; dirent = readdir(dir) ) { + if(!strncmp("vmastat.", dirent->d_name, MODULE_NAME_SIZE)) { + char* pid_str = dirent->d_name + PID_OFFSET; + if (check_if_process_running(pid_str)) { + errno = 0; + int pid = strtol(pid_str, NULL, 0); + if (errno == 0) { + pids.push_back(pid); + } + else { + log_system_err("Failed to convert:%s", pid_str); + } + } + } + } + closedir(dir); +} + +/////////////////////////// +int print_processes_stats(const std::vector &pids) +{ + const int SIZE = pids.size(); + + int num_instances = 0; + sh_mem_info_t sh_mem_info[SIZE]; + + // 1. N * prepare shmem and indicate VMA to update shmem + for (int i = 0; i < SIZE; ++i){ + sh_mem_info[num_instances].pid = pids[i]; + if (0 == init_print_process_stats(sh_mem_info[num_instances])) + ++num_instances; + } + + // 2. one sleep to rule them all + usleep(STATS_READER_DELAY * 1000);// After 'init_print_process_stats' we wait for VMA publisher to recognize + // that we asked for statistics, otherwise, the first read will be zero + + // 3. N * read from shmem, write to user, and shmem cleanup + for (int i = 0; i < num_instances; ++i) + complete_print_process_stats(sh_mem_info[i]); + + return 0; +} diff --git a/src/utils/Makefile.am b/src/utils/Makefile.am new file mode 100644 index 0000000..f6412c9 --- /dev/null +++ b/src/utils/Makefile.am @@ -0,0 +1,16 @@ +AM_CPPFLAGS := -I$(top_srcdir)/src + +noinst_LTLIBRARIES = libutils.la +libutils_la_LDFLAGS = -static +libutils_la_LIBADD = -lrt +libutils_la_SOURCES = \ + asm-arm64.h \ + asm-ppc64.h \ + asm-x86.h \ + asm.h \ + atomic.h \ + bullseye.h \ + clock.h \ + lock_wrapper.h \ + rdtsc.h \ + types.h diff --git a/src/utils/asm-arm64.h b/src/utils/asm-arm64.h new file mode 100644 index 0000000..7d2d7bd --- /dev/null +++ b/src/utils/asm-arm64.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef ASMARM64_H_ +#define ASMARM64_H_ + +#include +#include + +#define COPY_64B_NT(dst, src) \ + *dst++ = *src++; \ + *dst++ = *src++; \ + *dst++ = *src++; \ + *dst++ = *src++; \ + *dst++ = *src++; \ + *dst++ = *src++; \ + *dst++ = *src++; \ + *dst++ = *src++ + +#define mb() asm volatile("dsb sy" ::: "memory") +#define rmb() asm volatile("dsb ld" ::: "memory") +#define wmb() asm volatile("dsb st" ::: "memory") +#define wc_wmb() wmb() + +/** + * Add to the atomic variable. + * @param i integer value to add. + * @param v pointer of type atomic_t. + * @return Value before add. + */ +static inline int atomic_fetch_and_add(int i, volatile int *ptr) +{ + return __atomic_fetch_add(ptr, i, __ATOMIC_ACQUIRE); +} + +/** + * Read RDTSC register + */ +static inline void gettimeoftsc(unsigned long long *p_tscval) +{ + // Read Time Stamp Counter + asm volatile("isb" : : : "memory"); + asm volatile("mrs %0, cntvct_el0" : "=r" ((unsigned long long)*p_tscval)); +} + +/** + * Cache Line Prefetch - Arch specific! + */ +#ifndef L1_CACHE_BYTES +#define L1_CACHE_BYTES 64 +#endif + +static inline void prefetch(void *x) +{ + //__builtin_prefetch(); + asm volatile("prfm pldl1keep, %a0\n" : : "p" (x)); +} + +static inline void prefetch_range(void *addr, size_t len) +{ + char *cp = (char*)addr; + char *end = (char*)addr + len; + for (; cp < end; cp += L1_CACHE_BYTES) + prefetch(cp); +} + + + +#endif diff --git a/src/utils/asm-ppc64.h b/src/utils/asm-ppc64.h new file mode 100644 index 0000000..9ed5802 --- /dev/null +++ b/src/utils/asm-ppc64.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef ASMPPC64_H_ +#define ASMPPC64_H_ + +#include +#include + +#define COPY_64B_NT(dst, src) \ + *dst++ = *src++; \ + *dst++ = *src++; \ + *dst++ = *src++; \ + *dst++ = *src++; \ + *dst++ = *src++; \ + *dst++ = *src++; \ + *dst++ = *src++; \ + *dst++ = *src++ + +#define mb() asm volatile("sync" ::: "memory") +#define rmb() asm volatile("lwsync" ::: "memory") +#define wmb() rmb() +#define wc_wmb() mb() + +/** + * Add to the atomic variable. + * @param i integer value to add. + * @param v pointer of type atomic_t. + * @return Value before add. + */ +static inline int atomic_fetch_and_add(int i, volatile int *ptr) +{ +#ifdef __ATOMIC_ACQUIRE + return __atomic_fetch_add(ptr, i, __ATOMIC_ACQUIRE); +#else + return __sync_fetch_and_add(ptr, i); +#endif +} + + +/** + * Read RDTSC register + */ +static inline void gettimeoftsc(unsigned long long *p_tscval) +{ + asm volatile ("mftb %0" : "=r" (*p_tscval) : ); +} + +/** + * Cache Line Prefetch - Arch specific! + */ +#ifndef L1_CACHE_BYTES +#define L1_CACHE_BYTES 128 +#endif + +static inline void prefetch(void *x) +{ + //__builtin_prefetch(); + __asm__ __volatile__ ("dcbt 0,%0,1" : : "r" (x)); +} + +static inline void prefetch_range(void *addr, size_t len) +{ + char *cp = (char*)addr; + char *end = (char*)addr + len; + for (; cp < end; cp += L1_CACHE_BYTES) + prefetch(cp); +} + + + +#endif diff --git a/src/utils/asm-x86.h b/src/utils/asm-x86.h new file mode 100644 index 0000000..5dfb6f4 --- /dev/null +++ b/src/utils/asm-x86.h @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef ASMX86_H_ +#define ASMX86_H_ + +#include +#include +#include "utils/bullseye.h" + +#define __xg(x) ((volatile long *)(x)) + +#define mb() asm volatile("" ::: "memory") +#define rmb() mb() +#define wmb() asm volatile("" ::: "memory") +#define wc_wmb() asm volatile("sfence" ::: "memory") + +#define COPY_64B_NT(dst, src) \ + __asm__ __volatile__ ( \ + " movdqa (%1),%%xmm0\n" \ + " movdqa 16(%1),%%xmm1\n" \ + " movdqa 32(%1),%%xmm2\n" \ + " movdqa 48(%1),%%xmm3\n" \ + " movntdq %%xmm0, (%0)\n" \ + " movntdq %%xmm1, 16(%0)\n" \ + " movntdq %%xmm2, 32(%0)\n" \ + " movntdq %%xmm3, 48(%0)\n" \ + : : "r" (dst), "r" (src) : "memory"); \ + dst += 8; \ + src += 8 + +#if _BullseyeCoverage + #pragma BullseyeCoverage off +#endif +/** + * Atomic swap + */ +static inline unsigned long xchg(unsigned long x, volatile void *ptr) +{ + __asm__ __volatile__("xchg %0,%1" + :"=r" (x) + :"m" (*__xg(ptr)), "0" (x) + :"memory"); + return x; +} + +/** + * Atomic compare-and-swap + */ +static inline bool cmpxchg(unsigned long old_value, unsigned long new_value, volatile void *ptr) +{ + unsigned long prev_value = old_value; + __asm__ __volatile__("lock; cmpxchg %1,%2" + : "=a"(prev_value) + : "r"(new_value), "m"(*__xg(ptr)), "0"(old_value) + : "memory"); + return prev_value == old_value; +} +#if _BullseyeCoverage + #pragma BullseyeCoverage on +#endif + +/** + * Add to the atomic variable. + * @param i integer value to add. + * @param v pointer of type atomic_t. + * @return Value before add. + */ +static inline int atomic_fetch_and_add(int x, volatile int *ptr) +{ + __asm__ __volatile__("lock; xaddl %0,%1" + : "=r"(x) + : "m"(*ptr), "0"(x) + : "memory"); + return x; +} + +/** + * Read RDTSC register + */ +static inline void gettimeoftsc(unsigned long long *p_tscval) +{ + register uint32_t upper_32, lower_32; + + // ReaD Time Stamp Counter (RDTCS) + __asm__ __volatile__("rdtsc" : "=a" (lower_32), "=d" (upper_32)); + + // Copy to user + *p_tscval = (((unsigned long long)upper_32) << 32) | lower_32; +} + +/** + * Cache Line Prefetch - Arch specific! + */ +#ifndef L1_CACHE_BYTES +#define L1_CACHE_BYTES 64 +#endif + +static inline void prefetch(void *x) +{ + #if defined __i386__ || defined __x86_64__ + asm volatile("prefetcht0 %0" :: "m" (*(unsigned long *)x)); + #else + { + // Use simple memcpy to get data into cache + char temp_prefetch_block[L1_CACHE_BYTES]; + memcpy(temp_prefetch_block, x, L1_CACHE_BYTES); + } + #endif +} + +static inline void prefetch_range(void *addr, size_t len) +{ + char *cp = (char*)addr; + char *end = (char*)addr + len; + for (; cp < end; cp += L1_CACHE_BYTES) + prefetch(cp); +} + + + +#endif diff --git a/src/utils/asm.h b/src/utils/asm.h new file mode 100644 index 0000000..513a6e3 --- /dev/null +++ b/src/utils/asm.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef ASM_H_ +#define ASM_H_ + +#if defined(__aarch64__) +#include "asm-arm64.h" +#elif defined(__powerpc64__) +#include "asm-ppc64.h" +#elif defined(__x86_64__) +#include "asm-x86.h" +#else +#error No architecture specific memory barrier definitions found! +#endif + +#endif diff --git a/src/utils/atomic.h b/src/utils/atomic.h new file mode 100644 index 0000000..8dc695c --- /dev/null +++ b/src/utils/atomic.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef ATOMIC_H_ +#define ATOMIC_H_ + +#include "asm.h" +#include "utils/bullseye.h" + +struct atomic_t { + __volatile__ int counter; +}; + +#define ATOMIC_INIT(i) { (i) } + +/** + * Read atomic variable. + * @param v pointer of type atomic_t + * @return Value of the atomic. + * + * Atomically reads the value of @v. + */ +#define atomic_read(v) ((v)->counter) + +/** + * Set atomic variable. + * @param v pointer of type atomic_t. + * @param i required value. + */ +#define atomic_set(v,i) (((v)->counter) = (i)) + +#if 0 + +#if _BullseyeCoverage + #pragma BullseyeCoverage off +#endif + +/** + * Returns current contents of addr and replaces contents with value. + * @param value Values to set. + * @param addr Address to set. + * @return Previous value of *addr. + */ +template +static inline T atomic_swap(T new_value, T *addr) +{ + return (T)xchg((unsigned long)new_value, (void*)addr); +} + +/** + * Replaces *addr with new_value if it equals old_value. + * @param old_value Expected value. + * @param new_value Value to set. + * @param addr Address to set. + * @return true if was set, false if not. + */ +template +static bool atomic_cas(T old_value, T new_value, T *addr) +{ + return cmpxchg((unsigned long)old_value, (unsigned long)new_value, (void*)addr); +} +#if _BullseyeCoverage + #pragma BullseyeCoverage on +#endif + +#endif + +/** + * Add to the atomic variable. + * @param i integer value to add. + * @param v pointer of type atomic_t. + * @return Value before add. + */ +static inline int atomic_fetch_and_inc(atomic_t *v) +{ + return atomic_fetch_and_add(1, &v->counter); +} + +/** + * Add to the atomic variable. + * @param i integer value to add. + * @param v pointer of type atomic_t. + * @return Value before add. + */ +static inline int atomic_fetch_and_dec(atomic_t *v) +{ + return atomic_fetch_and_add(-1, &v->counter); +} + +#endif /* ATOMIC_H_ */ diff --git a/src/utils/bullseye.h b/src/utils/bullseye.h new file mode 100644 index 0000000..dc5f7a8 --- /dev/null +++ b/src/utils/bullseye.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +/* + * Bullseye Coverage Definitions +*/ +#ifndef BULLSEYE_H_ +#define BULLSEYE_H_ + +#ifndef _BullseyeCoverage +#define _BullseyeCoverage 0 +#endif + +#if _BullseyeCoverage +#define BULLSEYE_EXCLUDE_BLOCK_START "BullseyeCoverage save off"; +#define BULLSEYE_EXCLUDE_BLOCK_END "BullseyeCoverage restore"; +#else +#define BULLSEYE_EXCLUDE_BLOCK_START +#define BULLSEYE_EXCLUDE_BLOCK_END +#endif + + +#endif /* BULLSEYE_H_ */ diff --git a/src/utils/clock.h b/src/utils/clock.h new file mode 100644 index 0000000..67e55d2 --- /dev/null +++ b/src/utils/clock.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef CLOCK_H +#define CLOCK_H + +#include + + + +/* + * Parameters used to convert the time values: + */ +#define MSEC_PER_SEC 1000L +#define USEC_PER_MSEC 1000L +#define NSEC_PER_USEC 1000L +#define NSEC_PER_MSEC 1000000L +#define USEC_PER_SEC 1000000L +#define NSEC_PER_SEC 1000000000L +#define FSEC_PER_SEC 1000000000000000L + + +/* + * Convenience macros for operations on timevals + */ +#define TIMEVAL_INITIALIZER {0,0} + +#define tv_to_sec(tvp) ( (tvp)->tv_sec) +#define tv_to_msec(tvp) ((int64_t((tvp)->tv_sec) * MSEC_PER_SEC) + (int64_t((tvp)->tv_usec) / USEC_PER_MSEC)) +#define tv_to_usec(tvp) ((int64_t((tvp)->tv_sec) * USEC_PER_SEC) + (int64_t((tvp)->tv_usec) )) +#define tv_to_nsec(tvp) ((int64_t((tvp)->tv_sec) * NSEC_PER_SEC) + (int64_t((tvp)->tv_usec) * NSEC_PER_USEC)) + +#define tv_isset(tvp) timerisset(tvp) +#define tv_clear(tvp) timerclear(tvp) + +#define tv_cmp(a, b, CMP) timercmp(a, b, CMP) +#define tv_add(a, b, result) timeradd(a, b, result) +#define tv_sub(a, b, result) timersub(a, b, result) + + +/* Convenience macros for operations on timespecs */ +#define TIMESPEC_INITIALIZER {0,0} + +#define ts_to_sec(tsp) ( (tsp)->tv_sec) +#define ts_to_msec(tsp) ((int64_t((tsp)->tv_sec) * MSEC_PER_SEC) + (int64_t((tsp)->tv_nsec) / NSEC_PER_MSEC)) +#define ts_to_usec(tsp) ((int64_t((tsp)->tv_sec) * USEC_PER_SEC) + (int64_t((tsp)->tv_nsec) / NSEC_PER_USEC)) +#define ts_to_nsec(tsp) ((int64_t((tsp)->tv_sec) * NSEC_PER_SEC) + (int64_t((tsp)->tv_nsec) )) + +#define ts_isset(tvp) ((tvp)->tv_sec || (tvp)->tv_nsec) +#define ts_clear(tvp) ((tvp)->tv_sec = (tvp)->tv_nsec = 0) + +#define ts_cmp(a, b, CMP) \ + (((a)->tv_sec == (b)->tv_sec) ? \ + ((a)->tv_nsec CMP (b)->tv_nsec) : \ + ((a)->tv_sec CMP (b)->tv_sec)) + +#define ts_add(a, b, result) \ + do { \ + (result)->tv_sec = (a)->tv_sec + (b)->tv_sec; \ + (result)->tv_nsec = (a)->tv_nsec + (b)->tv_nsec; \ + if ((result)->tv_nsec >= NSEC_PER_SEC) \ + { \ + ++(result)->tv_sec; \ + (result)->tv_nsec -= NSEC_PER_SEC; \ + } \ + } while (0) + +#define ts_sub(a, b, result) \ + do { \ + (result)->tv_sec = (a)->tv_sec - (b)->tv_sec; \ + (result)->tv_nsec = (a)->tv_nsec - (b)->tv_nsec; \ + if ((result)->tv_nsec < 0) { \ + --(result)->tv_sec; \ + (result)->tv_nsec += NSEC_PER_SEC; \ + } \ + } while (0) + + +#endif //CLOCK_H diff --git a/src/utils/lock_wrapper.h b/src/utils/lock_wrapper.h new file mode 100644 index 0000000..58f976d --- /dev/null +++ b/src/utils/lock_wrapper.h @@ -0,0 +1,347 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef LOCK_WRAPPER_H +#define LOCK_WRAPPER_H + +#include +#include +#include +#include +#include +#include "types.h" +#include "utils/bullseye.h" +#include "utils/rdtsc.h" + +//todo disable assert +#define ASSERT_LOCKED(lock) assert((lock).is_locked_by_me()) +#define ASSERT_NOT_LOCKED(lock) assert(!(lock).is_locked_by_me()) + +#ifdef DEFINED_NO_THREAD_LOCK + #define DEFINED_NO_THREAD_LOCK_RETURN_0 return 0; + #define DEFINED_NO_THREAD_LOCK_RETURN_1 return 1; +#else + #define DEFINED_NO_THREAD_LOCK_RETURN_0 + #define DEFINED_NO_THREAD_LOCK_RETURN_1 +#endif + +#define NO_LOCK_STATS + +#ifdef NO_LOCK_STATS + #define LOCK_BASE_LOCK + #define LOCK_BASE_TRYLOCK + #define LOCK_BASE_UNLOCK + #define LOCK_BASE_START_LOCK_WAIT + #define LOCK_BASE_END_LOCK_WAIT +#else + #define LOCK_BASE_LOCK lock_base::lock(); + #define LOCK_BASE_TRYLOCK lock_base::trylock(); + #define LOCK_BASE_UNLOCK lock_base::unlock(); + #define LOCK_BASE_START_LOCK_WAIT tscval_t timeval = start_lock_wait(); + #define LOCK_BASE_END_LOCK_WAIT end_lock_wait(timeval); +#endif + +#ifdef NO_LOCK_STATS + +// pthread lock stats counter for debugging + +class lock_base +{ +public: + lock_base(const char *_lock_name = NULL) : m_lock_name(_lock_name) {}; + virtual ~lock_base() {}; + virtual int lock() = 0; + virtual int trylock() = 0; + virtual int unlock() = 0; +#if _BullseyeCoverage + #pragma BullseyeCoverage off +#endif + const char* to_str() { return m_lock_name; } +#if _BullseyeCoverage + #pragma BullseyeCoverage on +#endif +private: + const char* m_lock_name; +}; + +#else //NO_LOCK_STATS + + +#include +#include + +// +// pthread counting mutex +// +class lock_base +{ +public: + lock_base(const char *name) { + m_lock_count = 0; + m_lock_wait_time = 0; + m_lock_name = name; + m_prev_print_time = 0; + m_print_interval = get_tsc_rate_per_second() * 5; + }; + + virtual ~lock_base() { + if (m_lock_count > 1000) { + print_stats(); + } + }; + + virtual inline int lock() { + m_lock_count++; + return 0; + }; + + virtual inline int trylock() { + m_lock_count++; + return 0; + } + + virtual inline int unlock() { + return 0; + }; + + const char* to_str() { return m_lock_name; } + +private: + void print_stats() { + printf("[lock %s %p] --- locked %d times average wait %.2f us ---\n", + to_str(), this, m_lock_count, avg_lock_wait() * 1000000.0); + } + + const char* m_lock_name; + int m_lock_count; + tscval_t m_lock_wait_time; + tscval_t m_prev_print_time; + tscval_t m_print_interval; + +protected: + tscval_t start_lock_wait() { + tscval_t t; + gettimeoftsc(&t); + return t; + } + + void end_lock_wait(tscval_t start_time) { + tscval_t t; + gettimeoftsc(&t); + m_lock_wait_time += (t - start_time); + if (t - m_prev_print_time > m_print_interval) { + print_stats(); + m_prev_print_time = t; + } + } + + double avg_lock_wait() { + return (m_lock_wait_time / + static_cast(get_tsc_rate_per_second())) / m_lock_count ; + } +}; +#endif //NO_LOCK_STATS + + +/** + * pthread spinlock + */ +class lock_spin : public lock_base +{ +public: + lock_spin(const char *name = "lock_spin") : lock_base(name) { + pthread_spin_init(&m_lock, 0); + }; + ~lock_spin() { + pthread_spin_destroy(&m_lock); + }; + inline int lock() { + DEFINED_NO_THREAD_LOCK_RETURN_0 + LOCK_BASE_START_LOCK_WAIT + int ret = pthread_spin_lock(&m_lock); + LOCK_BASE_LOCK + LOCK_BASE_END_LOCK_WAIT + return ret; + }; + inline int trylock() { + DEFINED_NO_THREAD_LOCK_RETURN_0 + int ret = pthread_spin_trylock(&m_lock); + LOCK_BASE_TRYLOCK + return ret; + }; + inline int unlock() { + DEFINED_NO_THREAD_LOCK_RETURN_0 + LOCK_BASE_UNLOCK + return pthread_spin_unlock(&m_lock); + }; + +protected: + pthread_spinlock_t m_lock; +}; + +/** + * pthread spinlock + */ +class lock_spin_recursive : public lock_spin +{ +public: + lock_spin_recursive(const char *name = "lock_spin_recursive") : + lock_spin(name), m_lock_count(0) { + memset(&m_invalid_owner, 0xff, sizeof(m_invalid_owner)); + m_owner = m_invalid_owner; + }; + ~lock_spin_recursive() {}; + + inline int lock() { + DEFINED_NO_THREAD_LOCK_RETURN_0 + pthread_t self = pthread_self(); + if (m_owner == self) { + ++m_lock_count; + return 0; + } + LOCK_BASE_START_LOCK_WAIT + int ret = lock_spin::lock(); + if (likely(ret == 0)) { + ++m_lock_count; + m_owner = self; + } + LOCK_BASE_END_LOCK_WAIT + return ret; + }; + inline int trylock() { + DEFINED_NO_THREAD_LOCK_RETURN_0 + pthread_t self = pthread_self(); + if (m_owner == self) { + ++m_lock_count; + return 0; + } + int ret = lock_spin::trylock(); + if (ret == 0) { + ++m_lock_count; + m_owner = self; + } + return ret; + }; + inline int unlock() { + DEFINED_NO_THREAD_LOCK_RETURN_0 + if (--m_lock_count == 0) { + m_owner = m_invalid_owner; + return lock_spin::unlock(); + } + return 0; + }; + inline int is_locked_by_me() { + DEFINED_NO_THREAD_LOCK_RETURN_1 + pthread_t self = pthread_self(); + return ((m_owner == self && m_lock_count) ? m_lock_count : 0); + }; + +protected: + pthread_t m_owner; + pthread_t m_invalid_owner; + int m_lock_count; +}; + +/** + * pthread mutex + */ +class lock_mutex : public lock_base +{ +public: + lock_mutex(const char *name = "lock_mutex", + int mtx_type = PTHREAD_MUTEX_DEFAULT) : lock_base(name) { + pthread_mutexattr_t mtx_attr; + pthread_mutexattr_init(&mtx_attr); + pthread_mutexattr_settype(&mtx_attr, mtx_type); + pthread_mutex_init(&m_lock, &mtx_attr); + }; + ~lock_mutex() { + pthread_mutex_destroy(&m_lock); + }; + inline int lock() { + DEFINED_NO_THREAD_LOCK_RETURN_0 + LOCK_BASE_START_LOCK_WAIT + int ret = pthread_mutex_lock(&m_lock); + LOCK_BASE_LOCK + LOCK_BASE_END_LOCK_WAIT + return ret; + }; + inline int trylock() { + DEFINED_NO_THREAD_LOCK_RETURN_0 + int ret = pthread_mutex_trylock(&m_lock); + LOCK_BASE_TRYLOCK + return ret; + }; + inline int unlock() { + DEFINED_NO_THREAD_LOCK_RETURN_0 + LOCK_BASE_UNLOCK + return pthread_mutex_unlock(&m_lock); + }; + +protected: + pthread_mutex_t m_lock; +}; + + +/** + * pthread recursive mutex + */ +class lock_mutex_recursive : public lock_mutex +{ +public: + lock_mutex_recursive(const char *name = "lock_mutex_recursive") : + lock_mutex(name, PTHREAD_MUTEX_RECURSIVE) {}; + ~lock_mutex_recursive() {}; +}; + +/** + * automatic unlock at end of scope where this object was defined on + * Input: lock_base of lock kind as reference + */ +class auto_unlocker +{ +public: + inline auto_unlocker(lock_base& lock) : m_lock(lock) { + m_lock.lock(); + //printf("[%s %p] locked\n", m_lock.to_str(), this); + }; + inline ~auto_unlocker() { + //printf("[%s %p] unlocking\n", m_lock.to_str(), this); + m_lock.unlock(); + }; + +private: + lock_base& m_lock; +}; + +#endif //LOCK_WRAPPER_H diff --git a/src/utils/rdtsc.h b/src/utils/rdtsc.h new file mode 100644 index 0000000..d414dfc --- /dev/null +++ b/src/utils/rdtsc.h @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef RDTSC_H +#define RDTSC_H + +#include +#include +#include // for MAX & MIN + +#include "asm.h" +#include "clock.h" + +/** + * RDTSC extensions + */ +typedef unsigned long long tscval_t; + +#define TSCVAL_INITIALIZER (0) + +/** +* Read the CPU's Hz (based on /proc/cpuinfo Mhz report) +* Provide the MAX and MIN values, which might be the case if core are running at power control states +* Return true on success, false on any failure +**/ +static bool get_cpu_hz(double &hz_min, double &hz_max) +{ + FILE* f; + char buf[256]; + bool first_run = true; + + f = fopen("/proc/cpuinfo", "r"); + if (!f) { + return false; + } + + while (fgets(buf, sizeof(buf), f)) { + double mhz = 0; + int rc = 0; + +#if defined(__ia64__) + rc = sscanf(buf, "itc MHz : %lf", &mhz); +#elif defined(__powerpc__) + rc = sscanf(buf, "clock : %lf", &mhz); +#elif defined(__aarch64__) + rc = sscanf(buf, "BogoMIPS : %lf", &mhz); + mhz /= 2; +#else + rc = sscanf(buf, "cpu MHz : %lf", &mhz); +#endif + if (rc != 1) { + continue; + } + if (first_run) { + // first time align of all values + first_run = false; + hz_max = hz_min = mhz; + continue; + } + hz_min = MIN(hz_min, mhz); + hz_max = MAX(hz_max, mhz); + } + fclose(f); + + // Convert to Hz before return to caller + // (original values are in MHz) + hz_min = hz_min * 1.0e6; + hz_max = hz_max * 1.0e6; + return true; +} + +/** + * Calibrate TSC with CPU speed + * @return number of tsc ticks per second + */ +static inline tscval_t get_tsc_rate_per_second() +{ + static tscval_t tsc_per_second = TSCVAL_INITIALIZER; + if (!tsc_per_second) { + double hz_min = -1, hz_max = -1; + if (get_cpu_hz(hz_min, hz_max)) { + tsc_per_second = (tscval_t)hz_max; + } + else { + // failure calibrating TSC to CPU speed + tsc_per_second = 2 * 1e6; // assume 2 MHz CPU speed + } + } + return tsc_per_second; +} + +/** + * 'gettimeofday()' based on RDTSC + * Re-sync with system clock no more then once a second + */ +inline int gettimefromtsc(struct timespec *ts) +{ + static tscval_t tsc_start = TSCVAL_INITIALIZER; + static struct timespec ts_start = TIMESPEC_INITIALIZER; + + struct timespec ts_delta = TIMESPEC_INITIALIZER; + tscval_t tsc_now, tsc_delta; + uint64_t nsec_delta = 0; + + if (!ts_isset(&ts_start)) { + clock_gettime(CLOCK_MONOTONIC, &ts_start); + gettimeoftsc(&tsc_start); + } + gettimeoftsc(&tsc_now); + tsc_delta = tsc_now - tsc_start; + nsec_delta = tsc_delta * NSEC_PER_SEC / get_tsc_rate_per_second(); + + ts_delta.tv_sec = nsec_delta / NSEC_PER_SEC; + ts_delta.tv_nsec = nsec_delta - ts_delta.tv_sec * NSEC_PER_SEC; + ts_add(&ts_start, &ts_delta, ts); + +#ifndef VMA_TIME_MEASURE + // Once a second re-sync our start time with real time-of-day + if (tsc_delta > get_tsc_rate_per_second()) + ts_clear(&ts_start); +#endif + + return 0; +} + +static inline int gettime(struct timespec *ts) +{ +#ifdef VMA_TIME_MEASURE + return clock_gettime(CLOCK_MONOTONIC, ts); +#else + return gettimefromtsc(ts); +#endif +} + +static inline int gettime(struct timeval *tv) +{ + return gettimeofday(tv, NULL); +} + +#endif //RDTSC_H diff --git a/src/utils/types.h b/src/utils/types.h new file mode 100644 index 0000000..378aaca --- /dev/null +++ b/src/utils/types.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef TYPES_H +#define TYPES_H + +#include + +#ifndef IN +#define IN +#endif + +#ifndef OUT +#define OUT +#endif + +#ifndef INOUT +#define INOUT +#endif + +#ifndef likely +#define likely(x) __builtin_expect(!!(x), 1) +#endif + +#ifndef unlikely +#define unlikely(x) __builtin_expect(!!(x), 0) +#endif + +#ifndef NOT_IN_USE +#define NOT_IN_USE(a) ((void)(a)) +#endif + +#endif //TYPES_H diff --git a/src/vlogger/Makefile.am b/src/vlogger/Makefile.am new file mode 100644 index 0000000..e2cb35f --- /dev/null +++ b/src/vlogger/Makefile.am @@ -0,0 +1,12 @@ +AM_CPPFLAGS := -I$(top_srcdir)/src + +noinst_LTLIBRARIES = libvlogger.la +libvlogger_la_LDFLAGS = -static +libvlogger_la_LIBADD = -lrt +libvlogger_la_SOURCES = vlogger.cpp vlogger.h + +noinst_PROGRAMS = vlogger_test +vlogger_test_LDADD = \ + $(top_builddir)/src/utils/libutils.la \ + libvlogger.la +vlogger_test_SOURCES = main.cpp diff --git a/src/vlogger/main.cpp b/src/vlogger/main.cpp new file mode 100644 index 0000000..988a0c3 --- /dev/null +++ b/src/vlogger/main.cpp @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include + +#include "vlogger.h" + +#if _BullseyeCoverage + #pragma BullseyeCoverage off +#endif + +int main(int argc, char **argv) +{ + vlog_levels_t vlog_levels_init = VLOG_WARNING; + if (argc > 1) + vlog_levels_init = (vlog_levels_t)atoi(argv[1]); + + printf(">> starting vlogger in level: %d\n", (int)vlog_levels_init); + vlog_start("Voltaire Logger test module: ", vlog_levels_init); + + vlog_printf(VLOG_PANIC, "%s: test log_print in level VLOG_PANIC\n", __func__); + vlog_printf(VLOG_ERROR, "%s: test log_print in level VLOG_ERROR\n", __func__); + vlog_printf(VLOG_WARNING, "%s: test log_print in level VLOG_WARNING\n", __func__); + vlog_printf(VLOG_INFO, "%s: test log_print in level VLOG_INFO\n", __func__); + vlog_printf(VLOG_DEBUG, "%s: test log_print in level VLOG_DEBUG\n", __func__); + vlog_printf(VLOG_FUNC, "%s: test log_print in level VLOG_FUNC\n", __func__); + vlog_printf(VLOG_FUNC_ALL, "%s: test log_print in level VLOG_FUNC_ALL\n", __func__); + + usleep(10000); + + vlog_stop(); + + return 0; +} + +#if _BullseyeCoverage + #pragma BullseyeCoverage on +#endif diff --git a/src/vlogger/vlogger.cpp b/src/vlogger/vlogger.cpp new file mode 100644 index 0000000..09fc7b0 --- /dev/null +++ b/src/vlogger/vlogger.cpp @@ -0,0 +1,350 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "vlogger.h" + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "utils/bullseye.h" +#include "vma/util/utils.h" +#include "vma/util/sys_vars.h" + +#define VLOG_DEFAULT_MODULE_NAME "VMA" +#define VMA_LOG_CB_ENV_VAR "VMA_LOG_CB_FUNC_PTR" + +char g_vlogger_module_name[VLOG_MODULE_MAX_LEN] = VLOG_DEFAULT_MODULE_NAME; +int g_vlogger_fd = -1; +FILE* g_vlogger_file = NULL; +vlog_levels_t g_vlogger_level = VLOG_DEFAULT; +vlog_levels_t* g_p_vlogger_level = NULL; +uint8_t g_vlogger_details = 0; +uint8_t* g_p_vlogger_details = NULL; +uint32_t g_vlogger_usec_on_startup = 0; +bool g_vlogger_log_in_colors = MCE_DEFAULT_LOG_COLORS; +vma_log_cb_t g_vlogger_cb = NULL; + +namespace log_level +{ + typedef struct { + vlog_levels_t level; + const char * output_name; + const char * output_color; + const char ** input_names; + } level_names; + + static const char *log_names_none[] = {"none", NULL}; + static const char *log_names_panic[] = {"panic", "0", NULL}; + static const char *log_names_error[] = {"error", "1", NULL}; + static const char *log_names_warn[] = {"warn", "warning", "2", NULL}; + static const char *log_names_info[] = {"info", "information", "3", NULL}; + static const char *log_names_details[] = {"details", NULL}; + static const char *log_names_debug[] = {"debug", "4", NULL}; + static const char *log_names_fine[] = {"fine", "func", "5", NULL}; + static const char *log_names_finer[] = {"finer", "func+", "funcall", "func_all", "func-all", "6", NULL}; + static const char *log_names_all[] = {"all", NULL}; + + // must be by order because "to_str" relies on that! + static const level_names levels[] = { + {VLOG_NONE, "NONE", "\e[0;31m" /*Red*/, (const char ** )log_names_none}, + {VLOG_PANIC, "PANIC", "\e[0;31m" /*Red*/, (const char ** )log_names_panic}, + {VLOG_ERROR, "ERROR", "\e[0;31m" /*Red*/, (const char ** )log_names_error}, + {VLOG_WARNING, "WARNING", "\e[2;35m" /*Magenta*/, (const char ** )log_names_warn}, + {VLOG_INFO, "INFO", "\e[0m" /*Default*/, (const char ** )log_names_info}, + {VLOG_DETAILS, "DETAILS", "\e[0m" /*Default*/, (const char ** )log_names_details}, + {VLOG_DEBUG, "DEBUG", "\e[0m" /*Default*/, (const char ** )log_names_debug}, + {VLOG_FINE, "FINE", "\e[2m" /*Grey*/, (const char ** )log_names_fine}, + {VLOG_FINER, "FINER", "\e[2m" /*Grey*/, (const char ** )log_names_finer}, + {VLOG_ALL, "ALL", "\e[2m" /*Grey*/, (const char ** )log_names_all}, + }; + + // convert str to vlog_levels_t; upon error - returns the given 'def_value' + vlog_levels_t from_str(const char* str, vlog_levels_t def_value) + { + size_t num_levels = sizeof(levels) / sizeof(levels[0]); + for (size_t i = 0; i < num_levels; ++i) { + const char ** input_name = levels[i].input_names; + while (*input_name) { + if (strcasecmp(str, *input_name) == 0) { + /* Set maximum accessible logging level in case + * a user requests level that is reduced during compilation + * or requested one if the level is in valid range + */ + if (levels[i].level <= VMA_MAX_DEFINED_LOG_LEVEL) { + return levels[i].level; + } + def_value = (vlog_levels_t)(VMA_MAX_DEFINED_LOG_LEVEL); + vlog_printf(VLOG_WARNING, "VMA trace level set to max level %s\n", to_str(def_value)); + return def_value; + } + input_name++; + } + } + + return def_value; // not found. use given def_value + } + + // convert int to vlog_levels_t; upon error - returns the given 'def_value' + vlog_levels_t from_int(const int int_log, vlog_levels_t def_value) + { + if (int_log >= VLOG_NONE && int_log <= VLOG_ALL) { + return static_cast(int_log); + } + return def_value; // not found. use given def_value + } + + const char * to_str(vlog_levels_t level) + { + static int base = VLOG_NONE; + return levels[level - base].output_name; + } + + const char * get_color(vlog_levels_t level) + { + static int base = VLOG_NONE; + return levels[level - base].output_color; + } +} + +pid_t gettid(void) +{ + return syscall(__NR_gettid); +} + +#if _BullseyeCoverage + #pragma BullseyeCoverage off +#endif + +// Credit for the C++ de-mangler go to: http://tombarta.wordpress.com/2008/08/01/c-stack-traces-with-gcc/ +#include +void printf_backtrace(void) +{ + char **backtrace_strings; + void* backtrace_addrs[10]; + int backtrace_depth = backtrace(backtrace_addrs, 10); + printf("[tid: %d] ------ printf_backtrace ------ \n", gettid()); + backtrace_strings = backtrace_symbols(backtrace_addrs, backtrace_depth); + for (int i = 1; i < backtrace_depth; i++) { +#if 0 + printf("[%d] %p: %s\n", i, backtrace_addrs[i], backtrace_strings[i]); +#else + size_t sz = 1024; // just a guess, template names will go much wider + char *function = NULL; + char *begin = 0, *end = 0; + // find the parentheses and address offset surrounding the mangled name + for (char *j = backtrace_strings[i]; *j; ++j) { + if (*j == '(') { + begin = j; + } + else if (*j == '+') { + end = j; + } + } + if (begin && end) { + *begin++ = '\0'; + *end = '\0'; + // found our mangled name, now in [begin, end) + + int status; + function = abi::__cxa_demangle(begin, NULL, &sz, &status); + if (NULL == function) { + // demangling failed, just pretend it's a C function with no args + function = static_cast(malloc(sz)); + if (function) { + status = snprintf(function, sz - 1, "%s()", begin); + if (status > 0) { + function[status] = '\0'; + } else { + function[0] = '\0'; + } + } + } + // fprintf(out, " %s:%s\n", stack.backtrace_strings[i], function); + printf("[%d] %p: %s:%s\n", i, backtrace_addrs[i], backtrace_strings[i], (function ? function : "n/a")); + if (function) { + free(function); + } + } + else + { + // didn't find the mangled name, just print the whole line + printf("[%d] %p: %s\n", i, backtrace_addrs[i], backtrace_strings[i]); + } +#endif + } + free(backtrace_strings); +} + +#if _BullseyeCoverage + #pragma BullseyeCoverage on +#endif + +//////////////////////////////////////////////////////////////////////////////// +// NOTE: this function matches 'bool vma_log_set_cb_func(vma_log_cb_t log_cb)' that +// we gave customers; hence, you must not change our side without considering their side +static vma_log_cb_t vma_log_get_cb_func() +{ + vma_log_cb_t log_cb = NULL; + const char* const CB_STR = getenv(VMA_LOG_CB_ENV_VAR); + if (!CB_STR || !*CB_STR) return NULL; + + if (1 != sscanf(CB_STR, "%p", &log_cb)) return NULL; + return log_cb; +} + +void vlog_start(const char* log_module_name, vlog_levels_t log_level, const char* log_filename, int log_details, bool log_in_colors) +{ + g_vlogger_file = stderr; + + g_vlogger_cb = vma_log_get_cb_func(); + + strncpy(g_vlogger_module_name, log_module_name, sizeof(g_vlogger_module_name) - 1); + g_vlogger_module_name[sizeof(g_vlogger_module_name) - 1] = '\0'; + + vlog_get_usec_since_start(); + + char local_log_filename[255]; + if (log_filename != NULL && strcmp(log_filename,"")) { + sprintf(local_log_filename, "%s", log_filename); + g_vlogger_fd = open(local_log_filename, O_WRONLY|O_CREAT|O_TRUNC, 0644); + if (g_vlogger_fd < 0) { + vlog_printf(VLOG_PANIC, "Failed to open logfile: %s\n",local_log_filename); + exit(1); + } + g_vlogger_file = fdopen(g_vlogger_fd, "w"); + + BULLSEYE_EXCLUDE_BLOCK_START + if (g_vlogger_file == NULL) { + g_vlogger_file = stderr; + vlog_printf(VLOG_PANIC, "Failed to open logfile: %s\n",local_log_filename); + exit(1); + } + BULLSEYE_EXCLUDE_BLOCK_END + } + + g_vlogger_level = log_level; + g_p_vlogger_level = &g_vlogger_level; + g_vlogger_details = log_details; + g_p_vlogger_details = &g_vlogger_details; + + + int file_fd = fileno(g_vlogger_file); + if (file_fd >= 0 && isatty(file_fd) && log_in_colors) + g_vlogger_log_in_colors = log_in_colors; +} + +void vlog_stop(void) +{ + // Closing logger + + // Allow only really extreme (PANIC) logs to go out + g_vlogger_level = VLOG_PANIC; + + //set default module name + strcpy(g_vlogger_module_name, VLOG_DEFAULT_MODULE_NAME); + + // Close output stream + if(g_vlogger_file && g_vlogger_file != stderr) + fclose(g_vlogger_file); + + //fix for using LD_PRELOAD with LBM. Unset the pointer given by the parent process, so a child could get his own pointer without issues. + unsetenv(VMA_LOG_CB_ENV_VAR); +} + +void vlog_output(vlog_levels_t log_level, const char* fmt , ... ) +{ + int len = 0; + char buf[VLOGGER_STR_SIZE]; + + // Format header + + // Set color scheme + if (g_vlogger_log_in_colors) + len += snprintf(buf+len, VLOGGER_STR_SIZE-len-1, "%s", log_level::get_color(log_level)); + + switch (g_vlogger_details) { + case 3: // Time + len += snprintf(buf+len, VLOGGER_STR_SIZE-len-1, " Time: %9.3f", ((float)vlog_get_usec_since_start())/1000); // fallthrough + case 2: // Pid + len += snprintf(buf+len, VLOGGER_STR_SIZE-len-1, " Pid: %5u", getpid()); // fallthrough + case 1: // Tid + len += snprintf(buf+len, VLOGGER_STR_SIZE-len-1, " Tid: %5u", gettid()); // fallthrough + case 0: // Func + default: + len += snprintf(buf+len, VLOGGER_STR_SIZE-len-1, " %s %s: ", g_vlogger_module_name, log_level::to_str(log_level)); + } + + if (len < 0) { + return ; + } + buf[len+1] = '\0'; + + // Format body + va_list ap; + va_start(ap, fmt); + if (fmt != NULL) + len += vsnprintf(buf+len, VLOGGER_STR_SIZE-len, fmt, ap); + va_end(ap); + + // Reset color scheme + if (g_vlogger_log_in_colors) { + // Save enough room for color code termination and EOL + if (len > VLOGGER_STR_SIZE - VLOGGER_STR_TERMINATION_SIZE) + len = VLOGGER_STR_SIZE - VLOGGER_STR_TERMINATION_SIZE - 1; + + len = snprintf(buf + len, VLOGGER_STR_TERMINATION_SIZE, VLOGGER_STR_COLOR_TERMINATION_STR); + if (len < 0) { + return ; + } + } + + if (g_vlogger_cb) + { + g_vlogger_cb(log_level, buf); + } + else if (g_vlogger_file) + { + // Print out + fprintf(g_vlogger_file, "%s", buf); + fflush(g_vlogger_file); + } + else { + printf("%s", buf); + } +} diff --git a/src/vlogger/vlogger.h b/src/vlogger/vlogger.h new file mode 100644 index 0000000..2aafa35 --- /dev/null +++ b/src/vlogger/vlogger.h @@ -0,0 +1,349 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef VLOGGER_H +#define VLOGGER_H + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include "utils/bullseye.h" +#include "utils/rdtsc.h" + +#define TO_STR(a) TOSTR_HELPER(a) +#define TOSTR_HELPER(a) #a + +#undef MODULE_HDR +#define MODULE_HDR MODULE_NAME "%d:%s() " + +#undef MODULE_HDR_INFO +#define MODULE_HDR_INFO MODULE_NAME "[%p]:%d:%s() " + +#undef MODULE_HDR_ENTRY +#define MODULE_HDR_ENTRY "ENTER: " + +#undef MODULE_HDR_EXIT +#define MODULE_HDR_EXIT "EXIT: " + +#undef __INFO__ +#define __INFO__ this + +#define vlog_printf(_log_level, _format, ... ) \ + do { \ + if (g_vlogger_level >= (_log_level)) { \ + vlog_output((_log_level), _format, ##__VA_ARGS__); \ + } \ + } while (0) + +#define VLOG_PRINTF( log_level, log_fmt, log_args...) vlog_printf(log_level, MODULE_HDR log_fmt "\n", __LINE__, __FUNCTION__, ##log_args) +#define VLOG_PRINTF_INFO(log_level, log_fmt, log_args...) vlog_printf(log_level, MODULE_HDR_INFO log_fmt "\n", __INFO__, __LINE__, __FUNCTION__, ##log_args) +#define VLOG_PRINTF_INFO_ONCE_THEN_ALWAYS(log_level_once, log_level, log_fmt,log_args...) \ + do { \ + static vlog_levels_t ___log_level = log_level_once; \ + VLOG_PRINTF_INFO(___log_level, log_fmt, ##log_args); \ + ___log_level = log_level; \ + } while (0) + +#define VLOG_PRINTF_ONCE_THEN_ALWAYS(log_level_once, log_level, log_fmt,log_args...) \ + do { \ + static vlog_levels_t ___log_level = log_level_once; \ + VLOG_PRINTF(___log_level, log_fmt, ##log_args); \ + ___log_level = log_level; \ + } while (0) + +#define VLOG_PRINTF_ONCE_THEN_DEBUG(log_level_once, log_fmt,log_args...) \ + do { \ + static vlog_levels_t ___log_level = log_level_once; \ + vlog_printf(___log_level, log_fmt, ##log_args); \ + ___log_level = VLOG_DEBUG; \ + } while (0) + +#define VLOG_PRINTF_ENTRY(log_level, log_fmt, log_args...) vlog_printf(log_level, MODULE_HDR_ENTRY "%s(" log_fmt ")\n", __FUNCTION__, ##log_args) +#define VLOG_PRINTF_EXIT( log_level, log_fmt, log_args...) vlog_printf(log_level, MODULE_HDR_EXIT "%s() " log_fmt "\n", __FUNCTION__, ##log_args) + + +#define __log_panic(log_fmt, log_args...) do { VLOG_PRINTF(VLOG_PANIC, log_fmt, ##log_args); throw; } while (0) +#define __log_err(log_fmt, log_args...) do { VLOG_PRINTF(VLOG_ERROR, log_fmt, ##log_args); } while (0) +#define __log_warn(log_fmt, log_args...) do { VLOG_PRINTF(VLOG_WARNING, log_fmt, ##log_args); } while (0) +#define __log_info(log_fmt, log_args...) do { VLOG_PRINTF(VLOG_INFO, log_fmt, ##log_args); } while (0) + +#if (VMA_MAX_DEFINED_LOG_LEVEL < DEFINED_VLOG_DETAILS) +#define __log_details(log_fmt, log_args...) ((void)0) +#else +#define __log_details(log_fmt, log_args...) do { if (g_vlogger_level >= VLOG_DETAILS) VLOG_PRINTF(VLOG_DETAILS, log_fmt, ##log_args); } while (0) +#endif + +#if (VMA_MAX_DEFINED_LOG_LEVEL < DEFINED_VLOG_DEBUG) +#define __log_dbg(log_fmt, log_args...) ((void)0) +#else +#define __log_dbg(log_fmt, log_args...) do { if (g_vlogger_level >= VLOG_DEBUG) VLOG_PRINTF(VLOG_DEBUG, log_fmt, ##log_args); } while (0) +#endif + +#if (VMA_MAX_DEFINED_LOG_LEVEL < DEFINED_VLOG_FINE) +#define __log_fine(log_fmt, log_args...) ((void)0) +#else +#define __log_fine(log_fmt, log_args...) do { if (g_vlogger_level >= VLOG_FINE) VLOG_PRINTF(VLOG_FINE, log_fmt, ##log_args); } while (0) +#endif + +#if (VMA_MAX_DEFINED_LOG_LEVEL < DEFINED_VLOG_FINER) +#define __log_finer(log_fmt, log_args...) ((void)0) +#else +#define __log_finer(log_fmt, log_args...) do { if (g_vlogger_level >= VLOG_FINER) VLOG_PRINTF(VLOG_FINER, log_fmt, ##log_args); } while (0) +#endif /* VMA_MAX_DEFINED_LOG_LEVEL */ + +#define __log_info_panic(log_fmt, log_args...) do { VLOG_PRINTF_INFO(VLOG_PANIC, log_fmt, ##log_args); throw; } while (0) +#define __log_info_err(log_fmt, log_args...) do { VLOG_PRINTF_INFO(VLOG_ERROR, log_fmt, ##log_args); } while (0) +#define __log_info_warn(log_fmt, log_args...) do { VLOG_PRINTF_INFO(VLOG_WARNING, log_fmt, ##log_args); } while (0) +#define __log_info_info(log_fmt, log_args...) do { VLOG_PRINTF_INFO(VLOG_INFO, log_fmt, ##log_args); } while (0) + +#if (VMA_MAX_DEFINED_LOG_LEVEL < DEFINED_VLOG_DETAILS) +#define __log_info_details(log_fmt, log_args...) ((void)0) +#else +#define __log_info_details(log_fmt, log_args...) do { if (g_vlogger_level >= VLOG_DETAILS) VLOG_PRINTF_INFO(VLOG_DETAILS, log_fmt, ##log_args); } while (0) +#endif + +#if (VMA_MAX_DEFINED_LOG_LEVEL < DEFINED_VLOG_DEBUG) +#define __log_info_dbg(log_fmt, log_args...) ((void)0) +#else +#define __log_info_dbg(log_fmt, log_args...) do { if (g_vlogger_level >= VLOG_DEBUG) VLOG_PRINTF_INFO(VLOG_DEBUG, log_fmt, ##log_args); } while (0) +#endif + +#if (VMA_MAX_DEFINED_LOG_LEVEL < DEFINED_VLOG_FINE) +#define __log_info_fine(log_fmt, log_args...) ((void)0) +#else +#define __log_info_fine(log_fmt, log_args...) do { if (g_vlogger_level >= VLOG_FINE) VLOG_PRINTF_INFO(VLOG_FINE, log_fmt, ##log_args); } while (0) +#endif + +#if (VMA_MAX_DEFINED_LOG_LEVEL < DEFINED_VLOG_FINER) +#define __log_info_finer(log_fmt, log_args...) ((void)0) +#else +#define __log_info_finer(log_fmt, log_args...) do { if (g_vlogger_level >= VLOG_FINER) VLOG_PRINTF_INFO(VLOG_FINER, log_fmt, ##log_args); } while (0) +#endif /* VMA_MAX_DEFINED_LOG_LEVEL */ + +#if (VMA_MAX_DEFINED_LOG_LEVEL < DEFINED_VLOG_DEBUG) +#define __log_entry_dbg(log_fmt, log_args...) ((void)0) +#else +#define __log_entry_dbg(log_fmt, log_args...) do { if (g_vlogger_level >= VLOG_DEBUG) VLOG_PRINTF_ENTRY(VLOG_DEBUG, log_fmt, ##log_args); } while (0) +#endif + +#if (VMA_MAX_DEFINED_LOG_LEVEL < DEFINED_VLOG_FINE) +#define __log_entry_fine(log_fmt, log_args...) ((void)0) +#else +#define __log_entry_fine(log_fmt, log_args...) do { if (g_vlogger_level >= VLOG_FINE) VLOG_PRINTF_ENTRY(VLOG_FINE, log_fmt, ##log_args); } while (0) +#endif + +#if (VMA_MAX_DEFINED_LOG_LEVEL < DEFINED_VLOG_FINER) +#define __log_entry_finer(log_fmt, log_args...) ((void)0) +#else +#define __log_entry_finer(log_fmt, log_args...) do { if (g_vlogger_level >= VLOG_FINER) VLOG_PRINTF_ENTRY(VLOG_FINER, log_fmt, ##log_args); } while (0) +#endif + +#if (VMA_MAX_DEFINED_LOG_LEVEL < DEFINED_VLOG_DEBUG) +#define __log_exit_dbg(log_fmt, log_args...) ((void)0) +#else +#define __log_exit_dbg(log_fmt, log_args...) do { if (g_vlogger_level >= VLOG_DEBUG) VLOG_PRINTF_EXIT(VLOG_DEBUG, log_fmt, ##log_args); } while (0) +#endif + +#if (VMA_MAX_DEFINED_LOG_LEVEL < DEFINED_VLOG_FINE) +#define __log_exit_fine(log_fmt, log_args...) ((void)0) +#else +#define __log_exit_fine(log_fmt, log_args...) do { if (g_vlogger_level >= VLOG_FINE) VLOG_PRINTF_EXIT(VLOG_FINE, log_fmt, ##log_args); } while (0) +#endif + +#if (VMA_MAX_DEFINED_LOG_LEVEL < DEFINED_VLOG_FINER) +#define __log_exit_finer(log_fmt, log_args...) ((void)0) +#else +#define __log_exit_finer(log_fmt, log_args...) do { if (g_vlogger_level >= VLOG_FINER) VLOG_PRINTF_EXIT(VLOG_FINER, log_fmt, ##log_args); } while (0) +#endif /* VMA_MAX_DEFINED_LOG_LEVEL */ + +// deprecated functions - only exist for Backward Compatibility. Please avoid using them! +#define __log_func(...) __log_fine(__VA_ARGS__) +#define __log_funcall(...) __log_finer(__VA_ARGS__) +#define __log_info_func(...) __log_info_fine(__VA_ARGS__) +#define __log_info_funcall(...) __log_info_finer(__VA_ARGS__) +#define __log_entry_func(...) __log_entry_fine(__VA_ARGS__) +#define __log_entry_funcall(...) __log_entry_finer(__VA_ARGS__) +#define __log_exit_func(...) __log_exit_fine(__VA_ARGS__) +#define __log_exit_funcall(...) __log_exit_finer(__VA_ARGS__) + +#ifdef __cplusplus +extern "C" { +#endif //__cplusplus + +typedef enum { + VLOG_INIT = DEFINED_VLOG_INIT, + VLOG_NONE = DEFINED_VLOG_NONE, + VLOG_PANIC = DEFINED_VLOG_PANIC, + VLOG_ERROR = DEFINED_VLOG_ERROR, + VLOG_WARNING = DEFINED_VLOG_WARNING, + VLOG_INFO = DEFINED_VLOG_INFO, VLOG_DEFAULT = VLOG_INFO, + VLOG_DETAILS = DEFINED_VLOG_DETAILS, + VLOG_DEBUG = DEFINED_VLOG_DEBUG, + VLOG_FINE = DEFINED_VLOG_FINE, VLOG_FUNC = VLOG_FINE, + VLOG_FINER = DEFINED_VLOG_FINER, VLOG_FUNC_ALL = VLOG_FINER, + VLOG_ALL = DEFINED_VLOG_ALL /* last element */ +} vlog_levels_t; + +namespace log_level { + // convert str to vlog_levels_t; upon error - returns the given 'def_value' + vlog_levels_t from_str(const char* str, vlog_levels_t def_value = VLOG_DEFAULT); + + // convert int to vlog_levels_t; upon error - returns the given 'def_value' + vlog_levels_t from_int(const int int_log, vlog_levels_t def_value = VLOG_DEFAULT); + + const char * to_str(vlog_levels_t level); + const char * get_color(vlog_levels_t level); +} + + +#define VLOG_SINCE_YEAR 1900 +#define VLOG_MODULE_MAX_LEN 10 + +#define VLOGGER_STR_COLOR_TERMINATION_STR "\e[0m" +#define VLOGGER_STR_TERMINATION_SIZE 6 + +typedef void (*vma_log_cb_t)(int log_level, const char* str); + +extern char g_vlogger_module_name[VLOG_MODULE_MAX_LEN]; +extern FILE* g_vlogger_file; +extern int g_vlogger_fd; +extern vlog_levels_t g_vlogger_level; +extern vlog_levels_t* g_p_vlogger_level; +extern uint8_t g_vlogger_details; +extern uint8_t* g_p_vlogger_details; +extern uint32_t g_vlogger_usec_on_startup; +extern bool g_vlogger_log_in_colors; +extern vma_log_cb_t g_vlogger_cb; + +#define vlog_func_enter() vlog_printf(VLOG_FINE,"ENTER %s\n", __PRETTY_FUNCTION__); +#define vlog_func_exit() vlog_printf(VLOG_FINE,"EXIT %s\n",__PRETTY_FUNCTION__); + +#define vlog_func_all_enter() vlog_printf(VLOG_FINER,"ENTER %s\n", __PRETTY_FUNCTION__); +#define vlog_func_all_exit() vlog_printf(VLOG_FINER,"EXIT %s\n",__PRETTY_FUNCTION__); + +pid_t gettid(void); // Check vlogger.cpp for implementation + +void printf_backtrace(void); + +void vlog_start(const char* log_module_name, vlog_levels_t log_level = VLOG_DEFAULT, const char* log_filename = NULL, int log_details = 0, bool colored_log = true); +void vlog_stop(void); + +static inline uint32_t vlog_get_usec_since_start() +{ + struct timespec ts_now; + + BULLSEYE_EXCLUDE_BLOCK_START + if (gettime(&ts_now)) { + printf("%s() gettime() Returned with Error (errno=%d %m)\n", __func__, errno); + return (uint32_t)-1; + } + BULLSEYE_EXCLUDE_BLOCK_END + + if (!g_vlogger_usec_on_startup) { + g_vlogger_usec_on_startup = ts_to_usec(&ts_now); + } + + return (ts_to_usec(&ts_now) - g_vlogger_usec_on_startup); +} + +#define VLOGGER_STR_SIZE 512 + +void vlog_output(vlog_levels_t log_level, const char* fmt , ... ); + +static inline void vlog_print_buffer(vlog_levels_t log_level, const char* msg_header, const char* msg_tail, const char* buf_user, int buf_len) +{ + if (g_vlogger_level < log_level) + return; + + int len = 0; + char buf[VLOGGER_STR_SIZE]; + + // Format header + if (g_vlogger_level >= VLOG_DEBUG) { + //vlog_time(log_level, log_msg); + len = snprintf(buf, sizeof(buf)-1, " Tid: %11lx : %s %s: ", + pthread_self(), g_vlogger_module_name, log_level::to_str(log_level)); + } else { + len = snprintf(buf, sizeof(buf)-1, "%s %s: ", + g_vlogger_module_name, log_level::to_str(log_level)); + } + if (len < 0) { + return ; + } + buf[len+1] = '\0'; + + + if (msg_header) + len += snprintf(buf+len, VLOGGER_STR_SIZE-len-1, "%s", msg_header); + + for (int c = 0; c < buf_len && len < (VLOGGER_STR_SIZE-1-6); c++) { + len += sprintf(buf+len, "%2.2X ", (unsigned char)buf_user[c]); + if ((c % 8) == 7) + len += sprintf(buf+len, " "); + } + + if (msg_tail) + len += snprintf(buf+len, VLOGGER_STR_SIZE-len-1, "%s", msg_tail); + + buf[len+1] = '\0'; + + // Print out + if (g_vlogger_cb) + { + g_vlogger_cb(log_level, buf); + } + else if (g_vlogger_file) + { + fprintf(g_vlogger_file, "%s", buf); + fflush(g_vlogger_file); + } + else + { + printf("%s", buf); + } + +} + +#ifdef __cplusplus +}; +#endif //__cplusplus + +#endif // VLOGGER_H diff --git a/src/vma/Makefile.am b/src/vma/Makefile.am new file mode 100644 index 0000000..6a710a7 --- /dev/null +++ b/src/vma/Makefile.am @@ -0,0 +1,325 @@ + +noinst_LTLIBRARIES = \ + libconfig_parser.la +libconfig_parser_la_SOURCES = +BUILT_SOURCES = + +AM_LFLAGS = -Plibvma_yy +AM_YFLAGS = -plibvma_yy -d +LEX_OUTPUT_ROOT=lex.libvma_yy + +# consider config_scanner.c, config_parser.c, config_parser.h as source (and not +# as built) because we don't want it to be created by old version of flex/yacc +# on some machines that will generate gcc warmings. +# in case you change the *.l or *.y in the future - than change the commenting in the following 3 lines +#----- +#BUILT_SOURCES += config_scanner.c config_parser.h config_parser.c +#libconfig_parser_la_SOURCES += util/config_scanner.l util/config_parser.y +libconfig_parser_la_SOURCES += config_scanner.c config_parser.c + + +CLEANFILES = $(BUILT_SOURCES) +dist-hook: + cd $(distdir); rm -f $(BUILT_SOURCES) + +SUBDIRS = infra netlink + +EXTRA_DIST = \ + util/hash_map.inl \ + dev/cq_mgr.inl \ + dev/cq_mgr_mlx5.inl \ + util/libvma.conf \ + util/30-libvma-limits.conf + +sysconf_DATA = util/libvma.conf +othersysconfdir=$(sysconfdir)/security/limits.d +othersysconf_DATA=util/30-libvma-limits.conf +otherincludedir = $(includedir)/mellanox +otherinclude_HEADERS = vma_extra.h + +install-exec-hook: + rm -f $(DESTDIR)$(libdir)/libvma.la + rm -f $(DESTDIR)$(libdir)/libvma.a + rm -f $(DESTDIR)$(bindir)/state_machine_test + rm -f $(DESTDIR)$(bindir)/vlogger_test + +uninstall-hook: + rm -f $(DESTDIR)$(libdir)/libvma.so* + +lib_LTLIBRARIES = libvma.la + +AM_CPPFLAGS := \ + -I$(top_srcdir)/src ${LIBNL_CFLAGS} + +if IS_RELEASE_ZERO +libvma_la_LDFLAGS := -no-undefined -version-number @VMA_LIBRARY_MAJOR@:@VMA_LIBRARY_MINOR@:@VMA_LIBRARY_REVISION@ +else +libvma_la_LDFLAGS := -no-undefined -version-number @VMA_LIBRARY_MAJOR@:@VMA_LIBRARY_MINOR@:@VMA_LIBRARY_REVISION@ -release @VMA_LIBRARY_RELEASE@ +endif + +libvma_la_LIBADD = \ + -lrt -ldl -lpthread $(LIBNL_LIBS) $(VERBS_LIBS) $(DPCP_LIBS) \ + $(top_builddir)/src/utils/libutils.la \ + $(top_builddir)/src/vlogger/libvlogger.la \ + $(top_builddir)/src/state_machine/libstate_machine.la \ + $(top_builddir)/src/stats/libstats.la \ + $(top_builddir)/src/vma/netlink/libnetlink.la \ + $(top_builddir)/src/vma/infra/libinfra.la \ + libconfig_parser.la + +libvma_la_SOURCES := \ + dev/allocator.cpp \ + dev/buffer_pool.cpp \ + dev/cq_mgr.cpp \ + dev/cq_mgr_mlx5.cpp \ + dev/cq_mgr_mp.cpp \ + dev/dm_mgr.cpp \ + dev/qp_mgr.cpp \ + dev/qp_mgr_eth_mlx5.cpp \ + dev/qp_mgr_eth_direct.cpp \ + dev/qp_mgr_mp.cpp \ + dev/gro_mgr.cpp \ + dev/rfs.cpp \ + dev/rfs_uc.cpp \ + dev/rfs_uc_tcp_gro.cpp \ + dev/rfs_mc.cpp \ + dev/time_converter.cpp \ + dev/time_converter_ptp.cpp \ + dev/time_converter_ib_ctx.cpp \ + dev/ib_ctx_handler.cpp \ + dev/ib_ctx_handler_collection.cpp \ + dev/net_device_val.cpp \ + dev/net_device_entry.cpp \ + dev/net_device_table_mgr.cpp \ + dev/wqe_send_handler.cpp \ + dev/wqe_send_ib_handler.cpp \ + dev/ring.cpp \ + dev/ring_bond.cpp \ + dev/ring_slave.cpp \ + dev/ring_simple.cpp \ + dev/ring_tap.cpp \ + dev/ring_eth_cb.cpp \ + dev/ring_eth_direct.cpp \ + dev/ring_profile.cpp \ + dev/ring_allocation_logic.cpp \ + \ + event/delta_timer.cpp \ + event/event_handler_manager.cpp \ + event/vlogger_timer_handler.cpp \ + event/netlink_event.cpp \ + \ + ib/base/verbs_extra.cpp \ + ib/mlx5/ib_mlx5.cpp \ + ib/mlx5/ib_mlx5_hw.cpp \ + ib/mlx5/ib_mlx5_dv.cpp \ + \ + iomux/epfd_info.cpp \ + iomux/epoll_wait_call.cpp \ + iomux/io_mux_call.cpp \ + iomux/poll_call.cpp \ + iomux/select_call.cpp \ + \ + lwip/pbuf.c \ + lwip/tcp.c \ + lwip/tcp_in.c \ + lwip/tcp_out.c \ + lwip/cc.c \ + lwip/cc_lwip.c \ + lwip/cc_cubic.c \ + lwip/cc_none.c \ + lwip/init.c \ + \ + proto/ip_frag.cpp \ + proto/flow_tuple.cpp \ + proto/vma_lwip.cpp \ + proto/neighbour.cpp \ + proto/neighbour_table_mgr.cpp \ + proto/L2_address.cpp \ + proto/route_table_mgr.cpp \ + proto/route_entry.cpp \ + proto/route_val.cpp \ + proto/rule_table_mgr.cpp \ + proto/rule_entry.cpp \ + proto/rule_val.cpp \ + proto/dst_entry.cpp \ + proto/dst_entry_udp.cpp \ + proto/dst_entry_udp_mc.cpp \ + proto/dst_entry_tcp.cpp \ + proto/header.cpp \ + proto/arp.cpp \ + proto/igmp_mgr.cpp \ + proto/igmp_handler.cpp \ + \ + sock/sockinfo.cpp \ + sock/sockinfo_udp.cpp \ + sock/sockinfo_tcp.cpp \ + sock/fd_collection.cpp \ + sock/pipeinfo.cpp \ + sock/socket_fd_api.cpp \ + sock/sock-redirect.cpp \ + \ + util/wakeup.cpp \ + util/wakeup_pipe.cpp \ + util/match.cpp \ + util/utils.cpp \ + util/instrumentation.cpp \ + util/sys_vars.cpp \ + util/agent.cpp \ + util/data_updater.cpp \ + \ + libvma.c \ + main.cpp \ + \ + dev/allocator.h \ + dev/buffer_pool.h \ + dev/cq_mgr.h \ + dev/cq_mgr_mlx5.h \ + dev/cq_mgr_mp.h \ + dev/dm_mgr.h \ + dev/gro_mgr.h \ + dev/ib_ctx_handler_collection.h \ + dev/ib_ctx_handler.h \ + dev/time_converter.h \ + dev/time_converter_ptp.h \ + dev/time_converter_ib_ctx.h \ + dev/net_device_entry.h \ + dev/net_device_table_mgr.h \ + dev/net_device_val.h \ + dev/qp_mgr.h \ + dev/qp_mgr_eth_mlx5.h \ + dev/qp_mgr_eth_direct.h \ + dev/qp_mgr_mp.h \ + dev/rfs.h \ + dev/rfs_mc.h \ + dev/rfs_uc.h \ + dev/rfs_uc_tcp_gro.h \ + dev/ring.h \ + dev/ring_bond.h \ + dev/ring_slave.h \ + dev/ring_simple.h \ + dev/ring_tap.h \ + dev/ring_eth_cb.h \ + dev/ring_eth_direct.h \ + dev/ring_profile.h \ + dev/ring_allocation_logic.h \ + dev/wqe_send_handler.h \ + dev/wqe_send_ib_handler.h \ + \ + event/command.h \ + event/delta_timer.h \ + event/event.h \ + event/event_handler_ibverbs.h \ + event/event_handler_manager.h \ + event/event_handler_rdma_cm.h \ + event/netlink_event.h \ + event/timer_handler.h \ + event/timers_group.h \ + event/vlogger_timer_handler.h \ + \ + ib/base/verbs_extra.h \ + ib/mlx5/ib_mlx5.h \ + ib/mlx5/ib_mlx5_hw.h \ + ib/mlx5/ib_mlx5_dv.h \ + \ + infra/sender.h \ + infra/sender_info_dst.h \ + infra/subject_observer.h \ + infra/cache_subject_observer.h \ + \ + iomux/epfd_info.h \ + iomux/epoll_wait_call.h \ + iomux/io_mux_call.h \ + iomux/poll_call.h \ + iomux/select_call.h \ + \ + lwip/cc_cubic.h \ + lwip/cc.h \ + lwip/def.h \ + lwip/err.h \ + lwip/init.h \ + lwip/ip_addr.h \ + lwip/ip.h \ + lwip/opt.h \ + lwip/pbuf.h \ + lwip/stats.h \ + lwip/tcp.h \ + lwip/tcp_impl.h \ + \ + netlink/link_info.h \ + netlink/neigh_info.h \ + netlink/netlink_compatibility.h \ + netlink/netlink_wrapper.h \ + netlink/route_info.h \ + \ + proto/arp.h \ + proto/dst_entry.h \ + proto/dst_entry_tcp.h \ + proto/dst_entry_udp.h \ + proto/dst_entry_udp_mc.h \ + proto/flow_tuple.h \ + proto/header.h \ + proto/igmp_handler.h \ + proto/igmp_mgr.h \ + proto/ip_address.h \ + proto/ip_frag.h \ + proto/L2_address.h \ + proto/mem_buf_desc.h \ + proto/neighbour.h \ + proto/neighbour_observer.h \ + proto/neighbour_table_mgr.h \ + proto/netlink_socket_mgr.h \ + proto/peer_key.h \ + proto/route_entry.h \ + proto/route_rule_table_key.h \ + proto/route_table_mgr.h \ + proto/route_val.h \ + proto/rule_entry.h \ + proto/rule_table_mgr.h \ + proto/rule_val.h \ + proto/vma_lwip.h \ + \ + sock/cleanable_obj.h \ + sock/fd_collection.h \ + sock/pipeinfo.h \ + sock/pkt_rcvr_sink.h \ + sock/pkt_sndr_source.h \ + sock/socket_fd_api.h \ + sock/sockinfo.h \ + sock/sockinfo_tcp.h \ + sock/sockinfo_udp.h \ + sock/sock-redirect.h \ + \ + util/chunk_list.h \ + util/hash_map.h \ + util/if.h \ + util/instrumentation.h \ + util/libvma.h \ + util/list.h \ + util/sg_array.h \ + util/sock_addr.h \ + util/sysctl_reader.h \ + util/sys_vars.h \ + util/to_str.h \ + util/utils.h \ + util/valgrind.h \ + util/vma_list.h \ + util/vma_stats.h \ + util/vtypes.h \ + util/wakeup.h \ + util/wakeup_pipe.h \ + util/agent.h \ + util/agent_def.h \ + util/data_updater.h \ + \ + config_parser.h \ + main.h \ + vma_extra.h + +libvma_la_DEPENDENCIES = \ + $(top_builddir)/src/vlogger/libvlogger.la \ + $(top_builddir)/src/state_machine/libstate_machine.la \ + $(top_builddir)/src/stats/libstats.la \ + $(top_builddir)/src/vma/netlink/libnetlink.la \ + $(top_builddir)/src/vma/infra/libinfra.la \ + libconfig_parser.la + diff --git a/src/vma/config_parser.c b/src/vma/config_parser.c new file mode 100644 index 0000000..cefda17 --- /dev/null +++ b/src/vma/config_parser.c @@ -0,0 +1,2226 @@ +/* A Bison parser, made by GNU Bison 2.7. */ + +/* Bison implementation for Yacc-like parsers in C + + Copyright (C) 1984, 1989-1990, 2000-2012 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* As a special exception, you may create a larger work that contains + part or all of the Bison parser skeleton and distribute that work + under terms of your choice, so long as that work isn't itself a + parser generator using the skeleton or a modified version thereof + as a parser skeleton. Alternatively, if you modify or redistribute + the parser skeleton itself, you may (at your option) remove this + special exception, which will cause the skeleton and the resulting + Bison output files to be licensed under the GNU General Public + License without this special exception. + + This special exception was added by the Free Software Foundation in + version 2.2 of Bison. */ + +/* C LALR(1) parser skeleton written by Richard Stallman, by + simplifying the original so-called "semantic" parser. */ + +/* All symbols defined below should begin with yy or YY, to avoid + infringing on user name space. This should be done even for local + variables, as they might otherwise be expanded by user macros. + There are some unavoidable exceptions within include files to + define necessary library symbols; they are noted "INFRINGES ON + USER NAME SPACE" below. */ + +/* Identify Bison output. */ +#define YYBISON 1 + +/* Bison version. */ +#define YYBISON_VERSION "2.7" + +/* Skeleton name. */ +#define YYSKELETON_NAME "yacc.c" + +/* Pure parsers. */ +#define YYPURE 0 + +/* Push parsers. */ +#define YYPUSH 0 + +/* Pull parsers. */ +#define YYPULL 1 + + +/* Substitute the variable and function names. */ +#define yyparse libvma_yyparse +#define yylex libvma_yylex +#define yyerror libvma_yyerror +#define yylval libvma_yylval +#define yychar libvma_yychar +#define yydebug libvma_yydebug +#define yynerrs libvma_yynerrs + +/* Copy the first part of user declarations. */ +/* Line 371 of yacc.c */ +/* Line 39 of config_parser.y */ + + +/* header section */ +#include +#include +#include +#include +#include +#include +#include +#include + +typedef enum +{ + CONF_RULE +} configuration_t; + +#define YYERROR_VERBOSE 1 + +extern int yyerror(const char *msg); +extern int yylex(void); +static int parse_err = 0; + +struct dbl_lst __instance_list; + +/* some globals to store intermidiate parser state */ +static struct use_family_rule __vma_rule; +static struct address_port_rule *__vma_address_port_rule = NULL; +static int __vma_rule_push_head = 0; +static int current_role = 0; +static configuration_t current_conf_type = CONF_RULE; +static struct instance *curr_instance = NULL; + +int __vma_config_empty(void) +{ + return ((__instance_list.head == NULL) && (__instance_list.tail == NULL)); +} + +/* define the address by 4 integers */ +static void __vma_set_ipv4_addr(short a0, short a1, short a2, short a3) +{ + char buf[16]; + struct in_addr *p_ipv4 = NULL; + + p_ipv4 = &(__vma_address_port_rule->ipv4); + + snprintf(buf, sizeof(buf), "%hd.%hd.%hd.%hd", a0, a1, a2, a3); + if (1 != inet_pton(AF_INET, (const char*)buf, p_ipv4)) { + parse_err = 1; + yyerror("provided address is not legal"); + } +} + +static void __vma_set_inet_addr_prefix_len(unsigned char prefixlen) +{ + if (prefixlen > 32) + prefixlen = 32; + + __vma_address_port_rule->prefixlen = prefixlen; +} + +// SM: log part is not used... +int __vma_min_level = 9; + +void __vma_dump_address_port_rule_config_state(char *buf) { + if (__vma_address_port_rule->match_by_addr) { + char str_addr[INET_ADDRSTRLEN]; + + inet_ntop(AF_INET, &(__vma_address_port_rule->ipv4), str_addr, sizeof(str_addr)); + if ( __vma_address_port_rule->prefixlen != 32 ) { + sprintf(buf+strlen(buf), " %s/%d", str_addr, + __vma_address_port_rule->prefixlen); + } else { + sprintf(buf+strlen(buf), " %s", str_addr); + } + } else { + sprintf(buf+strlen(buf), " *"); + } + + if (__vma_address_port_rule->match_by_port) { + sprintf(buf+strlen(buf), ":%d",__vma_address_port_rule->sport); + if (__vma_address_port_rule->eport > __vma_address_port_rule->sport) + sprintf(buf+strlen(buf), "-%d",__vma_address_port_rule->eport); + } + else + sprintf(buf+strlen(buf), ":*"); +} + +/* dump the current state in readable format */ +static void __vma_dump_rule_config_state(void) { + char buf[1024]; + sprintf(buf, "\tACCESS CONFIG: use %s %s %s ", + __vma_get_transport_str(__vma_rule.target_transport), + __vma_get_role_str(current_role), + __vma_get_protocol_str(__vma_rule.protocol)); + __vma_address_port_rule = &(__vma_rule.first); + __vma_dump_address_port_rule_config_state(buf); + if (__vma_rule.use_second) { + __vma_address_port_rule = &(__vma_rule.second); + __vma_dump_address_port_rule_config_state(buf); + } + sprintf(buf+strlen(buf), "\n"); + __vma_log(1, "%s", buf); +} + +/* dump configuration properites of new instance */ +static void __vma_dump_instance(void) { + char buf[1024]; + + if (curr_instance) { + sprintf(buf, "CONFIGURATION OF INSTANCE "); + if (curr_instance->id.prog_name_expr) + sprintf(buf+strlen(buf), "%s ", curr_instance->id.prog_name_expr); + if (curr_instance->id.user_defined_id) + sprintf(buf+strlen(buf), "%s", curr_instance->id.user_defined_id); + sprintf(buf+strlen(buf), ":\n"); + __vma_log(1, "%s", buf); + } +} + +static void __vma_add_dbl_lst_node_head(struct dbl_lst *lst, struct dbl_lst_node *node) +{ + if (node && lst) { + + node->prev = NULL; + node->next = lst->head; + + if (!lst->head) + lst->tail = node; + else + lst->head->prev = node; + + lst->head = node; + } +} + +static void __vma_add_dbl_lst_node(struct dbl_lst *lst, struct dbl_lst_node *node) +{ + if (node && lst) { + node->prev = lst->tail; + + if (!lst->head) + lst->head = node; + else + lst->tail->next = node; + lst->tail = node; + } +} + +static struct dbl_lst_node* __vma_allocate_dbl_lst_node(void) +{ + struct dbl_lst_node *ret_val = NULL; + + ret_val = (struct dbl_lst_node*) malloc(sizeof(struct dbl_lst_node)); + if (!ret_val) { + yyerror("fail to allocate new node"); + parse_err = 1; + } + else + memset((void*) ret_val, 0, sizeof(struct dbl_lst_node)); + return ret_val; +} + +/* use the above state for adding a new instance */ +static void __vma_add_instance(char *prog_name_expr, char *user_defined_id) { + struct dbl_lst_node *curr, *new_node; + struct instance *new_instance; + + curr = __instance_list.head; + while (curr) { + struct instance *instance = (struct instance*)curr->data; + if (!strcmp(prog_name_expr, instance->id.prog_name_expr) && !strcmp(user_defined_id, instance->id.user_defined_id)) { + curr_instance = (struct instance*)curr->data; + if (__vma_min_level <= 1) __vma_dump_instance(); + return; + } + curr = curr->next; + } + + if (!(new_node = __vma_allocate_dbl_lst_node())) + return; + + new_instance = (struct instance*) malloc(sizeof(struct instance)); + if (!new_instance) { + yyerror("fail to allocate new instance"); + parse_err = 1; + free(new_node); + return; + } + + memset((void*) new_instance, 0, sizeof(struct instance)); + new_instance->id.prog_name_expr = strdup(prog_name_expr); + new_instance->id.user_defined_id = strdup(user_defined_id); + + if (!new_instance->id.prog_name_expr || !new_instance->id.user_defined_id) { + yyerror("failed to allocate memory"); + parse_err = 1; + if (new_instance->id.prog_name_expr) + free(new_instance->id.prog_name_expr); + if (new_instance->id.user_defined_id) + free(new_instance->id.user_defined_id); + free(new_node); + free(new_instance); + return; + } + new_node->data = (void*)new_instance; + __vma_add_dbl_lst_node(&__instance_list, new_node); + curr_instance = new_instance; + if (__vma_min_level <= 1) __vma_dump_instance(); +} + +static void __vma_add_inst_with_int_uid(char *prog_name_expr, int user_defined_id) { + char str_id[50]; + sprintf(str_id, "%d", user_defined_id); + __vma_add_instance(prog_name_expr, str_id); +} + +/* use the above state for making a new rule */ +static void __vma_add_rule(void) { + struct dbl_lst *p_lst; + struct use_family_rule *rule; + struct dbl_lst_node *new_node; + + if (!curr_instance) + __vma_add_instance((char *)"*", (char *)"*"); + if (!curr_instance) + return; + + if (__vma_min_level <= 1) __vma_dump_rule_config_state(); + switch (current_role) { + case ROLE_TCP_SERVER: + p_lst = &curr_instance->tcp_srv_rules_lst; + break; + case ROLE_TCP_CLIENT: + p_lst = &curr_instance->tcp_clt_rules_lst; + break; + case ROLE_UDP_SENDER: + p_lst = &curr_instance->udp_snd_rules_lst; + break; + case ROLE_UDP_RECEIVER: + p_lst = &curr_instance->udp_rcv_rules_lst; + break; + case ROLE_UDP_CONNECT: + p_lst = &curr_instance->udp_con_rules_lst; + break; + default: + yyerror("ignoring unknown role"); + parse_err = 1; + return; + break; + } + + if (!(new_node = __vma_allocate_dbl_lst_node())) + return; + + rule = (struct use_family_rule *)malloc(sizeof(*rule)); + if (!rule) { + yyerror("fail to allocate new rule"); + parse_err = 1; + free(new_node); + return; + } + memset(rule, 0, sizeof(*rule)); + new_node->data = (void*)rule; + *((struct use_family_rule *)new_node->data) = __vma_rule; + if (__vma_rule_push_head) + __vma_add_dbl_lst_node_head(p_lst, new_node); + else + __vma_add_dbl_lst_node(p_lst, new_node); +} + + +/* Line 371 of yacc.c */ +/* Line 341 of config_parser.c */ + +# ifndef YY_NULL +# if defined __cplusplus && 201103L <= __cplusplus +# define YY_NULL nullptr +# else +# define YY_NULL 0 +# endif +# endif + +/* Enabling verbose error messages. */ +#ifdef YYERROR_VERBOSE +# undef YYERROR_VERBOSE +# define YYERROR_VERBOSE 1 +#else +# define YYERROR_VERBOSE 0 +#endif + +/* In a future release of Bison, this section will be replaced + by #include "y.tab.h". */ +#ifndef YY_LIBVMA_YY_Y_TAB_H_INCLUDED +# define YY_LIBVMA_YY_Y_TAB_H_INCLUDED +/* Enabling traces. */ +#ifndef YYDEBUG +# define YYDEBUG 0 +#endif +#if YYDEBUG +extern int libvma_yydebug; +#endif + +/* Tokens. */ +#ifndef YYTOKENTYPE +# define YYTOKENTYPE + /* Put the tokens into the symbol table, so that GDB and other debuggers + know about them. */ + enum yytokentype { + USE = 258, + TCP_CLIENT = 259, + TCP_SERVER = 260, + UDP_SENDER = 261, + UDP_RECEIVER = 262, + UDP_CONNECT = 263, + TCP = 264, + UDP = 265, + OS = 266, + VMA = 267, + SDP = 268, + SA = 269, + INT = 270, + APP_ID = 271, + PROGRAM = 272, + USER_DEFINED_ID_STR = 273, + LOG = 274, + DEST = 275, + STDERR = 276, + SYSLOG = 277, + FILENAME = 278, + NAME = 279, + LEVEL = 280, + LINE = 281 + }; +#endif +/* Tokens. */ +#define USE 258 +#define TCP_CLIENT 259 +#define TCP_SERVER 260 +#define UDP_SENDER 261 +#define UDP_RECEIVER 262 +#define UDP_CONNECT 263 +#define TCP 264 +#define UDP 265 +#define OS 266 +#define VMA 267 +#define SDP 268 +#define SA 269 +#define INT 270 +#define APP_ID 271 +#define PROGRAM 272 +#define USER_DEFINED_ID_STR 273 +#define LOG 274 +#define DEST 275 +#define STDERR 276 +#define SYSLOG 277 +#define FILENAME 278 +#define NAME 279 +#define LEVEL 280 +#define LINE 281 + + + +#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED +typedef union YYSTYPE +{ +/* Line 387 of yacc.c */ +/* Line 306 of config_parser.y */ + + int ival; + char *sval; + + +/* Line 387 of yacc.c */ +/* Line 442 of config_parser.c */ +} YYSTYPE; +# define YYSTYPE_IS_TRIVIAL 1 +# define yystype YYSTYPE /* obsolescent; will be withdrawn */ +# define YYSTYPE_IS_DECLARED 1 +#endif + +extern YYSTYPE libvma_yylval; + +#ifdef YYPARSE_PARAM +#if defined __STDC__ || defined __cplusplus +int libvma_yyparse (void *YYPARSE_PARAM); +#else +int libvma_yyparse (); +#endif +#else /* ! YYPARSE_PARAM */ +#if defined __STDC__ || defined __cplusplus +int libvma_yyparse (void); +#else +int libvma_yyparse (); +#endif +#endif /* ! YYPARSE_PARAM */ + +#endif /* !YY_LIBVMA_YY_Y_TAB_H_INCLUDED */ + +/* Copy the second part of user declarations. */ +/* Line 390 of yacc.c */ +/* Line 339 of config_parser.y */ + + long __vma_config_line_num; + +/* Line 390 of yacc.c */ +/* Line 474 of config_parser.c */ + +#ifdef short +# undef short +#endif + +#ifdef YYTYPE_UINT8 +typedef YYTYPE_UINT8 yytype_uint8; +#else +typedef unsigned char yytype_uint8; +#endif + +#ifdef YYTYPE_INT8 +typedef YYTYPE_INT8 yytype_int8; +#elif (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +typedef signed char yytype_int8; +#else +typedef short int yytype_int8; +#endif + +#ifdef YYTYPE_UINT16 +typedef YYTYPE_UINT16 yytype_uint16; +#else +typedef unsigned short int yytype_uint16; +#endif + +#ifdef YYTYPE_INT16 +typedef YYTYPE_INT16 yytype_int16; +#else +typedef short int yytype_int16; +#endif + +#ifndef YYSIZE_T +# ifdef __SIZE_TYPE__ +# define YYSIZE_T __SIZE_TYPE__ +# elif defined size_t +# define YYSIZE_T size_t +# elif ! defined YYSIZE_T && (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +# include /* INFRINGES ON USER NAME SPACE */ +# define YYSIZE_T size_t +# else +# define YYSIZE_T unsigned int +# endif +#endif + +#define YYSIZE_MAXIMUM ((YYSIZE_T) -1) + +#ifndef YY_ +# if defined YYENABLE_NLS && YYENABLE_NLS +# if ENABLE_NLS +# include /* INFRINGES ON USER NAME SPACE */ +# define YY_(Msgid) dgettext ("bison-runtime", Msgid) +# endif +# endif +# ifndef YY_ +# define YY_(Msgid) Msgid +# endif +#endif + +/* Suppress unused-variable warnings by "using" E. */ +#if ! defined lint || defined __GNUC__ +# define YYUSE(E) ((void) (E)) +#else +# define YYUSE(E) /* empty */ +#endif + +/* Identity function, used to suppress warnings about constant conditions. */ +#ifndef lint +# define YYID(N) (N) +#else +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static int +YYID (int yyi) +#else +static int +YYID (yyi) + int yyi; +#endif +{ + return yyi; +} +#endif + +#if ! defined yyoverflow || YYERROR_VERBOSE + +/* The parser invokes alloca or malloc; define the necessary symbols. */ + +# ifdef YYSTACK_USE_ALLOCA +# if YYSTACK_USE_ALLOCA +# ifdef __GNUC__ +# define YYSTACK_ALLOC __builtin_alloca +# elif defined __BUILTIN_VA_ARG_INCR +# include /* INFRINGES ON USER NAME SPACE */ +# elif defined _AIX +# define YYSTACK_ALLOC __alloca +# elif defined _MSC_VER +# include /* INFRINGES ON USER NAME SPACE */ +# define alloca _alloca +# else +# define YYSTACK_ALLOC alloca +# if ! defined _ALLOCA_H && ! defined EXIT_SUCCESS && (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +# include /* INFRINGES ON USER NAME SPACE */ + /* Use EXIT_SUCCESS as a witness for stdlib.h. */ +# ifndef EXIT_SUCCESS +# define EXIT_SUCCESS 0 +# endif +# endif +# endif +# endif +# endif + +# ifdef YYSTACK_ALLOC + /* Pacify GCC's `empty if-body' warning. */ +# define YYSTACK_FREE(Ptr) do { /* empty */; } while (YYID (0)) +# ifndef YYSTACK_ALLOC_MAXIMUM + /* The OS might guarantee only one guard page at the bottom of the stack, + and a page size can be as small as 4096 bytes. So we cannot safely + invoke alloca (N) if N exceeds 4096. Use a slightly smaller number + to allow for a few compiler-allocated temporary stack slots. */ +# define YYSTACK_ALLOC_MAXIMUM 4032 /* reasonable circa 2006 */ +# endif +# else +# define YYSTACK_ALLOC YYMALLOC +# define YYSTACK_FREE YYFREE +# ifndef YYSTACK_ALLOC_MAXIMUM +# define YYSTACK_ALLOC_MAXIMUM YYSIZE_MAXIMUM +# endif +# if (defined __cplusplus && ! defined EXIT_SUCCESS \ + && ! ((defined YYMALLOC || defined malloc) \ + && (defined YYFREE || defined free))) +# include /* INFRINGES ON USER NAME SPACE */ +# ifndef EXIT_SUCCESS +# define EXIT_SUCCESS 0 +# endif +# endif +# ifndef YYMALLOC +# define YYMALLOC malloc +# if ! defined malloc && ! defined EXIT_SUCCESS && (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +void *malloc (YYSIZE_T); /* INFRINGES ON USER NAME SPACE */ +# endif +# endif +# ifndef YYFREE +# define YYFREE free +# if ! defined free && ! defined EXIT_SUCCESS && (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +void free (void *); /* INFRINGES ON USER NAME SPACE */ +# endif +# endif +# endif +#endif /* ! defined yyoverflow || YYERROR_VERBOSE */ + + +#if (! defined yyoverflow \ + && (! defined __cplusplus \ + || (defined YYSTYPE_IS_TRIVIAL && YYSTYPE_IS_TRIVIAL))) + +/* A type that is properly aligned for any stack member. */ +union yyalloc +{ + yytype_int16 yyss_alloc; + YYSTYPE yyvs_alloc; +}; + +/* The size of the maximum gap between one aligned stack and the next. */ +# define YYSTACK_GAP_MAXIMUM (sizeof (union yyalloc) - 1) + +/* The size of an array large to enough to hold all stacks, each with + N elements. */ +# define YYSTACK_BYTES(N) \ + ((N) * (sizeof (yytype_int16) + sizeof (YYSTYPE)) \ + + YYSTACK_GAP_MAXIMUM) + +# define YYCOPY_NEEDED 1 + +/* Relocate STACK from its old location to the new one. The + local variables YYSIZE and YYSTACKSIZE give the old and new number of + elements in the stack, and YYPTR gives the new location of the + stack. Advance YYPTR to a properly aligned location for the next + stack. */ +# define YYSTACK_RELOCATE(Stack_alloc, Stack) \ + do \ + { \ + YYSIZE_T yynewbytes; \ + YYCOPY (&yyptr->Stack_alloc, Stack, yysize); \ + Stack = &yyptr->Stack_alloc; \ + yynewbytes = yystacksize * sizeof (*Stack) + YYSTACK_GAP_MAXIMUM; \ + yyptr += yynewbytes / sizeof (*yyptr); \ + } \ + while (YYID (0)) + +#endif + +#if defined YYCOPY_NEEDED && YYCOPY_NEEDED +/* Copy COUNT objects from SRC to DST. The source and destination do + not overlap. */ +# ifndef YYCOPY +# if defined __GNUC__ && 1 < __GNUC__ +# define YYCOPY(Dst, Src, Count) \ + __builtin_memcpy (Dst, Src, (Count) * sizeof (*(Src))) +# else +# define YYCOPY(Dst, Src, Count) \ + do \ + { \ + YYSIZE_T yyi; \ + for (yyi = 0; yyi < (Count); yyi++) \ + (Dst)[yyi] = (Src)[yyi]; \ + } \ + while (YYID (0)) +# endif +# endif +#endif /* !YYCOPY_NEEDED */ + +/* YYFINAL -- State number of the termination state. */ +#define YYFINAL 7 +/* YYLAST -- Last index in YYTABLE. */ +#define YYLAST 48 + +/* YYNTOKENS -- Number of terminals. */ +#define YYNTOKENS 32 +/* YYNNTS -- Number of nonterminals. */ +#define YYNNTS 26 +/* YYNRULES -- Number of rules. */ +#define YYNRULES 50 +/* YYNRULES -- Number of states. */ +#define YYNSTATES 74 + +/* YYTRANSLATE(YYLEX) -- Bison symbol number corresponding to YYLEX. */ +#define YYUNDEFTOK 2 +#define YYMAXUTOK 281 + +#define YYTRANSLATE(YYX) \ + ((unsigned int) (YYX) <= YYMAXUTOK ? yytranslate[YYX] : YYUNDEFTOK) + +/* YYTRANSLATE[YYLEX] -- Bison symbol number corresponding to YYLEX. */ +static const yytype_uint8 yytranslate[] = +{ + 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 27, 2, 2, 31, 30, 29, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 28, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, + 2, 2, 2, 2, 2, 2, 1, 2, 3, 4, + 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, + 25, 26 +}; + +#if YYDEBUG +/* YYPRHS[YYN] -- Index of the first RHS symbol of rule number YYN in + YYRHS. */ +static const yytype_uint8 yyprhs[] = +{ + 0, 0, 3, 5, 8, 9, 10, 12, 15, 16, + 19, 21, 23, 25, 29, 30, 33, 36, 39, 42, + 46, 49, 52, 56, 60, 66, 68, 70, 72, 74, + 76, 78, 80, 82, 84, 86, 88, 90, 92, 96, + 104, 105, 108, 109, 112, 114, 118, 120, 128, 130, + 134 +}; + +/* YYRHS -- A `-1'-separated list of the rules' RHS. */ +static const yytype_int8 yyrhs[] = +{ + 35, 0, -1, 26, -1, 33, 26, -1, -1, -1, + 33, -1, 34, 36, -1, -1, 36, 37, -1, 38, + -1, 42, -1, 44, -1, 19, 39, 33, -1, -1, + 39, 40, -1, 39, 41, -1, 20, 21, -1, 20, + 22, -1, 20, 23, 24, -1, 25, 15, -1, 43, + 33, -1, 16, 17, 18, -1, 16, 17, 15, -1, + 45, 46, 47, 48, 33, -1, 3, -1, 11, -1, + 12, -1, 13, -1, 14, -1, 27, -1, 5, -1, + 4, -1, 7, -1, 6, -1, 8, -1, 49, -1, + 50, -1, 51, 28, 57, -1, 51, 28, 57, 28, + 53, 28, 57, -1, -1, 52, 55, -1, -1, 54, + 55, -1, 56, -1, 56, 29, 15, -1, 27, -1, + 15, 30, 15, 30, 15, 30, 15, -1, 15, -1, + 15, 31, 15, -1, 27, -1 +}; + +/* YYRLINE[YYN] -- source line where rule number YYN was defined. */ +static const yytype_uint16 yyrline[] = +{ + 0, 345, 345, 346, 347, 349, 350, 353, 356, 357, + 361, 362, 363, 367, 370, 371, 372, 376, 377, 378, + 382, 386, 390, 391, 396, 400, 404, 405, 406, 407, + 408, 413, 414, 415, 416, 417, 421, 422, 426, 430, + 434, 434, 438, 438, 442, 443, 444, 448, 452, 453, + 454 +}; +#endif + +#if YYDEBUG || YYERROR_VERBOSE || 0 +/* YYTNAME[SYMBOL-NUM] -- String name of the symbol SYMBOL-NUM. + First, the terminals, then, starting at YYNTOKENS, nonterminals. */ +static const char *const yytname[] = +{ + "$end", "error", "$undefined", "\"use\"", "\"tcp client\"", + "\"tcp server\"", "\"udp sender\"", "\"udp receiver\"", + "\"udp connect\"", "\"tcp\"", "\"udp\"", "\"os\"", "\"vma\"", "\"sdp\"", + "\"sa\"", "\"integer value\"", "\"application id\"", "\"program name\"", + "\"userdefined id str\"", "\"log statement\"", "\"destination\"", + "\"ystderr\"", "\"syslog\"", "\"yfile\"", "\"a name\"", "\"min-level\"", + "\"new line\"", "'*'", "':'", "'/'", "'.'", "'-'", "$accept", "NL", + "ONL", "config", "statements", "statement", "log_statement", "log_opts", + "log_dest", "verbosity", "app_id_statement", "app_id", + "socket_statement", "use", "transport", "role", "tuple", "three_tuple", + "five_tuple", "address_first", "$@1", "address_second", "$@2", "address", + "ipv4", "ports", YY_NULL +}; +#endif + +# ifdef YYPRINT +/* YYTOKNUM[YYLEX-NUM] -- Internal token number corresponding to + token YYLEX-NUM. */ +static const yytype_uint16 yytoknum[] = +{ + 0, 256, 257, 258, 259, 260, 261, 262, 263, 264, + 265, 266, 267, 268, 269, 270, 271, 272, 273, 274, + 275, 276, 277, 278, 279, 280, 281, 42, 58, 47, + 46, 45 +}; +# endif + +/* YYR1[YYN] -- Symbol number of symbol that rule YYN derives. */ +static const yytype_uint8 yyr1[] = +{ + 0, 32, 33, 33, 33, 34, 34, 35, 36, 36, + 37, 37, 37, 38, 39, 39, 39, 40, 40, 40, + 41, 42, 43, 43, 44, 45, 46, 46, 46, 46, + 46, 47, 47, 47, 47, 47, 48, 48, 49, 50, + 52, 51, 54, 53, 55, 55, 55, 56, 57, 57, + 57 +}; + +/* YYR2[YYN] -- Number of symbols composing right hand side of rule YYN. */ +static const yytype_uint8 yyr2[] = +{ + 0, 2, 1, 2, 0, 0, 1, 2, 0, 2, + 1, 1, 1, 3, 0, 2, 2, 2, 2, 3, + 2, 2, 3, 3, 5, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 3, 7, + 0, 2, 0, 2, 1, 3, 1, 7, 1, 3, + 1 +}; + +/* YYDEFACT[STATE-NAME] -- Default reduction number in state STATE-NUM. + Performed when YYTABLE doesn't specify something else to do. Zero + means the default is an error. */ +static const yytype_uint8 yydefact[] = +{ + 4, 2, 6, 8, 0, 3, 7, 1, 25, 0, + 14, 9, 10, 11, 4, 12, 0, 0, 4, 21, + 26, 27, 28, 29, 30, 0, 23, 22, 0, 0, + 13, 15, 16, 32, 31, 34, 33, 35, 40, 17, + 18, 0, 20, 4, 36, 37, 0, 0, 19, 24, + 0, 0, 46, 41, 44, 48, 50, 38, 0, 0, + 0, 42, 0, 45, 49, 0, 0, 0, 0, 43, + 0, 39, 0, 47 +}; + +/* YYDEFGOTO[NTERM-NUM]. */ +static const yytype_int8 yydefgoto[] = +{ + -1, 2, 3, 4, 6, 11, 12, 18, 31, 32, + 13, 14, 15, 16, 25, 38, 43, 44, 45, 46, + 47, 65, 66, 53, 54, 57 +}; + +/* YYPACT[STATE-NUM] -- Index in YYTABLE of the portion describing + STATE-NUM. */ +#define YYPACT_NINF -25 +static const yytype_int8 yypact[] = +{ + -24, -25, -17, -25, 12, -25, -2, -25, -25, -1, + -25, -25, -25, -25, -24, -25, -6, 13, -7, -17, + -25, -25, -25, -25, -25, 19, -25, -25, 11, -4, + -17, -25, -25, -25, -25, -25, -25, -25, -25, -25, + -25, 6, -25, -24, -25, -25, -8, -12, -25, -17, + -5, 5, -25, -25, 7, 8, -25, 9, 23, 25, + 26, -25, 14, -25, -25, 15, -12, 27, -5, -25, + 16, -25, 30, -25 +}; + +/* YYPGOTO[NTERM-NUM]. */ +static const yytype_int8 yypgoto[] = +{ + -25, -14, -25, -25, -25, -25, -25, -25, -25, -25, + -25, -25, -25, -25, -25, -25, -25, -25, -25, -25, + -25, -25, -25, -19, -25, -20 +}; + +/* YYTABLE[YYPACT[STATE-NUM]]. What to do in state STATE-NUM. If + positive, shift that token. If negative, reduce the rule which + number is the opposite. If YYTABLE_NINF, syntax error. */ +#define YYTABLE_NINF -1 +static const yytype_uint8 yytable[] = +{ + 19, 8, 1, 51, 30, 20, 21, 22, 23, 5, + 55, 42, 7, 28, 9, 52, 17, 10, 29, 1, + 50, 24, 56, 33, 34, 35, 36, 37, 26, 49, + 48, 27, 39, 40, 41, 58, 59, 61, 62, 60, + 63, 64, 70, 68, 67, 73, 72, 69, 71 +}; + +#define yypact_value_is_default(Yystate) \ + (!!((Yystate) == (-25))) + +#define yytable_value_is_error(Yytable_value) \ + YYID (0) + +static const yytype_uint8 yycheck[] = +{ + 14, 3, 26, 15, 18, 11, 12, 13, 14, 26, + 15, 15, 0, 20, 16, 27, 17, 19, 25, 26, + 28, 27, 27, 4, 5, 6, 7, 8, 15, 43, + 24, 18, 21, 22, 23, 30, 29, 28, 15, 31, + 15, 15, 15, 28, 30, 15, 30, 66, 68 +}; + +/* YYSTOS[STATE-NUM] -- The (internal number of the) accessing + symbol of state STATE-NUM. */ +static const yytype_uint8 yystos[] = +{ + 0, 26, 33, 34, 35, 26, 36, 0, 3, 16, + 19, 37, 38, 42, 43, 44, 45, 17, 39, 33, + 11, 12, 13, 14, 27, 46, 15, 18, 20, 25, + 33, 40, 41, 4, 5, 6, 7, 8, 47, 21, + 22, 23, 15, 48, 49, 50, 51, 52, 24, 33, + 28, 15, 27, 55, 56, 15, 27, 57, 30, 29, + 31, 28, 15, 15, 15, 53, 54, 30, 28, 55, + 15, 57, 30, 15 +}; + +#define yyerrok (yyerrstatus = 0) +#define yyclearin (yychar = YYEMPTY) +#define YYEMPTY (-2) +#define YYEOF 0 + +#define YYACCEPT goto yyacceptlab +#define YYABORT goto yyabortlab +#define YYERROR goto yyerrorlab + + +/* Like YYERROR except do call yyerror. This remains here temporarily + to ease the transition to the new meaning of YYERROR, for GCC. + Once GCC version 2 has supplanted version 1, this can go. However, + YYFAIL appears to be in use. Nevertheless, it is formally deprecated + in Bison 2.4.2's NEWS entry, where a plan to phase it out is + discussed. */ + +#define YYFAIL goto yyerrlab +#if defined YYFAIL + /* This is here to suppress warnings from the GCC cpp's + -Wunused-macros. Normally we don't worry about that warning, but + some users do, and we want to make it easy for users to remove + YYFAIL uses, which will produce warnings from Bison 2.5. */ +#endif + +#define YYRECOVERING() (!!yyerrstatus) + +#define YYBACKUP(Token, Value) \ +do \ + if (yychar == YYEMPTY) \ + { \ + yychar = (Token); \ + yylval = (Value); \ + YYPOPSTACK (yylen); \ + yystate = *yyssp; \ + goto yybackup; \ + } \ + else \ + { \ + yyerror (YY_("syntax error: cannot back up")); \ + YYERROR; \ + } \ +while (YYID (0)) + +/* Error token number */ +#define YYTERROR 1 +#define YYERRCODE 256 + + +/* This macro is provided for backward compatibility. */ +#ifndef YY_LOCATION_PRINT +# define YY_LOCATION_PRINT(File, Loc) ((void) 0) +#endif + + +/* YYLEX -- calling `yylex' with the right arguments. */ +#ifdef YYLEX_PARAM +# define YYLEX yylex (YYLEX_PARAM) +#else +# define YYLEX yylex () +#endif + +/* Enable debugging if requested. */ +#if YYDEBUG + +# ifndef YYFPRINTF +# include /* INFRINGES ON USER NAME SPACE */ +# define YYFPRINTF fprintf +# endif + +# define YYDPRINTF(Args) \ +do { \ + if (yydebug) \ + YYFPRINTF Args; \ +} while (YYID (0)) + +# define YY_SYMBOL_PRINT(Title, Type, Value, Location) \ +do { \ + if (yydebug) \ + { \ + YYFPRINTF (stderr, "%s ", Title); \ + yy_symbol_print (stderr, \ + Type, Value); \ + YYFPRINTF (stderr, "\n"); \ + } \ +} while (YYID (0)) + + +/*--------------------------------. +| Print this symbol on YYOUTPUT. | +`--------------------------------*/ + +/*ARGSUSED*/ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yy_symbol_value_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep) +#else +static void +yy_symbol_value_print (yyoutput, yytype, yyvaluep) + FILE *yyoutput; + int yytype; + YYSTYPE const * const yyvaluep; +#endif +{ + FILE *yyo = yyoutput; + YYUSE (yyo); + if (!yyvaluep) + return; +# ifdef YYPRINT + if (yytype < YYNTOKENS) + YYPRINT (yyoutput, yytoknum[yytype], *yyvaluep); +# else + YYUSE (yyoutput); +# endif + switch (yytype) + { + default: + break; + } +} + + +/*--------------------------------. +| Print this symbol on YYOUTPUT. | +`--------------------------------*/ + +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yy_symbol_print (FILE *yyoutput, int yytype, YYSTYPE const * const yyvaluep) +#else +static void +yy_symbol_print (yyoutput, yytype, yyvaluep) + FILE *yyoutput; + int yytype; + YYSTYPE const * const yyvaluep; +#endif +{ + if (yytype < YYNTOKENS) + YYFPRINTF (yyoutput, "token %s (", yytname[yytype]); + else + YYFPRINTF (yyoutput, "nterm %s (", yytname[yytype]); + + yy_symbol_value_print (yyoutput, yytype, yyvaluep); + YYFPRINTF (yyoutput, ")"); +} + +/*------------------------------------------------------------------. +| yy_stack_print -- Print the state stack from its BOTTOM up to its | +| TOP (included). | +`------------------------------------------------------------------*/ + +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yy_stack_print (yytype_int16 *yybottom, yytype_int16 *yytop) +#else +static void +yy_stack_print (yybottom, yytop) + yytype_int16 *yybottom; + yytype_int16 *yytop; +#endif +{ + YYFPRINTF (stderr, "Stack now"); + for (; yybottom <= yytop; yybottom++) + { + int yybot = *yybottom; + YYFPRINTF (stderr, " %d", yybot); + } + YYFPRINTF (stderr, "\n"); +} + +# define YY_STACK_PRINT(Bottom, Top) \ +do { \ + if (yydebug) \ + yy_stack_print ((Bottom), (Top)); \ +} while (YYID (0)) + + +/*------------------------------------------------. +| Report that the YYRULE is going to be reduced. | +`------------------------------------------------*/ + +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yy_reduce_print (YYSTYPE *yyvsp, int yyrule) +#else +static void +yy_reduce_print (yyvsp, yyrule) + YYSTYPE *yyvsp; + int yyrule; +#endif +{ + int yynrhs = yyr2[yyrule]; + int yyi; + unsigned long int yylno = yyrline[yyrule]; + YYFPRINTF (stderr, "Reducing stack by rule %d (line %lu):\n", + yyrule - 1, yylno); + /* The symbols being reduced. */ + for (yyi = 0; yyi < yynrhs; yyi++) + { + YYFPRINTF (stderr, " $%d = ", yyi + 1); + yy_symbol_print (stderr, yyrhs[yyprhs[yyrule] + yyi], + &(yyvsp[(yyi + 1) - (yynrhs)]) + ); + YYFPRINTF (stderr, "\n"); + } +} + +# define YY_REDUCE_PRINT(Rule) \ +do { \ + if (yydebug) \ + yy_reduce_print (yyvsp, Rule); \ +} while (YYID (0)) + +/* Nonzero means print parse trace. It is left uninitialized so that + multiple parsers can coexist. */ +int yydebug; +#else /* !YYDEBUG */ +# define YYDPRINTF(Args) +# define YY_SYMBOL_PRINT(Title, Type, Value, Location) +# define YY_STACK_PRINT(Bottom, Top) +# define YY_REDUCE_PRINT(Rule) +#endif /* !YYDEBUG */ + + +/* YYINITDEPTH -- initial size of the parser's stacks. */ +#ifndef YYINITDEPTH +# define YYINITDEPTH 200 +#endif + +/* YYMAXDEPTH -- maximum size the stacks can grow to (effective only + if the built-in stack extension method is used). + + Do not make this value too large; the results are undefined if + YYSTACK_ALLOC_MAXIMUM < YYSTACK_BYTES (YYMAXDEPTH) + evaluated with infinite-precision integer arithmetic. */ + +#ifndef YYMAXDEPTH +# define YYMAXDEPTH 10000 +#endif + + +#if YYERROR_VERBOSE + +# ifndef yystrlen +# if defined __GLIBC__ && defined _STRING_H +# define yystrlen strlen +# else +/* Return the length of YYSTR. */ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static YYSIZE_T +yystrlen (const char *yystr) +#else +static YYSIZE_T +yystrlen (yystr) + const char *yystr; +#endif +{ + YYSIZE_T yylen; + for (yylen = 0; yystr[yylen]; yylen++) + continue; + return yylen; +} +# endif +# endif + +# ifndef yystpcpy +# if defined __GLIBC__ && defined _STRING_H && defined _GNU_SOURCE +# define yystpcpy stpcpy +# else +/* Copy YYSRC to YYDEST, returning the address of the terminating '\0' in + YYDEST. */ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static char * +yystpcpy (char *yydest, const char *yysrc) +#else +static char * +yystpcpy (yydest, yysrc) + char *yydest; + const char *yysrc; +#endif +{ + char *yyd = yydest; + const char *yys = yysrc; + + while ((*yyd++ = *yys++) != '\0') + continue; + + return yyd - 1; +} +# endif +# endif + +# ifndef yytnamerr +/* Copy to YYRES the contents of YYSTR after stripping away unnecessary + quotes and backslashes, so that it's suitable for yyerror. The + heuristic is that double-quoting is unnecessary unless the string + contains an apostrophe, a comma, or backslash (other than + backslash-backslash). YYSTR is taken from yytname. If YYRES is + null, do not copy; instead, return the length of what the result + would have been. */ +static YYSIZE_T +yytnamerr (char *yyres, const char *yystr) +{ + if (*yystr == '"') + { + YYSIZE_T yyn = 0; + char const *yyp = yystr; + + for (;;) + switch (*++yyp) + { + case '\'': + case ',': + goto do_not_strip_quotes; + + /* coverity[unterminated_case] */ + case '\\': + if (*++yyp != '\\') + goto do_not_strip_quotes; + /* Fall through. */ + default: + if (yyres) + yyres[yyn] = *yyp; + yyn++; + break; + + case '"': + if (yyres) + yyres[yyn] = '\0'; + return yyn; + } + do_not_strip_quotes: ; + } + + if (! yyres) + return yystrlen (yystr); + + return yystpcpy (yyres, yystr) - yyres; +} +# endif + +/* Copy into *YYMSG, which is of size *YYMSG_ALLOC, an error message + about the unexpected token YYTOKEN for the state stack whose top is + YYSSP. + + Return 0 if *YYMSG was successfully written. Return 1 if *YYMSG is + not large enough to hold the message. In that case, also set + *YYMSG_ALLOC to the required number of bytes. Return 2 if the + required number of bytes is too large to store. */ +static int +yysyntax_error (YYSIZE_T *yymsg_alloc, char **yymsg, + yytype_int16 *yyssp, int yytoken) +{ + YYSIZE_T yysize0 = 0; + YYSIZE_T yysize = yysize0; + enum { YYERROR_VERBOSE_ARGS_MAXIMUM = 5 }; + /* Internationalized format string. */ + const char *yyformat = YY_NULL; + /* Arguments of yyformat. */ + char const *yyarg[YYERROR_VERBOSE_ARGS_MAXIMUM]; + /* Number of reported tokens (one for the "unexpected", one per + "expected"). */ + int yycount = 0; + + if (yytoken < 0) return 1; + yysize0 = yytnamerr (YY_NULL, yytname[yytoken]); + yysize = yysize0; + + /* There are many possibilities here to consider: + - Assume YYFAIL is not used. It's too flawed to consider. See + + for details. YYERROR is fine as it does not invoke this + function. + - If this state is a consistent state with a default action, then + the only way this function was invoked is if the default action + is an error action. In that case, don't check for expected + tokens because there are none. + - The only way there can be no lookahead present (in yychar) is if + this state is a consistent state with a default action. Thus, + detecting the absence of a lookahead is sufficient to determine + that there is no unexpected or expected token to report. In that + case, just report a simple "syntax error". + - Don't assume there isn't a lookahead just because this state is a + consistent state with a default action. There might have been a + previous inconsistent state, consistent state with a non-default + action, or user semantic action that manipulated yychar. + - Of course, the expected token list depends on states to have + correct lookahead information, and it depends on the parser not + to perform extra reductions after fetching a lookahead from the + scanner and before detecting a syntax error. Thus, state merging + (from LALR or IELR) and default reductions corrupt the expected + token list. However, the list is correct for canonical LR with + one exception: it will still contain any token that will not be + accepted due to an error action in a later state. + */ + if (yytoken != YYEMPTY) + { + int yyn = yypact[*yyssp]; + yyarg[yycount++] = yytname[yytoken]; + if (!yypact_value_is_default (yyn)) + { + /* Start YYX at -YYN if negative to avoid negative indexes in + YYCHECK. In other words, skip the first -YYN actions for + this state because they are default actions. */ + int yyxbegin = yyn < 0 ? -yyn : 0; + /* Stay within bounds of both yycheck and yytname. */ + int yychecklim = YYLAST - yyn + 1; + int yyxend = yychecklim < YYNTOKENS ? yychecklim : YYNTOKENS; + int yyx; + + for (yyx = yyxbegin; yyx < yyxend; ++yyx) + if (yycheck[yyx + yyn] == yyx && yyx != YYTERROR + && !yytable_value_is_error (yytable[yyx + yyn])) + { + if (yycount == YYERROR_VERBOSE_ARGS_MAXIMUM) + { + yycount = 1; + yysize = yysize0; + break; + } + yyarg[yycount++] = yytname[yyx]; + { + YYSIZE_T yysize1 = yysize + yytnamerr (YY_NULL, yytname[yyx]); + if (! (yysize <= yysize1 + && yysize1 <= YYSTACK_ALLOC_MAXIMUM)) + return 2; + yysize = yysize1; + } + } + } + } + + switch (yycount) + { +# define YYCASE_(N, S) \ + case N: \ + yyformat = S; \ + break + YYCASE_(0, YY_("syntax error")); + YYCASE_(1, YY_("syntax error, unexpected %s")); + YYCASE_(2, YY_("syntax error, unexpected %s, expecting %s")); + YYCASE_(3, YY_("syntax error, unexpected %s, expecting %s or %s")); + YYCASE_(4, YY_("syntax error, unexpected %s, expecting %s or %s or %s")); + YYCASE_(5, YY_("syntax error, unexpected %s, expecting %s or %s or %s or %s")); +# undef YYCASE_ + } + + { + YYSIZE_T yysize1 = yysize + (yyformat ? yystrlen (yyformat) : 0); + if (! (yysize <= yysize1 && yysize1 <= YYSTACK_ALLOC_MAXIMUM)) + return 2; + yysize = yysize1; + } + + if (*yymsg_alloc < yysize) + { + *yymsg_alloc = 2 * yysize; + if (! (yysize <= *yymsg_alloc + && *yymsg_alloc <= YYSTACK_ALLOC_MAXIMUM)) + *yymsg_alloc = YYSTACK_ALLOC_MAXIMUM; + return 1; + } + + /* Avoid sprintf, as that infringes on the user's name space. + Don't have undefined behavior even if the translation + produced a string with the wrong number of "%s"s. */ + { + char *yyp = *yymsg; + int yyi = 0; + /* coverity[var_deref_op] */ + while ((*yyp = *yyformat) != '\0') + if (*yyp == '%' && yyformat[1] == 's' && yyi < yycount) + { + yyp += yytnamerr (yyp, yyarg[yyi++]); + yyformat += 2; + } + else + { + yyp++; + yyformat++; + } + } + return 0; +} +#endif /* YYERROR_VERBOSE */ + +/*-----------------------------------------------. +| Release the memory associated to this symbol. | +`-----------------------------------------------*/ + +/*ARGSUSED*/ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +static void +yydestruct (const char *yymsg, int yytype, YYSTYPE *yyvaluep) +#else +static void +yydestruct (yymsg, yytype, yyvaluep) + const char *yymsg; + int yytype; + YYSTYPE *yyvaluep; +#endif +{ + YYUSE (yyvaluep); + + if (!yymsg) + yymsg = "Deleting"; + (void)yymsg; + YY_SYMBOL_PRINT (yymsg, yytype, yyvaluep, yylocationp); + + switch (yytype) + { + + default: + break; + } +} + + + + +/* The lookahead symbol. */ +int yychar; + + +#ifndef YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN +# define YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN +# define YY_IGNORE_MAYBE_UNINITIALIZED_END +#endif +#ifndef YY_INITIAL_VALUE +# define YY_INITIAL_VALUE(Value) /* Nothing. */ +#endif + +/* The semantic value of the lookahead symbol. */ +YYSTYPE yylval YY_INITIAL_VALUE(yyval_default); + +/* Number of syntax errors so far. */ +int yynerrs; + + +/*----------. +| yyparse. | +`----------*/ + +#ifdef YYPARSE_PARAM +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +int +yyparse (void *YYPARSE_PARAM) +#else +int +yyparse (YYPARSE_PARAM) + void *YYPARSE_PARAM; +#endif +#else /* ! YYPARSE_PARAM */ +#if (defined __STDC__ || defined __C99__FUNC__ \ + || defined __cplusplus || defined _MSC_VER) +int +yyparse (void) +#else +int +yyparse () + +#endif +#endif +{ + int yystate; + /* Number of tokens to shift before error messages enabled. */ + int yyerrstatus; + + /* The stacks and their tools: + `yyss': related to states. + `yyvs': related to semantic values. + + Refer to the stacks through separate pointers, to allow yyoverflow + to reallocate them elsewhere. */ + + /* The state stack. */ + yytype_int16 yyssa[YYINITDEPTH]; + yytype_int16 *yyss; + yytype_int16 *yyssp; + + /* The semantic value stack. */ + YYSTYPE yyvsa[YYINITDEPTH]; + YYSTYPE *yyvs; + YYSTYPE *yyvsp; + + YYSIZE_T yystacksize; + + int yyn; + int yyresult; + /* Lookahead token as an internal (translated) token number. */ + int yytoken = 0; + /* The variables used to return semantic value and location from the + action routines. */ + YYSTYPE yyval; + +#if YYERROR_VERBOSE + /* Buffer for error messages, and its allocated size. */ + char yymsgbuf[128]; + char *yymsg = yymsgbuf; + YYSIZE_T yymsg_alloc = sizeof yymsgbuf; +#endif + +#define YYPOPSTACK(N) (yyvsp -= (N), yyssp -= (N)) + + /* The number of symbols on the RHS of the reduced rule. + Keep to zero when no symbol should be popped. */ + int yylen = 0; + + yyssp = yyss = yyssa; + yyvsp = yyvs = yyvsa; + yystacksize = YYINITDEPTH; + + YYDPRINTF ((stderr, "Starting parse\n")); + + yystate = 0; + yyerrstatus = 0; + yynerrs = 0; + yychar = YYEMPTY; /* Cause a token to be read. */ + goto yysetstate; + +/*------------------------------------------------------------. +| yynewstate -- Push a new state, which is found in yystate. | +`------------------------------------------------------------*/ + yynewstate: + /* In all cases, when you get here, the value and location stacks + have just been pushed. So pushing a state here evens the stacks. */ + yyssp++; + + yysetstate: + *yyssp = yystate; + + if (yyss + yystacksize - 1 <= yyssp) + { + /* Get the current used size of the three stacks, in elements. */ + YYSIZE_T yysize = yyssp - yyss + 1; + +#ifdef yyoverflow + { + /* Give user a chance to reallocate the stack. Use copies of + these so that the &'s don't force the real ones into + memory. */ + YYSTYPE *yyvs1 = yyvs; + yytype_int16 *yyss1 = yyss; + + /* Each stack pointer address is followed by the size of the + data in use in that stack, in bytes. This used to be a + conditional around just the two extra args, but that might + be undefined if yyoverflow is a macro. */ + yyoverflow (YY_("memory exhausted"), + &yyss1, yysize * sizeof (*yyssp), + &yyvs1, yysize * sizeof (*yyvsp), + &yystacksize); + + yyss = yyss1; + yyvs = yyvs1; + } +#else /* no yyoverflow */ +# ifndef YYSTACK_RELOCATE + goto yyexhaustedlab; +# else + /* Extend the stack our own way. */ + if (YYMAXDEPTH <= yystacksize) + goto yyexhaustedlab; + yystacksize *= 2; + if (YYMAXDEPTH < yystacksize) + yystacksize = YYMAXDEPTH; + + { + yytype_int16 *yyss1 = yyss; + /* coverity[leaked_storage] */ + union yyalloc *yyptr = + (union yyalloc *) YYSTACK_ALLOC (YYSTACK_BYTES (yystacksize)); + if (! yyptr) + goto yyexhaustedlab; + YYSTACK_RELOCATE (yyss_alloc, yyss); + YYSTACK_RELOCATE (yyvs_alloc, yyvs); +# undef YYSTACK_RELOCATE + if (yyss1 != yyssa) + YYSTACK_FREE (yyss1); + /* coverity[leaked_storage] */ + } +# endif +#endif /* no yyoverflow */ + + /* coverity[ptr_arith] */ + yyssp = yyss + yysize - 1; + yyvsp = yyvs + yysize - 1; + + YYDPRINTF ((stderr, "Stack size increased to %lu\n", + (unsigned long int) yystacksize)); + + if (yyss + yystacksize - 1 <= yyssp) + YYABORT; + } + + YYDPRINTF ((stderr, "Entering state %d\n", yystate)); + + if (yystate == YYFINAL) + YYACCEPT; + + goto yybackup; + +/*-----------. +| yybackup. | +`-----------*/ +yybackup: + + /* Do appropriate processing given the current state. Read a + lookahead token if we need one and don't already have one. */ + + /* First try to decide what to do without reference to lookahead token. */ + yyn = yypact[yystate]; + if (yypact_value_is_default (yyn)) + goto yydefault; + + /* Not known => get a lookahead token if don't already have one. */ + + /* YYCHAR is either YYEMPTY or YYEOF or a valid lookahead symbol. */ + if (yychar == YYEMPTY) + { + YYDPRINTF ((stderr, "Reading a token: ")); + yychar = YYLEX; + } + + if (yychar <= YYEOF) + { + yychar = yytoken = YYEOF; + YYDPRINTF ((stderr, "Now at end of input.\n")); + } + else + { + yytoken = YYTRANSLATE (yychar); + YY_SYMBOL_PRINT ("Next token is", yytoken, &yylval, &yylloc); + } + + /* If the proper action on seeing token YYTOKEN is to reduce or to + detect an error, take that action. */ + yyn += yytoken; + if (yyn < 0 || YYLAST < yyn || yycheck[yyn] != yytoken) + goto yydefault; + yyn = yytable[yyn]; + if (yyn <= 0) + { + if (yytable_value_is_error (yyn)) + goto yyerrlab; + yyn = -yyn; + goto yyreduce; + } + + /* Count tokens shifted since error; after three, turn off error + status. */ + if (yyerrstatus) + yyerrstatus--; + + /* Shift the lookahead token. */ + YY_SYMBOL_PRINT ("Shifting", yytoken, &yylval, &yylloc); + + /* Discard the shifted token. */ + yychar = YYEMPTY; + + yystate = yyn; + YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN + *++yyvsp = yylval; + YY_IGNORE_MAYBE_UNINITIALIZED_END + + goto yynewstate; + + +/*-----------------------------------------------------------. +| yydefault -- do the default action for the current state. | +`-----------------------------------------------------------*/ +yydefault: + yyn = yydefact[yystate]; + if (yyn == 0) + goto yyerrlab; + goto yyreduce; + + +/*-----------------------------. +| yyreduce -- Do a reduction. | +`-----------------------------*/ +yyreduce: + /* yyn is the number of a rule to reduce with. */ + yylen = yyr2[yyn]; + + /* If YYLEN is nonzero, implement the default value of the action: + `$$ = $1'. + + Otherwise, the following line sets YYVAL to garbage. + This behavior is undocumented and Bison + users should not rely upon it. Assigning to YYVAL + unconditionally makes the parser a bit smaller, and it avoids a + GCC warning that YYVAL may be used uninitialized. */ + yyval = yyvsp[1-yylen]; + + + YY_REDUCE_PRINT (yyn); + switch (yyn) + { + case 17: +/* Line 1792 of yacc.c */ +/* Line 376 of config_parser.y */ + { __vma_log_set_log_stderr(); } + break; + + case 18: +/* Line 1792 of yacc.c */ +/* Line 377 of config_parser.y */ + { __vma_log_set_log_syslog(); } + break; + + case 19: +/* Line 1792 of yacc.c */ +/* Line 378 of config_parser.y */ + { __vma_log_set_log_file((yyvsp[(3) - (3)].sval)); } + break; + + case 20: +/* Line 1792 of yacc.c */ +/* Line 382 of config_parser.y */ + { __vma_log_set_min_level((yyvsp[(2) - (2)].ival)); } + break; + + case 22: +/* Line 1792 of yacc.c */ +/* Line 390 of config_parser.y */ + {__vma_add_instance((yyvsp[(2) - (3)].sval), (yyvsp[(3) - (3)].sval)); if ((yyvsp[(2) - (3)].sval)) free((yyvsp[(2) - (3)].sval)); if ((yyvsp[(3) - (3)].sval)) free((yyvsp[(3) - (3)].sval)); } + break; + + case 23: +/* Line 1792 of yacc.c */ +/* Line 391 of config_parser.y */ + {__vma_add_inst_with_int_uid((yyvsp[(2) - (3)].sval), (yyvsp[(3) - (3)].ival)); if ((yyvsp[(2) - (3)].sval)) free((yyvsp[(2) - (3)].sval)); } + break; + + case 24: +/* Line 1792 of yacc.c */ +/* Line 396 of config_parser.y */ + { __vma_add_rule(); } + break; + + case 25: +/* Line 1792 of yacc.c */ +/* Line 400 of config_parser.y */ + { current_conf_type = CONF_RULE; } + break; + + case 26: +/* Line 1792 of yacc.c */ +/* Line 404 of config_parser.y */ + { __vma_rule.target_transport = TRANS_OS; } + break; + + case 27: +/* Line 1792 of yacc.c */ +/* Line 405 of config_parser.y */ + { __vma_rule.target_transport = TRANS_VMA; } + break; + + case 28: +/* Line 1792 of yacc.c */ +/* Line 406 of config_parser.y */ + { __vma_rule.target_transport = TRANS_SDP; } + break; + + case 29: +/* Line 1792 of yacc.c */ +/* Line 407 of config_parser.y */ + { __vma_rule.target_transport = TRANS_SA; } + break; + + case 30: +/* Line 1792 of yacc.c */ +/* Line 408 of config_parser.y */ + { __vma_rule.target_transport = TRANS_ULP; } + break; + + case 31: +/* Line 1792 of yacc.c */ +/* Line 413 of config_parser.y */ + { current_role = ROLE_TCP_SERVER; __vma_rule.protocol = PROTO_TCP; } + break; + + case 32: +/* Line 1792 of yacc.c */ +/* Line 414 of config_parser.y */ + { current_role = ROLE_TCP_CLIENT; __vma_rule.protocol = PROTO_TCP; } + break; + + case 33: +/* Line 1792 of yacc.c */ +/* Line 415 of config_parser.y */ + { current_role = ROLE_UDP_RECEIVER; __vma_rule.protocol = PROTO_UDP; } + break; + + case 34: +/* Line 1792 of yacc.c */ +/* Line 416 of config_parser.y */ + { current_role = ROLE_UDP_SENDER; __vma_rule.protocol = PROTO_UDP; } + break; + + case 35: +/* Line 1792 of yacc.c */ +/* Line 417 of config_parser.y */ + { current_role = ROLE_UDP_CONNECT; __vma_rule.protocol = PROTO_UDP; } + break; + + case 40: +/* Line 1792 of yacc.c */ +/* Line 434 of config_parser.y */ + { __vma_address_port_rule = &(__vma_rule.first); __vma_rule.use_second = 0; } + break; + + case 42: +/* Line 1792 of yacc.c */ +/* Line 438 of config_parser.y */ + { __vma_address_port_rule = &(__vma_rule.second); __vma_rule.use_second = 1; } + break; + + case 44: +/* Line 1792 of yacc.c */ +/* Line 442 of config_parser.y */ + { if (current_conf_type == CONF_RULE) __vma_address_port_rule->match_by_addr = 1; __vma_set_inet_addr_prefix_len(32); } + break; + + case 45: +/* Line 1792 of yacc.c */ +/* Line 443 of config_parser.y */ + { if (current_conf_type == CONF_RULE) __vma_address_port_rule->match_by_addr = 1; __vma_set_inet_addr_prefix_len((yyvsp[(3) - (3)].ival)); } + break; + + case 46: +/* Line 1792 of yacc.c */ +/* Line 444 of config_parser.y */ + { if (current_conf_type == CONF_RULE) __vma_address_port_rule->match_by_addr = 0; __vma_set_inet_addr_prefix_len(32); } + break; + + case 47: +/* Line 1792 of yacc.c */ +/* Line 448 of config_parser.y */ + { __vma_set_ipv4_addr((yyvsp[(1) - (7)].ival),(yyvsp[(3) - (7)].ival),(yyvsp[(5) - (7)].ival),(yyvsp[(7) - (7)].ival)); } + break; + + case 48: +/* Line 1792 of yacc.c */ +/* Line 452 of config_parser.y */ + { __vma_address_port_rule->match_by_port = 1; __vma_address_port_rule->sport= (yyvsp[(1) - (1)].ival); __vma_address_port_rule->eport= (yyvsp[(1) - (1)].ival); } + break; + + case 49: +/* Line 1792 of yacc.c */ +/* Line 453 of config_parser.y */ + { __vma_address_port_rule->match_by_port = 1; __vma_address_port_rule->sport= (yyvsp[(1) - (3)].ival); __vma_address_port_rule->eport= (yyvsp[(3) - (3)].ival); } + break; + + case 50: +/* Line 1792 of yacc.c */ +/* Line 454 of config_parser.y */ + { __vma_address_port_rule->match_by_port = 0; __vma_address_port_rule->sport= 0; __vma_address_port_rule->eport= 0; } + break; + + +/* Line 1792 of yacc.c */ +/* Line 1893 of config_parser.c */ + default: break; + } + /* User semantic actions sometimes alter yychar, and that requires + that yytoken be updated with the new translation. We take the + approach of translating immediately before every use of yytoken. + One alternative is translating here after every semantic action, + but that translation would be missed if the semantic action invokes + YYABORT, YYACCEPT, or YYERROR immediately after altering yychar or + if it invokes YYBACKUP. In the case of YYABORT or YYACCEPT, an + incorrect destructor might then be invoked immediately. In the + case of YYERROR or YYBACKUP, subsequent parser actions might lead + to an incorrect destructor call or verbose syntax error message + before the lookahead is translated. */ + YY_SYMBOL_PRINT ("-> $$ =", yyr1[yyn], &yyval, &yyloc); + + YYPOPSTACK (yylen); + yylen = 0; + YY_STACK_PRINT (yyss, yyssp); + + *++yyvsp = yyval; + + /* Now `shift' the result of the reduction. Determine what state + that goes to, based on the state we popped back to and the rule + number reduced by. */ + + yyn = yyr1[yyn]; + + yystate = yypgoto[yyn - YYNTOKENS] + *yyssp; + if (0 <= yystate && yystate <= YYLAST && yycheck[yystate] == *yyssp) + yystate = yytable[yystate]; + else + yystate = yydefgoto[yyn - YYNTOKENS]; + + goto yynewstate; + + +/*------------------------------------. +| yyerrlab -- here on detecting error | +`------------------------------------*/ +yyerrlab: + /* Make sure we have latest lookahead translation. See comments at + user semantic actions for why this is necessary. */ + yytoken = yychar == YYEMPTY ? YYEMPTY : YYTRANSLATE (yychar); + + /* If not already recovering from an error, report this error. */ + if (!yyerrstatus) + { + ++yynerrs; +#if ! YYERROR_VERBOSE + yyerror (YY_("syntax error")); +#else +# define YYSYNTAX_ERROR yysyntax_error (&yymsg_alloc, &yymsg, \ + yyssp, yytoken) + { + char const *yymsgp = YY_("syntax error"); + int yysyntax_error_status; + yysyntax_error_status = YYSYNTAX_ERROR; + if (yysyntax_error_status == 0) + yymsgp = yymsg; + else if (yysyntax_error_status == 1) + { + if (yymsg != yymsgbuf) + YYSTACK_FREE (yymsg); + yymsg = (char *) YYSTACK_ALLOC (yymsg_alloc); + if (!yymsg) + { + yymsg = yymsgbuf; + yymsg_alloc = sizeof yymsgbuf; + yysyntax_error_status = 2; + } + else + { + yysyntax_error_status = YYSYNTAX_ERROR; + yymsgp = yymsg; + } + } + yyerror (yymsgp); + if (yysyntax_error_status == 2) + goto yyexhaustedlab; + } +# undef YYSYNTAX_ERROR +#endif + } + + + + if (yyerrstatus == 3) + { + /* If just tried and failed to reuse lookahead token after an + error, discard it. */ + + if (yychar <= YYEOF) + { + /* Return failure if at end of input. */ + if (yychar == YYEOF) + YYABORT; + } + else + { + yydestruct ("Error: discarding", + yytoken, &yylval); + yychar = YYEMPTY; + } + } + + /* Else will try to reuse lookahead token after shifting the error + token. */ + goto yyerrlab1; + + +/*---------------------------------------------------. +| yyerrorlab -- error raised explicitly by YYERROR. | +`---------------------------------------------------*/ +yyerrorlab: + + /* Pacify compilers like GCC when the user code never invokes + YYERROR and the label yyerrorlab therefore never appears in user + code. */ + if (/*CONSTCOND*/ 0) + goto yyerrorlab; + + /* Do not reclaim the symbols of the rule which action triggered + this YYERROR. */ + YYPOPSTACK (yylen); + yylen = 0; + YY_STACK_PRINT (yyss, yyssp); + yystate = *yyssp; + goto yyerrlab1; + + +/*-------------------------------------------------------------. +| yyerrlab1 -- common code for both syntax error and YYERROR. | +`-------------------------------------------------------------*/ +yyerrlab1: + yyerrstatus = 3; /* Each real token shifted decrements this. */ + + for (;;) + { + yyn = yypact[yystate]; + if (!yypact_value_is_default (yyn)) + { + yyn += YYTERROR; + if (0 <= yyn && yyn <= YYLAST && yycheck[yyn] == YYTERROR) + { + yyn = yytable[yyn]; + if (0 < yyn) + break; + } + } + + /* Pop the current state because it cannot handle the error token. */ + if (yyssp == yyss) + YYABORT; + + + yydestruct ("Error: popping", + yystos[yystate], yyvsp); + YYPOPSTACK (1); + yystate = *yyssp; + YY_STACK_PRINT (yyss, yyssp); + } + + YY_IGNORE_MAYBE_UNINITIALIZED_BEGIN + *++yyvsp = yylval; + YY_IGNORE_MAYBE_UNINITIALIZED_END + + + /* Shift the error token. */ + YY_SYMBOL_PRINT ("Shifting", yystos[yyn], yyvsp, yylsp); + + yystate = yyn; + goto yynewstate; + + +/*-------------------------------------. +| yyacceptlab -- YYACCEPT comes here. | +`-------------------------------------*/ +yyacceptlab: + yyresult = 0; + goto yyreturn; + +/*-----------------------------------. +| yyabortlab -- YYABORT comes here. | +`-----------------------------------*/ +yyabortlab: + yyresult = 1; + goto yyreturn; + +#if !defined yyoverflow || YYERROR_VERBOSE +/*-------------------------------------------------. +| yyexhaustedlab -- memory exhaustion comes here. | +`-------------------------------------------------*/ +yyexhaustedlab: + yyerror (YY_("memory exhausted")); + yyresult = 2; + /* Fall through. */ +#endif + +yyreturn: + if (yychar != YYEMPTY) + { + /* Make sure we have latest lookahead translation. See comments at + user semantic actions for why this is necessary. */ + yytoken = YYTRANSLATE (yychar); + yydestruct ("Cleanup: discarding lookahead", + yytoken, &yylval); + } + /* Do not reclaim the symbols of the rule which action triggered + this YYABORT or YYACCEPT. */ + YYPOPSTACK (yylen); + YY_STACK_PRINT (yyss, yyssp); + while (yyssp != yyss) + { + yydestruct ("Cleanup: popping", + yystos[*yyssp], yyvsp); + YYPOPSTACK (1); + } +#ifndef yyoverflow + if (yyss != yyssa) + YYSTACK_FREE (yyss); +#endif +#if YYERROR_VERBOSE + if (yymsg != yymsgbuf) + YYSTACK_FREE (yymsg); +#endif + /* Make sure YYID is used. */ + return YYID (yyresult); +} + + +/* Line 2055 of yacc.c */ +/* Line 457 of config_parser.y */ + + +int yyerror(const char *msg) +{ + /* replace the $undefined and $end if exists */ + char *orig_msg = (char*)malloc(strlen(msg)+25); + char *final_msg = (char*)malloc(strlen(msg)+25); + + strcpy(orig_msg, msg); + + char *word = strtok(orig_msg, " "); + final_msg[0] = '\0'; + while (word != NULL) { + if (!strncmp(word, "$undefined", 10)) { + strcat(final_msg, "unrecognized-token "); + } else if (!strncmp(word, "$end",4)) { + strcat(final_msg, "end-of-file "); + } else { + strcat(final_msg, word); + strcat(final_msg, " "); + } + word = strtok(NULL, " "); + } + + __vma_log(9, "Error (line:%ld) : %s\n", __vma_config_line_num, final_msg); + parse_err = 1; + + free(orig_msg); + free(final_msg); + return 1; +} + +#include +#include + +/* parse apollo route dump file */ +int __vma_parse_config_file (const char *fileName) { + extern FILE * libvma_yyin; + + /* open the file */ + if (access(fileName, R_OK)) { + /* + * Let upper layer inform about no access to open file - based on log level + */ + return(1); + } + + /* coverity[toctou] */ + libvma_yyin = fopen(fileName,"r"); + if (!libvma_yyin) { + printf("libvma Error: Fail to open File:%s\n", fileName); + return(1); + } + __instance_list.head = NULL; + __instance_list.tail = NULL; + parse_err = 0; + __vma_config_line_num = 1; + + /* parse it */ + yyparse(); + + fclose(libvma_yyin); + return(parse_err); +} + +int __vma_parse_config_line (const char *line) { + extern FILE * libvma_yyin; + + __vma_rule_push_head = 1; + + /* The below casting is valid because we open the stream as read-only. */ + libvma_yyin = fmemopen((void*)line, strlen(line), "r"); + + if (!libvma_yyin) { + printf("libvma Error: Fail to parse line:%s\n", line); + return(1); + } + + parse_err = 0; + yyparse(); + + fclose(libvma_yyin); + + return(parse_err); +} diff --git a/src/vma/config_parser.h b/src/vma/config_parser.h new file mode 100644 index 0000000..3502c79 --- /dev/null +++ b/src/vma/config_parser.h @@ -0,0 +1,137 @@ +/* A Bison parser, made by GNU Bison 2.7. */ + +/* Bison interface for Yacc-like parsers in C + + Copyright (C) 1984, 1989-1990, 2000-2012 Free Software Foundation, Inc. + + This program is free software: you can redistribute it and/or modify + it under the terms of the GNU General Public License as published by + the Free Software Foundation, either version 3 of the License, or + (at your option) any later version. + + This program is distributed in the hope that it will be useful, + but WITHOUT ANY WARRANTY; without even the implied warranty of + MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the + GNU General Public License for more details. + + You should have received a copy of the GNU General Public License + along with this program. If not, see . */ + +/* As a special exception, you may create a larger work that contains + part or all of the Bison parser skeleton and distribute that work + under terms of your choice, so long as that work isn't itself a + parser generator using the skeleton or a modified version thereof + as a parser skeleton. Alternatively, if you modify or redistribute + the parser skeleton itself, you may (at your option) remove this + special exception, which will cause the skeleton and the resulting + Bison output files to be licensed under the GNU General Public + License without this special exception. + + This special exception was added by the Free Software Foundation in + version 2.2 of Bison. */ + +#ifndef YY_LIBVMA_YY_CONFIG_PARSER_H_INCLUDED +# define YY_LIBVMA_YY_CONFIG_PARSER_H_INCLUDED +/* Enabling traces. */ +#ifndef YYDEBUG +# define YYDEBUG 0 +#endif +#if YYDEBUG +extern int libvma_yydebug; +#endif + +/* Tokens. */ +#ifndef YYTOKENTYPE +# define YYTOKENTYPE + /* Put the tokens into the symbol table, so that GDB and other debuggers + know about them. */ + enum yytokentype { + USE = 258, + TCP_CLIENT = 259, + TCP_SERVER = 260, + UDP_SENDER = 261, + UDP_RECEIVER = 262, + UDP_CONNECT = 263, + TCP = 264, + UDP = 265, + OS = 266, + VMA = 267, + SDP = 268, + SA = 269, + INT = 270, + APP_ID = 271, + PROGRAM = 272, + USER_DEFINED_ID_STR = 273, + LOG = 274, + DEST = 275, + STDERR = 276, + SYSLOG = 277, + FILENAME = 278, + NAME = 279, + LEVEL = 280, + LINE = 281 + }; +#endif +/* Tokens. */ +#define USE 258 +#define TCP_CLIENT 259 +#define TCP_SERVER 260 +#define UDP_SENDER 261 +#define UDP_RECEIVER 262 +#define UDP_CONNECT 263 +#define TCP 264 +#define UDP 265 +#define OS 266 +#define VMA 267 +#define SDP 268 +#define SA 269 +#define INT 270 +#define APP_ID 271 +#define PROGRAM 272 +#define USER_DEFINED_ID_STR 273 +#define LOG 274 +#define DEST 275 +#define STDERR 276 +#define SYSLOG 277 +#define FILENAME 278 +#define NAME 279 +#define LEVEL 280 +#define LINE 281 + + + +#if ! defined YYSTYPE && ! defined YYSTYPE_IS_DECLARED +typedef union YYSTYPE +{ +/* Line 2058 of yacc.c */ +/* Line 306 of config_parser.y */ + + int ival; + char *sval; + + +/* Line 2058 of yacc.c */ +/* Line 115 of config_parser.h */ +} YYSTYPE; +# define YYSTYPE_IS_TRIVIAL 1 +# define yystype YYSTYPE /* obsolescent; will be withdrawn */ +# define YYSTYPE_IS_DECLARED 1 +#endif + +extern YYSTYPE libvma_yylval; + +#ifdef YYPARSE_PARAM +#if defined __STDC__ || defined __cplusplus +int libvma_yyparse (void *YYPARSE_PARAM); +#else +int libvma_yyparse (); +#endif +#else /* ! YYPARSE_PARAM */ +#if defined __STDC__ || defined __cplusplus +int libvma_yyparse (void); +#else +int libvma_yyparse (); +#endif +#endif /* ! YYPARSE_PARAM */ + +#endif /* !YY_LIBVMA_YY_CONFIG_PARSER_H_INCLUDED */ diff --git a/src/vma/config_scanner.c b/src/vma/config_scanner.c new file mode 100644 index 0000000..a885807 --- /dev/null +++ b/src/vma/config_scanner.c @@ -0,0 +1,2379 @@ + +/* Line 3 of config_scanner.c */ + +#define YY_INT_ALIGNED short int + +/* A lexical scanner generated by flex */ + +#define yy_create_buffer libvma_yy_create_buffer +#define yy_delete_buffer libvma_yy_delete_buffer +#define yy_flex_debug libvma_yy_flex_debug +#define yy_init_buffer libvma_yy_init_buffer +#define yy_flush_buffer libvma_yy_flush_buffer +#define yy_load_buffer_state libvma_yy_load_buffer_state +#define yy_switch_to_buffer libvma_yy_switch_to_buffer +#define yyin libvma_yyin +#define yyleng libvma_yyleng +#define yylex libvma_yylex +#define yylineno libvma_yylineno +#define yyout libvma_yyout +#define yyrestart libvma_yyrestart +#define yytext libvma_yytext +#define yywrap libvma_yywrap +#define yyalloc libvma_yyalloc +#define yyrealloc libvma_yyrealloc +#define yyfree libvma_yyfree + +#define FLEX_SCANNER +#define YY_FLEX_MAJOR_VERSION 2 +#define YY_FLEX_MINOR_VERSION 5 +#define YY_FLEX_SUBMINOR_VERSION 39 +#if YY_FLEX_SUBMINOR_VERSION > 0 +#define FLEX_BETA +#endif + +/* First, we deal with platform-specific or compiler-specific issues. */ + +/* begin standard C headers. */ +#include +#include +#include +#include + +/* end standard C headers. */ + +/* flex integer type definitions */ + +#ifndef FLEXINT_H +#define FLEXINT_H + +/* C99 systems have . Non-C99 systems may or may not. */ + +#if defined (__STDC_VERSION__) && __STDC_VERSION__ >= 199901L + +/* C99 says to define __STDC_LIMIT_MACROS before including stdint.h, + * if you want the limit (max/min) macros for int types. + */ +#ifndef __STDC_LIMIT_MACROS +#define __STDC_LIMIT_MACROS 1 +#endif + +#include +typedef int8_t flex_int8_t; +typedef uint8_t flex_uint8_t; +typedef int16_t flex_int16_t; +typedef uint16_t flex_uint16_t; +typedef int32_t flex_int32_t; +typedef uint32_t flex_uint32_t; +#else +typedef signed char flex_int8_t; +typedef short int flex_int16_t; +typedef int flex_int32_t; +typedef unsigned char flex_uint8_t; +typedef unsigned short int flex_uint16_t; +typedef unsigned int flex_uint32_t; + +/* Limits of integral types. */ +#ifndef INT8_MIN +#define INT8_MIN (-128) +#endif +#ifndef INT16_MIN +#define INT16_MIN (-32767-1) +#endif +#ifndef INT32_MIN +#define INT32_MIN (-2147483647-1) +#endif +#ifndef INT8_MAX +#define INT8_MAX (127) +#endif +#ifndef INT16_MAX +#define INT16_MAX (32767) +#endif +#ifndef INT32_MAX +#define INT32_MAX (2147483647) +#endif +#ifndef UINT8_MAX +#define UINT8_MAX (255U) +#endif +#ifndef UINT16_MAX +#define UINT16_MAX (65535U) +#endif +#ifndef UINT32_MAX +#define UINT32_MAX (4294967295U) +#endif + +#endif /* ! C99 */ + +#endif /* ! FLEXINT_H */ + +#ifdef __cplusplus + +/* The "const" storage-class-modifier is valid. */ +#define YY_USE_CONST + +#else /* ! __cplusplus */ + +/* C99 requires __STDC__ to be defined as 1. */ +#if defined (__STDC__) + +#define YY_USE_CONST + +#endif /* defined (__STDC__) */ +#endif /* ! __cplusplus */ + +#ifdef YY_USE_CONST +#define yyconst const +#else +#define yyconst +#endif + +/* Returned upon end-of-file. */ +#define YY_NULL 0 + +/* Promotes a possibly negative, possibly signed char to an unsigned + * integer for use as an array index. If the signed char is negative, + * we want to instead treat it as an 8-bit unsigned char, hence the + * double cast. + */ +#define YY_SC_TO_UI(c) ((unsigned int) (unsigned char) c) + +/* Enter a start condition. This macro really ought to take a parameter, + * but we do it the disgusting crufty way forced on us by the ()-less + * definition of BEGIN. + */ +#define BEGIN (yy_start) = 1 + 2 * + +/* Translate the current start state into a value that can be later handed + * to BEGIN to return to the state. The YYSTATE alias is for lex + * compatibility. + */ +#define YY_START (((yy_start) - 1) / 2) +#define YYSTATE YY_START + +/* Action number for EOF rule of a given start state. */ +#define YY_STATE_EOF(state) (YY_END_OF_BUFFER + state + 1) + +/* Special action meaning "start processing a new file". */ +#define YY_NEW_FILE libvma_yyrestart(libvma_yyin ) + +#define YY_END_OF_BUFFER_CHAR 0 + +/* Size of default input buffer. */ +#ifndef YY_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k. + * Moreover, YY_BUF_SIZE is 2*YY_READ_BUF_SIZE in the general case. + * Ditto for the __ia64__ case accordingly. + */ +#define YY_BUF_SIZE 32768 +#else +#define YY_BUF_SIZE 16384 +#endif /* __ia64__ */ +#endif + +/* The state buf must be large enough to hold one state per character in the main buffer. + */ +#define YY_STATE_BUF_SIZE ((YY_BUF_SIZE + 2) * sizeof(yy_state_type)) + +#ifndef YY_TYPEDEF_YY_BUFFER_STATE +#define YY_TYPEDEF_YY_BUFFER_STATE +typedef struct yy_buffer_state *YY_BUFFER_STATE; +#endif + +#ifndef YY_TYPEDEF_YY_SIZE_T +#define YY_TYPEDEF_YY_SIZE_T +typedef size_t yy_size_t; +#endif + +extern yy_size_t libvma_yyleng; + +extern FILE *libvma_yyin, *libvma_yyout; + +#define EOB_ACT_CONTINUE_SCAN 0 +#define EOB_ACT_END_OF_FILE 1 +#define EOB_ACT_LAST_MATCH 2 + + #define YY_LESS_LINENO(n) + #define YY_LINENO_REWIND_TO(ptr) + +/* Return all but the first "n" matched characters back to the input stream. */ +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up libvma_yytext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + *yy_cp = (yy_hold_char); \ + YY_RESTORE_YY_MORE_OFFSET \ + (yy_c_buf_p) = yy_cp = yy_bp + yyless_macro_arg - YY_MORE_ADJ; \ + YY_DO_BEFORE_ACTION; /* set up libvma_yytext again */ \ + } \ + while ( 0 ) + +#define unput(c) yyunput( c, (yytext_ptr) ) + +#ifndef YY_STRUCT_YY_BUFFER_STATE +#define YY_STRUCT_YY_BUFFER_STATE +struct yy_buffer_state + { + FILE *yy_input_file; + + char *yy_ch_buf; /* input buffer */ + char *yy_buf_pos; /* current position in input buffer */ + + /* Size of input buffer in bytes, not including room for EOB + * characters. + */ + yy_size_t yy_buf_size; + + /* Number of characters read into yy_ch_buf, not including EOB + * characters. + */ + yy_size_t yy_n_chars; + + /* Whether we "own" the buffer - i.e., we know we created it, + * and can realloc() it to grow it, and should free() it to + * delete it. + */ + int yy_is_our_buffer; + + /* Whether this is an "interactive" input source; if so, and + * if we're using stdio for input, then we want to use getc() + * instead of fread(), to make sure we stop fetching input after + * each newline. + */ + int yy_is_interactive; + + /* Whether we're considered to be at the beginning of a line. + * If so, '^' rules will be active on the next match, otherwise + * not. + */ + int yy_at_bol; + + int yy_bs_lineno; /**< The line count. */ + int yy_bs_column; /**< The column count. */ + + /* Whether to try to fill the input buffer when we reach the + * end of it. + */ + int yy_fill_buffer; + + int yy_buffer_status; + +#define YY_BUFFER_NEW 0 +#define YY_BUFFER_NORMAL 1 + /* When an EOF's been seen but there's still some text to process + * then we mark the buffer as YY_EOF_PENDING, to indicate that we + * shouldn't try reading from the input source any more. We might + * still have a bunch of tokens to match, though, because of + * possible backing-up. + * + * When we actually see the EOF, we change the status to "new" + * (via libvma_yyrestart()), so that the user can continue scanning by + * just pointing libvma_yyin at a new input file. + */ +#define YY_BUFFER_EOF_PENDING 2 + + }; +#endif /* !YY_STRUCT_YY_BUFFER_STATE */ + +/* Stack of input buffers. */ +static size_t yy_buffer_stack_top = 0; /**< index of top of stack. */ +static size_t yy_buffer_stack_max = 0; /**< capacity of stack. */ +static YY_BUFFER_STATE * yy_buffer_stack = 0; /**< Stack as an array. */ + +/* We provide macros for accessing buffer states in case in the + * future we want to put the buffer states in a more general + * "scanner state". + * + * Returns the top of the stack, or NULL. + */ +#define YY_CURRENT_BUFFER ( (yy_buffer_stack) \ + ? (yy_buffer_stack)[(yy_buffer_stack_top)] \ + : NULL) + +/* Same as previous macro, but useful when we know that the buffer stack is not + * NULL or when we need an lvalue. For internal use only. + */ +#define YY_CURRENT_BUFFER_LVALUE (yy_buffer_stack)[(yy_buffer_stack_top)] + +/* yy_hold_char holds the character lost when libvma_yytext is formed. */ +static char yy_hold_char; +static yy_size_t yy_n_chars; /* number of characters read into yy_ch_buf */ +yy_size_t libvma_yyleng; + +/* Points to current character in buffer. */ +static char *yy_c_buf_p = (char *) 0; +static int yy_init = 0; /* whether we need to initialize */ +static int yy_start = 0; /* start state number */ + +/* Flag which is used to allow libvma_yywrap()'s to do buffer switches + * instead of setting up a fresh libvma_yyin. A bit of a hack ... + */ +static int yy_did_buffer_switch_on_eof; + +void libvma_yyrestart (FILE *input_file ); +void libvma_yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ); +YY_BUFFER_STATE libvma_yy_create_buffer (FILE *file,int size ); +void libvma_yy_delete_buffer (YY_BUFFER_STATE b ); +void libvma_yy_flush_buffer (YY_BUFFER_STATE b ); +void libvma_yypush_buffer_state (YY_BUFFER_STATE new_buffer ); +void libvma_yypop_buffer_state (void ); + +static void libvma_yyensure_buffer_stack (void ); +static void libvma_yy_load_buffer_state (void ); +static void libvma_yy_init_buffer (YY_BUFFER_STATE b,FILE *file ); + +#define YY_FLUSH_BUFFER libvma_yy_flush_buffer(YY_CURRENT_BUFFER ) + +YY_BUFFER_STATE libvma_yy_scan_buffer (char *base,yy_size_t size ); +YY_BUFFER_STATE libvma_yy_scan_string (yyconst char *yy_str ); +YY_BUFFER_STATE libvma_yy_scan_bytes (yyconst char *bytes,yy_size_t len ); + +void *libvma_yyalloc (yy_size_t ); +void *libvma_yyrealloc (void *,yy_size_t ); +void libvma_yyfree (void * ); + +#define yy_new_buffer libvma_yy_create_buffer + +#define yy_set_interactive(is_interactive) \ + { \ + if ( ! YY_CURRENT_BUFFER ){ \ + libvma_yyensure_buffer_stack (); \ + YY_CURRENT_BUFFER_LVALUE = \ + libvma_yy_create_buffer(libvma_yyin,YY_BUF_SIZE ); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_is_interactive = is_interactive; \ + } + +#define yy_set_bol(at_bol) \ + { \ + if ( ! YY_CURRENT_BUFFER ){\ + libvma_yyensure_buffer_stack (); \ + YY_CURRENT_BUFFER_LVALUE = \ + libvma_yy_create_buffer(libvma_yyin,YY_BUF_SIZE ); \ + } \ + YY_CURRENT_BUFFER_LVALUE->yy_at_bol = at_bol; \ + } + +#define YY_AT_BOL() (YY_CURRENT_BUFFER_LVALUE->yy_at_bol) + +/* Begin user sect3 */ + +typedef unsigned char YY_CHAR; + +FILE *libvma_yyin = (FILE *) 0, *libvma_yyout = (FILE *) 0; + +typedef int yy_state_type; + +extern int libvma_yylineno; + +int libvma_yylineno = 1; + +extern char *libvma_yytext; +#define yytext_ptr libvma_yytext + +static yy_state_type yy_get_previous_state (void ); +static yy_state_type yy_try_NUL_trans (yy_state_type current_state ); +static int yy_get_next_buffer (void ); +static void yy_fatal_error (yyconst char msg[] ); + +/* Done after the current pattern has been matched and before the + * corresponding action - sets up libvma_yytext. + */ +#define YY_DO_BEFORE_ACTION \ + (yytext_ptr) = yy_bp; \ + libvma_yyleng = (size_t) (yy_cp - yy_bp); \ + (yy_hold_char) = *yy_cp; \ + *yy_cp = '\0'; \ + (yy_c_buf_p) = yy_cp; + +#define YY_NUM_RULES 29 +#define YY_END_OF_BUFFER 30 +/* This struct is not used in this scanner, + but its presence is necessary. */ +struct yy_trans_info + { + flex_int32_t yy_verify; + flex_int32_t yy_nxt; + }; +static yyconst flex_int16_t yy_accept[473] = + { 0, + 0, 0, 0, 0, 0, 0, 0, 0, 30, 28, + 27, 25, 26, 5, 5, 28, 28, 28, 28, 28, + 28, 28, 28, 28, 28, 27, 1, 24, 24, 5, + 5, 24, 24, 24, 24, 24, 24, 24, 24, 24, + 24, 1, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 1, 4, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 1, 27, 26, 5, 0, 0, 0, 0, 15, + 18, 0, 0, 0, 0, 0, 0, 0, 0, 27, + 1, 1, 24, 24, 5, 24, 24, 24, 24, 15, + + 18, 24, 24, 24, 24, 24, 24, 24, 24, 1, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 1, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 1, 0, 0, 6, 0, + 17, 0, 13, 14, 12, 16, 0, 0, 1, 24, + 24, 6, 24, 17, 24, 13, 14, 12, 16, 24, + 24, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, + 4, 4, 4, 4, 4, 0, 0, 0, 0, 0, + + 0, 0, 0, 24, 24, 24, 24, 24, 24, 24, + 24, 3, 3, 3, 3, 3, 3, 3, 3, 4, + 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 11, 0, 24, 24, + 24, 24, 24, 24, 24, 24, 24, 11, 24, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 4, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 0, 0, 10, 0, 0, 0, 0, 0, + 0, 24, 24, 24, 10, 24, 24, 24, 24, 24, + 24, 3, 3, 3, 3, 3, 3, 3, 3, 3, + + 3, 4, 4, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 0, 0, 0, 0, 0, 0, 0, 9, + 24, 24, 24, 24, 24, 24, 24, 24, 9, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 4, 4, + 4, 4, 4, 4, 4, 4, 4, 0, 0, 0, + 0, 0, 0, 0, 0, 24, 24, 24, 24, 24, + 24, 24, 24, 3, 3, 3, 3, 3, 3, 3, + 3, 4, 4, 4, 4, 4, 4, 4, 4, 0, + 0, 8, 0, 0, 0, 0, 0, 24, 24, 8, + 24, 24, 24, 24, 24, 3, 3, 3, 3, 3, + + 3, 3, 3, 4, 4, 4, 4, 4, 4, 4, + 4, 0, 0, 19, 20, 0, 0, 21, 24, 24, + 19, 20, 24, 24, 21, 3, 3, 3, 3, 3, + 3, 3, 4, 4, 4, 4, 4, 4, 4, 0, + 7, 23, 0, 24, 7, 23, 24, 3, 3, 3, + 3, 4, 4, 4, 4, 0, 22, 24, 22, 3, + 3, 4, 4, 0, 24, 3, 4, 2, 2, 2, + 2, 0 + } ; + +static yyconst flex_int32_t yy_ec[256] = + { 0, + 1, 1, 1, 1, 1, 1, 1, 1, 2, 3, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 2, 1, 1, 4, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 5, 1, 1, 6, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 8, 1, 9, 1, 10, 11, + + 12, 13, 14, 1, 15, 1, 1, 16, 17, 18, + 19, 20, 1, 21, 22, 23, 24, 25, 1, 1, + 26, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1 + } ; + +static yyconst flex_int32_t yy_meta[27] = + { 0, + 1, 2, 3, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1 + } ; + +static yyconst flex_int16_t yy_base[485] = + { 0, + 0, 25, 29, 54, 58, 83, 87, 112, 548, 549, + 545, 549, 0, 549, 111, 526, 533, 525, 528, 520, + 110, 531, 104, 523, 107, 120, 0, 0, 537, 0, + 121, 518, 525, 517, 520, 512, 114, 523, 119, 515, + 120, 529, 0, 528, 0, 125, 509, 516, 508, 511, + 503, 126, 514, 123, 506, 125, 520, 0, 519, 0, + 137, 500, 507, 499, 502, 494, 137, 505, 128, 497, + 136, 511, 510, 0, 147, 491, 488, 495, 490, 549, + 549, 487, 484, 485, 484, 491, 493, 486, 477, 153, + 0, 0, 0, 497, 153, 478, 475, 482, 477, 0, + + 0, 474, 471, 472, 471, 478, 480, 473, 464, 484, + 0, 483, 155, 464, 461, 468, 463, 0, 0, 460, + 457, 458, 457, 464, 466, 459, 450, 470, 0, 469, + 158, 450, 447, 454, 449, 0, 0, 446, 443, 444, + 443, 450, 452, 445, 436, 456, 441, 433, 549, 450, + 549, 438, 445, 444, 549, 549, 435, 439, 0, 433, + 425, 0, 442, 0, 430, 437, 436, 0, 0, 427, + 431, 425, 417, 0, 434, 0, 422, 429, 428, 0, + 0, 419, 423, 417, 409, 0, 426, 0, 414, 421, + 420, 0, 0, 411, 415, 410, 409, 407, 403, 146, + + 156, 409, 408, 404, 403, 401, 397, 157, 159, 403, + 402, 398, 397, 395, 391, 160, 162, 397, 396, 392, + 391, 389, 385, 163, 165, 391, 390, 391, 382, 387, + 384, 381, 384, 376, 382, 381, 549, 371, 381, 372, + 377, 374, 371, 374, 366, 372, 371, 0, 361, 371, + 362, 367, 364, 361, 364, 356, 362, 361, 0, 351, + 361, 352, 357, 354, 351, 354, 346, 352, 351, 0, + 341, 352, 351, 334, 549, 343, 336, 338, 345, 336, + 332, 343, 342, 325, 0, 334, 327, 329, 336, 327, + 323, 334, 333, 316, 0, 325, 318, 320, 327, 318, + + 314, 325, 324, 307, 0, 316, 309, 311, 318, 309, + 305, 302, 301, 311, 310, 296, 302, 307, 307, 549, + 294, 293, 303, 302, 288, 294, 299, 299, 0, 286, + 285, 295, 294, 280, 286, 291, 291, 0, 278, 277, + 287, 286, 272, 278, 283, 283, 0, 278, 277, 275, + 272, 277, 276, 272, 274, 270, 269, 267, 264, 269, + 268, 264, 266, 262, 261, 259, 256, 261, 260, 256, + 258, 254, 253, 251, 248, 253, 252, 248, 250, 242, + 241, 549, 236, 237, 247, 231, 234, 235, 234, 0, + 229, 230, 240, 224, 227, 228, 227, 0, 222, 223, + + 233, 217, 220, 221, 220, 0, 215, 216, 226, 210, + 213, 215, 214, 549, 549, 208, 218, 549, 211, 210, + 0, 0, 204, 214, 0, 207, 206, 0, 0, 200, + 210, 0, 203, 202, 0, 0, 196, 206, 0, 212, + 549, 549, 195, 210, 0, 0, 193, 208, 0, 0, + 191, 206, 0, 0, 189, 194, 549, 193, 0, 192, + 0, 191, 0, 165, 163, 160, 140, 549, 0, 0, + 0, 549, 187, 189, 85, 191, 193, 56, 195, 197, + 27, 199, 201, 203 + } ; + +static yyconst flex_int16_t yy_def[485] = + { 0, + 472, 1, 472, 3, 472, 5, 472, 7, 472, 472, + 472, 472, 473, 472, 472, 472, 472, 472, 472, 472, + 472, 472, 472, 472, 472, 472, 474, 475, 476, 475, + 475, 475, 475, 475, 475, 475, 475, 475, 475, 475, + 475, 477, 478, 479, 478, 478, 478, 478, 478, 478, + 478, 478, 478, 478, 478, 478, 480, 481, 482, 481, + 481, 481, 481, 481, 481, 481, 481, 481, 481, 481, + 481, 483, 472, 473, 472, 472, 472, 472, 472, 472, + 472, 472, 472, 472, 472, 472, 472, 472, 472, 472, + 484, 474, 475, 476, 475, 475, 475, 475, 475, 475, + + 475, 475, 475, 475, 475, 475, 475, 475, 475, 477, + 478, 479, 478, 478, 478, 478, 478, 478, 478, 478, + 478, 478, 478, 478, 478, 478, 478, 480, 481, 482, + 481, 481, 481, 481, 481, 481, 481, 481, 481, 481, + 481, 481, 481, 481, 481, 483, 472, 472, 472, 472, + 472, 472, 472, 472, 472, 472, 472, 472, 484, 475, + 475, 475, 475, 475, 475, 475, 475, 475, 475, 475, + 475, 478, 478, 478, 478, 478, 478, 478, 478, 478, + 478, 478, 478, 481, 481, 481, 481, 481, 481, 481, + 481, 481, 481, 481, 481, 472, 472, 472, 472, 472, + + 472, 472, 472, 475, 475, 475, 475, 475, 475, 475, + 475, 478, 478, 478, 478, 478, 478, 478, 478, 481, + 481, 481, 481, 481, 481, 481, 481, 472, 472, 472, + 472, 472, 472, 472, 472, 472, 472, 472, 475, 475, + 475, 475, 475, 475, 475, 475, 475, 475, 475, 478, + 478, 478, 478, 478, 478, 478, 478, 478, 478, 478, + 481, 481, 481, 481, 481, 481, 481, 481, 481, 481, + 481, 472, 472, 472, 472, 472, 472, 472, 472, 472, + 472, 475, 475, 475, 475, 475, 475, 475, 475, 475, + 475, 478, 478, 478, 478, 478, 478, 478, 478, 478, + + 478, 481, 481, 481, 481, 481, 481, 481, 481, 481, + 481, 472, 472, 472, 472, 472, 472, 472, 472, 472, + 475, 475, 475, 475, 475, 475, 475, 475, 475, 478, + 478, 478, 478, 478, 478, 478, 478, 478, 481, 481, + 481, 481, 481, 481, 481, 481, 481, 472, 472, 472, + 472, 472, 472, 472, 472, 475, 475, 475, 475, 475, + 475, 475, 475, 478, 478, 478, 478, 478, 478, 478, + 478, 481, 481, 481, 481, 481, 481, 481, 481, 472, + 472, 472, 472, 472, 472, 472, 472, 475, 475, 475, + 475, 475, 475, 475, 475, 478, 478, 478, 478, 478, + + 478, 478, 478, 481, 481, 481, 481, 481, 481, 481, + 481, 472, 472, 472, 472, 472, 472, 472, 475, 475, + 475, 475, 475, 475, 475, 478, 478, 478, 478, 478, + 478, 478, 481, 481, 481, 481, 481, 481, 481, 472, + 472, 472, 472, 475, 475, 475, 475, 478, 478, 478, + 478, 481, 481, 481, 481, 472, 472, 475, 475, 478, + 478, 481, 481, 472, 475, 478, 481, 472, 475, 478, + 481, 0, 472, 472, 472, 472, 472, 472, 472, 472, + 472, 472, 472, 472 + } ; + +static yyconst flex_int16_t yy_nxt[576] = + { 0, + 10, 11, 12, 13, 10, 14, 15, 10, 16, 10, + 17, 10, 10, 10, 10, 18, 19, 10, 20, 10, + 10, 21, 22, 23, 24, 25, 26, 129, 27, 28, + 11, 12, 29, 28, 30, 31, 28, 32, 28, 33, + 28, 28, 28, 28, 34, 35, 28, 36, 28, 28, + 37, 38, 39, 40, 41, 26, 111, 42, 43, 11, + 12, 44, 43, 45, 46, 43, 47, 43, 48, 43, + 43, 43, 43, 49, 50, 43, 51, 43, 43, 52, + 53, 54, 55, 56, 26, 93, 57, 58, 11, 12, + 59, 58, 60, 61, 58, 62, 58, 63, 58, 58, + + 58, 58, 64, 65, 58, 66, 58, 58, 67, 68, + 69, 70, 71, 26, 85, 72, 75, 75, 81, 88, + 82, 90, 101, 91, 102, 86, 95, 95, 89, 105, + 113, 113, 108, 123, 119, 83, 120, 126, 141, 103, + 106, 109, 131, 131, 124, 137, 127, 138, 144, 142, + 471, 121, 75, 75, 90, 232, 91, 145, 95, 95, + 113, 113, 139, 131, 131, 234, 243, 233, 245, 254, + 470, 256, 265, 469, 267, 468, 235, 236, 244, 246, + 247, 255, 257, 258, 266, 268, 269, 74, 74, 92, + 92, 94, 94, 110, 110, 112, 112, 128, 128, 130, + + 130, 146, 146, 159, 159, 467, 466, 465, 464, 463, + 462, 461, 460, 459, 458, 457, 456, 455, 454, 453, + 452, 451, 450, 449, 448, 447, 446, 445, 444, 443, + 442, 441, 440, 439, 438, 437, 436, 435, 434, 433, + 432, 431, 430, 429, 428, 427, 426, 425, 424, 423, + 422, 421, 420, 419, 418, 417, 416, 415, 414, 413, + 412, 411, 410, 409, 408, 407, 406, 405, 404, 403, + 402, 401, 400, 399, 398, 397, 396, 395, 394, 393, + 392, 391, 390, 389, 388, 387, 386, 385, 384, 383, + 382, 381, 380, 379, 378, 377, 376, 375, 374, 373, + + 372, 371, 370, 369, 368, 367, 366, 365, 364, 363, + 362, 361, 360, 359, 358, 357, 356, 355, 354, 353, + 352, 351, 350, 349, 348, 347, 346, 345, 344, 343, + 342, 341, 340, 339, 338, 337, 336, 335, 334, 333, + 332, 331, 330, 329, 328, 327, 326, 325, 324, 323, + 322, 321, 320, 319, 318, 317, 316, 315, 314, 313, + 312, 311, 310, 309, 308, 307, 306, 305, 304, 303, + 302, 301, 300, 299, 298, 297, 296, 295, 294, 293, + 292, 291, 290, 289, 288, 287, 286, 285, 284, 283, + 282, 281, 280, 279, 278, 277, 276, 275, 274, 273, + + 272, 271, 270, 264, 263, 262, 261, 260, 259, 253, + 252, 251, 250, 249, 248, 242, 241, 240, 239, 238, + 237, 231, 230, 229, 228, 227, 226, 225, 224, 223, + 222, 221, 220, 219, 218, 217, 216, 215, 214, 213, + 212, 211, 210, 209, 208, 207, 206, 205, 204, 203, + 202, 201, 200, 199, 198, 197, 196, 92, 195, 194, + 193, 192, 191, 190, 189, 188, 187, 186, 185, 184, + 74, 92, 183, 182, 181, 180, 179, 178, 177, 176, + 175, 174, 173, 172, 74, 92, 171, 170, 169, 168, + 167, 166, 165, 164, 163, 162, 161, 160, 74, 158, + + 157, 156, 155, 154, 153, 152, 151, 150, 149, 148, + 147, 73, 92, 143, 140, 136, 135, 134, 133, 132, + 74, 92, 125, 122, 118, 117, 116, 115, 114, 74, + 92, 107, 104, 100, 99, 98, 97, 96, 74, 87, + 84, 80, 79, 78, 77, 76, 73, 472, 9, 472, + 472, 472, 472, 472, 472, 472, 472, 472, 472, 472, + 472, 472, 472, 472, 472, 472, 472, 472, 472, 472, + 472, 472, 472, 472, 472 + } ; + +static yyconst flex_int16_t yy_chk[576] = + { 0, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, + 1, 1, 1, 1, 1, 1, 2, 481, 2, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, + 3, 3, 3, 3, 3, 4, 478, 4, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, + 5, 5, 5, 5, 6, 475, 6, 7, 7, 7, + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + + 7, 7, 7, 7, 7, 7, 7, 7, 7, 7, + 7, 7, 7, 8, 23, 8, 15, 15, 21, 25, + 21, 26, 37, 26, 37, 23, 31, 31, 25, 39, + 46, 46, 41, 54, 52, 21, 52, 56, 69, 37, + 39, 41, 61, 61, 54, 67, 56, 67, 71, 69, + 467, 52, 75, 75, 90, 200, 90, 71, 95, 95, + 113, 113, 67, 131, 131, 201, 208, 200, 209, 216, + 466, 217, 224, 465, 225, 464, 201, 201, 208, 209, + 209, 216, 217, 217, 224, 225, 225, 473, 473, 474, + 474, 476, 476, 477, 477, 479, 479, 480, 480, 482, + + 482, 483, 483, 484, 484, 462, 460, 458, 456, 455, + 452, 451, 448, 447, 444, 443, 440, 438, 437, 434, + 433, 431, 430, 427, 426, 424, 423, 420, 419, 417, + 416, 413, 412, 411, 410, 409, 408, 407, 405, 404, + 403, 402, 401, 400, 399, 397, 396, 395, 394, 393, + 392, 391, 389, 388, 387, 386, 385, 384, 383, 381, + 380, 379, 378, 377, 376, 375, 374, 373, 372, 371, + 370, 369, 368, 367, 366, 365, 364, 363, 362, 361, + 360, 359, 358, 357, 356, 355, 354, 353, 352, 351, + 350, 349, 348, 346, 345, 344, 343, 342, 341, 340, + + 339, 337, 336, 335, 334, 333, 332, 331, 330, 328, + 327, 326, 325, 324, 323, 322, 321, 319, 318, 317, + 316, 315, 314, 313, 312, 311, 310, 309, 308, 307, + 306, 304, 303, 302, 301, 300, 299, 298, 297, 296, + 294, 293, 292, 291, 290, 289, 288, 287, 286, 284, + 283, 282, 281, 280, 279, 278, 277, 276, 274, 273, + 272, 271, 269, 268, 267, 266, 265, 264, 263, 262, + 261, 260, 258, 257, 256, 255, 254, 253, 252, 251, + 250, 249, 247, 246, 245, 244, 243, 242, 241, 240, + 239, 238, 236, 235, 234, 233, 232, 231, 230, 229, + + 228, 227, 226, 223, 222, 221, 220, 219, 218, 215, + 214, 213, 212, 211, 210, 207, 206, 205, 204, 203, + 202, 199, 198, 197, 196, 195, 194, 191, 190, 189, + 187, 185, 184, 183, 182, 179, 178, 177, 175, 173, + 172, 171, 170, 167, 166, 165, 163, 161, 160, 158, + 157, 154, 153, 152, 150, 148, 147, 146, 145, 144, + 143, 142, 141, 140, 139, 138, 135, 134, 133, 132, + 130, 128, 127, 126, 125, 124, 123, 122, 121, 120, + 117, 116, 115, 114, 112, 110, 109, 108, 107, 106, + 105, 104, 103, 102, 99, 98, 97, 96, 94, 89, + + 88, 87, 86, 85, 84, 83, 82, 79, 78, 77, + 76, 73, 72, 70, 68, 66, 65, 64, 63, 62, + 59, 57, 55, 53, 51, 50, 49, 48, 47, 44, + 42, 40, 38, 36, 35, 34, 33, 32, 29, 24, + 22, 20, 19, 18, 17, 16, 11, 9, 472, 472, + 472, 472, 472, 472, 472, 472, 472, 472, 472, 472, + 472, 472, 472, 472, 472, 472, 472, 472, 472, 472, + 472, 472, 472, 472, 472 + } ; + +static yy_state_type yy_last_accepting_state; +static char *yy_last_accepting_cpos; + +extern int libvma_yy_flex_debug; +int libvma_yy_flex_debug = 0; + +/* The intent behind this definition is that it'll catch + * any uses of REJECT which flex missed. + */ +#define REJECT reject_used_but_not_detected +#define yymore() yymore_used_but_not_detected +#define YY_MORE_ADJ 0 +#define YY_RESTORE_YY_MORE_OFFSET +char *libvma_yytext; +/* Line 1 of config_scanner.l */ +/* + * Copyright (c) 2001-2020 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id: ibnl_scanner.ll,v 1.4 2005/02/23 21:08:37 eitan Exp $ + */ +/* Line 36 of config_scanner.l */ + +//#define DEBUG 1 + +#define yyparse libvma_yyparse +#define libvma_yylex libvma_yylex +#define yyerror libvma_yyerror +#define yylval libvma_yylval +#define yychar libvma_yychar +#define yydebug libvma_yydebug +#define yynerrs libvma_yynerrs + +#define libvma_yywrap libvma_yywrap + +#include +#include +#include "config_parser.h" +extern long __vma_config_line_num; +#define YY_NO_INPUT 1 + +/* Line 812 of config_scanner.c */ + +#define INITIAL 0 +#define CANNAME 1 +#define APP_ID_S1 2 +#define APP_ID_S2 3 + +#ifndef YY_NO_UNISTD_H +/* Special case for "unistd.h", since it is non-ANSI. We include it way + * down here because we want the user's section 1 to have been scanned first. + * The user has a chance to override it with an option. + */ +#include +#endif + +#ifndef YY_EXTRA_TYPE +#define YY_EXTRA_TYPE void * +#endif + +static int yy_init_globals (void ); + +/* Accessor methods to globals. + These are made visible to non-reentrant scanners for convenience. */ + +int libvma_yylex_destroy (void ); + +int libvma_yyget_debug (void ); + +void libvma_yyset_debug (int debug_flag ); + +YY_EXTRA_TYPE libvma_yyget_extra (void ); + +void libvma_yyset_extra (YY_EXTRA_TYPE user_defined ); + +FILE *libvma_yyget_in (void ); + +void libvma_yyset_in (FILE * in_str ); + +FILE *libvma_yyget_out (void ); + +void libvma_yyset_out (FILE * out_str ); + +yy_size_t libvma_yyget_leng (void ); + +char *libvma_yyget_text (void ); + +int libvma_yyget_lineno (void ); + +void libvma_yyset_lineno (int line_number ); + +/* Macros after this point can all be overridden by user definitions in + * section 1. + */ + +#ifndef YY_SKIP_YYWRAP +#ifdef __cplusplus +extern "C" int libvma_yywrap (void ); +#else +extern int libvma_yywrap (void ); +#endif +#endif + +#ifndef yytext_ptr +static void yy_flex_strncpy (char *,yyconst char *,int ); +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (yyconst char * ); +#endif + +#ifndef YY_NO_INPUT + +#ifdef __cplusplus +static int yyinput (void ); +#else +static int input (void ); +#endif + +#endif + +/* Amount of stuff to slurp up with each read. */ +#ifndef YY_READ_BUF_SIZE +#ifdef __ia64__ +/* On IA-64, the buffer size is 16k, not 8k */ +#define YY_READ_BUF_SIZE 16384 +#else +#define YY_READ_BUF_SIZE 8192 +#endif /* __ia64__ */ +#endif + +/* Copy whatever the last rule matched to the standard output. */ +#ifndef ECHO +/* This used to be an fputs(), but since the string might contain NUL's, + * we now use fwrite(). + */ +#define ECHO do { if (fwrite( libvma_yytext, libvma_yyleng, 1, libvma_yyout )) {} } while (0) +#endif + +/* Gets input and stuffs it into "buf". number of characters read, or YY_NULL, + * is returned in "result". + */ +#ifndef YY_INPUT +#define YY_INPUT(buf,result,max_size) \ + if ( YY_CURRENT_BUFFER_LVALUE->yy_is_interactive ) \ + { \ + int c = '*'; \ + size_t n; \ + for ( n = 0; n < max_size && \ + (c = getc( libvma_yyin )) != EOF && c != '\n'; ++n ) \ + buf[n] = (char) c; \ + if ( c == '\n' ) \ + buf[n++] = (char) c; \ + if ( c == EOF && ferror( libvma_yyin ) ) \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + result = n; \ + } \ + else \ + { \ + errno=0; \ + while ( (result = fread(buf, 1, max_size, libvma_yyin))==0 && ferror(libvma_yyin)) \ + { \ + if( errno != EINTR) \ + { \ + YY_FATAL_ERROR( "input in flex scanner failed" ); \ + break; \ + } \ + errno=0; \ + clearerr(libvma_yyin); \ + } \ + }\ +\ + +#endif + +/* No semi-colon after return; correct usage is to write "yyterminate();" - + * we don't want an extra ';' after the "return" because that will cause + * some compilers to complain about unreachable statements. + */ +#ifndef yyterminate +#define yyterminate() return YY_NULL +#endif + +/* Number of entries by which start-condition stack grows. */ +#ifndef YY_START_STACK_INCR +#define YY_START_STACK_INCR 25 +#endif + +/* Report a fatal error. */ +#ifndef YY_FATAL_ERROR +#define YY_FATAL_ERROR(msg) yy_fatal_error( msg ) +#endif + +/* end tables serialization structures and prototypes */ + +/* Default declaration of generated scanner - a define so the user can + * easily add parameters. + */ +#ifndef YY_DECL +#define YY_DECL_IS_OURS 1 + +extern int libvma_yylex (void); + +#define YY_DECL int libvma_yylex (void) +#endif /* !YY_DECL */ + +/* Code executed at the beginning of each rule, after libvma_yytext and libvma_yyleng + * have been set up. + */ +#ifndef YY_USER_ACTION +#define YY_USER_ACTION +#endif + +/* Code executed at the end of each rule. */ +#ifndef YY_BREAK +#define YY_BREAK break; +#endif + +#define YY_RULE_SETUP \ + if ( libvma_yyleng > 0 ) \ + YY_CURRENT_BUFFER_LVALUE->yy_at_bol = \ + (libvma_yytext[libvma_yyleng - 1] == '\n'); \ + YY_USER_ACTION + +/** The main scanner function which does all the work. + */ +YY_DECL +{ + register yy_state_type yy_current_state; + register char *yy_cp, *yy_bp; + register int yy_act; + + if ( !(yy_init) ) + { + (yy_init) = 1; + +#ifdef YY_USER_INIT + YY_USER_INIT; +#endif + + if ( ! (yy_start) ) + (yy_start) = 1; /* first start state */ + + if ( ! libvma_yyin ) + libvma_yyin = stdin; + + if ( ! libvma_yyout ) + libvma_yyout = stdout; + + if ( ! YY_CURRENT_BUFFER ) { + libvma_yyensure_buffer_stack (); + if ( yy_buffer_stack ) { + YY_CURRENT_BUFFER_LVALUE = + libvma_yy_create_buffer(libvma_yyin,YY_BUF_SIZE ); + } + } + + libvma_yy_load_buffer_state( ); + } + + { +/* Line 57 of config_scanner.l */ + + +/* Line 1033 of config_scanner.c */ + + while ( 1 ) /* loops until end-of-file is reached */ + { + yy_cp = (yy_c_buf_p); + + /* Support of libvma_yytext. */ + *yy_cp = (yy_hold_char); + + /* yy_bp points to the position in yy_ch_buf of the start of + * the current run. + */ + yy_bp = yy_cp; + + /* coverity[var_deref_op] */ + yy_current_state = (yy_start); + yy_current_state += YY_AT_BOL(); +yy_match: + do + { + register YY_CHAR yy_c = yy_ec[YY_SC_TO_UI(*yy_cp)] ; + if ( yy_accept[yy_current_state] ) + { + (yy_last_accepting_state) = yy_current_state; + (yy_last_accepting_cpos) = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 473 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; + ++yy_cp; + } + while ( yy_base[yy_current_state] != 549 ); + +yy_find_action: + yy_act = yy_accept[yy_current_state]; + if ( yy_act == 0 ) + { /* have to back up */ + yy_cp = (yy_last_accepting_cpos); + yy_current_state = (yy_last_accepting_state); + yy_act = yy_accept[yy_current_state]; + } + + YY_DO_BEFORE_ACTION; + +do_action: /* This label is used only to access EOF actions. */ + + switch ( yy_act ) + { /* beginning of action switch */ + case 0: /* must back up */ + /* undo the effects of YY_DO_BEFORE_ACTION */ + *yy_cp = (yy_hold_char); + yy_cp = (yy_last_accepting_cpos); + yy_current_state = (yy_last_accepting_state); + goto yy_find_action; + +case 1: +YY_RULE_SETUP +/* Line 59 of config_scanner.l */ +{} + YY_BREAK +case 2: +YY_RULE_SETUP +/* Line 61 of config_scanner.l */ +{ + yylval.ival = APP_ID; +#ifdef DEBUG + printf("APP_ID\n"); +#endif + BEGIN(APP_ID_S1); + return APP_ID; +} + YY_BREAK +case 3: +YY_RULE_SETUP +/* Line 70 of config_scanner.l */ +{ + yylval.sval = (char *)malloc(strlen(libvma_yytext) + 1); + strcpy(yylval.sval, libvma_yytext); +#ifdef DEBUG + printf("PROGRAM:%s\n",yylval.sval); +#endif + BEGIN(APP_ID_S2); + return (PROGRAM); +} + YY_BREAK +case 4: +YY_RULE_SETUP +/* Line 80 of config_scanner.l */ +{ + yylval.sval = (char *)malloc(strlen(libvma_yytext) + 1); + strcpy(yylval.sval, libvma_yytext); +#ifdef DEBUG + printf("USER_DEFINED_ID_STR:%s\n",yylval.sval); +#endif + BEGIN(0); + return (USER_DEFINED_ID_STR); +} + YY_BREAK +case 5: +YY_RULE_SETUP +/* Line 90 of config_scanner.l */ +{ + yylval.ival = atoi(libvma_yytext); +#ifdef DEBUG + printf("INT:%d\n",yylval.ival); +#endif + return INT; +} + YY_BREAK +case 6: +YY_RULE_SETUP +/* Line 98 of config_scanner.l */ +{ + yylval.ival = LOG; +#ifdef DEBUG + printf("LOG\n"); +#endif + return LOG; +} + YY_BREAK +case 7: +YY_RULE_SETUP +/* Line 106 of config_scanner.l */ +{ + yylval.ival = DEST; +#ifdef DEBUG + printf("DEST\n"); +#endif + return DEST; +} + YY_BREAK +case 8: +YY_RULE_SETUP +/* Line 114 of config_scanner.l */ +{ + yylval.ival = LEVEL; +#ifdef DEBUG + printf("LEVEL\n"); +#endif + return LEVEL; +} + YY_BREAK +case 9: +YY_RULE_SETUP +/* Line 122 of config_scanner.l */ +{ + yylval.ival = STDERR; +#ifdef DEBUG + printf("STDERR\n"); +#endif + return STDERR; +} + YY_BREAK +case 10: +YY_RULE_SETUP +/* Line 130 of config_scanner.l */ +{ + yylval.ival = SYSLOG; +#ifdef DEBUG + printf("SYSLOG\n"); +#endif + return SYSLOG; +} + YY_BREAK +case 11: +YY_RULE_SETUP +/* Line 138 of config_scanner.l */ +{ + yylval.ival = FILENAME; +#ifdef DEBUG + printf("FILENAME\n"); +#endif + BEGIN(CANNAME); + return FILENAME; +} + YY_BREAK +case 12: +YY_RULE_SETUP +/* Line 149 of config_scanner.l */ +{ + yylval.ival = USE; +#ifdef DEBUG + printf("USE\n"); +#endif + return USE; +} + YY_BREAK +case 13: +YY_RULE_SETUP +/* Line 157 of config_scanner.l */ +{ + yylval.ival = TCP; +#ifdef DEBUG + printf("TCP\n"); +#endif + return TCP; +} + YY_BREAK +case 14: +YY_RULE_SETUP +/* Line 165 of config_scanner.l */ +{ + yylval.ival = UDP; +#ifdef DEBUG + printf("UDP\n"); +#endif + return UDP; +} + YY_BREAK +case 15: +YY_RULE_SETUP +/* Line 173 of config_scanner.l */ +{ + yylval.ival = OS; +#ifdef DEBUG + printf("OS\n"); +#endif + return OS; +} + YY_BREAK +case 16: +YY_RULE_SETUP +/* Line 181 of config_scanner.l */ +{ + yylval.ival = VMA; +#ifdef DEBUG + printf("VMA\n"); +#endif + return VMA; +} + YY_BREAK +case 17: +YY_RULE_SETUP +/* Line 189 of config_scanner.l */ +{ + yylval.ival = SDP; +#ifdef DEBUG + printf("SDP\n"); +#endif + return SDP; +} + YY_BREAK +case 18: +YY_RULE_SETUP +/* Line 197 of config_scanner.l */ +{ + yylval.ival = SA; +#ifdef DEBUG + printf("SA\n"); +#endif + return SA; +} + YY_BREAK +case 19: +YY_RULE_SETUP +/* Line 205 of config_scanner.l */ +{ + yylval.ival = TCP_CLIENT; +#ifdef DEBUG + printf("TCP CLIENT\n"); +#endif + return TCP_CLIENT; +} + YY_BREAK +case 20: +YY_RULE_SETUP +/* Line 213 of config_scanner.l */ +{ + yylval.ival = TCP_SERVER; +#ifdef DEBUG + printf("TCP SERVER\n"); +#endif + return TCP_SERVER; +} + YY_BREAK +case 21: +YY_RULE_SETUP +/* Line 221 of config_scanner.l */ +{ + yylval.ival = UDP_SENDER; +#ifdef DEBUG + printf("UDP SENDER\n"); +#endif + return UDP_SENDER; +} + YY_BREAK +case 22: +YY_RULE_SETUP +/* Line 229 of config_scanner.l */ +{ + yylval.ival = UDP_RECEIVER; +#ifdef DEBUG + printf("UDP RECEIVER\n"); +#endif + return UDP_RECEIVER; +} + YY_BREAK +case 23: +YY_RULE_SETUP +/* Line 237 of config_scanner.l */ +{ + yylval.ival = UDP_CONNECT; +#ifdef DEBUG + printf("UDP CONNECT\n"); +#endif + return UDP_CONNECT; +} + YY_BREAK +case 24: +YY_RULE_SETUP +/* Line 245 of config_scanner.l */ +{ + yylval.sval = (char *)malloc(strlen(libvma_yytext) + 1); + strcpy(yylval.sval, libvma_yytext); +#ifdef DEBUG + printf("NAME:%s\n",yylval.sval); +#endif + BEGIN(0); + return (NAME); +} + YY_BREAK +case 25: +/* rule 25 can match eol */ +YY_RULE_SETUP +/* Line 255 of config_scanner.l */ +{ + __vma_config_line_num++; +#ifdef DEBUG + printf("LINE\n"); +#endif + yylval.ival = LINE; + return(LINE); +} + YY_BREAK +case 26: +YY_RULE_SETUP +/* Line 264 of config_scanner.l */ +{ + __vma_config_line_num++; +} + YY_BREAK +case 27: +YY_RULE_SETUP +/* Line 268 of config_scanner.l */ +{} + YY_BREAK +case 28: +YY_RULE_SETUP +/* Line 270 of config_scanner.l */ +{ +#ifdef DEBUG + printf("CHAR:%c\n",libvma_yytext[0]); +#endif + return(libvma_yytext[0]); +} + YY_BREAK +case 29: +YY_RULE_SETUP +/* Line 277 of config_scanner.l */ +ECHO; + YY_BREAK +/* Line 1397 of config_scanner.c */ +case YY_STATE_EOF(INITIAL): +case YY_STATE_EOF(CANNAME): +case YY_STATE_EOF(APP_ID_S1): +case YY_STATE_EOF(APP_ID_S2): + yyterminate(); + + case YY_END_OF_BUFFER: + { + /* Amount of text matched not including the EOB char. */ + int yy_amount_of_matched_text = (int) (yy_cp - (yytext_ptr)) - 1; + + /* Undo the effects of YY_DO_BEFORE_ACTION. */ + *yy_cp = (yy_hold_char); + YY_RESTORE_YY_MORE_OFFSET + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_NEW ) + { + /* We're scanning a new file or input source. It's + * possible that this happened because the user + * just pointed libvma_yyin at a new source and called + * libvma_yylex(). If so, then we have to assure + * consistency between YY_CURRENT_BUFFER and our + * globals. Here is the right place to do so, because + * this is the first action (other than possibly a + * back-up) that will match for the new input source. + */ + (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + YY_CURRENT_BUFFER_LVALUE->yy_input_file = libvma_yyin; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = YY_BUFFER_NORMAL; + } + + /* Note that here we test for yy_c_buf_p "<=" to the position + * of the first EOB in the buffer, since yy_c_buf_p will + * already have been incremented past the NUL character + * (since all states make transitions on EOB to the + * end-of-buffer state). Contrast this with the test + * in input(). + */ + if ( (yy_c_buf_p) <= &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] ) + { /* This was really a NUL. */ + yy_state_type yy_next_state; + + (yy_c_buf_p) = (yytext_ptr) + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( ); + + /* Okay, we're now positioned to make the NUL + * transition. We couldn't have + * yy_get_previous_state() go ahead and do it + * for us because it doesn't know how to deal + * with the possibility of jamming (and we don't + * want to build jamming into it because then it + * will run more slowly). + */ + + yy_next_state = yy_try_NUL_trans( yy_current_state ); + + yy_bp = (yytext_ptr) + YY_MORE_ADJ; + + if ( yy_next_state ) + { + /* Consume the NUL. */ + yy_cp = ++(yy_c_buf_p); + yy_current_state = yy_next_state; + goto yy_match; + } + + else + { + yy_cp = (yy_c_buf_p); + goto yy_find_action; + } + } + + else switch ( yy_get_next_buffer( ) ) + { + case EOB_ACT_END_OF_FILE: + { + (yy_did_buffer_switch_on_eof) = 0; + + if ( libvma_yywrap( ) ) + { + /* Note: because we've taken care in + * yy_get_next_buffer() to have set up + * libvma_yytext, we can now set up + * yy_c_buf_p so that if some total + * hoser (like flex itself) wants to + * call the scanner after we return the + * YY_NULL, it'll still work - another + * YY_NULL will get returned. + */ + (yy_c_buf_p) = (yytext_ptr) + YY_MORE_ADJ; + + yy_act = YY_STATE_EOF(YY_START); + goto do_action; + } + + else + { + if ( ! (yy_did_buffer_switch_on_eof) ) + YY_NEW_FILE; + } + break; + } + + case EOB_ACT_CONTINUE_SCAN: + (yy_c_buf_p) = + (yytext_ptr) + yy_amount_of_matched_text; + + yy_current_state = yy_get_previous_state( ); + + yy_cp = (yy_c_buf_p); + yy_bp = (yytext_ptr) + YY_MORE_ADJ; + goto yy_match; + + case EOB_ACT_LAST_MATCH: + (yy_c_buf_p) = + &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)]; + + yy_current_state = yy_get_previous_state( ); + + yy_cp = (yy_c_buf_p); + yy_bp = (yytext_ptr) + YY_MORE_ADJ; + goto yy_find_action; + } + break; + } + + default: + YY_FATAL_ERROR( + "fatal flex scanner internal error--no action found" ); + } /* end of action switch */ + } /* end of scanning one token */ + } /* end of user's declarations */ +} /* end of libvma_yylex */ + +/* yy_get_next_buffer - try to read in a new buffer + * + * Returns a code representing an action: + * EOB_ACT_LAST_MATCH - + * EOB_ACT_CONTINUE_SCAN - continue scanning from current position + * EOB_ACT_END_OF_FILE - end of file + */ +static int yy_get_next_buffer (void) +{ + register char *dest = YY_CURRENT_BUFFER_LVALUE->yy_ch_buf; + register char *source = (yytext_ptr); + register int number_to_move, i; + int ret_val; + + if ( (yy_c_buf_p) > &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] ) + YY_FATAL_ERROR( + "fatal flex scanner internal error--end of buffer missed" ); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_fill_buffer == 0 ) + { /* Don't try to fill the buffer, so this is an EOF. */ + if ( (yy_c_buf_p) - (yytext_ptr) - YY_MORE_ADJ == 1 ) + { + /* We matched a single character, the EOB, so + * treat this as a final EOF. + */ + return EOB_ACT_END_OF_FILE; + } + + else + { + /* We matched some text prior to the EOB, first + * process it. + */ + return EOB_ACT_LAST_MATCH; + } + } + + /* Try to read more data. */ + + /* First move last chars to start of buffer. */ + number_to_move = (int) ((yy_c_buf_p) - (yytext_ptr)) - 1; + + for ( i = 0; i < number_to_move; ++i ) + *(dest++) = *(source++); + + if ( YY_CURRENT_BUFFER_LVALUE->yy_buffer_status == YY_BUFFER_EOF_PENDING ) + /* don't do the read, it's not guaranteed to return an EOF, + * just force an EOF + */ + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars) = 0; + + else + { + yy_size_t num_to_read = + YY_CURRENT_BUFFER_LVALUE->yy_buf_size - number_to_move - 1; + + while ( num_to_read <= 0 ) + { /* Not enough room in the buffer - grow it. */ + + /* just a shorter name for the current buffer */ + YY_BUFFER_STATE b = YY_CURRENT_BUFFER_LVALUE; + + int yy_c_buf_p_offset = + (int) ((yy_c_buf_p) - b->yy_ch_buf); + + if ( b->yy_is_our_buffer ) + { + yy_size_t new_size = b->yy_buf_size * 2; + + if ( new_size <= 0 ) + b->yy_buf_size += b->yy_buf_size / 8; + else + b->yy_buf_size *= 2; + + b->yy_ch_buf = (char *) + /* Include room in for 2 EOB chars. */ + libvma_yyrealloc((void *) b->yy_ch_buf,b->yy_buf_size + 2 ); + } + else + /* Can't grow it, we don't own it. */ + b->yy_ch_buf = 0; + + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( + "fatal error - scanner input buffer overflow" ); + + (yy_c_buf_p) = &b->yy_ch_buf[yy_c_buf_p_offset]; + + num_to_read = YY_CURRENT_BUFFER_LVALUE->yy_buf_size - + number_to_move - 1; + + } + + if ( num_to_read > YY_READ_BUF_SIZE ) + num_to_read = YY_READ_BUF_SIZE; + + /* Read in more data. */ + YY_INPUT( (&YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[number_to_move]), + (yy_n_chars), num_to_read ); + + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); + } + + if ( (yy_n_chars) == 0 ) + { + if ( number_to_move == YY_MORE_ADJ ) + { + ret_val = EOB_ACT_END_OF_FILE; + libvma_yyrestart(libvma_yyin ); + } + + else + { + ret_val = EOB_ACT_LAST_MATCH; + YY_CURRENT_BUFFER_LVALUE->yy_buffer_status = + YY_BUFFER_EOF_PENDING; + } + } + + else + ret_val = EOB_ACT_CONTINUE_SCAN; + + if ((yy_size_t) ((yy_n_chars) + number_to_move) > YY_CURRENT_BUFFER_LVALUE->yy_buf_size) { + /* Extend the array by 50%, plus the number we really need. */ + yy_size_t new_size = (yy_n_chars) + number_to_move + ((yy_n_chars) >> 1); + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf = (char *) libvma_yyrealloc((void *) YY_CURRENT_BUFFER_LVALUE->yy_ch_buf,new_size ); + if ( ! YY_CURRENT_BUFFER_LVALUE->yy_ch_buf ) + YY_FATAL_ERROR( "out of dynamic memory in yy_get_next_buffer()" ); + } + + (yy_n_chars) += number_to_move; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] = YY_END_OF_BUFFER_CHAR; + YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars) + 1] = YY_END_OF_BUFFER_CHAR; + + (yytext_ptr) = &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[0]; + + return ret_val; +} + +/* yy_get_previous_state - get the state just before the EOB char was reached */ + + static yy_state_type yy_get_previous_state (void) +{ + register yy_state_type yy_current_state; + register char *yy_cp; + + yy_current_state = (yy_start); + yy_current_state += YY_AT_BOL(); + + for ( yy_cp = (yytext_ptr) + YY_MORE_ADJ; yy_cp < (yy_c_buf_p); ++yy_cp ) + { + register YY_CHAR yy_c = (*yy_cp ? yy_ec[YY_SC_TO_UI(*yy_cp)] : 1); + if ( yy_accept[yy_current_state] ) + { + (yy_last_accepting_state) = yy_current_state; + (yy_last_accepting_cpos) = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 473 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; + } + + return yy_current_state; +} + +/* yy_try_NUL_trans - try to make a transition on the NUL character + * + * synopsis + * next_state = yy_try_NUL_trans( current_state ); + */ + static yy_state_type yy_try_NUL_trans (yy_state_type yy_current_state ) +{ + register int yy_is_jam; + register char *yy_cp = (yy_c_buf_p); + + register YY_CHAR yy_c = 1; + if ( yy_accept[yy_current_state] ) + { + (yy_last_accepting_state) = yy_current_state; + (yy_last_accepting_cpos) = yy_cp; + } + while ( yy_chk[yy_base[yy_current_state] + yy_c] != yy_current_state ) + { + yy_current_state = (int) yy_def[yy_current_state]; + if ( yy_current_state >= 473 ) + yy_c = yy_meta[(unsigned int) yy_c]; + } + yy_current_state = yy_nxt[yy_base[yy_current_state] + (unsigned int) yy_c]; + yy_is_jam = (yy_current_state == 472); + + return yy_is_jam ? 0 : yy_current_state; +} + +#ifndef YY_NO_INPUT +#ifdef __cplusplus + static int yyinput (void) +#else + static int input (void) +#endif + +{ + int c; + + *(yy_c_buf_p) = (yy_hold_char); + + if ( *(yy_c_buf_p) == YY_END_OF_BUFFER_CHAR ) + { + /* yy_c_buf_p now points to the character we want to return. + * If this occurs *before* the EOB characters, then it's a + * valid NUL; if not, then we've hit the end of the buffer. + */ + if ( (yy_c_buf_p) < &YY_CURRENT_BUFFER_LVALUE->yy_ch_buf[(yy_n_chars)] ) + /* This was really a NUL. */ + *(yy_c_buf_p) = '\0'; + + else + { /* need more input */ + yy_size_t offset = (yy_c_buf_p) - (yytext_ptr); + ++(yy_c_buf_p); + + switch ( yy_get_next_buffer( ) ) + { + case EOB_ACT_LAST_MATCH: + /* This happens because yy_g_n_b() + * sees that we've accumulated a + * token and flags that we need to + * try matching the token before + * proceeding. But for input(), + * there's no matching to consider. + * So convert the EOB_ACT_LAST_MATCH + * to EOB_ACT_END_OF_FILE. + */ + + /* Reset buffer status. */ + libvma_yyrestart(libvma_yyin ); + + /*FALLTHROUGH*/ + + case EOB_ACT_END_OF_FILE: + { + if ( libvma_yywrap( ) ) + return EOF; + + if ( ! (yy_did_buffer_switch_on_eof) ) + YY_NEW_FILE; +#ifdef __cplusplus + return yyinput(); +#else + return input(); +#endif + } + + case EOB_ACT_CONTINUE_SCAN: + (yy_c_buf_p) = (yytext_ptr) + offset; + break; + } + } + } + + c = *(unsigned char *) (yy_c_buf_p); /* cast for 8-bit char's */ + *(yy_c_buf_p) = '\0'; /* preserve libvma_yytext */ + (yy_hold_char) = *++(yy_c_buf_p); + + YY_CURRENT_BUFFER_LVALUE->yy_at_bol = (c == '\n'); + + return c; +} +#endif /* ifndef YY_NO_INPUT */ + +/** Immediately switch to a different input stream. + * @param input_file A readable stream. + * + * @note This function does not reset the start condition to @c INITIAL . + */ + void libvma_yyrestart (FILE * input_file ) +{ + + if ( ! YY_CURRENT_BUFFER ){ + libvma_yyensure_buffer_stack (); + if ( yy_buffer_stack ) { + YY_CURRENT_BUFFER_LVALUE = + libvma_yy_create_buffer(libvma_yyin,YY_BUF_SIZE ); + } + } + + libvma_yy_init_buffer(YY_CURRENT_BUFFER,input_file ); + libvma_yy_load_buffer_state( ); +} + +/** Switch to a different input buffer. + * @param new_buffer The new input buffer. + * + */ + void libvma_yy_switch_to_buffer (YY_BUFFER_STATE new_buffer ) +{ + + /* TODO. We should be able to replace this entire function body + * with + * libvma_yypop_buffer_state(); + * libvma_yypush_buffer_state(new_buffer); + */ + libvma_yyensure_buffer_stack (); + if ( YY_CURRENT_BUFFER == new_buffer ) + return; + + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *(yy_c_buf_p) = (yy_hold_char); + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p); + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); + } + + if ( yy_buffer_stack ) { + YY_CURRENT_BUFFER_LVALUE = new_buffer; + } + libvma_yy_load_buffer_state( ); + + /* We don't actually know whether we did this switch during + * EOF (libvma_yywrap()) processing, but the only time this flag + * is looked at is after libvma_yywrap() is called, so it's safe + * to go ahead and always set it. + */ + (yy_did_buffer_switch_on_eof) = 1; +} + +static void libvma_yy_load_buffer_state (void) +{ + (yy_n_chars) = YY_CURRENT_BUFFER_LVALUE->yy_n_chars; + (yytext_ptr) = (yy_c_buf_p) = YY_CURRENT_BUFFER_LVALUE->yy_buf_pos; + libvma_yyin = YY_CURRENT_BUFFER_LVALUE->yy_input_file; + (yy_hold_char) = *(yy_c_buf_p); +} + +/** Allocate and initialize an input buffer state. + * @param file A readable stream. + * @param size The character buffer size in bytes. When in doubt, use @c YY_BUF_SIZE. + * + * @return the allocated buffer state. + */ + YY_BUFFER_STATE libvma_yy_create_buffer (FILE * file, int size ) +{ + YY_BUFFER_STATE b; + + b = (YY_BUFFER_STATE) libvma_yyalloc(sizeof( struct yy_buffer_state ) ); + if ( ! b ) + YY_FATAL_ERROR( "out of dynamic memory in libvma_yy_create_buffer()" ); + + b->yy_buf_size = size; + + /* yy_ch_buf has to be 2 characters longer than the size given because + * we need to put in 2 end-of-buffer characters. + */ + b->yy_ch_buf = (char *) libvma_yyalloc(b->yy_buf_size + 2 ); + if ( ! b->yy_ch_buf ) + YY_FATAL_ERROR( "out of dynamic memory in libvma_yy_create_buffer()" ); + + b->yy_is_our_buffer = 1; + + libvma_yy_init_buffer(b,file ); + + return b; +} + +/** Destroy the buffer. + * @param b a buffer created with libvma_yy_create_buffer() + * + */ + void libvma_yy_delete_buffer (YY_BUFFER_STATE b ) +{ + + if ( ! b ) + return; + + if ( b == YY_CURRENT_BUFFER ) /* Not sure if we should pop here. */ + YY_CURRENT_BUFFER_LVALUE = (YY_BUFFER_STATE) 0; + + if ( b->yy_is_our_buffer ) + libvma_yyfree((void *) b->yy_ch_buf ); + + libvma_yyfree((void *) b ); +} + +/* Initializes or reinitializes a buffer. + * This function is sometimes called more than once on the same buffer, + * such as during a libvma_yyrestart() or at EOF. + */ + static void libvma_yy_init_buffer (YY_BUFFER_STATE b, FILE * file ) + +{ + int oerrno = errno; + + libvma_yy_flush_buffer(b ); + + b->yy_input_file = file; + b->yy_fill_buffer = 1; + + /* If b is the current buffer, then libvma_yy_init_buffer was _probably_ + * called from libvma_yyrestart() or through yy_get_next_buffer. + * In that case, we don't want to reset the lineno or column. + */ + if (b != YY_CURRENT_BUFFER){ + b->yy_bs_lineno = 1; + b->yy_bs_column = 0; + } + + b->yy_is_interactive = file ? (isatty( fileno(file) ) > 0) : 0; + + errno = oerrno; +} + +/** Discard all buffered characters. On the next scan, YY_INPUT will be called. + * @param b the buffer state to be flushed, usually @c YY_CURRENT_BUFFER. + * + */ + void libvma_yy_flush_buffer (YY_BUFFER_STATE b ) +{ + if ( ! b ) + return; + + b->yy_n_chars = 0; + + /* We always need two end-of-buffer characters. The first causes + * a transition to the end-of-buffer state. The second causes + * a jam in that state. + */ + b->yy_ch_buf[0] = YY_END_OF_BUFFER_CHAR; + b->yy_ch_buf[1] = YY_END_OF_BUFFER_CHAR; + + b->yy_buf_pos = &b->yy_ch_buf[0]; + + b->yy_at_bol = 1; + b->yy_buffer_status = YY_BUFFER_NEW; + + if ( b == YY_CURRENT_BUFFER ) + libvma_yy_load_buffer_state( ); +} + +/** Pushes the new state onto the stack. The new state becomes + * the current state. This function will allocate the stack + * if necessary. + * @param new_buffer The new state. + * + */ +void libvma_yypush_buffer_state (YY_BUFFER_STATE new_buffer ) +{ + if (new_buffer == NULL) + return; + + libvma_yyensure_buffer_stack(); + + /* This block is copied from libvma_yy_switch_to_buffer. */ + if ( YY_CURRENT_BUFFER ) + { + /* Flush out information for old buffer. */ + *(yy_c_buf_p) = (yy_hold_char); + YY_CURRENT_BUFFER_LVALUE->yy_buf_pos = (yy_c_buf_p); + YY_CURRENT_BUFFER_LVALUE->yy_n_chars = (yy_n_chars); + } + + /* Only push if top exists. Otherwise, replace top. */ + if (YY_CURRENT_BUFFER) { + (yy_buffer_stack_top)++; + } + if (yy_buffer_stack) { + YY_CURRENT_BUFFER_LVALUE = new_buffer; + } + + /* copied from libvma_yy_switch_to_buffer. */ + libvma_yy_load_buffer_state( ); + (yy_did_buffer_switch_on_eof) = 1; +} + +/** Removes and deletes the top of the stack, if present. + * The next element becomes the new top. + * + */ +void libvma_yypop_buffer_state (void) +{ + if (!YY_CURRENT_BUFFER) + return; + + libvma_yy_delete_buffer(YY_CURRENT_BUFFER ); + YY_CURRENT_BUFFER_LVALUE = NULL; + if ((yy_buffer_stack_top) > 0) + --(yy_buffer_stack_top); + + if (YY_CURRENT_BUFFER) { + libvma_yy_load_buffer_state( ); + (yy_did_buffer_switch_on_eof) = 1; + } +} + +/* Allocates the stack if it does not exist. + * Guarantees space for at least one push. + */ +static void libvma_yyensure_buffer_stack (void) +{ + yy_size_t num_to_alloc; + + if (!(yy_buffer_stack)) { + + /* First allocation is just for 2 elements, since we don't know if this + * scanner will even need a stack. We use 2 instead of 1 to avoid an + * immediate realloc on the next call. + */ + num_to_alloc = 1; + (yy_buffer_stack) = (struct yy_buffer_state**)libvma_yyalloc + (num_to_alloc * sizeof(struct yy_buffer_state*) + ); + if ( ! (yy_buffer_stack) ) + YY_FATAL_ERROR( "out of dynamic memory in libvma_yyensure_buffer_stack()" ); + + memset((yy_buffer_stack), 0, num_to_alloc * sizeof(struct yy_buffer_state*)); + + (yy_buffer_stack_max) = num_to_alloc; + (yy_buffer_stack_top) = 0; + return; + } + + if ((yy_buffer_stack_top) >= ((yy_buffer_stack_max)) - 1){ + + /* Increase the buffer to prepare for a possible push. */ + int grow_size = 8 /* arbitrary grow size */; + + num_to_alloc = (yy_buffer_stack_max) + grow_size; + (yy_buffer_stack) = (struct yy_buffer_state**)libvma_yyrealloc + ((yy_buffer_stack), + num_to_alloc * sizeof(struct yy_buffer_state*) + ); + if ( ! (yy_buffer_stack) ) + YY_FATAL_ERROR( "out of dynamic memory in libvma_yyensure_buffer_stack()" ); + + /* zero only the new slots.*/ + memset((yy_buffer_stack) + (yy_buffer_stack_max), 0, grow_size * sizeof(struct yy_buffer_state*)); + (yy_buffer_stack_max) = num_to_alloc; + } +} + +/** Setup the input buffer state to scan directly from a user-specified character buffer. + * @param base the character buffer + * @param size the size in bytes of the character buffer + * + * @return the newly allocated buffer state object. + */ +YY_BUFFER_STATE libvma_yy_scan_buffer (char * base, yy_size_t size ) +{ + YY_BUFFER_STATE b; + + if ( size < 2 || + base[size-2] != YY_END_OF_BUFFER_CHAR || + base[size-1] != YY_END_OF_BUFFER_CHAR ) + /* They forgot to leave room for the EOB's. */ + return 0; + + b = (YY_BUFFER_STATE) libvma_yyalloc(sizeof( struct yy_buffer_state ) ); + if ( ! b ) + YY_FATAL_ERROR( "out of dynamic memory in libvma_yy_scan_buffer()" ); + + b->yy_buf_size = size - 2; /* "- 2" to take care of EOB's */ + b->yy_buf_pos = b->yy_ch_buf = base; + b->yy_is_our_buffer = 0; + b->yy_input_file = 0; + b->yy_n_chars = b->yy_buf_size; + b->yy_is_interactive = 0; + b->yy_at_bol = 1; + b->yy_fill_buffer = 0; + b->yy_buffer_status = YY_BUFFER_NEW; + + libvma_yy_switch_to_buffer(b ); + + return b; +} + +/** Setup the input buffer state to scan a string. The next call to libvma_yylex() will + * scan from a @e copy of @a str. + * @param yystr a NUL-terminated string to scan + * + * @return the newly allocated buffer state object. + * @note If you want to scan bytes that may contain NUL values, then use + * libvma_yy_scan_bytes() instead. + */ +YY_BUFFER_STATE libvma_yy_scan_string (yyconst char * yystr ) +{ + + return libvma_yy_scan_bytes(yystr,strlen(yystr) ); +} + +/** Setup the input buffer state to scan the given bytes. The next call to libvma_yylex() will + * scan from a @e copy of @a bytes. + * @param yybytes the byte buffer to scan + * @param _yybytes_len the number of bytes in the buffer pointed to by @a bytes. + * + * @return the newly allocated buffer state object. + */ +YY_BUFFER_STATE libvma_yy_scan_bytes (yyconst char * yybytes, yy_size_t _yybytes_len ) +{ + YY_BUFFER_STATE b; + char *buf; + yy_size_t n; + yy_size_t i; + + /* Get memory for full buffer, including space for trailing EOB's. */ + n = _yybytes_len + 2; + buf = (char *) libvma_yyalloc(n ); + if ( ! buf ) + YY_FATAL_ERROR( "out of dynamic memory in libvma_yy_scan_bytes()" ); + + for ( i = 0; i < _yybytes_len; ++i ) + buf[i] = yybytes[i]; + + buf[_yybytes_len] = buf[_yybytes_len+1] = YY_END_OF_BUFFER_CHAR; + + b = libvma_yy_scan_buffer(buf,n ); + if ( ! b ) + YY_FATAL_ERROR( "bad buffer in libvma_yy_scan_bytes()" ); + + /* It's okay to grow etc. this buffer, and we should throw it + * away when we're done. + */ + b->yy_is_our_buffer = 1; + + return b; +} + +#ifndef YY_EXIT_FAILURE +#define YY_EXIT_FAILURE 2 +#endif + +static void yy_fatal_error (yyconst char* msg ) +{ + (void) fprintf( stderr, "%s\n", msg ); + exit( YY_EXIT_FAILURE ); +} + +/* Redefine yyless() so it works in section 3 code. */ + +#undef yyless +#define yyless(n) \ + do \ + { \ + /* Undo effects of setting up libvma_yytext. */ \ + int yyless_macro_arg = (n); \ + YY_LESS_LINENO(yyless_macro_arg);\ + libvma_yytext[libvma_yyleng] = (yy_hold_char); \ + (yy_c_buf_p) = libvma_yytext + yyless_macro_arg; \ + (yy_hold_char) = *(yy_c_buf_p); \ + *(yy_c_buf_p) = '\0'; \ + libvma_yyleng = yyless_macro_arg; \ + } \ + while ( 0 ) + +/* Accessor methods (get/set functions) to struct members. */ + +/** Get the current line number. + * + */ +int libvma_yyget_lineno (void) +{ + + return libvma_yylineno; +} + +/** Get the input stream. + * + */ +FILE *libvma_yyget_in (void) +{ + return libvma_yyin; +} + +/** Get the output stream. + * + */ +FILE *libvma_yyget_out (void) +{ + return libvma_yyout; +} + +/** Get the length of the current token. + * + */ +yy_size_t libvma_yyget_leng (void) +{ + return libvma_yyleng; +} + +/** Get the current token. + * + */ + +char *libvma_yyget_text (void) +{ + return libvma_yytext; +} + +/** Set the current line number. + * @param line_number + * + */ +void libvma_yyset_lineno (int line_number ) +{ + + libvma_yylineno = line_number; +} + +/** Set the input stream. This does not discard the current + * input buffer. + * @param in_str A readable stream. + * + * @see libvma_yy_switch_to_buffer + */ +void libvma_yyset_in (FILE * in_str ) +{ + libvma_yyin = in_str ; +} + +void libvma_yyset_out (FILE * out_str ) +{ + libvma_yyout = out_str ; +} + +int libvma_yyget_debug (void) +{ + return libvma_yy_flex_debug; +} + +void libvma_yyset_debug (int bdebug ) +{ + libvma_yy_flex_debug = bdebug ; +} + +static int yy_init_globals (void) +{ + /* Initialization is the same as for the non-reentrant scanner. + * This function is called from libvma_yylex_destroy(), so don't allocate here. + */ + + (yy_buffer_stack) = 0; + (yy_buffer_stack_top) = 0; + (yy_buffer_stack_max) = 0; + (yy_c_buf_p) = (char *) 0; + (yy_init) = 0; + (yy_start) = 0; + +/* Defined in main.c */ +#ifdef YY_STDINIT + libvma_yyin = stdin; + libvma_yyout = stdout; +#else + libvma_yyin = (FILE *) 0; + libvma_yyout = (FILE *) 0; +#endif + + /* For future reference: Set errno on error, since we are called by + * libvma_yylex_init() + */ + return 0; +} + +/* libvma_yylex_destroy is for both reentrant and non-reentrant scanners. */ +int libvma_yylex_destroy (void) +{ + + /* Pop the buffer stack, destroying each element. */ + while(YY_CURRENT_BUFFER){ + libvma_yy_delete_buffer(YY_CURRENT_BUFFER ); + YY_CURRENT_BUFFER_LVALUE = NULL; + libvma_yypop_buffer_state(); + } + + /* Destroy the stack itself. */ + libvma_yyfree((yy_buffer_stack) ); + (yy_buffer_stack) = NULL; + + /* Reset the globals. This is important in a non-reentrant scanner so the next time + * libvma_yylex() is called, initialization will occur. */ + yy_init_globals( ); + + return 0; +} + +/* + * Internal utility routines. + */ + +#ifndef yytext_ptr +static void yy_flex_strncpy (char* s1, yyconst char * s2, int n ) +{ + register int i; + for ( i = 0; i < n; ++i ) + s1[i] = s2[i]; +} +#endif + +#ifdef YY_NEED_STRLEN +static int yy_flex_strlen (yyconst char * s ) +{ + register int n; + for ( n = 0; s[n]; ++n ) + ; + + return n; +} +#endif + +void *libvma_yyalloc (yy_size_t size ) +{ + return (void *) malloc( size ); +} + +void *libvma_yyrealloc (void * ptr, yy_size_t size ) +{ + /* The cast to (char *) in the following accommodates both + * implementations that use char* generic pointers, and those + * that use void* generic pointers. It works with the latter + * because both ANSI C and C++ allow castless assignment from + * any pointer type to void*, and deal with argument conversions + * as though doing an assignment. + */ + return (void *) realloc( (char *) ptr, size ); +} + +void libvma_yyfree (void * ptr ) +{ + free( (char *) ptr ); /* see libvma_yyrealloc() for (char *) cast */ +} + +#define YYTABLES_NAME "yytables" + +/* Line 276 of config_scanner.l */ + + + +int libvma_yywrap () +{ + return (1); +} + + diff --git a/src/vma/dev/allocator.cpp b/src/vma/dev/allocator.cpp new file mode 100644 index 0000000..2e7ec5a --- /dev/null +++ b/src/vma/dev/allocator.cpp @@ -0,0 +1,373 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#define MODULE_NAME "allocator" + +vma_allocator::vma_allocator() +{ + __log_info_dbg(""); + + m_shmid = -1; + m_length = 0; + m_data_block = NULL; + m_mem_alloc_type = safe_mce_sys().mem_alloc_type; + + __log_info_dbg("Done"); +} + +vma_allocator::~vma_allocator() +{ + __log_info_dbg(""); + + // Unregister memory + deregister_memory(); + if (!m_data_block) { + __log_info_dbg("m_data_block is null"); + return; + } + switch (m_mem_alloc_type) { + case ALLOC_TYPE_EXTERNAL: + // not allocated by us + case ALLOC_TYPE_CONTIG: + // freed as part of deregister_memory + break; + case ALLOC_TYPE_HUGEPAGES: + if (m_shmid > 0) { + if (shmdt(m_data_block) != 0) { + __log_info_err("shmem detach failure %m"); + } + } else { // used mmap + if (munmap(m_data_block, m_length)) { + __log_info_err("failed freeing memory " + "with munmap errno " + "%d", errno); + } + } + break; + case ALLOC_TYPE_ANON: + free(m_data_block); + break; + default: + __log_info_err("Unknown memory allocation type %d", + m_mem_alloc_type); + break; + } + __log_info_dbg("Done"); +} + +void* vma_allocator::alloc_and_reg_mr(size_t size, ib_ctx_handler *p_ib_ctx_h, void *ptr /* NULL */) +{ + uint64_t access = VMA_IBV_ACCESS_LOCAL_WRITE; + + if (ptr) { + m_mem_alloc_type = ALLOC_TYPE_EXTERNAL; + } + switch (m_mem_alloc_type) { + case ALLOC_TYPE_EXTERNAL: + m_data_block = ptr; + register_memory(size, p_ib_ctx_h, access); + break; + case ALLOC_TYPE_HUGEPAGES: + if (!hugetlb_alloc(size)) { + __log_info_dbg("Failed allocating huge pages, " + "falling back to another memory allocation method"); + } + else { + __log_info_dbg("Huge pages allocation passed successfully"); + m_mem_alloc_type = ALLOC_TYPE_HUGEPAGES; + register_memory(size, p_ib_ctx_h, access); + break; + } + // fallthrough + case ALLOC_TYPE_CONTIG: +#ifdef VMA_IBV_ACCESS_ALLOCATE_MR + if (mce_sys_var::HYPER_MSHV != safe_mce_sys().hypervisor) { + register_memory(size, p_ib_ctx_h, (access | VMA_IBV_ACCESS_ALLOCATE_MR)); + __log_info_dbg("Contiguous pages allocation passed successfully"); + m_mem_alloc_type = ALLOC_TYPE_CONTIG; + break; + } +#endif + // fallthrough + case ALLOC_TYPE_ANON: + default: + __log_info_dbg("allocating memory using malloc()"); + align_simple_malloc(size); // if fail will raise exception + m_mem_alloc_type = ALLOC_TYPE_ANON; + register_memory(size, p_ib_ctx_h, access); + break; + } + __log_info_dbg("allocated memory using type: %d at %p, size %zd", + m_mem_alloc_type, m_data_block, size); + + return m_data_block; +} + +ibv_mr* vma_allocator::find_ibv_mr_by_ib_ctx(ib_ctx_handler *p_ib_ctx_h) const +{ + lkey_map_ib_ctx_map_t::const_iterator iter = m_lkey_map_ib_ctx.find(p_ib_ctx_h); + if (iter != m_lkey_map_ib_ctx.end()) { + return p_ib_ctx_h->get_mem_reg(iter->second); + } + + return NULL; +} + +uint32_t vma_allocator::find_lkey_by_ib_ctx(ib_ctx_handler *p_ib_ctx_h) const +{ + lkey_map_ib_ctx_map_t::const_iterator iter = m_lkey_map_ib_ctx.find(p_ib_ctx_h); + if (iter != m_lkey_map_ib_ctx.end()) { + return iter->second; + } + + return (uint32_t)(-1); +} + +bool vma_allocator::hugetlb_alloc(size_t sz_bytes) +{ + const size_t hugepagemask = 4 * 1024 * 1024 - 1; + + m_length = (sz_bytes + hugepagemask) & (~hugepagemask); + + if (hugetlb_mmap_alloc()) { + return true; + } + if (hugetlb_sysv_alloc()) { + return true; + } + + VLOG_PRINTF_ONCE_THEN_DEBUG(VLOG_WARNING, "**************************************************************\n"); + VLOG_PRINTF_ONCE_THEN_DEBUG(VLOG_WARNING, "* NO IMMEDIATE ACTION NEEDED! \n"); + VLOG_PRINTF_ONCE_THEN_DEBUG(VLOG_WARNING, "* Not enough hugepage resources for VMA memory allocation. \n"); + VLOG_PRINTF_ONCE_THEN_DEBUG(VLOG_WARNING, "* VMA will continue working with regular memory allocation. \n"); + VLOG_PRINTF_ONCE_THEN_DEBUG(VLOG_INFO, " * Optional: \n"); + VLOG_PRINTF_ONCE_THEN_DEBUG(VLOG_INFO, " * 1. Switch to a different memory allocation type \n"); + VLOG_PRINTF_ONCE_THEN_DEBUG(VLOG_INFO, " * (%s!= %d) \n", + SYS_VAR_MEM_ALLOC_TYPE, ALLOC_TYPE_HUGEPAGES); + VLOG_PRINTF_ONCE_THEN_DEBUG(VLOG_INFO, " * 2. Restart process after increasing the number of \n"); + VLOG_PRINTF_ONCE_THEN_DEBUG(VLOG_INFO, " * hugepages resources in the system: \n"); + VLOG_PRINTF_ONCE_THEN_DEBUG(VLOG_INFO, " * \"echo 1000000000 > /proc/sys/kernel/shmmax\" \n"); + VLOG_PRINTF_ONCE_THEN_DEBUG(VLOG_INFO, " * \"echo 800 > /proc/sys/vm/nr_hugepages\" \n"); + VLOG_PRINTF_ONCE_THEN_DEBUG(VLOG_WARNING, "* Please refer to the memory allocation section in the VMA's \n"); + VLOG_PRINTF_ONCE_THEN_DEBUG(VLOG_WARNING, "* User Manual for more information \n"); + VLOG_PRINTF_ONCE_THEN_DEBUG(VLOG_WARNING, "**************************************************************\n"); + return false; +} + +bool vma_allocator::hugetlb_mmap_alloc() +{ +#ifdef MAP_HUGETLB + __log_info_dbg("Allocating %zd bytes in huge tlb using mmap", m_length); + + m_data_block = mmap(NULL, m_length, + PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS | + MAP_POPULATE | MAP_HUGETLB, -1, 0); + if (m_data_block == MAP_FAILED) { + __log_info_dbg("failed allocating %zd using mmap %d", m_length, + errno); + m_data_block = NULL; + return false; + } + return true; +#else + return false; +#endif +} + + +bool vma_allocator::hugetlb_sysv_alloc() +{ + __log_info_dbg("Allocating %zd bytes in huge tlb with shmget", m_length); + + // allocate memory + m_shmid = shmget(IPC_PRIVATE, m_length, + SHM_HUGETLB | IPC_CREAT | SHM_R | SHM_W); + if (m_shmid < 0) { + return false; + } + + // get pointer to allocated memory + m_data_block = shmat(m_shmid, NULL, 0); + if (m_data_block == (void*)-1) { + __log_info_warn("Shared memory attach failure (errno=%d %m)", errno); + shmctl(m_shmid, IPC_RMID, NULL); + m_shmid = -1; + m_data_block = NULL; + return false; + } + + // mark 'to be destroyed' when process detaches from shmem segment + // this will clear the HugePage resources even if process if killed not nicely + if (shmctl(m_shmid, IPC_RMID, NULL)) { + __log_info_warn("Shared memory contrl mark 'to be destroyed' failed " + "(errno=%d %m)", errno); + } + + // We want to determine now that we can lock it. Note: it was claimed + // that without actual mlock, linux might be buggy on this with huge-pages + int rc = mlock(m_data_block, m_length); + if (rc!=0) { + __log_info_warn("mlock of shared memory failure (errno=%d %m)", errno); + if (shmdt(m_data_block) != 0) { + __log_info_err("shmem detach failure %m"); + } + m_data_block = NULL; // no value to try shmdt later + m_shmid = -1; + return false; + } + + return true; +} + +void vma_allocator::align_simple_malloc(size_t sz_bytes) +{ + int ret = 0; + long page_size = sysconf(_SC_PAGESIZE); + + if (page_size > 0) { + m_length = (sz_bytes + page_size - 1) & (~page_size - 1); + ret = posix_memalign(&m_data_block, page_size, m_length); + if (!ret) { + __log_info_dbg("allocated %zd aligned memory at %p", + m_length, m_data_block); + return; + } + } + __log_info_dbg("failed allocating memory with posix_memalign size %zd " + "returned %d (errno=%d %m) ", m_length, ret, errno); + + m_length = sz_bytes; + m_data_block = malloc(sz_bytes); + + if (m_data_block == NULL) { + __log_info_dbg("failed allocating data memory block " + "(size=%d bytes) (errno=%d %m)", sz_bytes, errno); + throw_vma_exception("failed allocating data memory block"); + } + __log_info_dbg("allocated memory using malloc()"); +} + +void vma_allocator::register_memory(size_t size, ib_ctx_handler *p_ib_ctx_h, + uint64_t access) +{ + ib_context_map_t *ib_ctx_map = NULL; + ib_ctx_handler *p_ib_ctx_h_ref = p_ib_ctx_h; + uint32_t lkey = (uint32_t)(-1); + bool failed = false; + + ib_ctx_map = g_p_ib_ctx_handler_collection->get_ib_cxt_list(); + if (ib_ctx_map) { + ib_context_map_t::iterator iter; + + for (iter = ib_ctx_map->begin(); iter != ib_ctx_map->end(); iter++) { + p_ib_ctx_h = iter->second; + if (p_ib_ctx_h_ref && p_ib_ctx_h != p_ib_ctx_h_ref) { + continue; + } + lkey = p_ib_ctx_h->mem_reg(m_data_block, size, access); + if (lkey == (uint32_t)(-1)) { + __log_info_warn("Failure during memory registration on dev: %s addr=%p length=%d", + p_ib_ctx_h->get_ibname(), m_data_block, size); + failed = true; + break; + } else { + m_lkey_map_ib_ctx[p_ib_ctx_h] = lkey; + if (NULL == m_data_block) { + m_data_block = p_ib_ctx_h->get_mem_reg(lkey)->addr; + } + errno = 0; //ibv_reg_mr() set errno=12 despite successful returning +#ifdef VMA_IBV_ACCESS_ALLOCATE_MR + if ((access & VMA_IBV_ACCESS_ALLOCATE_MR) != 0) { // contig pages mode + // When using 'IBV_ACCESS_ALLOCATE_MR', ibv_reg_mr will return a pointer that its 'addr' field will hold the address of the allocated memory. + // Second registration and above is done using 'IBV_ACCESS_LOCAL_WRITE' and the 'addr' we received from the first registration. + access &= ~VMA_IBV_ACCESS_ALLOCATE_MR; + } +#endif + __log_info_dbg("Registered memory on dev: %s addr=%p length=%d", + p_ib_ctx_h->get_ibname(), m_data_block, size); + } + if (p_ib_ctx_h == p_ib_ctx_h_ref) { + break; + } + } + } + + /* Possible cases: + * 1. no IB device: it is not possible to register memory + * - return w/o error + * 2. p_ib_ctx_h is null: try to register on all IB devices + * - fatal return if at least one IB device can not register memory + * - return w/o error in case no issue is observed + * 3. p_ib_ctx is defined: try to register on specific device + * - fatal return if device is found and registration fails + * - return w/o error in case no issue is observed or device is not found + */ + if (failed) { + __log_info_warn("Failed registering memory, This might happen " + "due to low MTT entries. Please refer to README.txt " + "for more info"); + if (m_data_block) { + __log_info_dbg("Failed registering memory block with device " + "(ptr=%p size=%ld%s) (errno=%d %m)", + m_data_block, size, errno); + } + throw_vma_exception("Failed registering memory"); + } + + return; +} + +void vma_allocator::deregister_memory() +{ + ib_ctx_handler *p_ib_ctx_h = NULL; + ib_context_map_t *ib_ctx_map = NULL; + uint32_t lkey = (uint32_t)(-1); + + ib_ctx_map = g_p_ib_ctx_handler_collection->get_ib_cxt_list(); + if (ib_ctx_map) { + ib_context_map_t::iterator iter; + + for (iter = ib_ctx_map->begin(); iter != ib_ctx_map->end(); iter++) { + p_ib_ctx_h = iter->second; + lkey = find_lkey_by_ib_ctx(p_ib_ctx_h); + if (lkey != (uint32_t)(-1)) { + p_ib_ctx_h->mem_dereg(lkey); + m_lkey_map_ib_ctx.erase(p_ib_ctx_h); + } + } + } + m_lkey_map_ib_ctx.clear(); +} diff --git a/src/vma/dev/allocator.h b/src/vma/dev/allocator.h new file mode 100644 index 0000000..f96ce21 --- /dev/null +++ b/src/vma/dev/allocator.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef SRC_VMA_DEV_ALLOCATOR_H_ +#define SRC_VMA_DEV_ALLOCATOR_H_ + +#include "vlogger/vlogger.h" +#include "ib_ctx_handler_collection.h" + + +class ib_ctx_handler; +typedef std::tr1::unordered_map lkey_map_ib_ctx_map_t; + +class vma_allocator { +public: + vma_allocator(); + ~vma_allocator(); + void* alloc_and_reg_mr(size_t size, ib_ctx_handler *p_ib_ctx_h, void *ptr = NULL); + uint32_t find_lkey_by_ib_ctx(ib_ctx_handler *p_ib_ctx_h) const; + ibv_mr* find_ibv_mr_by_ib_ctx(ib_ctx_handler *p_ib_ctx_h) const; + void register_memory(size_t size, ib_ctx_handler *p_ib_ctx_h, uint64_t access); + void deregister_memory(); +private: + void align_simple_malloc(size_t sz_bytes); + bool hugetlb_alloc(size_t sz_bytes); + bool hugetlb_mmap_alloc(); + bool hugetlb_sysv_alloc(); + lkey_map_ib_ctx_map_t m_lkey_map_ib_ctx; + int m_shmid; + size_t m_length; + void *m_data_block; + alloc_mode_t m_mem_alloc_type; +}; + +#endif /* SRC_VMA_DEV_ALLOCATOR_H_ */ diff --git a/src/vma/dev/buffer_pool.cpp b/src/vma/dev/buffer_pool.cpp new file mode 100644 index 0000000..8a53a22 --- /dev/null +++ b/src/vma/dev/buffer_pool.cpp @@ -0,0 +1,374 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "buffer_pool.h" + +#include +#include // for MIN + +#include "utils/bullseye.h" +#include "vlogger/vlogger.h" +#include "vma/util/sys_vars.h" +#include "vma/proto/mem_buf_desc.h" +#include "ib_ctx_handler_collection.h" + +#define MODULE_NAME "bpool" + +buffer_pool *g_buffer_pool_rx = NULL; +buffer_pool *g_buffer_pool_tx = NULL; + +// inlining a function only help in case it come before using it... +inline void buffer_pool::put_buffer_helper(mem_buf_desc_t *buff) +{ +#if VLIST_DEBUG + if (buff->buffer_node.is_list_member()) { + __log_info_warn("Buffer is already a member in a list! id=[%s]", buff->buffer_node.list_id()); + } +#endif + buff->p_next_desc = m_p_head; + free_lwip_pbuf(&buff->lwip_pbuf); + m_p_head = buff; + m_n_buffers++; + m_p_bpool_stat->n_buffer_pool_size++; +} + +/** Free-callback function to free a 'struct pbuf_custom_ref', called by + * pbuf_free. */ +void buffer_pool::free_rx_lwip_pbuf_custom(struct pbuf *p_buff) +{ + g_buffer_pool_rx->put_buffers_thread_safe((mem_buf_desc_t *)p_buff); +} + +void buffer_pool::free_tx_lwip_pbuf_custom(struct pbuf *p_buff) +{ + g_buffer_pool_tx->put_buffers_thread_safe((mem_buf_desc_t *)p_buff); +} + +buffer_pool::buffer_pool(size_t buffer_count, size_t buf_size, pbuf_free_custom_fn custom_free_function) : + m_lock_spin("buffer_pool"), + m_n_buffers(0), + m_n_buffers_created(buffer_count), + m_p_head(NULL) +{ + size_t sz_aligned_element = 0; + uint8_t *ptr_buff, *ptr_desc; + + __log_info_func("count = %d", buffer_count); + + m_p_bpool_stat = &m_bpool_stat_static; + memset(m_p_bpool_stat , 0, sizeof(*m_p_bpool_stat)); + vma_stats_instance_create_bpool_block(m_p_bpool_stat); + + if (buffer_count) { + sz_aligned_element = (buf_size + MCE_ALIGNMENT) & (~MCE_ALIGNMENT); + m_size = (sizeof(mem_buf_desc_t) + sz_aligned_element) * buffer_count + MCE_ALIGNMENT; + } else { + m_size = buf_size; + } + void *data_block = m_allocator.alloc_and_reg_mr(m_size, NULL); + + + if (!buffer_count) return; + + // Align pointers + ptr_buff = (uint8_t *)((unsigned long)((char*)data_block + MCE_ALIGNMENT) & (~MCE_ALIGNMENT)); + ptr_desc = ptr_buff + sz_aligned_element * buffer_count; + + // Split the block to buffers + for (size_t i = 0; i < buffer_count; ++i) { + mem_buf_desc_t *desc = new (ptr_desc) mem_buf_desc_t(ptr_buff, buf_size, custom_free_function); + put_buffer_helper(desc); + + ptr_buff += sz_aligned_element; + ptr_desc += sizeof(mem_buf_desc_t); + } + + print_val_tbl(); + + __log_info_func("done"); +} + +buffer_pool::~buffer_pool() +{ + free_bpool_resources(); +} + +void buffer_pool::free_bpool_resources() +{ + if (m_n_buffers == m_n_buffers_created) { + __log_info_func("count %lu, missing %lu", m_n_buffers, + m_n_buffers_created-m_n_buffers); + } + else { + __log_info_dbg("count %lu, missing %lu", m_n_buffers, + m_n_buffers_created - m_n_buffers); + } + + vma_stats_instance_remove_bpool_block(m_p_bpool_stat); + + __log_info_func("done"); +} + +void buffer_pool::register_memory(ib_ctx_handler *p_ib_ctx_h) +{ + m_allocator.register_memory(m_size, p_ib_ctx_h, VMA_IBV_ACCESS_LOCAL_WRITE); +} + +void buffer_pool::print_val_tbl() +{ + __log_info_dbg("pool 0x%X size: %ld buffers: %lu", this, m_size, m_n_buffers); +} + +bool buffer_pool::get_buffers_thread_safe(descq_t &pDeque, ring_slave* desc_owner, size_t count, uint32_t lkey) +{ + auto_unlocker lock(m_lock_spin); + + mem_buf_desc_t *head; + + __log_info_funcall("requested %lu, present %lu, created %lu", count, m_n_buffers, m_n_buffers_created); + + if (unlikely(m_n_buffers < count)) { + VLOG_PRINTF_INFO_ONCE_THEN_ALWAYS(VLOG_DEBUG, VLOG_FUNC, "ERROR! not enough buffers in the pool (requested: %lu, have: %lu, created: %lu, Buffer pool type: %s)", + count, m_n_buffers, m_n_buffers_created, m_p_bpool_stat->is_rx ? "Rx" : "Tx"); + + m_p_bpool_stat->n_buffer_pool_no_bufs++; + return false; + } + + // pop buffers from the list + m_n_buffers -= count; + m_p_bpool_stat->n_buffer_pool_size -= count; + while (count-- > 0) { + // Remove from list + head = m_p_head; + m_p_head = m_p_head->p_next_desc; + head->p_next_desc = NULL; + + // Init + head->lkey = lkey; + head->p_desc_owner = desc_owner; + + // Push to queue + pDeque.push_back(head); + } + + return true; +} + +uint32_t buffer_pool::find_lkey_by_ib_ctx_thread_safe(ib_ctx_handler* p_ib_ctx_h) +{ + auto_unlocker lock(m_lock_spin); + return m_allocator.find_lkey_by_ib_ctx(p_ib_ctx_h); +} + +#if _BullseyeCoverage + #pragma BullseyeCoverage off +#endif + +/* + * this function is minimal C version of Floyd's cycle-finding algorithm + * just for determining whether a circle exists or not. + * Complexity is O(n) + * see: http://en.wikipedia.org/wiki/Cycle_detection#Tortoise_and_hare + */ +bool isCircle (mem_buf_desc_t *pNode) { + if (!pNode) return false; + + mem_buf_desc_t *p1 = pNode; + mem_buf_desc_t *p2 = pNode; + + while (p2->p_next_desc && p2->p_next_desc->p_next_desc) + { + p1 = p1->p_next_desc; + p2 = p2->p_next_desc->p_next_desc; + if (p1 == p2) + return true; + } + return false; +} + +typedef mem_buf_desc_t* Node; + +static inline Node f(Node x) { + // NOTE: after we determined we have a circle, no need to check for nullity + return x->p_next_desc; +} + +// full version of Floyd's cycle-finding algorithm +// see: http://en.wikipedia.org/wiki/Cycle_detection#Tortoise_and_hare +void Floyd_LogCircleInfo(Node x0) { + + // The main phase of the algorithm, finding a repetition x_mu = x_2mu + // The hare moves twice as quickly as the tortoise + Node tortoise = f(x0); // f(x0) is the element/node next to x0. + Node hare = f(f(x0)); + while (tortoise != hare) { + tortoise = f(tortoise); + hare = f(f(hare)); + } + + // at this point tortoise position is equvi-distant from x0 + // and current hare position (which is the same as tortoise position). This is + // true because tortoise moved exactly half of the hare way. + // so hare (set to tortoise-current position and move at tortoise speed) moving in + // circle and tortoise (set to x0 ) moving towards circle, must meet at + // current hare position (== current turtle position). Realize that they move + // in same speed, the first intersection will be the beginning of the circle. + // + + // Find the position of the first repetition of length mu + // The hare and tortoise move at the same speeds + int mu = 0; // first index that starts the circle + hare = tortoise; + tortoise = x0; + const int MAX_STEPS = 1 << 24; // = 16M + while (tortoise != hare) { + tortoise = f(tortoise); + hare = f(hare); + mu++; + if (mu > MAX_STEPS) break; // extra safety; not really needed + } + + // Find the length of the shortest cycle starting from x_mu + // The hare moves while the tortoise stays still + int lambda = 1; //circle length + hare = f(tortoise); + while (tortoise != hare) { + hare = f(hare); + lambda++; + if (lambda > MAX_STEPS) break; // extra safety; not really needed + } + vlog_printf (VLOG_ERROR, "circle first index (mu) = %d, circle length (lambda) = %d", mu, lambda); +} + +void buffer_pool::buffersPanic() +{ + if (isCircle(m_p_head)) + { + __log_info_err("Circle was found in buffer_pool"); + + // print mu & lambda of circle + Floyd_LogCircleInfo(m_p_head); + } + else + { + __log_info_info("no circle was found in buffer_pool"); + } + + // log backtrace + const int MAX_BACKTRACE = 25; + char **symbols; + void *addresses[MAX_BACKTRACE]; + int count = backtrace(addresses, MAX_BACKTRACE); + symbols = backtrace_symbols(addresses, count); + for (int i = 0; i < count; ++i) { + vlog_printf(VLOG_ERROR, " %2d %s\n", i, symbols[i]); + } + + __log_info_panic("m_n_buffers(%lu) > m_n_buffers_created(%lu)", m_n_buffers, m_n_buffers_created); +} + +#if _BullseyeCoverage + #pragma BullseyeCoverage on +#endif + +inline void buffer_pool::put_buffers(mem_buf_desc_t *buff_list) +{ + mem_buf_desc_t *next; + __log_info_funcall("returning list, present %lu, created %lu", m_n_buffers, m_n_buffers_created); + while (buff_list) { + next = buff_list->p_next_desc; + put_buffer_helper(buff_list); + buff_list = next; + } + + if (unlikely(m_n_buffers > m_n_buffers_created)) { + buffersPanic(); + } +} + +void buffer_pool::put_buffers_thread_safe(mem_buf_desc_t *buff_list) +{ + auto_unlocker lock(m_lock_spin); + put_buffers(buff_list); +} + +void buffer_pool::put_buffers(descq_t *buffers, size_t count) +{ + mem_buf_desc_t *buff_list, *next; + size_t amount; + __log_info_funcall("returning %lu, present %lu, created %lu", count, m_n_buffers, m_n_buffers_created); + for (amount = MIN(count, buffers->size()); amount > 0 ; amount--) { + buff_list = buffers->get_and_pop_back(); + while (buff_list) { + next = buff_list->p_next_desc; + put_buffer_helper(buff_list); + buff_list = next; + } + } + + if (unlikely(m_n_buffers > m_n_buffers_created)) { + buffersPanic(); + } +} + +void buffer_pool::put_buffers_thread_safe(descq_t *buffers, size_t count) +{ + auto_unlocker lock(m_lock_spin); + put_buffers(buffers, count); +} + +void buffer_pool::put_buffers_after_deref_thread_safe(descq_t *pDeque) +{ + auto_unlocker lock(m_lock_spin); + while (!pDeque->empty()) { + mem_buf_desc_t * list = pDeque->get_and_pop_front(); + if (list->dec_ref_count() <= 1 && (list->lwip_pbuf.pbuf.ref-- <= 1)) { + put_buffers(list); + } + } +} + +size_t buffer_pool::get_free_count() +{ + return m_n_buffers; +} + +void buffer_pool::set_RX_TX_for_stats(bool rx) +{ + if (rx) + m_p_bpool_stat->is_rx = true; + else + m_p_bpool_stat->is_tx = true; +} + diff --git a/src/vma/dev/buffer_pool.h b/src/vma/dev/buffer_pool.h new file mode 100644 index 0000000..606f37b --- /dev/null +++ b/src/vma/dev/buffer_pool.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef BUFFER_POOL_H +#define BUFFER_POOL_H + +#include "utils/lock_wrapper.h" +#include "vma/util/vma_stats.h" +#include "vma/proto/mem_buf_desc.h" +#include "vma/dev/allocator.h" + +inline static void free_lwip_pbuf(struct pbuf_custom *pbuf_custom) +{ + pbuf_custom->pbuf.flags = 0; + pbuf_custom->pbuf.ref = 0; +} + +/** + * A buffer pool which internally sorts the buffers. + */ +class buffer_pool +{ +public: + buffer_pool(size_t buffer_count, size_t size, pbuf_free_custom_fn custom_free_function); + ~buffer_pool(); + + void register_memory(ib_ctx_handler *p_ib_ctx_h); + void print_val_tbl(); + + uint32_t find_lkey_by_ib_ctx_thread_safe(ib_ctx_handler* p_ib_ctx_h); + + /** + * Get buffers from the pool - thread safe + * @parma pDeque List to put the buffers. + * @param desc_owner The new owner of the buffers. + * @param count Number of buffers required. + * @param lkey The registered memory lkey. + * @return False if no buffers are available, else True. + */ + bool get_buffers_thread_safe(descq_t &pDeque, ring_slave* desc_owner, size_t count, uint32_t lkey); + + /** + * Return buffers to the pool. + */ + void put_buffers(descq_t *buffers, size_t count); + void put_buffers_thread_safe(descq_t *buffers, size_t count); + void put_buffers(mem_buf_desc_t *buff_list); + void put_buffers_thread_safe(mem_buf_desc_t *buff_list); + static void free_rx_lwip_pbuf_custom(struct pbuf *p_buff); + static void free_tx_lwip_pbuf_custom(struct pbuf *p_buff); + + /** + * Assume locked owner!!! Return buffers to the pool with ref_count check. + */ + void put_buffers_after_deref_thread_safe(descq_t *pDeque); + + /** + * @return Number of free buffers in the pool. + */ + size_t get_free_count(); + + void set_RX_TX_for_stats(bool rx); + +private: + lock_spin m_lock_spin; + // XXX-dummy buffer list head and count + // to be replaced with a bucket-sorted array + + size_t m_size; /* pool size in bytes */ + size_t m_n_buffers; + size_t m_n_buffers_created; + mem_buf_desc_t *m_p_head; + + bpool_stats_t* m_p_bpool_stat; + bpool_stats_t m_bpool_stat_static; + vma_allocator m_allocator; + /** + * Add a buffer to the pool + */ + inline void put_buffer_helper(mem_buf_desc_t *buff); + + void buffersPanic(); + + /** + * dtor + */ + inline void free_bpool_resources(); +}; + +extern buffer_pool* g_buffer_pool_rx; +extern buffer_pool* g_buffer_pool_tx; + + +#endif diff --git a/src/vma/dev/cq_mgr.cpp b/src/vma/dev/cq_mgr.cpp new file mode 100644 index 0000000..c276563 --- /dev/null +++ b/src/vma/dev/cq_mgr.cpp @@ -0,0 +1,984 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "cq_mgr.h" +#include "cq_mgr.inl" +#include +#include +#include +#include + +#include "utils/bullseye.h" +#include +#include +#include "vma/util/instrumentation.h" +#include +#include "vma/ib/base/verbs_extra.h" + +#include "buffer_pool.h" +#include "qp_mgr.h" +#include "ring_simple.h" + +#define MODULE_NAME "cqm" + +#define cq_logpanic __log_info_panic +#define cq_logerr __log_info_err +#define cq_logwarn __log_info_warn +#define cq_loginfo __log_info_info +#define cq_logdbg __log_info_dbg +#define cq_logfunc __log_info_func +#define cq_logfuncall __log_info_funcall + +#define cq_logdbg_no_funcname(log_fmt, log_args...) do { if (g_vlogger_level >= VLOG_DEBUG) vlog_printf(VLOG_DEBUG, MODULE_NAME "[%p]:%d: " log_fmt "\n", __INFO__, __LINE__, ##log_args); } while (0) + +#if VLIST_DEBUG +#define VLIST_DEBUG_CQ_MGR_PRINT_ERROR_IS_MEMBER do { \ + if (buff->buffer_node.is_list_member()) \ + cq_logwarn("Buffer is already a member in a list! id=[%s]", buff->buffer_node.list_id()); \ + } while (0) +#else +#define VLIST_DEBUG_CQ_MGR_PRINT_ERROR_IS_MEMBER +#endif + +atomic_t cq_mgr::m_n_cq_id_counter = ATOMIC_INIT(1); + +uint64_t cq_mgr::m_n_global_sn = 0; + +cq_mgr::cq_mgr(ring_simple* p_ring, ib_ctx_handler* p_ib_ctx_handler, int cq_size, struct ibv_comp_channel* p_comp_event_channel, bool is_rx, bool config) : + m_p_ibv_cq(NULL) + ,m_b_is_rx(is_rx) + ,m_cq_id(0) + ,m_n_cq_poll_sn(0) + ,m_p_ring(p_ring) + ,m_n_wce_counter(0) + ,m_b_was_drained(false) + ,m_b_is_rx_hw_csum_on(false) + ,m_n_sysvar_cq_poll_batch_max(safe_mce_sys().cq_poll_batch_max) + ,m_n_sysvar_progress_engine_wce_max(safe_mce_sys().progress_engine_wce_max) + ,m_p_cq_stat(&m_cq_stat_static) // use local copy of stats by default (on rx cq get shared memory stats) + ,m_transport_type(m_p_ring->get_transport_type()) + ,m_p_next_rx_desc_poll(NULL) + ,m_n_sysvar_rx_prefetch_bytes_before_poll(safe_mce_sys().rx_prefetch_bytes_before_poll) + ,m_n_sysvar_rx_prefetch_bytes(safe_mce_sys().rx_prefetch_bytes) + ,m_sz_transport_header(0) + ,m_p_ib_ctx_handler(p_ib_ctx_handler) + ,m_n_sysvar_rx_num_wr_to_post_recv(safe_mce_sys().rx_num_wr_to_post_recv) + ,m_comp_event_channel(p_comp_event_channel) + ,m_b_notification_armed(false) + ,m_n_sysvar_qp_compensation_level(safe_mce_sys().qp_compensation_level) + ,m_rx_lkey(g_buffer_pool_rx->find_lkey_by_ib_ctx_thread_safe(m_p_ib_ctx_handler)) + ,m_b_sysvar_cq_keep_qp_full(safe_mce_sys().cq_keep_qp_full) + ,m_n_out_of_free_bufs_warning(0) + ,m_rx_buffs_rdy_for_free_head(NULL) + ,m_rx_buffs_rdy_for_free_tail(NULL) +{ + BULLSEYE_EXCLUDE_BLOCK_START + if (m_rx_lkey == 0) { + __log_info_panic("invalid lkey found %lu", m_rx_lkey); + } + BULLSEYE_EXCLUDE_BLOCK_END + + memset(&m_cq_stat_static, 0, sizeof(m_cq_stat_static)); + memset(&m_qp_rec, 0, sizeof(m_qp_rec)); + m_rx_queue.set_id("cq_mgr (%p) : m_rx_queue", this); + m_rx_pool.set_id("cq_mgr (%p) : m_rx_pool", this); + m_cq_id = atomic_fetch_and_inc(&m_n_cq_id_counter); // cq id is nonzero + if (config) + configure(cq_size); +} + +void cq_mgr::configure(int cq_size) +{ + vma_ibv_cq_init_attr attr; + memset(&attr, 0, sizeof(attr)); + + prep_ibv_cq(attr); + + m_p_ibv_cq = vma_ibv_create_cq(m_p_ib_ctx_handler->get_ibv_context(), + cq_size - 1, (void *)this, m_comp_event_channel, 0, &attr); + BULLSEYE_EXCLUDE_BLOCK_START + if (!m_p_ibv_cq) { + throw_vma_exception("ibv_create_cq failed"); + } + BULLSEYE_EXCLUDE_BLOCK_END + VALGRIND_MAKE_MEM_DEFINED(m_p_ibv_cq, sizeof(ibv_cq)); + switch (m_transport_type) { + case VMA_TRANSPORT_IB: + m_sz_transport_header = GRH_HDR_LEN; + break; + case VMA_TRANSPORT_ETH: + m_sz_transport_header = ETH_HDR_LEN; + break; + BULLSEYE_EXCLUDE_BLOCK_START + default: + cq_logpanic("Unknown transport type: %d", m_transport_type); + break; + BULLSEYE_EXCLUDE_BLOCK_END + } + + if (m_b_is_rx) { + vma_stats_instance_create_cq_block(m_p_cq_stat); + } + + if (m_b_is_rx) { + m_b_is_rx_hw_csum_on = vma_is_rx_hw_csum_supported(m_p_ib_ctx_handler->get_ibv_device_attr()); + cq_logdbg("RX CSUM support = %d", m_b_is_rx_hw_csum_on); + } + + cq_logdbg("Created CQ as %s with fd[%d] and of size %d elements (ibv_cq_hndl=%p)", (m_b_is_rx?"Rx":"Tx"), get_channel_fd(), cq_size, m_p_ibv_cq); +} + +void cq_mgr::prep_ibv_cq(vma_ibv_cq_init_attr& attr) const +{ + if (m_p_ib_ctx_handler->get_ctx_time_converter_status()) { + vma_ibv_cq_init_ts_attr(&attr); + } +} + +uint32_t cq_mgr::clean_cq() +{ + uint32_t ret_total = 0; + int ret = 0; + uint64_t cq_poll_sn = 0; + mem_buf_desc_t* buff = NULL; + /* coverity[stack_use_local_overflow] */ + vma_ibv_wc wce[MCE_MAX_CQ_POLL_BATCH]; + while ((ret = poll(wce, MCE_MAX_CQ_POLL_BATCH, &cq_poll_sn)) > 0) { + for (int i = 0; i < ret; i++) { + if (m_b_is_rx) { + buff = process_cq_element_rx(&wce[i]); + } else { + buff = process_cq_element_tx(&wce[i]); + } + if (buff) + m_rx_queue.push_back(buff); + } + ret_total += ret; + } + + return ret_total; +} + +cq_mgr::~cq_mgr() +{ + cq_logfunc(""); + cq_logdbg("destroying CQ as %s", (m_b_is_rx?"Rx":"Tx")); + + if (m_rx_buffs_rdy_for_free_head) { + reclaim_recv_buffers(m_rx_buffs_rdy_for_free_head); + } + + m_b_was_drained = true; + if (m_rx_queue.size() + m_rx_pool.size()) { + cq_logdbg("Returning %d buffers to global Rx pool (ready queue %d, free pool %d))", m_rx_queue.size() + m_rx_pool.size(), m_rx_queue.size(), m_rx_pool.size()); + + g_buffer_pool_rx->put_buffers_thread_safe(&m_rx_queue, m_rx_queue.size()); + m_p_cq_stat->n_rx_sw_queue_len = m_rx_queue.size(); + + g_buffer_pool_rx->put_buffers_thread_safe(&m_rx_pool, m_rx_pool.size()); + m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); + } + + cq_logfunc("destroying ibv_cq"); + IF_VERBS_FAILURE_EX(ibv_destroy_cq(m_p_ibv_cq), EIO) { + cq_logdbg("destroy cq failed (errno=%d %m)", errno); + } ENDIF_VERBS_FAILURE; + VALGRIND_MAKE_MEM_UNDEFINED(m_p_ibv_cq, sizeof(ibv_cq)); + + statistics_print(); + if (m_b_is_rx) + vma_stats_instance_remove_cq_block(m_p_cq_stat); + + cq_logdbg("done"); +} + +void cq_mgr::statistics_print() +{ + if (m_p_cq_stat->n_rx_pkt_drop || m_p_cq_stat->n_rx_sw_queue_len || + m_p_cq_stat->n_rx_drained_at_once_max || m_p_cq_stat->n_buffer_pool_len) { + cq_logdbg_no_funcname("Packets dropped: %12llu", m_p_cq_stat->n_rx_pkt_drop); + cq_logdbg_no_funcname("Drained max: %17u", m_p_cq_stat->n_rx_drained_at_once_max); + } +} + +ibv_cq* cq_mgr::get_ibv_cq_hndl() +{ + return m_p_ibv_cq; +} + +int cq_mgr::get_channel_fd() +{ + return m_comp_event_channel->fd; +} + +void cq_mgr::add_qp_rx(qp_mgr* qp) +{ + cq_logdbg("qp_mgr=%p", qp); + descq_t temp_desc_list; + temp_desc_list.set_id("cq_mgr (%p) : temp_desc_list", this); + + m_p_cq_stat->n_rx_drained_at_once_max = 0; + + /* return_extra_buffers(); */ //todo?? + + // Initial fill of receiver work requests + uint32_t qp_rx_wr_num = qp->get_rx_max_wr_num(); + cq_logdbg("Trying to push %d WRE to allocated qp (%p)", qp_rx_wr_num, qp); + while (qp_rx_wr_num) { + uint32_t n_num_mem_bufs = m_n_sysvar_rx_num_wr_to_post_recv; + if (n_num_mem_bufs > qp_rx_wr_num) + n_num_mem_bufs = qp_rx_wr_num; + bool res = g_buffer_pool_rx->get_buffers_thread_safe(temp_desc_list, m_p_ring, n_num_mem_bufs, m_rx_lkey); + if (!res) { + VLOG_PRINTF_INFO_ONCE_THEN_ALWAYS(VLOG_WARNING, VLOG_DEBUG, "WARNING Out of mem_buf_desc from Rx buffer pool for qp_mgr qp_mgr initialization (qp=%p),\n" + "\tThis might happen due to wrong setting of VMA_RX_BUFS and VMA_RX_WRE. Please refer to README.txt for more info", qp); + break; + } + + qp->post_recv_buffers(&temp_desc_list, temp_desc_list.size()); + if (!temp_desc_list.empty()) { + cq_logdbg("qp post recv is already full (push=%d, planned=%d)", qp->get_rx_max_wr_num()-qp_rx_wr_num, qp->get_rx_max_wr_num()); + g_buffer_pool_rx->put_buffers_thread_safe(&temp_desc_list, temp_desc_list.size()); + break; + } + qp_rx_wr_num -= n_num_mem_bufs; + } + cq_logdbg("Successfully post_recv qp with %d new Rx buffers (planned=%d)", qp->get_rx_max_wr_num()-qp_rx_wr_num, qp->get_rx_max_wr_num()); + + // Add qp_mgr to map + m_qp_rec.qp = qp; + m_qp_rec.debt = 0; +} + +void cq_mgr::del_qp_rx(qp_mgr *qp) +{ + BULLSEYE_EXCLUDE_BLOCK_START + if (m_qp_rec.qp != qp) { + cq_logdbg("wrong qp_mgr=%p != m_qp_rec.qp=%p", qp, m_qp_rec.qp); + return; + } + BULLSEYE_EXCLUDE_BLOCK_END + cq_logdbg("qp_mgr=%p", m_qp_rec.qp); + return_extra_buffers(); + + clean_cq(); + memset(&m_qp_rec, 0, sizeof(m_qp_rec)); +} + +void cq_mgr::add_qp_tx(qp_mgr* qp) +{ + //Assume locked! + cq_logdbg("qp_mgr=%p", qp); + m_qp_rec.qp = qp; + m_qp_rec.debt = 0; +} + +bool cq_mgr::request_more_buffers() +{ + cq_logfuncall("Allocating additional %d buffers for internal use", m_n_sysvar_qp_compensation_level); + + // Assume locked! + // Add an additional free buffer descs to RX cq mgr + bool res = g_buffer_pool_rx->get_buffers_thread_safe(m_rx_pool, m_p_ring, m_n_sysvar_qp_compensation_level, m_rx_lkey); + if (!res) { + cq_logfunc("Out of mem_buf_desc from RX free pool for internal object pool"); + return false; + }; + + m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); + return true; +} + +void cq_mgr::return_extra_buffers() +{ + if (m_rx_pool.size() < m_n_sysvar_qp_compensation_level * 2) + return; + int buff_to_rel = m_rx_pool.size() - m_n_sysvar_qp_compensation_level; + + cq_logfunc("releasing %d buffers to global rx pool", buff_to_rel); + g_buffer_pool_rx->put_buffers_thread_safe(&m_rx_pool, buff_to_rel); + m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); +} + +int cq_mgr::poll(vma_ibv_wc* p_wce, int num_entries, uint64_t* p_cq_poll_sn) +{ + // Assume locked!!! + cq_logfuncall(""); + +#ifdef RDTSC_MEASURE_RX_VERBS_READY_POLL + RDTSC_TAKE_START(g_rdtsc_instr_info_arr[RDTSC_FLOW_RX_VERBS_READY_POLL]); +#endif //RDTSC_MEASURE_RX_VERBS_READY_POLL + +#ifdef RDTSC_MEASURE_RX_VERBS_IDLE_POLL + RDTSC_TAKE_START(g_rdtsc_instr_info_arr[RDTSC_FLOW_RX_VERBS_IDLE_POLL]); +#endif //RDTSC_MEASURE_RX_VERBS_IDLE_POLL + +#ifdef RDTSC_MEASURE_RX_VMA_TCP_IDLE_POLL + RDTSC_TAKE_END(g_rdtsc_instr_info_arr[RDTSC_FLOW_RX_VMA_TCP_IDLE_POLL]); +#endif //RDTSC_MEASURE_RX_VMA_TCP_IDLE_POLL + int ret = vma_ibv_poll_cq(m_p_ibv_cq, num_entries, p_wce); + if (ret <= 0) { +#ifdef RDTSC_MEASURE_RX_VERBS_IDLE_POLL + RDTSC_TAKE_END(g_rdtsc_instr_info_arr[RDTSC_FLOW_RX_VERBS_IDLE_POLL]); +#endif + +#ifdef RDTSC_MEASURE_RX_VMA_TCP_IDLE_POLL + RDTSC_TAKE_START(g_rdtsc_instr_info_arr[RDTSC_FLOW_RX_VMA_TCP_IDLE_POLL]); +#endif + // Zero polled wce OR ibv_poll_cq() has driver specific errors + // so we can't really do anything with them +#ifdef RDTSC_MEASURE_RX_CQE_RECEIVEFROM + RDTSC_TAKE_START(g_rdtsc_instr_info_arr[RDTSC_FLOW_RX_CQE_TO_RECEIVEFROM]); +#endif + *p_cq_poll_sn = m_n_global_sn; +#ifdef VMA_TIME_MEASURE + INC_ERR_POLL_COUNT; +#endif + return 0; + } + else { +#ifdef RDTSC_MEASURE_RX_VERBS_READY_POLL + RDTSC_TAKE_END(g_rdtsc_instr_info_arr[RDTSC_FLOW_RX_VERBS_READY_POLL]); +#endif //RDTSC_MEASURE_RX_VERBS_READY_POLL + +#ifdef RDTSC_MEASURE_RX_READY_POLL_TO_LWIP + RDTSC_TAKE_START(g_rdtsc_instr_info_arr[RDTSC_FLOW_RX_READY_POLL_TO_LWIP]); +#endif + } + +#ifdef VMA_TIME_MEASURE + TAKE_POLL_CQ_IN; +#endif + + if (unlikely(g_vlogger_level >= VLOG_FUNC_ALL)) { + for (int i = 0; i < ret; i++) { + cq_logfuncall("wce[%d] info wr_id=%x, status=%x, opcode=%x, vendor_err=%x, byte_len=%d, imm_data=%x", i, p_wce[i].wr_id, p_wce[i].status, vma_wc_opcode(p_wce[i]), p_wce[i].vendor_err, p_wce[i].byte_len, p_wce[i].imm_data); + cq_logfuncall("qp_num=%x, src_qp=%x, wc_flags=%x, pkey_index=%x, slid=%x, sl=%x, dlid_path_bits=%x", p_wce[i].qp_num, p_wce[i].src_qp, vma_wc_flags(p_wce[i]), p_wce[i].pkey_index, p_wce[i].slid, p_wce[i].sl, p_wce[i].dlid_path_bits); + } + } + + // spoil the global sn if we have packets ready + union __attribute__((packed)) { + uint64_t global_sn; + struct { + uint32_t cq_id; + uint32_t cq_sn; + } bundle; + } next_sn; + next_sn.bundle.cq_sn = ++m_n_cq_poll_sn; + next_sn.bundle.cq_id = m_cq_id; + + *p_cq_poll_sn = m_n_global_sn = next_sn.global_sn; + + return ret; +} + +void cq_mgr::process_cq_element_log_helper(mem_buf_desc_t* p_mem_buf_desc, vma_ibv_wc* p_wce) +{ + BULLSEYE_EXCLUDE_BLOCK_START + // wce with bad status value + if (p_wce->status == IBV_WC_SUCCESS) { + cq_logdbg("wce: wr_id=%#x, status=%#x, vendor_err=%#x, qp_num=%#x", p_wce->wr_id, p_wce->status, p_wce->vendor_err, p_wce->qp_num); + if (m_b_is_rx_hw_csum_on && ! vma_wc_rx_hw_csum_ok(*p_wce)) + cq_logdbg("wce: bad rx_csum"); + cq_logdbg("wce: opcode=%#x, byte_len=%#d, src_qp=%#x, wc_flags=%#x", vma_wc_opcode(*p_wce), p_wce->byte_len, p_wce->src_qp, vma_wc_flags(*p_wce)); + cq_logdbg("wce: pkey_index=%#x, slid=%#x, sl=%#x, dlid_path_bits=%#x, imm_data=%#x", p_wce->pkey_index, p_wce->slid, p_wce->sl, p_wce->dlid_path_bits, p_wce->imm_data); + cq_logdbg("mem_buf_desc: lkey=%#x, p_buffer=%p, sz_buffer=%#x", p_mem_buf_desc->lkey, p_mem_buf_desc->p_buffer, p_mem_buf_desc->sz_buffer); + } else if (p_wce->status != IBV_WC_WR_FLUSH_ERR) { + cq_logwarn("wce: wr_id=%#x, status=%#x, vendor_err=%#x, qp_num=%#x", p_wce->wr_id, p_wce->status, p_wce->vendor_err, p_wce->qp_num); + cq_loginfo("wce: opcode=%#x, byte_len=%#d, src_qp=%#x, wc_flags=%#x", vma_wc_opcode(*p_wce), p_wce->byte_len, p_wce->src_qp, vma_wc_flags(*p_wce)); + cq_loginfo("wce: pkey_index=%#x, slid=%#x, sl=%#x, dlid_path_bits=%#x, imm_data=%#x", p_wce->pkey_index, p_wce->slid, p_wce->sl, p_wce->dlid_path_bits, p_wce->imm_data); + + if (p_mem_buf_desc) { + cq_logwarn("mem_buf_desc: lkey=%#x, p_buffer=%p, sz_buffer=%#x", p_mem_buf_desc->lkey, p_mem_buf_desc->p_buffer, p_mem_buf_desc->sz_buffer); + } + } + BULLSEYE_EXCLUDE_BLOCK_END + + cq_logfunc("wce error status '%s' [%d] (wr_id=%p, qp_num=%x)", priv_ibv_wc_status_str(p_wce->status), p_wce->status, p_wce->wr_id, p_wce->qp_num); +} + +mem_buf_desc_t* cq_mgr::process_cq_element_tx(vma_ibv_wc* p_wce) +{ + // Assume locked!!! + cq_logfuncall(""); + + // Get related mem_buf_desc pointer from the wr_id + mem_buf_desc_t* p_mem_buf_desc = (mem_buf_desc_t*)(uintptr_t)p_wce->wr_id; + + if (unlikely(p_wce->status != IBV_WC_SUCCESS)) { + process_cq_element_log_helper(p_mem_buf_desc, p_wce); + + if (p_mem_buf_desc == NULL) { + cq_logdbg("wce->wr_id = 0!!! When status != IBV_WC_SUCCESS"); + return NULL; + } + if (p_mem_buf_desc->p_desc_owner) { + m_p_ring->mem_buf_desc_completion_with_error_tx(p_mem_buf_desc); + } else { + // AlexR: can this wce have a valid mem_buf_desc pointer? + // AlexR: are we throwing away a data buffer and a mem_buf_desc element? + cq_logdbg("no desc_owner(wr_id=%p, qp_num=%x)", p_wce->wr_id, p_wce->qp_num); + } + + return NULL; + } + + if (p_mem_buf_desc == NULL) { + cq_logdbg("wce->wr_id = 0!!! When status == IBV_WC_SUCCESS"); + return NULL; + } + + return p_mem_buf_desc; +} + +mem_buf_desc_t* cq_mgr::process_cq_element_rx(vma_ibv_wc* p_wce) +{ + // Assume locked!!! + cq_logfuncall(""); + + // Get related mem_buf_desc pointer from the wr_id + mem_buf_desc_t* p_mem_buf_desc = (mem_buf_desc_t*)(uintptr_t)p_wce->wr_id; + + bool bad_wce = p_wce->status != IBV_WC_SUCCESS; + + if (unlikely(bad_wce || p_mem_buf_desc == NULL)) { + if (p_mem_buf_desc == NULL) { + m_p_next_rx_desc_poll = NULL; + cq_logdbg("wce->wr_id = 0!!! When status == IBV_WC_SUCCESS"); + return NULL; + } + + process_cq_element_log_helper(p_mem_buf_desc, p_wce); + + m_p_next_rx_desc_poll = NULL; + + if (p_mem_buf_desc == NULL) { + cq_logdbg("wce->wr_id = 0!!! When status != IBV_WC_SUCCESS"); + return NULL; + } + if (p_mem_buf_desc->p_desc_owner) { + m_p_ring->mem_buf_desc_completion_with_error_rx(p_mem_buf_desc); + return NULL; + } + // AlexR: can this wce have a valid mem_buf_desc pointer? + // AlexR: are we throwing away a data buffer and a mem_buf_desc element? + cq_logdbg("no desc_owner(wr_id=%p, qp_num=%x)", p_wce->wr_id, p_wce->qp_num); + return NULL; + } + + if (m_n_sysvar_rx_prefetch_bytes_before_poll) { + /*for debug: + if (m_p_next_rx_desc_poll && m_p_next_rx_desc_poll != p_mem_buf_desc) { + cq_logerr("prefetched wrong buffer"); + }*/ + m_p_next_rx_desc_poll = p_mem_buf_desc->p_prev_desc; + p_mem_buf_desc->p_prev_desc = NULL; + } + + p_mem_buf_desc->rx.is_sw_csum_need = !(m_b_is_rx_hw_csum_on && vma_wc_rx_hw_csum_ok(*p_wce)); + + if (likely(vma_wc_opcode(*p_wce) & VMA_IBV_WC_RECV)) { + // Save recevied total bytes + p_mem_buf_desc->sz_data = p_wce->byte_len; + + //we use context to verify that on reclaim rx buffer path we return the buffer to the right CQ + p_mem_buf_desc->rx.is_vma_thr = false; + p_mem_buf_desc->rx.context = this; + p_mem_buf_desc->rx.socketxtreme_polled = false; + + //this is not a deadcode if timestamping is defined in verbs API + // coverity[dead_error_condition] + if (vma_wc_flags(*p_wce) & VMA_IBV_WC_WITH_TIMESTAMP) { + p_mem_buf_desc->rx.hw_raw_timestamp = vma_wc_timestamp(*p_wce); + } + + VALGRIND_MAKE_MEM_DEFINED(p_mem_buf_desc->p_buffer, p_mem_buf_desc->sz_data); + + prefetch_range((uint8_t*)p_mem_buf_desc->p_buffer + m_sz_transport_header, + std::min(p_mem_buf_desc->sz_data - m_sz_transport_header, (size_t)m_n_sysvar_rx_prefetch_bytes)); + //prefetch((uint8_t*)p_mem_buf_desc->p_buffer + m_sz_transport_header); + } + + return p_mem_buf_desc; +} + +bool cq_mgr::compensate_qp_poll_success(mem_buf_desc_t* buff_cur) +{ + // Assume locked!!! + // Compensate QP for all completions that we found + if (m_rx_pool.size() || request_more_buffers()) { + size_t buffers = std::min(m_qp_rec.debt, m_rx_pool.size()); + m_qp_rec.qp->post_recv_buffers(&m_rx_pool, buffers); + m_qp_rec.debt -= buffers; + m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); + } + else if (m_b_sysvar_cq_keep_qp_full || + m_qp_rec.debt + MCE_MAX_CQ_POLL_BATCH > (int)m_qp_rec.qp->m_rx_num_wr) { + m_p_cq_stat->n_rx_pkt_drop++; + m_qp_rec.qp->post_recv_buffer(buff_cur); + --m_qp_rec.debt; + return true; + } + + return false; +} + +void cq_mgr::compensate_qp_poll_failed() +{ + // Assume locked!!! + // Compensate QP for all completions debt + if (m_qp_rec.debt) { + if (likely(m_rx_pool.size() || request_more_buffers())) { + size_t buffers = std::min(m_qp_rec.debt, m_rx_pool.size()); + m_qp_rec.qp->post_recv_buffers(&m_rx_pool, buffers); + m_qp_rec.debt -= buffers; + m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); + } + } +} + +void cq_mgr::reclaim_recv_buffer_helper(mem_buf_desc_t* buff) +{ + if (buff->dec_ref_count() <= 1 && (buff->lwip_pbuf.pbuf.ref-- <= 1)) { + if (likely(buff->p_desc_owner == m_p_ring)) { + mem_buf_desc_t* temp = NULL; + while (buff) { + VLIST_DEBUG_CQ_MGR_PRINT_ERROR_IS_MEMBER; + temp = buff; + buff = temp->p_next_desc; + temp->p_next_desc = NULL; + temp->p_prev_desc = NULL; + temp->reset_ref_count(); + temp->rx.tcp.gro = 0; + temp->rx.is_vma_thr = false; + temp->rx.socketxtreme_polled = false; + temp->rx.flow_tag_id = 0; + temp->rx.tcp.p_ip_h = NULL; + temp->rx.tcp.p_tcp_h = NULL; + temp->rx.timestamps.sw.tv_nsec = 0; + temp->rx.timestamps.sw.tv_sec = 0; + temp->rx.timestamps.hw.tv_nsec = 0; + temp->rx.timestamps.hw.tv_sec = 0; + temp->rx.hw_raw_timestamp = 0; + free_lwip_pbuf(&temp->lwip_pbuf); + m_rx_pool.push_back(temp); + } + m_p_cq_stat->n_buffer_pool_len = m_rx_pool.size(); + } + else { + cq_logfunc("Buffer returned to wrong CQ"); + g_buffer_pool_rx->put_buffers_thread_safe(buff); + } + } +} + +void cq_mgr::process_tx_buffer_list(mem_buf_desc_t* p_mem_buf_desc) +{ + // Assume locked!!! + BULLSEYE_EXCLUDE_BLOCK_START + if (p_mem_buf_desc && (p_mem_buf_desc->p_desc_owner == m_p_ring /*|| m_p_ring->get_parent()->is_member(p_mem_buf_desc->p_desc_owner)*/)) { + m_p_ring->mem_buf_desc_return_to_owner_tx(p_mem_buf_desc); + /* if decided to free buffers of another ring here, need to modify return_to_owner to check owner and return to gpool. */ + } + else if (p_mem_buf_desc && m_p_ring->get_parent()->is_member(p_mem_buf_desc->p_desc_owner)) { + cq_logerr("got buffer of wrong owner, high-availability event? buf=%p, owner=%p", p_mem_buf_desc, p_mem_buf_desc ? p_mem_buf_desc->p_desc_owner : NULL); + /* if decided to free buffers here, remember its a list and need to deref members. */ + //p_mem_buf_desc->p_desc_owner->mem_buf_desc_return_to_owner_tx(p_mem_buf_desc); /* this can cause a deadlock between rings, use trylock? */ + } else { + cq_logerr("got buffer of wrong owner, buf=%p, owner=%p", p_mem_buf_desc, p_mem_buf_desc ? p_mem_buf_desc->p_desc_owner : NULL); + } + BULLSEYE_EXCLUDE_BLOCK_END +} + +void cq_mgr::mem_buf_desc_completion_with_error(mem_buf_desc_t* p_mem_buf_desc) +{ + cq_logfuncall(""); + // lock(); Called from cq_mgr context which is already locked!! + reclaim_recv_buffer_helper(p_mem_buf_desc); + // unlock(); Called from cq_mgr context which is already locked!! +} + +void cq_mgr::mem_buf_desc_return_to_owner(mem_buf_desc_t* p_mem_buf_desc, void* pv_fd_ready_array /*=NULL*/) +{ + cq_logfuncall(""); + NOT_IN_USE(pv_fd_ready_array); + reclaim_recv_buffer_helper(p_mem_buf_desc); +} + +int cq_mgr::poll_and_process_element_rx(uint64_t* p_cq_poll_sn, void* pv_fd_ready_array) +{ + // Assume locked!!! + cq_logfuncall(""); + + /* coverity[stack_use_local_overflow] */ + vma_ibv_wc wce[MCE_MAX_CQ_POLL_BATCH]; + + int ret; + uint32_t ret_rx_processed = process_recv_queue(pv_fd_ready_array); + if (unlikely(ret_rx_processed >= m_n_sysvar_cq_poll_batch_max)) { + m_p_ring->m_gro_mgr.flush_all(pv_fd_ready_array); + return ret_rx_processed; + } + + if (m_p_next_rx_desc_poll) { + prefetch_range((uint8_t*)m_p_next_rx_desc_poll->p_buffer, m_n_sysvar_rx_prefetch_bytes_before_poll); + } + + ret = poll(wce, m_n_sysvar_cq_poll_batch_max, p_cq_poll_sn); + if (ret > 0) { + m_n_wce_counter += ret; + if (ret < (int)m_n_sysvar_cq_poll_batch_max) + m_b_was_drained = true; + + for (int i = 0; i < ret; i++) { + mem_buf_desc_t *buff = process_cq_element_rx((&wce[i])); + if (buff) { + if (vma_wc_opcode(wce[i]) & VMA_IBV_WC_RECV) { + if ((++m_qp_rec.debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || + !compensate_qp_poll_success(buff)) { + process_recv_buffer(buff, pv_fd_ready_array); + } + } + } + } + ret_rx_processed += ret; + m_p_ring->m_gro_mgr.flush_all(pv_fd_ready_array); + } else { + compensate_qp_poll_failed(); + } + + return ret_rx_processed; +} + +int cq_mgr::poll_and_process_element_tx(uint64_t* p_cq_poll_sn) +{ + // Assume locked!!! + cq_logfuncall(""); + + /* coverity[stack_use_local_overflow] */ + vma_ibv_wc wce[MCE_MAX_CQ_POLL_BATCH]; + int ret = poll(wce, m_n_sysvar_cq_poll_batch_max, p_cq_poll_sn); + if (ret > 0) { + m_n_wce_counter += ret; + if (ret < (int)m_n_sysvar_cq_poll_batch_max) + m_b_was_drained = true; + + for (int i = 0; i < ret; i++) { + mem_buf_desc_t *buff = process_cq_element_tx((&wce[i])); + if (buff) { + process_tx_buffer_list(buff); + } + } + } + + return ret; +} + +int cq_mgr::poll_and_process_element_rx(mem_buf_desc_t **p_desc_lst) +{ + NOT_IN_USE(p_desc_lst); + cq_logerr("SocketXtreme mode is supported by mlx5 cq manager only"); + return 0; +} + +bool cq_mgr::reclaim_recv_buffers(mem_buf_desc_t *rx_reuse_lst) +{ + if (m_rx_buffs_rdy_for_free_head) { + reclaim_recv_buffer_helper(m_rx_buffs_rdy_for_free_head); + m_rx_buffs_rdy_for_free_head = m_rx_buffs_rdy_for_free_tail = NULL; + } + reclaim_recv_buffer_helper(rx_reuse_lst); + return_extra_buffers(); + + return true; +} + +bool cq_mgr::reclaim_recv_buffers_no_lock(mem_buf_desc_t *rx_reuse_lst) +{ + if (likely(rx_reuse_lst)) { + reclaim_recv_buffer_helper(rx_reuse_lst); + return true; + } + return false; +} + +int cq_mgr::reclaim_recv_single_buffer(mem_buf_desc_t* rx_reuse) +{ + int ret_val = 0; + + ret_val = rx_reuse->lwip_pbuf_dec_ref_count(); + if ((ret_val == 0) && (rx_reuse->get_ref_count() <= 0)) { + /*if ((safe_mce_sys().thread_mode > THREAD_MODE_SINGLE)) { + m_lock_ring_rx.lock(); + }*/ + if (!m_rx_buffs_rdy_for_free_head) { + m_rx_buffs_rdy_for_free_head = m_rx_buffs_rdy_for_free_tail = rx_reuse; + } + else { + m_rx_buffs_rdy_for_free_tail->p_next_desc = rx_reuse; + m_rx_buffs_rdy_for_free_tail = rx_reuse; + } + m_rx_buffs_rdy_for_free_tail->p_next_desc = NULL; + /*if ((safe_mce_sys().thread_mode > THREAD_MODE_SINGLE)) { + m_lock_ring_rx.lock(); + }*/ + } + return ret_val; +} + +bool cq_mgr::reclaim_recv_buffers(descq_t *rx_reuse) +{ + cq_logfuncall(""); + // Called from outside cq_mgr context which is not locked!! + while (!rx_reuse->empty()) { + mem_buf_desc_t* buff = rx_reuse->get_and_pop_front(); + reclaim_recv_buffer_helper(buff); + } + return_extra_buffers(); + + return true; +} + +// +// @OUT: p_recycle_buffers_last_wr_id Returns the final WR_ID handled. When set, this indicates this is a CQE drain flow. +// @OUT: returns total number of processes CQE's +// + + +int cq_mgr::drain_and_proccess(uintptr_t* p_recycle_buffers_last_wr_id /*=NULL*/) +{ + cq_logfuncall("cq was %s drained. %d processed wce since last check. %d wce in m_rx_queue", (m_b_was_drained?"":"not "), m_n_wce_counter, m_rx_queue.size()); + + // CQ polling loop until max wce limit is reached for this interval or CQ is drained + uint32_t ret_total = 0; + uint64_t cq_poll_sn = 0; + + if (p_recycle_buffers_last_wr_id != NULL) { + m_b_was_drained = false; + } + + while ((m_n_sysvar_progress_engine_wce_max > m_n_wce_counter) && (!m_b_was_drained)) { + + /* coverity[stack_use_local_overflow] */ + vma_ibv_wc wce[MCE_MAX_CQ_POLL_BATCH]; + int ret = poll(wce, MCE_MAX_CQ_POLL_BATCH, &cq_poll_sn); + if (ret <= 0) { + m_b_was_drained = true; + m_p_ring->m_gro_mgr.flush_all(NULL); + return ret_total; + } + + m_n_wce_counter += ret; + if (ret < MCE_MAX_CQ_POLL_BATCH) + m_b_was_drained = true; + + for (int i = 0; i < ret; i++) { + mem_buf_desc_t* buff = process_cq_element_rx(&wce[i]); + if (buff) { + if (p_recycle_buffers_last_wr_id) { + m_p_cq_stat->n_rx_pkt_drop++; + reclaim_recv_buffer_helper(buff); + } else { + bool procces_now = false; + if (m_transport_type == VMA_TRANSPORT_ETH) { + procces_now = is_eth_tcp_frame(buff); + } + if (m_transport_type == VMA_TRANSPORT_IB) { + procces_now = is_ib_tcp_frame(buff); + } + // We process immediately all non udp/ip traffic.. + if (procces_now) { + buff->rx.is_vma_thr = true; + if ((++m_qp_rec.debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || + !compensate_qp_poll_success(buff)) { + process_recv_buffer(buff, NULL); + } + } + else { //udp/ip traffic we just put in the cq's rx queue + m_rx_queue.push_back(buff); + mem_buf_desc_t* buff_cur = m_rx_queue.get_and_pop_front(); + if ((++m_qp_rec.debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || + !compensate_qp_poll_success(buff_cur)) { + m_rx_queue.push_front(buff_cur); + } + } + } + } + if (p_recycle_buffers_last_wr_id) { + *p_recycle_buffers_last_wr_id = (uintptr_t)wce[i].wr_id; + } + } + ret_total += ret; + } + m_p_ring->m_gro_mgr.flush_all(NULL); + + m_n_wce_counter = 0; + m_b_was_drained = false; + + // Update cq statistics + m_p_cq_stat->n_rx_sw_queue_len = m_rx_queue.size(); + m_p_cq_stat->n_rx_drained_at_once_max = std::max(ret_total, m_p_cq_stat->n_rx_drained_at_once_max); + + return ret_total; +} + +// 1 -> busy +// 0 -> ok +// -1 -> error +int cq_mgr::ack_and_request_notification() +{ + int res, cq_ev_count = 0; + ibv_cq* ib_cq; + void *cq_context; + do { + res = ibv_get_cq_event(m_comp_event_channel, &ib_cq, &cq_context); + if (res == 0) { + ++cq_ev_count; + } + } while (res == 0); + if (errno != EAGAIN) { + return -1; + } + if (cq_ev_count > 0) { + get_cq_event(cq_ev_count); + ibv_ack_cq_events(m_p_ibv_cq, cq_ev_count); + return 1; + } + IF_VERBS_FAILURE(req_notify_cq()) { + cq_logerr("Failure arming the qp_mgr notification channel (errno=%d %m)", errno); + return -1; + } + ENDIF_VERBS_FAILURE + return 0; +} + +int cq_mgr::request_notification(uint64_t poll_sn) +{ + int ret = -1; + + cq_logfuncall(""); + + if ((m_n_global_sn > 0 && poll_sn != m_n_global_sn)) { + // The cq_mgr's has receive packets pending processing (or got processed since cq_poll_sn) + cq_logfunc("miss matched poll sn (user=0x%lx, cq=0x%lx)", poll_sn, m_n_cq_poll_sn); + return 1; + } + + if (m_b_notification_armed == false) { + + cq_logfunc("arming cq_mgr notification channel"); + + // Arm the CQ notification channel + IF_VERBS_FAILURE(req_notify_cq()) { + cq_logerr("Failure arming the qp_mgr notification channel (errno=%d %m)", errno); + } + else { + ret = 0; + m_b_notification_armed = true; + + } ENDIF_VERBS_FAILURE; + } + else { + // cq_mgr notification channel already armed + ret = 0; + } + + cq_logfuncall("returning with %d", ret); + return ret; +} + +int cq_mgr::wait_for_notification_and_process_element(uint64_t* p_cq_poll_sn, void* pv_fd_ready_array) +{ + int ret = -1; + + cq_logfunc(""); + + if (m_b_notification_armed) { + cq_mgr* p_cq_mgr_context = NULL; + struct ibv_cq* p_cq_hndl = NULL; + void *p; // deal with compiler warnings + + // Block on the cq_mgr's notification event channel + IF_VERBS_FAILURE(ibv_get_cq_event(m_comp_event_channel, &p_cq_hndl, &p)) { + cq_logfunc("waiting on cq_mgr event returned with error (errno=%d %m)", errno); + } + else { + get_cq_event(); + p_cq_mgr_context = (cq_mgr*)p; + if (p_cq_mgr_context != this) { + cq_logerr("mismatch with cq_mgr returned from new event (event->cq_mgr->%p)", p_cq_mgr_context); + // this can be if we are using a single channel for several/all cq_mgrs + // in this case we need to deliver the event to the correct cq_mgr + } + + // Ack event + ibv_ack_cq_events(m_p_ibv_cq, 1); + + // Clear flag + m_b_notification_armed = false; + + // Now try processing the ready element + if (m_b_is_rx) { + ret = poll_and_process_element_rx(p_cq_poll_sn, pv_fd_ready_array); + } else { + ret = poll_and_process_element_tx(p_cq_poll_sn); + } + } ENDIF_VERBS_FAILURE; + } + else { + cq_logfunc("notification channel is not armed"); + errno = EAGAIN; + } + + return ret; +} + +cq_mgr* get_cq_mgr_from_cq_event(struct ibv_comp_channel* p_cq_channel) +{ + cq_mgr* p_cq_mgr = NULL; + struct ibv_cq* p_cq_hndl = NULL; + void *p_context; // deal with compiler warnings + + // read & ack the CQ event + IF_VERBS_FAILURE(ibv_get_cq_event(p_cq_channel, &p_cq_hndl, &p_context)) { + vlog_printf(VLOG_INFO, MODULE_NAME ":%d: waiting on cq_mgr event returned with error (errno=%d %m)\n", __LINE__, errno); + } + else { + p_cq_mgr = (cq_mgr*)p_context; // Save the cq_mgr + p_cq_mgr->get_cq_event(); + ibv_ack_cq_events(p_cq_hndl, 1); // Ack the ibv event + } ENDIF_VERBS_FAILURE; + + return p_cq_mgr; +} diff --git a/src/vma/dev/cq_mgr.h b/src/vma/dev/cq_mgr.h new file mode 100644 index 0000000..c3be28b --- /dev/null +++ b/src/vma/dev/cq_mgr.h @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef CQ_MGR_H +#define CQ_MGR_H + +#include "vma/ib/base/verbs_extra.h" +#include "utils/atomic.h" +#include "vma/dev/qp_mgr.h" +#include "vma/dev/ib_ctx_handler.h" +#include "vma/util/sys_vars.h" +#include "vma/util/hash_map.h" +#include "vma/util/vma_stats.h" +#include "vma/proto/mem_buf_desc.h" +#include "vma/proto/vma_lwip.h" +#include "vma/vma_extra.h" + + +class net_device_mgr; +class ring; +class qp_mgr; +class ring_simple; + +#define LOCAL_IF_INFO_INVALID (local_if_info_t){0,0} + +struct cq_request_info_t { + struct ibv_device* p_ibv_device; + struct ibv_context* p_ibv_context; + int n_port; + qp_mgr* p_qp_mgr; +}; + +struct buff_lst_info_t { + mem_buf_desc_t* buff_lst; + uint32_t n_buff_num; +}; + +typedef std::pair local_if_info_key_t; + +typedef struct local_if_info_t { + in_addr_t addr; + uint32_t attached_grp_ref_cnt; +} local_if_info_t; + +struct qp_rec { + qp_mgr *qp; + int debt; +}; + +// Class cq_mgr +// +class cq_mgr +{ + friend class ring; // need to expose the m_n_global_sn only to ring + friend class ring_simple; // need to expose the m_n_global_sn only to ring + friend class ring_bond; // need to expose the m_n_global_sn only to ring + +public: + cq_mgr(ring_simple *p_ring, ib_ctx_handler *p_ib_ctx_handler, + int cq_size, struct ibv_comp_channel *p_comp_event_channel, + bool is_rx, bool config=true); + virtual ~cq_mgr(); + + void configure(int cq_size); + + ibv_cq *get_ibv_cq_hndl(); + int get_channel_fd(); + // ack events and rearm CQ + int ack_and_request_notification(); + /** + * Arm the managed CQ's notification channel + * Calling this more then once without get_event() will return without + * doing anything (arm flag is changed to true on first call). + * This call will also check if a wce was processes between the + * last poll and this arm request - if true it will not arm the CQ + * @return ==0 cq is armed + * ==1 cq not armed (cq poll_sn out of sync) + * < 0 on error + */ + int request_notification(uint64_t poll_sn); + + /** + * Block on the CQ's notification channel for the next event and process + * it before exiting. + * + * @return >=0 number of processed wce + * < 0 error or if channel not armed or channel would block + * (on non-blocked channel) (some other thread beat you to it) + */ + int wait_for_notification_and_process_element(uint64_t* p_cq_poll_sn, + void* pv_fd_ready_array = NULL); + + /** + * This will poll n_num_poll time on the cq or stop early if it gets + * a wce (work completion element). If a wce was found 'processing' will + * occur. + * @return >=0 number of wce processed + * < 0 error + */ + virtual int poll_and_process_element_rx(uint64_t* p_cq_poll_sn, void* pv_fd_ready_array = NULL); + virtual int poll_and_process_element_tx(uint64_t* p_cq_poll_sn); + virtual int poll_and_process_element_rx(mem_buf_desc_t **p_desc_lst); + + /** + * This will check if the cq was drained, and if it wasn't it will drain it. + * @param restart - In case of restart - don't process any buffer + * @return >=0 number of wce processed + * < 0 error + */ + virtual int drain_and_proccess(uintptr_t* p_recycle_buffers_last_wr_id = NULL); + + // CQ implements the Rx mem_buf_desc_owner. + // These callbacks will be called for each Rx buffer that passed processed completion + // Rx completion handling at the cq_mgr level is forwarding the packet to the ib_comm_mgr layer + void mem_buf_desc_completion_with_error(mem_buf_desc_t* p_rx_wc_buf_desc); + void mem_buf_desc_return_to_owner(mem_buf_desc_t* p_mem_buf_desc, void* pv_fd_ready_array = NULL); + + virtual void add_qp_rx(qp_mgr* qp); + virtual void del_qp_rx(qp_mgr *qp); + virtual uint32_t clean_cq(); + + virtual void add_qp_tx(qp_mgr* qp); + + bool reclaim_recv_buffers(descq_t *rx_reuse); + bool reclaim_recv_buffers(mem_buf_desc_t *rx_reuse_lst); + bool reclaim_recv_buffers_no_lock(mem_buf_desc_t *rx_reuse_lst); + int reclaim_recv_single_buffer(mem_buf_desc_t* rx_reuse); + + //maps between qpn and vlan id to the local interface + void map_vlan_and_qpn_to_local_if(int qp_num, uint16_t vlan_id, in_addr_t local_if); + + //unmaps the qpn and vlan id + void unmap_vlan_and_qpn(int qp_num, uint16_t vlan_id); + + virtual bool fill_cq_hw_descriptors(struct hw_cq_data &data) {NOT_IN_USE(data);return false;}; + virtual void get_cq_event(int count = 1) {NOT_IN_USE(count);}; + +protected: + + /** + * Poll the CQ that is managed by this object + * @p_wce pointer to array where to save the wce in + * @num_entries Size of the p_wce (max number of wce to poll at once) + * @p_cq_poll_sn global unique wce id that maps last wce polled + * @return Number of successfully polled wce + */ + virtual int poll(vma_ibv_wc* p_wce, int num_entries, uint64_t* p_cq_poll_sn); + void compensate_qp_poll_failed(); + inline void process_recv_buffer(mem_buf_desc_t* buff, void* pv_fd_ready_array = NULL); + + /* Process a WCE... meaning... + * - extract the mem_buf_desc from the wce.wr_id and then loop on all linked mem_buf_desc + * and deliver them to their owner for further processing (sockinfo on Tx path and ib_conn_mgr on Rx path) + * - for Tx wce the data buffers will be released to the associated ring before the mem_buf_desc are returned + */ + mem_buf_desc_t* process_cq_element_tx(vma_ibv_wc* p_wce); + mem_buf_desc_t* process_cq_element_rx(vma_ibv_wc* p_wce); + void reclaim_recv_buffer_helper(mem_buf_desc_t* buff); + + // Returns true if the given buffer was used, + //false if the given buffer was not used. + bool compensate_qp_poll_success(mem_buf_desc_t* buff); + inline uint32_t process_recv_queue(void* pv_fd_ready_array = NULL); + + virtual void prep_ibv_cq(vma_ibv_cq_init_attr &attr) const; + //returns list of buffers to the owner. + void process_tx_buffer_list(mem_buf_desc_t* p_mem_buf_desc); + + struct ibv_cq* m_p_ibv_cq; + bool m_b_is_rx; + descq_t m_rx_queue; + static uint64_t m_n_global_sn; + uint32_t m_cq_id; + uint32_t m_n_cq_poll_sn; + ring_simple* m_p_ring; + uint32_t m_n_wce_counter; + bool m_b_was_drained; + bool m_b_is_rx_hw_csum_on; + qp_rec m_qp_rec; + const uint32_t m_n_sysvar_cq_poll_batch_max; + const uint32_t m_n_sysvar_progress_engine_wce_max; + cq_stats_t* m_p_cq_stat; + transport_type_t m_transport_type; + mem_buf_desc_t* m_p_next_rx_desc_poll; + const uint32_t m_n_sysvar_rx_prefetch_bytes_before_poll; + const uint32_t m_n_sysvar_rx_prefetch_bytes; + size_t m_sz_transport_header; + ib_ctx_handler* m_p_ib_ctx_handler; + const uint32_t m_n_sysvar_rx_num_wr_to_post_recv; +private: + struct ibv_comp_channel *m_comp_event_channel; + bool m_b_notification_armed; + const uint32_t m_n_sysvar_qp_compensation_level; + const uint32_t m_rx_lkey; + const bool m_b_sysvar_cq_keep_qp_full; + descq_t m_rx_pool; + int32_t m_n_out_of_free_bufs_warning; + cq_stats_t m_cq_stat_static; + static atomic_t m_n_cq_id_counter; + + /* This fields are needed to track internal memory buffers + * represented as struct vma_buff_t + * from user application by special VMA extended API + */ + mem_buf_desc_t* m_rx_buffs_rdy_for_free_head; + mem_buf_desc_t* m_rx_buffs_rdy_for_free_tail; + + void handle_tcp_ctl_packets(uint32_t rx_processed, void* pv_fd_ready_array); + + // requests safe_mce_sys().qp_compensation_level buffers from global pool + bool request_more_buffers() __attribute__((noinline)); + + // returns safe_mce_sys().qp_compensation_level buffers to global pool + void return_extra_buffers() __attribute__((noinline)); + + void statistics_print(); + + //Finds and sets the local if to which the buff is addressed (according to qpn and vlan id). + inline void find_buff_dest_local_if(mem_buf_desc_t * buff); + + //Finds and sets the vma if to which the buff is addressed (according to qpn). + inline void find_buff_dest_vma_if_ctx(mem_buf_desc_t * buff); + + void process_cq_element_log_helper(mem_buf_desc_t* p_mem_buf_desc, vma_ibv_wc* p_wce); + + virtual int req_notify_cq() { + return ibv_req_notify_cq(m_p_ibv_cq, 0); + }; +}; + +// Helper gunction to extract the Tx cq_mgr from the CQ event, +// Since we have a single TX CQ comp channel for all cq_mgr's, it might not be the active_cq object +cq_mgr* get_cq_mgr_from_cq_event(struct ibv_comp_channel* p_cq_channel); + +#endif //CQ_MGR_H diff --git a/src/vma/dev/cq_mgr.inl b/src/vma/dev/cq_mgr.inl new file mode 100644 index 0000000..68f8875 --- /dev/null +++ b/src/vma/dev/cq_mgr.inl @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef CQ_MGR_INL_H +#define CQ_MGR_INL_H + +#include "cq_mgr.h" +#include "ring_simple.h" + +/**/ +/** inlining functions can only help if they are implemented before their usage **/ +/**/ + +inline void cq_mgr::process_recv_buffer(mem_buf_desc_t* p_mem_buf_desc, void* pv_fd_ready_array) +{ + // Assume locked!!! + + // Pass the Rx buffer ib_comm_mgr for further IP processing + if (!m_p_ring->rx_process_buffer(p_mem_buf_desc, pv_fd_ready_array)) { + // If buffer is dropped by callback - return to RX pool + reclaim_recv_buffer_helper(p_mem_buf_desc); + } +} + +inline uint32_t cq_mgr::process_recv_queue(void* pv_fd_ready_array) +{ + // Assume locked!!! + // If we have packets in the queue, dequeue one and process it + // until reaching cq_poll_batch_max or empty queue + uint32_t processed = 0; + + while (!m_rx_queue.empty()) { + mem_buf_desc_t* buff = m_rx_queue.get_and_pop_front(); + process_recv_buffer(buff, pv_fd_ready_array); + if (++processed >= m_n_sysvar_cq_poll_batch_max) + break; + } + m_p_cq_stat->n_rx_sw_queue_len = m_rx_queue.size(); + return processed; +} + +inline bool is_eth_tcp_frame(mem_buf_desc_t* buff) +{ + struct ethhdr* p_eth_h = (struct ethhdr*)(buff->p_buffer); + uint16_t h_proto = p_eth_h->h_proto; + + size_t transport_header_len = ETH_HDR_LEN; + struct vlanhdr* p_vlan_hdr = NULL; + if (h_proto == htons(ETH_P_8021Q)) { + p_vlan_hdr = (struct vlanhdr*)((uint8_t*)p_eth_h + transport_header_len); + transport_header_len = ETH_VLAN_HDR_LEN; + h_proto = p_vlan_hdr->h_vlan_encapsulated_proto; + } + struct iphdr *p_ip_h = (struct iphdr*)(buff->p_buffer + transport_header_len); + if (likely(h_proto == htons(ETH_P_IP)) && (p_ip_h->protocol == IPPROTO_TCP)) { + return true; + } + return false; +} + +inline bool is_ib_tcp_frame(mem_buf_desc_t* buff) +{ + struct ipoibhdr* p_ipoib_h = (struct ipoibhdr*)(buff->p_buffer + GRH_HDR_LEN); + + // Validate IPoIB header + if (unlikely(p_ipoib_h->ipoib_header != htonl(IPOIB_HEADER))) { + return false; + } + + size_t transport_header_len = GRH_HDR_LEN + IPOIB_HDR_LEN; + + struct iphdr * p_ip_h = (struct iphdr*)(buff->p_buffer + transport_header_len); + if (likely(p_ip_h->protocol == IPPROTO_TCP)) { + return true; + } + return false; +} + +#endif//CQ_MGR_INL_H diff --git a/src/vma/dev/cq_mgr_mlx5.cpp b/src/vma/dev/cq_mgr_mlx5.cpp new file mode 100644 index 0000000..246c5b7 --- /dev/null +++ b/src/vma/dev/cq_mgr_mlx5.cpp @@ -0,0 +1,852 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "cq_mgr_mlx5.h" + +#if defined(DEFINED_DIRECT_VERBS) + +#include +#include "cq_mgr.inl" +#include "cq_mgr_mlx5.inl" +#include "qp_mgr.h" +#include "qp_mgr_eth_mlx5.h" +#include "ring_simple.h" + +#define MODULE_NAME "cqm_mlx5" + +#define cq_logfunc __log_info_func +#define cq_logdbg __log_info_dbg +#define cq_logerr __log_info_err +#define cq_logpanic __log_info_panic +#define cq_logfuncall __log_info_funcall + + +cq_mgr_mlx5::cq_mgr_mlx5(ring_simple* p_ring, ib_ctx_handler* p_ib_ctx_handler, + uint32_t cq_size, struct ibv_comp_channel* p_comp_event_channel, + bool is_rx, bool call_configure): + cq_mgr(p_ring, p_ib_ctx_handler, cq_size, p_comp_event_channel, is_rx, call_configure) + ,m_qp(NULL) + ,m_b_sysvar_enable_socketxtreme(safe_mce_sys().enable_socketxtreme) + ,m_rx_hot_buffer(NULL) +{ + cq_logfunc(""); + + memset(&m_mlx5_cq, 0, sizeof(m_mlx5_cq)); +} + +uint32_t cq_mgr_mlx5::clean_cq() +{ + uint32_t ret_total = 0; + uint64_t cq_poll_sn = 0; + mem_buf_desc_t* buff; + + if (m_b_is_rx) { + /* Sanity check for cq: initialization of tx and rx cq has difference: + * tx - is done in qp_mgr::configure() + * rx - is done in qp_mgr::up() + * as a result rx cq can be created but not initialized + */ + if (NULL == m_qp) return 0; + + buff_status_e status = BS_OK; + while((buff = poll(status))) { + if (process_cq_element_rx( buff, status)) { + m_rx_queue.push_back(buff); + } + ++ret_total; + } + update_global_sn(cq_poll_sn, ret_total); + } else {//Tx + int ret = 0; + /* coverity[stack_use_local_overflow] */ + vma_ibv_wc wce[MCE_MAX_CQ_POLL_BATCH]; + while ((ret = cq_mgr::poll(wce, MCE_MAX_CQ_POLL_BATCH, &cq_poll_sn)) > 0) { + for (int i = 0; i < ret; i++) { + buff = process_cq_element_tx(&wce[i]); + if (buff) + m_rx_queue.push_back(buff); + } + ret_total += ret; + } + } + + return ret_total; +} + +cq_mgr_mlx5::~cq_mgr_mlx5() +{ + cq_logfunc(""); + cq_logdbg("destroying CQ as %s", (m_b_is_rx?"Rx":"Tx")); +} + +mem_buf_desc_t* cq_mgr_mlx5::poll(enum buff_status_e& status) +{ + mem_buf_desc_t *buff = NULL; + +#ifdef RDTSC_MEASURE_RX_VMA_TCP_IDLE_POLL + RDTSC_TAKE_END(RDTSC_FLOW_RX_VMA_TCP_IDLE_POLL); +#endif //RDTSC_MEASURE_RX_VMA_TCP_IDLE_POLL + +#if defined(RDTSC_MEASURE_RX_VERBS_READY_POLL) || defined(RDTSC_MEASURE_RX_VERBS_IDLE_POLL) + RDTSC_TAKE_START_RX_VERBS_POLL(RDTSC_FLOW_RX_VERBS_READY_POLL, RDTSC_FLOW_RX_VERBS_IDLE_POLL); +#endif //RDTSC_MEASURE_RX_VERBS_READY_POLL || RDTSC_MEASURE_RX_VERBS_IDLE_POLL + + if (unlikely(NULL == m_rx_hot_buffer)) { + if (likely(m_qp->m_mlx5_qp.rq.tail != (m_qp->m_mlx5_qp.rq.head))) { + uint32_t index = m_qp->m_mlx5_qp.rq.tail & (m_qp_rec.qp->m_rx_num_wr - 1); + m_rx_hot_buffer = (mem_buf_desc_t *)m_qp->m_rq_wqe_idx_to_wrid[index]; + m_qp->m_rq_wqe_idx_to_wrid[index] = 0; + prefetch((void*)m_rx_hot_buffer); + prefetch((uint8_t*)m_mlx5_cq.cq_buf + ((m_mlx5_cq.cq_ci & (m_mlx5_cq.cqe_count - 1)) << m_mlx5_cq.cqe_size_log)); + } else { +#ifdef RDTSC_MEASURE_RX_VERBS_IDLE_POLL + RDTSC_TAKE_END(RDTSC_FLOW_RX_VERBS_IDLE_POLL); +#endif + +#if defined(RDTSC_MEASURE_RX_VMA_TCP_IDLE_POLL) || defined(RDTSC_MEASURE_RX_CQE_RECEIVEFROM) + RDTSC_TAKE_START_VMA_IDLE_POLL_CQE_TO_RECVFROM(RDTSC_FLOW_RX_VMA_TCP_IDLE_POLL, + RDTSC_FLOW_RX_CQE_TO_RECEIVEFROM); +#endif //RDTSC_MEASURE_RX_VMA_TCP_IDLE_POLL || RDTSC_MEASURE_RX_CQE_RECEIVEFROM + /* If rq_tail and rq_head are pointing to the same wqe, + * the wq is empty and there is no cqe to be received */ + return NULL; + } + } + mlx5_cqe64 *cqe = check_cqe(); + if (likely(cqe)) { + /* Update the consumer index */ + ++m_mlx5_cq.cq_ci; + rmb(); + cqe64_to_mem_buff_desc(cqe, m_rx_hot_buffer, status); + + ++m_qp->m_mlx5_qp.rq.tail; + *m_mlx5_cq.dbrec = htonl(m_mlx5_cq.cq_ci & 0xffffff); + + buff = m_rx_hot_buffer; + m_rx_hot_buffer = NULL; + +#ifdef RDTSC_MEASURE_RX_VERBS_READY_POLL + RDTSC_TAKE_END(RDTSC_FLOW_RX_VERBS_READY_POLL); +#endif //RDTSC_MEASURE_RX_VERBS_READY_POLL + +#ifdef RDTSC_MEASURE_RX_READY_POLL_TO_LWIP + RDTSC_TAKE_START(RDTSC_FLOW_RX_READY_POLL_TO_LWIP); +#endif + } else { +#ifdef RDTSC_MEASURE_RX_VERBS_IDLE_POLL + RDTSC_TAKE_END(RDTSC_FLOW_RX_VERBS_IDLE_POLL); +#endif + +#if defined(RDTSC_MEASURE_RX_VMA_TCP_IDLE_POLL) || defined(RDTSC_MEASURE_RX_CQE_RECEIVEFROM) + RDTSC_TAKE_START_VMA_IDLE_POLL_CQE_TO_RECVFROM(RDTSC_FLOW_RX_VMA_TCP_IDLE_POLL, + RDTSC_FLOW_RX_CQE_TO_RECEIVEFROM); +#endif //RDTSC_MEASURE_RX_VMA_TCP_IDLE_POLL || RDTSC_MEASURE_RX_CQE_RECEIVEFROM + + prefetch((void*)m_rx_hot_buffer); + } + + prefetch((uint8_t*)m_mlx5_cq.cq_buf + ((m_mlx5_cq.cq_ci & (m_mlx5_cq.cqe_count - 1)) << m_mlx5_cq.cqe_size_log)); + + return buff; +} + +inline void cq_mgr_mlx5::cqe64_to_mem_buff_desc(struct mlx5_cqe64 *cqe, mem_buf_desc_t* p_rx_wc_buf_desc, enum buff_status_e &status) +{ + struct mlx5_err_cqe *ecqe; + ecqe = (struct mlx5_err_cqe *)cqe; + + switch (MLX5_CQE_OPCODE(cqe->op_own)) { + case MLX5_CQE_RESP_WR_IMM: + cq_logerr("IBV_WC_RECV_RDMA_WITH_IMM is not supported"); + status = BS_CQE_RESP_WR_IMM_NOT_SUPPORTED; + break; + case MLX5_CQE_RESP_SEND: + case MLX5_CQE_RESP_SEND_IMM: + case MLX5_CQE_RESP_SEND_INV: + { + status = BS_OK; + p_rx_wc_buf_desc->sz_data = ntohl(cqe->byte_cnt); + p_rx_wc_buf_desc->rx.hw_raw_timestamp = ntohll(cqe->timestamp); + p_rx_wc_buf_desc->rx.flow_tag_id = vma_get_flow_tag(cqe); + p_rx_wc_buf_desc->rx.is_sw_csum_need = !(m_b_is_rx_hw_csum_on && + (cqe->hds_ip_ext & MLX5_CQE_L4_OK) && (cqe->hds_ip_ext & MLX5_CQE_L3_OK)); + return; + } + case MLX5_CQE_INVALID: /* No cqe!*/ + { + cq_logerr("We should no receive a buffer without a cqe\n"); + status = BS_CQE_INVALID; + break; + } + case MLX5_CQE_REQ: + case MLX5_CQE_REQ_ERR: + case MLX5_CQE_RESP_ERR: + default: + { + if (MLX5_CQE_SYNDROME_WR_FLUSH_ERR == ecqe->syndrome) { + status = BS_IBV_WC_WR_FLUSH_ERR; + } else { + status = BS_GENERAL_ERR; + } + /* + IB compliant completion with error syndrome: + 0x1: Local_Length_Error + 0x2: Local_QP_Operation_Error + 0x4: Local_Protection_Error + 0x5: Work_Request_Flushed_Error + 0x6: Memory_Window_Bind_Error + 0x10: Bad_Response_Error + 0x11: Local_Access_Error + 0x12: Remote_Invalid_Request_Error + 0x13: Remote_Access_Error + 0x14: Remote_Operation_Error + 0x15: Transport_Retry_Counter_Exceeded + 0x16: RNR_Retry_Counter_Exceeded + 0x22: Aborted_Error + other: Reserved + */ + break; + } + } +} + +int cq_mgr_mlx5::drain_and_proccess(uintptr_t* p_recycle_buffers_last_wr_id /*=NULL*/) +{ + cq_logfuncall("cq was %s drained. %d processed wce since last check. %d wce in m_rx_queue", (m_b_was_drained?"":"not "), m_n_wce_counter, m_rx_queue.size()); + + /* CQ polling loop until max wce limit is reached for this interval or CQ is drained */ + uint32_t ret_total = 0; + uint64_t cq_poll_sn = 0; + + if (p_recycle_buffers_last_wr_id != NULL) { + m_b_was_drained = false; + } + + if (m_b_sysvar_enable_socketxtreme) { + while ((m_n_sysvar_progress_engine_wce_max && (m_n_sysvar_progress_engine_wce_max > m_n_wce_counter)) && + !m_b_was_drained) { + int ret = 0; + mlx5_cqe64 *cqe_arr[MCE_MAX_CQ_POLL_BATCH]; + + for (int i = 0; i < MCE_MAX_CQ_POLL_BATCH; ++i) + { + cqe_arr[i] = get_cqe64(); + if (cqe_arr[i]) { + ++ret; + wmb(); + *m_mlx5_cq.dbrec = htonl(m_mlx5_cq.cq_ci); + if (m_b_is_rx) { + ++m_qp->m_mlx5_qp.rq.tail; + } + } + else { + break; + } + } + + if (!ret) { + m_b_was_drained = true; + return ret_total; + } + + + m_n_wce_counter += ret; + if (ret < MCE_MAX_CQ_POLL_BATCH) + m_b_was_drained = true; + + for (int i = 0; i < ret; i++) { + uint32_t wqe_sz = 0; + mlx5_cqe64 *cqe = cqe_arr[i]; + vma_ibv_wc wce; + + uint16_t wqe_ctr = ntohs(cqe->wqe_counter); + if (m_b_is_rx) { + wqe_sz = m_qp->m_rx_num_wr; + } + else { + wqe_sz = m_qp->m_tx_num_wr; + } + + int index = wqe_ctr & (wqe_sz - 1); + + /* We need to processes rx data in case + * wce.status == IBV_WC_SUCCESS + * and release buffers to rx pool + * in case failure + */ + m_rx_hot_buffer = (mem_buf_desc_t*)(uintptr_t)m_qp->m_rq_wqe_idx_to_wrid[index]; + memset(&wce, 0, sizeof(wce)); + wce.wr_id = (uintptr_t)m_rx_hot_buffer; + cqe64_to_vma_wc(cqe, &wce); + + m_rx_hot_buffer = cq_mgr::process_cq_element_rx(&wce); + if (m_rx_hot_buffer) { + if (p_recycle_buffers_last_wr_id) { + m_p_cq_stat->n_rx_pkt_drop++; + reclaim_recv_buffer_helper(m_rx_hot_buffer); + } else { + bool procces_now = false; + if (m_transport_type == VMA_TRANSPORT_ETH) { + procces_now = is_eth_tcp_frame(m_rx_hot_buffer); + } + if (m_transport_type == VMA_TRANSPORT_IB) { + procces_now = is_ib_tcp_frame(m_rx_hot_buffer); + } + // We process immediately all non udp/ip traffic.. + if (procces_now) { + m_rx_hot_buffer->rx.is_vma_thr = true; + if ((++m_qp_rec.debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || + !compensate_qp_poll_success(m_rx_hot_buffer)) { + process_recv_buffer(m_rx_hot_buffer, NULL); + } + } + else { //udp/ip traffic we just put in the cq's rx queue + m_rx_queue.push_back(m_rx_hot_buffer); + mem_buf_desc_t* buff_cur = m_rx_queue.get_and_pop_front(); + if ((++m_qp_rec.debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || + !compensate_qp_poll_success(buff_cur)) { + m_rx_queue.push_front(buff_cur); + } + } + } + } + if (p_recycle_buffers_last_wr_id) { + *p_recycle_buffers_last_wr_id = (uintptr_t)wce.wr_id; + } + } + ret_total += ret; + } + } else { + if (p_recycle_buffers_last_wr_id != NULL) { + m_b_was_drained = false; + } + + while ((m_n_sysvar_progress_engine_wce_max > m_n_wce_counter) && + !m_b_was_drained) { + buff_status_e status = BS_OK; + mem_buf_desc_t* buff = poll(status); + if (NULL == buff) { + update_global_sn(cq_poll_sn, ret_total); + m_b_was_drained = true; + m_p_ring->m_gro_mgr.flush_all(NULL); + return ret_total; + } + + ++m_n_wce_counter; + + if (process_cq_element_rx(buff, status)) { + if (p_recycle_buffers_last_wr_id) { + m_p_cq_stat->n_rx_pkt_drop++; + reclaim_recv_buffer_helper(buff); + } else { + bool procces_now = false; + if (m_transport_type == VMA_TRANSPORT_ETH) { + procces_now = is_eth_tcp_frame(buff); + } + if (m_transport_type == VMA_TRANSPORT_IB) { + procces_now = is_ib_tcp_frame(buff); + } + /* We process immediately all non udp/ip traffic.. */ + if (procces_now) { + buff->rx.is_vma_thr = true; + if ((++m_qp_rec.debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || + !compensate_qp_poll_success(buff)) { + process_recv_buffer(buff, NULL); + } + } + else { /* udp/ip traffic we just put in the cq's rx queue */ + m_rx_queue.push_back(buff); + mem_buf_desc_t* buff_cur = m_rx_queue.front(); + m_rx_queue.pop_front(); + if ((++m_qp_rec.debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || + !compensate_qp_poll_success(buff_cur)) { + m_rx_queue.push_front(buff_cur); + } + } + } + } + + if (p_recycle_buffers_last_wr_id) { + *p_recycle_buffers_last_wr_id = (uintptr_t)buff; + } + + ++ret_total; + } + + update_global_sn(cq_poll_sn, ret_total); + + m_p_ring->m_gro_mgr.flush_all(NULL); + } + + m_n_wce_counter = 0; + m_b_was_drained = false; + + // Update cq statistics + m_p_cq_stat->n_rx_sw_queue_len = m_rx_queue.size(); + m_p_cq_stat->n_rx_drained_at_once_max = std::max(ret_total, m_p_cq_stat->n_rx_drained_at_once_max); + + return ret_total; +} + +inline void cq_mgr_mlx5::update_global_sn(uint64_t& cq_poll_sn, uint32_t num_polled_cqes) +{ + if (num_polled_cqes > 0) { + // spoil the global sn if we have packets ready + union __attribute__((packed)) { + uint64_t global_sn; + struct { + uint32_t cq_id; + uint32_t cq_sn; + } bundle; + } next_sn; + m_n_cq_poll_sn += num_polled_cqes; + next_sn.bundle.cq_sn = m_n_cq_poll_sn; + next_sn.bundle.cq_id = m_cq_id; + + m_n_global_sn = next_sn.global_sn; + } + + cq_poll_sn = m_n_global_sn; +} + +mem_buf_desc_t* cq_mgr_mlx5::process_cq_element_rx(mem_buf_desc_t* p_mem_buf_desc, enum buff_status_e status) +{ + /* Assume locked!!! */ + cq_logfuncall(""); + + /* we use context to verify that on reclaim rx buffer path we return the buffer to the right CQ */ + p_mem_buf_desc->rx.is_vma_thr = false; + p_mem_buf_desc->rx.context = NULL; + p_mem_buf_desc->rx.socketxtreme_polled = false; + + if (unlikely(status != BS_OK)) { + m_p_next_rx_desc_poll = NULL; + if (p_mem_buf_desc->p_desc_owner) { + m_p_ring->mem_buf_desc_completion_with_error_rx(p_mem_buf_desc); + } else { + /* AlexR: are we throwing away a data buffer and a mem_buf_desc element? */ + cq_logdbg("no desc_owner(wr_id=%p)", p_mem_buf_desc); + } + + return NULL; + } + + if (m_n_sysvar_rx_prefetch_bytes_before_poll) { + m_p_next_rx_desc_poll = p_mem_buf_desc->p_prev_desc; + p_mem_buf_desc->p_prev_desc = NULL; + } + + VALGRIND_MAKE_MEM_DEFINED(p_mem_buf_desc->p_buffer, p_mem_buf_desc->sz_data); + + prefetch_range((uint8_t*)p_mem_buf_desc->p_buffer + m_sz_transport_header, + std::min(p_mem_buf_desc->sz_data - m_sz_transport_header, (size_t)m_n_sysvar_rx_prefetch_bytes)); + + + return p_mem_buf_desc; +} + +int cq_mgr_mlx5::poll_and_process_element_rx(uint64_t* p_cq_poll_sn, void* pv_fd_ready_array) +{ + /* Assume locked!!! */ + cq_logfuncall(""); + + uint32_t ret_rx_processed = process_recv_queue(pv_fd_ready_array); + if (unlikely(ret_rx_processed >= m_n_sysvar_cq_poll_batch_max)) { + m_p_ring->m_gro_mgr.flush_all(pv_fd_ready_array); + return ret_rx_processed; + } + + if (m_p_next_rx_desc_poll) { + prefetch_range((uint8_t*)m_p_next_rx_desc_poll->p_buffer, m_n_sysvar_rx_prefetch_bytes_before_poll); + } + + if (m_b_sysvar_enable_socketxtreme) { + if (unlikely(m_rx_hot_buffer == NULL)) { + int index = m_qp->m_mlx5_qp.rq.tail & (m_qp->m_rx_num_wr - 1); + m_rx_hot_buffer = (mem_buf_desc_t*)(uintptr_t)m_qp->m_rq_wqe_idx_to_wrid[index]; + m_rx_hot_buffer->rx.context = NULL; + m_rx_hot_buffer->rx.is_vma_thr = false; + m_rx_hot_buffer->rx.socketxtreme_polled = false; + } + else { + mlx5_cqe64 *cqe_err = NULL; + mlx5_cqe64 *cqe = get_cqe64(&cqe_err); + + if (likely(cqe)) { + ++m_n_wce_counter; + ++m_qp->m_mlx5_qp.rq.tail; + m_rx_hot_buffer->sz_data = ntohl(cqe->byte_cnt); + m_rx_hot_buffer->rx.flow_tag_id = vma_get_flow_tag(cqe); + m_rx_hot_buffer->rx.is_sw_csum_need = !(m_b_is_rx_hw_csum_on && + (cqe->hds_ip_ext & MLX5_CQE_L4_OK) && (cqe->hds_ip_ext & MLX5_CQE_L3_OK)); + + if (unlikely(++m_qp_rec.debt >= (int)m_n_sysvar_rx_num_wr_to_post_recv)) { + (void)compensate_qp_poll_success(m_rx_hot_buffer); + } + process_recv_buffer(m_rx_hot_buffer, pv_fd_ready_array); + ++ret_rx_processed; + m_rx_hot_buffer = NULL; + } + else if (cqe_err) { + ret_rx_processed += poll_and_process_error_element_rx(cqe_err, pv_fd_ready_array); + } + else { + compensate_qp_poll_failed(); + } + + } + } else { + buff_status_e status = BS_OK; + uint32_t ret = 0; + while (ret < m_n_sysvar_cq_poll_batch_max) { + mem_buf_desc_t *buff = poll(status); + if (buff) { + ++ret; + if (process_cq_element_rx(buff, status)) { + if ((++m_qp_rec.debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || + !compensate_qp_poll_success(buff)) { + process_recv_buffer(buff, pv_fd_ready_array); + } + } + } else { + m_b_was_drained = true; + break; + } + } + + update_global_sn(*p_cq_poll_sn, ret); + + if (likely(ret > 0)) { + ret_rx_processed += ret; + m_n_wce_counter += ret; + m_p_ring->m_gro_mgr.flush_all(pv_fd_ready_array); + } else { + compensate_qp_poll_failed(); + } + } + + return ret_rx_processed; +} + +int cq_mgr_mlx5::poll_and_process_element_rx(mem_buf_desc_t **p_desc_lst) +{ + int packets_num = 0; + + if (unlikely(m_rx_hot_buffer == NULL)) { + int index = m_qp->m_mlx5_qp.rq.tail & (m_qp->m_rx_num_wr - 1); + m_rx_hot_buffer = (mem_buf_desc_t*)(uintptr_t)m_qp->m_rq_wqe_idx_to_wrid[index]; + m_rx_hot_buffer->rx.context = NULL; + m_rx_hot_buffer->rx.is_vma_thr = false; + } + //prefetch_range((uint8_t*)m_rx_hot_buffer->p_buffer,safe_mce_sys().rx_prefetch_bytes_before_poll); +#ifdef RDTSC_MEASURE_RX_VERBS_READY_POLL + RDTSC_TAKE_START(g_rdtsc_instr_info_arr[RDTSC_FLOW_RX_VERBS_READY_POLL]); +#endif //RDTSC_MEASURE_RX_VERBS_READY_POLL + +#ifdef RDTSC_MEASURE_RX_VERBS_IDLE_POLL + RDTSC_TAKE_START(g_rdtsc_instr_info_arr[RDTSC_FLOW_RX_VERBS_IDLE_POLL]); +#endif //RDTSC_MEASURE_RX_VERBS_IDLE_POLL + +#ifdef RDTSC_MEASURE_RX_VMA_TCP_IDLE_POLL + RDTSC_TAKE_END(g_rdtsc_instr_info_arr[RDTSC_FLOW_RX_VMA_TCP_IDLE_POLL]); +#endif //RDTSC_MEASURE_RX_VMA_TCP_IDLE_POLL + mlx5_cqe64 *cqe_err = NULL; + mlx5_cqe64 *cqe = get_cqe64(&cqe_err); + + if (likely(cqe)) { + ++m_n_wce_counter; + ++m_qp->m_mlx5_qp.rq.tail; + m_rx_hot_buffer->sz_data = ntohl(cqe->byte_cnt); + m_rx_hot_buffer->rx.hw_raw_timestamp = ntohll(cqe->timestamp); + m_rx_hot_buffer->rx.flow_tag_id = vma_get_flow_tag(cqe); + + m_rx_hot_buffer->rx.is_sw_csum_need = !(m_b_is_rx_hw_csum_on && (cqe->hds_ip_ext & MLX5_CQE_L4_OK) && (cqe->hds_ip_ext & MLX5_CQE_L3_OK)); + + if (unlikely(++m_qp_rec.debt >= (int)m_n_sysvar_rx_num_wr_to_post_recv)) { + (void)compensate_qp_poll_success(m_rx_hot_buffer); + } + ++packets_num; + *p_desc_lst = m_rx_hot_buffer; + m_rx_hot_buffer = NULL; + } + else if (cqe_err) { + /* Return nothing in case error wc + * It is difference with poll_and_process_element_rx() + */ + poll_and_process_error_element_rx(cqe_err, NULL); + *p_desc_lst = NULL; + } + else { +#ifdef RDTSC_MEASURE_RX_VERBS_IDLE_POLL + RDTSC_TAKE_END(g_rdtsc_instr_info_arr[RDTSC_FLOW_RX_VERBS_IDLE_POLL]); +#endif + +#ifdef RDTSC_MEASURE_RX_VMA_TCP_IDLE_POLL + RDTSC_TAKE_START(g_rdtsc_instr_info_arr[RDTSC_FLOW_RX_VMA_TCP_IDLE_POLL]); +#endif + +#ifdef RDTSC_MEASURE_RX_CQE_RECEIVEFROM + RDTSC_TAKE_START(g_rdtsc_instr_info_arr[RDTSC_FLOW_RX_CQE_TO_RECEIVEFROM]); +#endif + compensate_qp_poll_failed(); + } + + return packets_num; + +} + +inline void cq_mgr_mlx5::cqe64_to_vma_wc(struct mlx5_cqe64 *cqe, vma_ibv_wc *wc) +{ + struct mlx5_err_cqe* ecqe = (struct mlx5_err_cqe *)cqe; + + switch (cqe->op_own >> 4) { + case MLX5_CQE_RESP_WR_IMM: + cq_logerr("IBV_WC_RECV_RDMA_WITH_IMM is not supported"); + break; + case MLX5_CQE_RESP_SEND: + case MLX5_CQE_RESP_SEND_IMM: + case MLX5_CQE_RESP_SEND_INV: + vma_wc_opcode(*wc) = VMA_IBV_WC_RECV; + wc->byte_len = ntohl(cqe->byte_cnt); + wc->status = IBV_WC_SUCCESS; + return; + case MLX5_CQE_REQ: + wc->status = IBV_WC_SUCCESS; + return; + default: + break; + } + + /* Only IBV_WC_WR_FLUSH_ERR is used in code */ + if (MLX5_CQE_SYNDROME_WR_FLUSH_ERR == ecqe->syndrome) { + wc->status = IBV_WC_WR_FLUSH_ERR; + } else { + wc->status = IBV_WC_GENERAL_ERR; + } + + wc->vendor_err = ecqe->vendor_err_synd; +} + +inline struct mlx5_cqe64* cq_mgr_mlx5::check_error_completion(struct mlx5_cqe64 *cqe, uint32_t *ci, + uint8_t op_own) +{ + switch (op_own >> 4) { + case MLX5_CQE_REQ_ERR: + case MLX5_CQE_RESP_ERR: + ++(*ci); + rmb(); + *m_mlx5_cq.dbrec = htonl((*ci)); + return cqe; + + case MLX5_CQE_INVALID: + default: + return NULL; /* No CQE */ + } +} + +inline struct mlx5_cqe64 *cq_mgr_mlx5::get_cqe64(struct mlx5_cqe64 **cqe_err) +{ + struct mlx5_cqe64 *cqe = (struct mlx5_cqe64 *)(((uint8_t*)m_mlx5_cq.cq_buf) + + ((m_mlx5_cq.cq_ci & (m_mlx5_cq.cqe_count - 1)) << m_mlx5_cq.cqe_size_log)); + uint8_t op_own = cqe->op_own; + + /* Check ownership and invalid opcode + * Return cqe_err for 0x80 - MLX5_CQE_REQ_ERR, MLX5_CQE_RESP_ERR or MLX5_CQE_INVALID + */ + if (unlikely((op_own & MLX5_CQE_OWNER_MASK) == !(m_mlx5_cq.cq_ci & m_mlx5_cq.cqe_count))) { + return NULL; + } else if (unlikely((op_own >> 4) == MLX5_CQE_INVALID)) { + return NULL; + } else if (cqe_err && (op_own & 0x80)) { + *cqe_err = check_error_completion(cqe, &m_mlx5_cq.cq_ci, op_own); + return NULL; + } + + ++m_mlx5_cq.cq_ci; + rmb(); + *m_mlx5_cq.dbrec = htonl(m_mlx5_cq.cq_ci); + + return cqe; +} + +int cq_mgr_mlx5::poll_and_process_error_element_tx(struct mlx5_cqe64 *cqe, uint64_t* p_cq_poll_sn) +{ + uint16_t wqe_ctr = ntohs(cqe->wqe_counter); + int index = wqe_ctr & (m_qp->m_tx_num_wr - 1); + mem_buf_desc_t* buff = NULL; + vma_ibv_wc wce; + + // spoil the global sn if we have packets ready + union __attribute__((packed)) { + uint64_t global_sn; + struct { + uint32_t cq_id; + uint32_t cq_sn; + } bundle; + } next_sn; + next_sn.bundle.cq_sn = ++m_n_cq_poll_sn; + next_sn.bundle.cq_id = m_cq_id; + + *p_cq_poll_sn = m_n_global_sn = next_sn.global_sn; + + memset(&wce, 0, sizeof(wce)); + if (m_qp->m_sq_wqe_idx_to_wrid) { + wce.wr_id = m_qp->m_sq_wqe_idx_to_wrid[index]; + cqe64_to_vma_wc(cqe, &wce); + + buff = cq_mgr::process_cq_element_tx(&wce); + if (buff) { + cq_mgr::process_tx_buffer_list(buff); + } + return 1; + } + return 0; +} + +int cq_mgr_mlx5::poll_and_process_element_tx(uint64_t* p_cq_poll_sn) +{ + // Assume locked!!! + cq_logfuncall(""); + + int ret = 0; + mlx5_cqe64 *cqe_err = NULL; + mlx5_cqe64 *cqe = get_cqe64(&cqe_err); + + if (likely(cqe)) { + uint16_t wqe_ctr = ntohs(cqe->wqe_counter); + int index = wqe_ctr & (m_qp->m_tx_num_wr - 1); + mem_buf_desc_t* buff = (mem_buf_desc_t*)(uintptr_t)m_qp->m_sq_wqe_idx_to_wrid[index]; + // spoil the global sn if we have packets ready + union __attribute__((packed)) { + uint64_t global_sn; + struct { + uint32_t cq_id; + uint32_t cq_sn; + } bundle; + } next_sn; + next_sn.bundle.cq_sn = ++m_n_cq_poll_sn; + next_sn.bundle.cq_id = m_cq_id; + + *p_cq_poll_sn = m_n_global_sn = next_sn.global_sn; + + cq_mgr::process_tx_buffer_list(buff); + ret = 1; + } + else if (cqe_err) { + ret = poll_and_process_error_element_tx(cqe_err, p_cq_poll_sn); + } + else { + *p_cq_poll_sn = m_n_global_sn; + } + + return ret; +} + +void cq_mgr_mlx5::set_qp_rq(qp_mgr* qp) +{ + m_qp = static_cast (qp); + + m_qp->m_rq_wqe_counter = 0; /* In case of bonded qp, wqe_counter must be reset to zero */ + m_rx_hot_buffer = NULL; + + if (0 != vma_ib_mlx5_get_cq(m_p_ibv_cq, &m_mlx5_cq)) { + cq_logpanic("vma_ib_mlx5_get_cq failed (errno=%d %m)", errno); + } + VALGRIND_MAKE_MEM_DEFINED(&m_mlx5_cq, sizeof(m_mlx5_cq)); + cq_logfunc("qp_mgr=%p m_mlx5_cq.dbrec=%p m_mlx5_cq.cq_buf=%p", m_qp, m_mlx5_cq.dbrec, m_mlx5_cq.cq_buf); +} + +void cq_mgr_mlx5::add_qp_rx(qp_mgr* qp) +{ + cq_logfunc(""); + set_qp_rq(qp); + cq_mgr::add_qp_rx(qp); +} + +void cq_mgr_mlx5::add_qp_tx(qp_mgr* qp) +{ + //Assume locked! + cq_mgr::add_qp_tx(qp); + m_qp = static_cast (qp); + + if (0 != vma_ib_mlx5_get_cq(m_p_ibv_cq, &m_mlx5_cq)) { + cq_logpanic("vma_ib_mlx5_get_cq failed (errno=%d %m)", errno); + } + + cq_logfunc("qp_mgr=%p m_mlx5_cq.dbrec=%p m_mlx5_cq.cq_buf=%p", m_qp, m_mlx5_cq.dbrec, m_mlx5_cq.cq_buf); +} + +bool cq_mgr_mlx5::fill_cq_hw_descriptors(struct hw_cq_data &data) +{ + cq_logdbg("Returning HW descriptors for CQ %p cqn %u cqe_cnt %u buf %p " + "dbrec %p cqe_size %u", m_p_ibv_cq, m_mlx5_cq.cq_num, m_mlx5_cq.cqe_count, + m_mlx5_cq.cq_buf, m_mlx5_cq.dbrec, m_mlx5_cq.cqe_size); + + data.buf = m_mlx5_cq.cq_buf; + data.cons_idx = &m_mlx5_cq.cq_ci; + data.cq_size = m_mlx5_cq.cqe_count; + data.cqe_size = m_mlx5_cq.cqe_size; + data.cqn = m_mlx5_cq.cq_num; + data.dbrec = m_mlx5_cq.dbrec; + + /* Not supported yet */ + data.uar = NULL; + + return true; +} + +int cq_mgr_mlx5::poll_and_process_error_element_rx(struct mlx5_cqe64 *cqe, void* pv_fd_ready_array) +{ + vma_ibv_wc wce; + + memset(&wce, 0, sizeof(wce)); + wce.wr_id = (uintptr_t)m_rx_hot_buffer; + cqe64_to_vma_wc(cqe, &wce); + + ++m_n_wce_counter; + ++m_qp->m_mlx5_qp.rq.tail; + + m_rx_hot_buffer = cq_mgr::process_cq_element_rx(&wce); + if (m_rx_hot_buffer) { + if (vma_wc_opcode(wce) & VMA_IBV_WC_RECV) { + if ((++m_qp_rec.debt < (int)m_n_sysvar_rx_num_wr_to_post_recv) || + !compensate_qp_poll_success(m_rx_hot_buffer)) { + process_recv_buffer(m_rx_hot_buffer, pv_fd_ready_array); + } + } + } + m_rx_hot_buffer = NULL; + + return 1; +} + +#endif /* DEFINED_DIRECT_VERBS */ diff --git a/src/vma/dev/cq_mgr_mlx5.h b/src/vma/dev/cq_mgr_mlx5.h new file mode 100644 index 0000000..5abff22 --- /dev/null +++ b/src/vma/dev/cq_mgr_mlx5.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef CQ_MGR_MLX5_H +#define CQ_MGR_MLX5_H + +#include "cq_mgr.h" +#include "qp_mgr_eth_mlx5.h" + +#if defined(DEFINED_DIRECT_VERBS) +class qp_mgr_eth_mlx5; + +/* Get CQE opcode. */ +#define MLX5_CQE_OPCODE(op_own) ((op_own) >> 4) + +/* Get CQE owner bit. */ +#define MLX5_CQE_OWNER(op_own) ((op_own) & MLX5_CQE_OWNER_MASK) + +class cq_mgr_mlx5: public cq_mgr +{ +public: + + enum buff_status_e{BS_OK, BS_CQE_RESP_WR_IMM_NOT_SUPPORTED, BS_IBV_WC_WR_FLUSH_ERR, BS_CQE_INVALID, BS_GENERAL_ERR}; + + cq_mgr_mlx5(ring_simple* p_ring, ib_ctx_handler* p_ib_ctx_handler, uint32_t cq_size, + struct ibv_comp_channel* p_comp_event_channel, bool is_rx, bool call_configure = true); + virtual ~cq_mgr_mlx5(); + + virtual mem_buf_desc_t* poll(enum buff_status_e& status); + virtual int drain_and_proccess(uintptr_t* p_recycle_buffers_last_wr_id = NULL); + virtual int poll_and_process_element_rx(uint64_t* p_cq_poll_sn, void* pv_fd_ready_array = NULL); + virtual int poll_and_process_element_rx(mem_buf_desc_t **p_desc_lst); + virtual int poll_and_process_element_tx(uint64_t* p_cq_poll_sn); + int poll_and_process_error_element_tx(struct mlx5_cqe64 *cqe, uint64_t* p_cq_poll_sn); + int poll_and_process_error_element_rx(struct mlx5_cqe64 *cqe, void* pv_fd_ready_array); + + virtual mem_buf_desc_t* process_cq_element_rx(mem_buf_desc_t* p_mem_buf_desc, enum buff_status_e status); + virtual void add_qp_rx(qp_mgr* qp); + void set_qp_rq(qp_mgr* qp); + virtual void add_qp_tx(qp_mgr* qp); + virtual uint32_t clean_cq(); + virtual bool fill_cq_hw_descriptors(struct hw_cq_data &data); + virtual void get_cq_event(int count = 1) { vma_ib_mlx5_get_cq_event(&m_mlx5_cq, count); }; + +protected: + qp_mgr_eth_mlx5* m_qp; + vma_ib_mlx5_cq_t m_mlx5_cq; + inline struct mlx5_cqe64* check_cqe(void); + +private: + const bool m_b_sysvar_enable_socketxtreme; + mem_buf_desc_t *m_rx_hot_buffer; + + inline struct mlx5_cqe64* get_cqe64(struct mlx5_cqe64 **cqe_err = NULL); + inline void cqe64_to_mem_buff_desc(struct mlx5_cqe64 *cqe, mem_buf_desc_t* p_rx_wc_buf_desc, enum buff_status_e& status); + void cqe64_to_vma_wc(struct mlx5_cqe64 *cqe, vma_ibv_wc *wc); + inline struct mlx5_cqe64* check_error_completion(struct mlx5_cqe64 *cqe, uint32_t *ci, uint8_t op_own); + inline void update_global_sn(uint64_t& cq_poll_sn, uint32_t rettotal); + + virtual int req_notify_cq() { + return vma_ib_mlx5_req_notify_cq(&m_mlx5_cq, 0); + }; +}; + +#endif /* DEFINED_DIRECT_VERBS */ +#endif //CQ_MGR_MLX5_H diff --git a/src/vma/dev/cq_mgr_mlx5.inl b/src/vma/dev/cq_mgr_mlx5.inl new file mode 100644 index 0000000..17a3a59 --- /dev/null +++ b/src/vma/dev/cq_mgr_mlx5.inl @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef CQ_MGR_MLX5_INL_H +#define CQ_MGR_MLX5_INL_H + +#include "dev/cq_mgr_mlx5.h" + +#if defined(DEFINED_DIRECT_VERBS) + +/**/ +/** inlining functions can only help if they are implemented before their usage **/ +/**/ +inline struct mlx5_cqe64* cq_mgr_mlx5::check_cqe(void) +{ + struct mlx5_cqe64* cqe = (struct mlx5_cqe64 *)(((uint8_t *)m_mlx5_cq.cq_buf) + + ((m_mlx5_cq.cq_ci & (m_mlx5_cq.cqe_count - 1)) << m_mlx5_cq.cqe_size_log)); + /* + * CQE ownership is defined by Owner bit in the CQE. + * The value indicating SW ownership is flipped every + * time CQ wraps around. + * */ + if (likely((MLX5_CQE_OPCODE(cqe->op_own)) != MLX5_CQE_INVALID) && + !((MLX5_CQE_OWNER(cqe->op_own)) ^ !!(m_mlx5_cq.cq_ci & m_mlx5_cq.cqe_count))) { + return cqe; + } + + return NULL; +} + +#endif /* DEFINED_DIRECT_VERBS */ +#endif//CQ_MGR_MLX5_INL_H diff --git a/src/vma/dev/cq_mgr_mp.cpp b/src/vma/dev/cq_mgr_mp.cpp new file mode 100644 index 0000000..5510676 --- /dev/null +++ b/src/vma/dev/cq_mgr_mp.cpp @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "vma/util/valgrind.h" +#include "dev/cq_mgr_mp.h" +#include "dev/cq_mgr_mlx5.inl" +#include "dev/qp_mgr_mp.h" + +#define MODULE_NAME "cqm" + +#define cq_logpanic __log_info_panic +#define cq_logerr __log_info_err +#define cq_logwarn __log_info_warn +#define cq_loginfo __log_info_info +#define cq_logdbg __log_info_dbg +#define cq_logfunc __log_info_func +#define cq_logfuncall __log_info_funcall +#define cq_logfine __log_info_fine + + +#ifdef HAVE_MP_RQ + +enum { + /* Masks to handle the CQE byte_count field in case of MP RQ */ + MP_RQ_BYTE_CNT_FIELD_MASK = 0x0000FFFF, + MP_RQ_NUM_STRIDES_FIELD_MASK = 0x7FFF0000, + MP_RQ_FILLER_FIELD_MASK = 0x80000000, + MP_RQ_NUM_STRIDES_FIELD_SHIFT = 16, +}; + +// for optimization expected checksum for receiving packets +const uint32_t cq_mgr_mp::UDP_OK_FLAGS = IBV_EXP_CQ_RX_IP_CSUM_OK | + IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK; + +cq_mgr_mp::cq_mgr_mp(const ring_eth_cb *p_ring, ib_ctx_handler *p_ib_ctx_handler, + uint32_t cq_size, + struct ibv_comp_channel *p_comp_event_channel, + bool is_rx, bool external_mem): + cq_mgr_mlx5((ring_simple*)p_ring, p_ib_ctx_handler, + cq_size , p_comp_event_channel, is_rx, false), + m_rq_tail(0), + m_p_ring(p_ring), + m_external_mem(external_mem), + m_qp(NULL) +{ + // must call from derive in order to call derived hooks + m_p_cq_stat->n_buffer_pool_len = cq_size; + m_p_cq_stat->n_rx_drained_at_once_max = 0; + configure(cq_size); +} + +void cq_mgr_mp::prep_ibv_cq(vma_ibv_cq_init_attr &attr) const +{ + cq_mgr::prep_ibv_cq(attr); + attr.comp_mask |= IBV_EXP_CQ_INIT_ATTR_RES_DOMAIN; + attr.res_domain = m_p_ring->get_res_domain(); +} + +void cq_mgr_mp::add_qp_rx(qp_mgr *qp) +{ + cq_logdbg("qp_mp_mgr=%p", qp); + qp_mgr_mp* mp_qp = dynamic_cast(qp); + + if (mp_qp == NULL) { + cq_logdbg("this qp is not of type qp_mgr_mp %p", qp); + throw_vma_exception("this qp is not of type qp_mgr_mp"); + } + set_qp_rq(qp); + m_qp_rec.qp = qp; + if (m_external_mem) { + cq_logdbg("this qp uses an external memory %p", qp); + } else { + if (mp_qp->post_recv(0, mp_qp->get_wq_count()) != 0) { + cq_logdbg("qp post recv failed"); + } else { + cq_logdbg("Successfully post_recv qp with %d new Rx buffers", + mp_qp->get_wq_count()); + } + } +} + +uint32_t cq_mgr_mp::clean_cq() +{ + return 0; +} + +/** + * this function polls the CQ, and extracts the needed fields + * upon CQE error state it will return -1 + * if a bad checksum packet or a filler bit it will return VMA_MP_RQ_BAD_PACKET + */ +int cq_mgr_mp::poll_mp_cq(uint16_t &size, uint32_t &strides_used, + uint32_t &flags, struct mlx5_cqe64 *&out_cqe64) +{ + struct mlx5_cqe64 *cqe= check_cqe(); + if (likely(cqe)) { + if (unlikely(MLX5_CQE_OPCODE(cqe->op_own) != MLX5_CQE_RESP_SEND)) { + cq_logdbg("Warning op_own is %x", MLX5_CQE_OPCODE(cqe->op_own)); + // optimize checks in ring by setting size non zero + if (MLX5_CQE_OPCODE(cqe->op_own) == MLX5_CQE_RESP_ERR) { + cq_logdbg("poll_length, CQE response error, " + "syndrome=0x%x, vendor syndrome error=0x%x, " + "HW syndrome 0x%x, HW syndrome type 0x%x\n", + ((struct mlx5_err_cqe *)cqe)->syndrome, + ((struct mlx5_err_cqe *)cqe)->vendor_err_synd, + ((struct mlx5_err_cqe *)cqe)->hw_err_synd, + ((struct mlx5_err_cqe *)cqe)->hw_synd_type); + } + size = 1; + m_p_cq_stat->n_rx_pkt_drop++; + return -1; + } + m_p_cq_stat->n_rx_pkt_drop += cqe->sop_qpn.sop; + out_cqe64 = cqe; + uint32_t stride_byte_cnt = ntohl(cqe->byte_cnt); + strides_used = (stride_byte_cnt & MP_RQ_NUM_STRIDES_FIELD_MASK) >> + MP_RQ_NUM_STRIDES_FIELD_SHIFT; + flags = (!!(cqe->hds_ip_ext & MLX5_CQE_L4_OK) * IBV_EXP_CQ_RX_TCP_UDP_CSUM_OK) | + (!!(cqe->hds_ip_ext & MLX5_CQE_L3_OK) * IBV_EXP_CQ_RX_IP_CSUM_OK); + if (likely(flags == UDP_OK_FLAGS)) { + size = stride_byte_cnt & MP_RQ_BYTE_CNT_FIELD_MASK; + } else { + // if CSUM is bad it can be either filler or bad packet + flags = VMA_MP_RQ_BAD_PACKET; + size = 1; + if (stride_byte_cnt & MP_RQ_FILLER_FIELD_MASK) { + m_p_cq_stat->n_rx_pkt_drop++; + } + } + ++m_mlx5_cq.cq_ci; + prefetch((uint8_t*)m_mlx5_cq.cq_buf + ((m_mlx5_cq.cq_ci & (m_mlx5_cq.cqe_count - 1)) << m_mlx5_cq.cqe_size_log)); + } else { + size = 0; + flags = 0; + } + cq_logfine("returning packet size %d, stride used %d " + "flags %d", size, strides_used, flags); + return 0; +} + +void cq_mgr_mp::set_qp_rq(qp_mgr* qp) +{ + m_qp = static_cast (qp); + + mlx5_rwq *mrwq = container_of(m_qp->get_wq(), struct mlx5_rwq, wq); + m_rq_tail = &mrwq->rq.tail; + + if (0 != vma_ib_mlx5_get_cq(m_p_ibv_cq, &m_mlx5_cq)) { + cq_logpanic("vma_ib_mlx5_get_cq failed (errno=%d %m)", errno); + } + VALGRIND_MAKE_MEM_DEFINED(&m_mlx5_cq, sizeof(m_mlx5_cq)); + cq_logfunc("qp_mgr=%p m_mlx5_cq.dbrec=%p m_mlx5_cq.cq_buf=%p", m_qp, m_mlx5_cq.dbrec, m_mlx5_cq.cq_buf); +} + +void cq_mgr_mp::update_dbell() +{ + wmb(); + (*m_rq_tail)++; + *m_mlx5_cq.dbrec = htonl(m_mlx5_cq.cq_ci & 0xffffff); +} + +cq_mgr_mp::~cq_mgr_mp() +{ + struct mlx5_cqe64 *out_cqe64; + uint16_t size; + uint32_t strides_used = 0, flags = 0; + int ret; + do { + ret = poll_mp_cq(size, strides_used, flags, out_cqe64); + } while (size > 0 || ret); + // prevents seg fault in mlx5 destructor +} +#endif // HAVE_MP_RQ + diff --git a/src/vma/dev/cq_mgr_mp.h b/src/vma/dev/cq_mgr_mp.h new file mode 100644 index 0000000..5e01e4d --- /dev/null +++ b/src/vma/dev/cq_mgr_mp.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef SRC_VMA_DEV_CQ_MGR_MP_H_ +#define SRC_VMA_DEV_CQ_MGR_MP_H_ + +#include "dev/cq_mgr_mlx5.h" +#include "dev/ring_eth_cb.h" +#include "dev/qp_mgr_mp.h" + +#ifdef HAVE_MP_RQ + +class cq_mgr_mp : public cq_mgr_mlx5 +{ +public: + cq_mgr_mp(const ring_eth_cb *p_ring, ib_ctx_handler *p_ib_ctx_handler, + uint32_t cq_size, struct ibv_comp_channel *p_comp_event_channel, + bool is_rx, bool external_mem); + ~cq_mgr_mp(); + int poll_mp_cq(uint16_t &size, uint32_t &strides_used, + uint32_t &flags, + struct mlx5_cqe64 *&cqe64); + void update_dbell(); + void update_max_drain(uint32_t t) { m_p_cq_stat->n_rx_drained_at_once_max = + std::max(m_p_cq_stat->n_rx_drained_at_once_max, t);} +protected: + virtual void prep_ibv_cq(vma_ibv_cq_init_attr &attr) const; + virtual void add_qp_rx(qp_mgr *qp); + virtual void set_qp_rq(qp_mgr *qp); + virtual uint32_t clean_cq(); +private: + uint32_t *m_rq_tail; + const ring_eth_cb *m_p_ring; + bool m_external_mem; + qp_mgr_mp *m_qp; + static const uint32_t UDP_OK_FLAGS; +}; +#endif /* HAVE_MP_RQ */ + +#endif /* SRC_VMA_DEV_CQ_MGR_MP_H_ */ diff --git a/src/vma/dev/dm_mgr.cpp b/src/vma/dev/dm_mgr.cpp new file mode 100644 index 0000000..135cc37 --- /dev/null +++ b/src/vma/dev/dm_mgr.cpp @@ -0,0 +1,267 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "dm_mgr.h" +#include "vlogger/vlogger.h" +#include "vma/proto/mem_buf_desc.h" +#include "vma/dev/ib_ctx_handler.h" + +#if defined(DEFINED_DIRECT_VERBS) +#if defined(DEFINED_IBV_DM) + +#define DM_MEMORY_MASK_8 7 +#define DM_MEMORY_MASK_64 63 +#define DM_ALIGN_SIZE(size, mask) ((size + mask) & (~mask)) + +#undef MODULE_NAME +#define MODULE_NAME "dm_mgr" +#undef MODULE_HDR +#define MODULE_HDR MODULE_NAME "%d:%s() " + +#define dm_logerr __log_info_err +#define dm_logwarn __log_info_warn +#define dm_logdbg __log_info_dbg +#define dm_logfunc __log_info_func + +dm_mgr::dm_mgr() : + m_p_dm_mr(NULL), + m_p_ibv_dm(NULL), + m_p_ring_stat(NULL), + m_allocation(0), + m_used(0), + m_head(0) +{}; + +/* + * Allocate dev_mem resources + */ +bool dm_mgr::allocate_resources(ib_ctx_handler* ib_ctx, ring_stats_t* ring_stats) +{ + size_t allocation_size = DM_ALIGN_SIZE(safe_mce_sys().ring_dev_mem_tx, DM_MEMORY_MASK_64); + vma_ibv_alloc_dm_attr dm_attr; + vma_ibv_reg_mr_in mr_in; + m_p_ring_stat = ring_stats; + if (!allocation_size) { + // On Device Memory usage was disabled by the user + return false; + } + + if (!ib_ctx->get_on_device_memory_size()) { + // On Device Memory usage is not supported + return false; + } + + // Allocate on device memory buffer + memset(&dm_attr, 0, sizeof(dm_attr)); + dm_attr.length = allocation_size; + m_p_ibv_dm = vma_ibv_alloc_dm(ib_ctx->get_ibv_context(), &dm_attr); + if (!m_p_ibv_dm) { + // Memory allocation can fail if we have already allocated the maximum possible. + dm_logdbg("ibv_alloc_dm error - On Device Memory allocation failed, %d %m", errno); + errno = 0; + return false; + } + + // Initialize MR attributes + memset(&mr_in, 0, sizeof(mr_in)); + vma_ibv_init_dm_mr(mr_in, ib_ctx->get_ibv_pd(), allocation_size, m_p_ibv_dm); + + // Register On Device Memory MR + m_p_dm_mr = vma_ibv_reg_dm_mr(&mr_in); + if (!m_p_dm_mr) { + vma_ibv_free_dm(m_p_ibv_dm); + m_p_ibv_dm = NULL; + dm_logerr("ibv_free_dm error - dm_mr registration failed, %d %m", errno); + return false; + } + + m_allocation = allocation_size; + m_p_ring_stat->simple.n_tx_dev_mem_allocated = m_allocation; + + dm_logdbg("Device memory allocation completed successfully! device[%s] bytes[%zu] dm_mr handle[%d] dm_mr lkey[%d]", + ib_ctx->get_ibv_device()->name, dm_attr.length, m_p_dm_mr->handle, m_p_dm_mr->lkey); + + return true; +} + +/* + * Release dev_mem resources + */ +void dm_mgr::release_resources() +{ + if (m_p_dm_mr) { + if (ibv_dereg_mr(m_p_dm_mr)) { + dm_logerr("ibv_dereg_mr failed, %d %m", errno); + } else { + dm_logdbg("ibv_dereg_mr success"); + } + m_p_dm_mr = NULL; + } + + if (m_p_ibv_dm) { + if (vma_ibv_free_dm(m_p_ibv_dm)) { + dm_logerr("ibv_free_dm failed %d %m", errno); + } else { + dm_logdbg("ibv_free_dm success"); + } + m_p_ibv_dm = NULL; + } + + m_p_ring_stat = NULL; + + dm_logdbg("Device memory release completed!"); +} + +/* + * Copy data into the On Device Memory buffer. + * + * On Device Memory buffer is implemented in a cycle way using two variables : + * m_head - index of the next offset to be written. + * m_used - amount of used bytes within the On Device Memory buffer (which also used to calculate the tail of the buffer). + * + * In order to maintain a proper order of allocation and release, we must distinguish between three possible cases: + * + * First case: + * Free space exists in the beginning and in the end of the array. + * + * |-------------------------------------------| + * | |XXXXXXXXXX| | + * |-------------------------------------------| + * tail head + * + * Second case: + * There is not enough free space at the end of the array. + * |-------------------------------------------| + * | |XXXXXXXXXX| | + * |-------------------------------------------| + * tail head + * + * In the case above, we will move the head to the beginning of the array. + * |-------------------------------------------| + * | |XXXXXXXXXXXXX| + * |-------------------------------------------| + * head tail + * + * Third case: + * Free space exists in the middle of the array + * |-------------------------------------------| + * |XXXXXXXXXXXXXX| |XXXXXX| + * |-------------------------------------------| + * head tail + * + * Due to hardware limitations: + * 1. Data should be written to 4bytes aligned addresses. + * 2. Data length should be aligned to 4bytes. + * + * Due to performance reasons: + * 1. Data should be written to a continuous memory area. + * 2. Data will be written to 8bytes aligned addresses. + */ +bool dm_mgr::copy_data(struct mlx5_wqe_data_seg* seg, uint8_t* src, uint32_t length, mem_buf_desc_t* buff) +{ + vma_ibv_memcpy_dm_attr memcpy_attr; + uint32_t length_aligned_8 = DM_ALIGN_SIZE(length, DM_MEMORY_MASK_8); + size_t continuous_left = 0; + size_t &dev_mem_length = buff->tx.dev_mem_length = 0; + + // Check if On Device Memory buffer is full + if (m_used >= m_allocation) { + goto dev_mem_oob; + } + + // Check for a continuous space to write + if (m_head >= m_used) { // First case + if ((continuous_left = m_allocation - m_head) < length_aligned_8) { // Second case + if (m_head - m_used >= length_aligned_8) { + // There is enough space at the beginning of the buffer. + m_head = 0; + dev_mem_length = continuous_left; + } else { + // There no enough space at the beginning of the buffer. + goto dev_mem_oob; + } + } + } else if ((continuous_left = m_allocation - m_used) < length_aligned_8) { // Third case + goto dev_mem_oob; + } + + // Initialize memcopy attributes + memset(&memcpy_attr, 0, sizeof(memcpy_attr)); + vma_ibv_init_memcpy_dm(memcpy_attr, src, m_head, length_aligned_8); + + // Copy data into the On Device Memory buffer. + if (vma_ibv_memcpy_dm(m_p_ibv_dm, &memcpy_attr)) { + dm_logfunc("Failed to memcopy data into the memic buffer %m"); + return false; + } + + // Update values + seg->lkey = htonl(m_p_dm_mr->lkey); + seg->addr = htonll(m_head); + m_head = (m_head + length_aligned_8) % m_allocation; + dev_mem_length += length_aligned_8; + m_used += dev_mem_length; + + // Update On Device Memory statistics + m_p_ring_stat->simple.n_tx_dev_mem_pkt_count++; + m_p_ring_stat->simple.n_tx_dev_mem_byte_count += length; + + dm_logfunc("Send completed successfully! Buffer[%p] length[%d] length_aligned_8[%d] continuous_left[%zu] head[%zu] used[%zu]", + buff, length, length_aligned_8, continuous_left, m_head, m_used); + + return true; + +dev_mem_oob: + dm_logfunc("Send OOB! Buffer[%p] length[%d] length_aligned_8[%d] continuous_left[%zu] head[%zu] used[%zu]", + buff, length, length_aligned_8, continuous_left, m_head, m_used); + + m_p_ring_stat->simple.n_tx_dev_mem_oob++; + + return false; +} + +/* + * Release On Device Memory buffer. + * This method should be called after completion was received. + */ +void dm_mgr::release_data(mem_buf_desc_t* buff) +{ + m_used -= buff->tx.dev_mem_length; + buff->tx.dev_mem_length = 0; + + dm_logfunc("Device memory release! buffer[%p] buffer_dev_mem_length[%zu] head[%zu] used[%zu]", + buff, buff->tx.dev_mem_length, m_head, m_used); + +} + +#endif /* DEFINED_IBV_DM */ +#endif /* DEFINED_DIRECT_VERBS */ diff --git a/src/vma/dev/dm_mgr.h b/src/vma/dev/dm_mgr.h new file mode 100644 index 0000000..0f2c60e --- /dev/null +++ b/src/vma/dev/dm_mgr.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef DM_MGR_H +#define DM_MGR_H + +#include "vma/ib/base/verbs_extra.h" +#include "vma/util/vma_stats.h" + +class mem_buf_desc_t; +class ib_ctx_handler; + +#if defined(DEFINED_DIRECT_VERBS) +#if defined(DEFINED_IBV_DM) + +#define DM_COMPLETION_THRESHOLD 8192 + +class dm_mgr { +public: + + dm_mgr(); + bool allocate_resources(ib_ctx_handler* ib_ctx, ring_stats_t* ring_stats); + void release_resources(); + bool copy_data(struct mlx5_wqe_data_seg* seg, uint8_t* src, uint32_t length, mem_buf_desc_t* buff); + void release_data(mem_buf_desc_t* buff); + inline bool is_completion_need() { return m_allocation - m_used < DM_COMPLETION_THRESHOLD; }; + +private: + + struct ibv_mr *m_p_dm_mr; + vma_ibv_dm *m_p_ibv_dm; + ring_stats_t *m_p_ring_stat; + size_t m_allocation; // Size of device memory buffer (bytes) + size_t m_used; // Next available index inside the buffer + size_t m_head; // Device memory used bytes +}; + +#else + +class dm_mgr { +public: + inline bool allocate_resources(ib_ctx_handler* ib_ctx, ring_stats_t* ring_stats) { NOT_IN_USE(ib_ctx); NOT_IN_USE(ring_stats); return false; }; + inline void release_resources() {}; + inline bool copy_data(struct mlx5_wqe_data_seg* seg, uint8_t* src, uint32_t length, mem_buf_desc_t* buff) { NOT_IN_USE(seg); NOT_IN_USE(src); NOT_IN_USE(length); NOT_IN_USE(buff); return false; }; + inline void release_data(mem_buf_desc_t* buff) { NOT_IN_USE(buff); }; + inline bool is_completion_need() { return false; }; +}; + +#endif /* DEFINED_IBV_DM */ +#endif /* DEFINED_DIRECT_VERBS */ +#endif /* DM_MGR_H */ diff --git a/src/vma/dev/gro_mgr.cpp b/src/vma/dev/gro_mgr.cpp new file mode 100644 index 0000000..e4f7534 --- /dev/null +++ b/src/vma/dev/gro_mgr.cpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vma/dev/gro_mgr.h" +#include "vma/dev/rfs_uc_tcp_gro.h" + +#define MODULE_NAME "gro_mgr" + +gro_mgr::gro_mgr(uint32_t flow_max, uint32_t buf_max) : m_n_flow_max(flow_max), m_n_buf_max(buf_max), m_n_flow_count(0) +{ + m_p_rfs_arr = new rfs_uc_tcp_gro*[flow_max]; + BULLSEYE_EXCLUDE_BLOCK_START + if (!m_p_rfs_arr) { + __log_panic("could not allocate memory"); + } + BULLSEYE_EXCLUDE_BLOCK_END +} + +gro_mgr::~gro_mgr() +{ + delete [] m_p_rfs_arr; +} + +bool gro_mgr::reserve_stream(rfs_uc_tcp_gro* rfs_uc_tcp_gro) +{ + if (is_stream_max()) return false; + + m_p_rfs_arr[m_n_flow_count] = rfs_uc_tcp_gro; + m_n_flow_count++; + return true; +} + +bool gro_mgr::is_stream_max() +{ + return (m_n_flow_count >= m_n_flow_max); +} + + + +void gro_mgr::flush_all(void* pv_fd_ready_array) +{ + for (uint32_t i = 0; i < m_n_flow_count; i++) { + m_p_rfs_arr[i]->flush(pv_fd_ready_array); + } + m_n_flow_count = 0; +} diff --git a/src/vma/dev/gro_mgr.h b/src/vma/dev/gro_mgr.h new file mode 100644 index 0000000..b83a3d5 --- /dev/null +++ b/src/vma/dev/gro_mgr.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef GRO_MGR_H_ +#define GRO_MGR_H_ + +#include + +#define MAX_AGGR_BYTE_PER_STREAM 0xFFFF +#define MAX_GRO_BUFS 32 + +class rfs_uc_tcp_gro; + +class gro_mgr +{ +public: + gro_mgr(uint32_t flow_max, uint32_t buf_max); + bool reserve_stream(rfs_uc_tcp_gro* rfs_uc_tcp_gro); + bool is_stream_max(); + inline uint32_t get_buf_max() { return m_n_buf_max;} + inline uint32_t get_byte_max() { return MAX_AGGR_BYTE_PER_STREAM;} + void flush_all(void* pv_fd_ready_array); + virtual ~gro_mgr(); + +private: + const uint32_t m_n_flow_max; + const uint32_t m_n_buf_max; + + uint32_t m_n_flow_count; + + rfs_uc_tcp_gro** m_p_rfs_arr; +}; + +#endif /* GRO_MGR_H_ */ diff --git a/src/vma/dev/ib_ctx_handler.cpp b/src/vma/dev/ib_ctx_handler.cpp new file mode 100644 index 0000000..a6b3152 --- /dev/null +++ b/src/vma/dev/ib_ctx_handler.cpp @@ -0,0 +1,669 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include + +#include "utils/bullseye.h" +#include "vlogger/vlogger.h" +#include +#include "vma/dev/ib_ctx_handler.h" +#include "vma/ib/base/verbs_extra.h" +#include "vma/dev/time_converter_ib_ctx.h" +#include "vma/dev/time_converter_ptp.h" +#include "util/valgrind.h" +#include "vma/event/event_handler_manager.h" + +#define MODULE_NAME "ibch" + +#define ibch_logpanic __log_panic +#define ibch_logerr __log_err +#define ibch_logwarn __log_warn +#define ibch_loginfo __log_info +#define ibch_logdbg __log_info_dbg +#define ibch_logfunc __log_info_func +#define ibch_logfuncall __log_info_funcall + + +ib_ctx_handler::ib_ctx_handler(struct ib_ctx_handler_desc *desc) : + m_flow_tag_enabled(false) + , m_on_device_memory(0) + , m_removed(false) + , m_lock_umr("spin_lock_umr") + , m_umr_cq(NULL) + , m_umr_qp(NULL) + , m_p_ctx_time_converter(NULL) +{ + if (NULL == desc) { + ibch_logpanic("Invalid ib_ctx_handler"); + } + + m_p_ibv_device = desc->device; + + if (m_p_ibv_device == NULL) { + ibch_logpanic("m_p_ibv_device is invalid"); + } + +#ifdef DEFINED_DPCP + m_p_adapter = set_dpcp_adapter(); + if (NULL == m_p_adapter) +#endif /* DEFINED_DPCP */ + { + m_p_ibv_context = ibv_open_device(m_p_ibv_device); + if (m_p_ibv_context == NULL) { + ibch_logpanic("m_p_ibv_context is invalid"); + } + + // Create pd for this device + m_p_ibv_pd = ibv_alloc_pd(m_p_ibv_context); + if (m_p_ibv_pd == NULL) { + ibch_logpanic("ibv device %p pd allocation failure (ibv context %p) (errno=%d %m)", + m_p_ibv_device, m_p_ibv_context, errno); + } + } + VALGRIND_MAKE_MEM_DEFINED(m_p_ibv_pd, sizeof(struct ibv_pd)); + + m_p_ibv_device_attr = new vma_ibv_device_attr_ex(); + if (m_p_ibv_device_attr == NULL) { + ibch_logpanic("ibv device %p attr allocation failure (ibv context %p) (errno=%d %m)", + m_p_ibv_device, m_p_ibv_context, errno); + } + vma_ibv_device_attr_comp_mask(m_p_ibv_device_attr); + IF_VERBS_FAILURE(vma_ibv_query_device(m_p_ibv_context, m_p_ibv_device_attr)) { + ibch_logerr("ibv_query_device failed on ibv device %p (ibv context %p) (errno=%d %m)", + m_p_ibv_device, m_p_ibv_context, errno); + goto err; + } ENDIF_VERBS_FAILURE; + + // update device memory capabilities + m_on_device_memory = vma_ibv_dm_size(m_p_ibv_device_attr); + +#ifdef DEFINED_IBV_PACKET_PACING_CAPS + if (vma_is_pacing_caps_supported(m_p_ibv_device_attr)) { + m_pacing_caps.rate_limit_min = m_p_ibv_device_attr->packet_pacing_caps.qp_rate_limit_min; + m_pacing_caps.rate_limit_max = m_p_ibv_device_attr->packet_pacing_caps.qp_rate_limit_max; + } +#endif // DEFINED_IBV_PACKET_PACING_CAPS + + g_p_event_handler_manager->register_ibverbs_event(m_p_ibv_context->async_fd, + this, m_p_ibv_context, 0); + + return; + +err: + if (m_p_ibv_device_attr) { + delete m_p_ibv_device_attr; + } + + if (m_p_ibv_pd) { + ibv_dealloc_pd(m_p_ibv_pd); + } + +#ifdef DEFINED_DPCP + if (m_p_adapter) { + delete m_p_adapter; + m_p_ibv_context = NULL; + } +#endif /* DEFINED_DPCP */ + if (m_p_ibv_context) { + ibv_close_device(m_p_ibv_context); + m_p_ibv_context = NULL; + } +} + +ib_ctx_handler::~ib_ctx_handler() +{ + if (!m_removed) { + g_p_event_handler_manager->unregister_ibverbs_event(m_p_ibv_context->async_fd, this); + } + + // must delete ib_ctx_handler only after freeing all resources that + // are still associated with the PD m_p_ibv_pd + BULLSEYE_EXCLUDE_BLOCK_START + + mr_map_lkey_t::iterator iter; + while ((iter = m_mr_map_lkey.begin()) != m_mr_map_lkey.end()) { + mem_dereg(iter->first); + } + if (m_umr_qp) { + IF_VERBS_FAILURE_EX(ibv_destroy_qp(m_umr_qp), EIO) { + ibch_logdbg("destroy qp failed (errno=%d %m)", errno); + } ENDIF_VERBS_FAILURE; + m_umr_qp = NULL; + } + if (m_umr_cq) { + IF_VERBS_FAILURE_EX(ibv_destroy_cq(m_umr_cq), EIO) { + ibch_logdbg("destroy cq failed (errno=%d %m)", errno); + } ENDIF_VERBS_FAILURE; + m_umr_cq = NULL; + } + if (m_p_ibv_pd) { + IF_VERBS_FAILURE_EX(ibv_dealloc_pd(m_p_ibv_pd), EIO) { + ibch_logdbg("pd deallocation failure (errno=%d %m)", errno); + } ENDIF_VERBS_FAILURE; + VALGRIND_MAKE_MEM_UNDEFINED(m_p_ibv_pd, sizeof(struct ibv_pd)); + m_p_ibv_pd = NULL; + } + + if (m_p_ctx_time_converter) { + m_p_ctx_time_converter->clean_obj(); + } + delete m_p_ibv_device_attr; + +#ifdef DEFINED_DPCP + if (m_p_adapter) { + delete m_p_adapter; + m_p_ibv_context = NULL; + } +#endif /* DEFINED_DPCP */ + if (m_p_ibv_context) { + ibv_close_device(m_p_ibv_context); + m_p_ibv_context = NULL; + } + + BULLSEYE_EXCLUDE_BLOCK_END +} + +void ib_ctx_handler::set_str() +{ + char str_x[512] = {0}; + + m_str[0] = '\0'; + + str_x[0] = '\0'; + sprintf(str_x, " %s:", get_ibname()); + strcat(m_str, str_x); + + str_x[0] = '\0'; + sprintf(str_x, " port(s): %d", get_ibv_device_attr()->phys_port_cnt); + strcat(m_str, str_x); + + str_x[0] = '\0'; + sprintf(str_x, " vendor: %d", get_ibv_device_attr()->vendor_part_id); + strcat(m_str, str_x); + + str_x[0] = '\0'; + sprintf(str_x, " fw: %s", get_ibv_device_attr()->fw_ver); + strcat(m_str, str_x); + + str_x[0] = '\0'; + sprintf(str_x, " max_qp_wr: %d", get_ibv_device_attr()->max_qp_wr); + strcat(m_str, str_x); + + str_x[0] = '\0'; + sprintf(str_x, " on_device_memory: %zu", m_on_device_memory); + strcat(m_str, str_x); + + str_x[0] = '\0'; + sprintf(str_x, " packet_pacing_caps: min rate %u, max rate %u", m_pacing_caps.rate_limit_min, m_pacing_caps.rate_limit_max); + strcat(m_str, str_x); +} + +void ib_ctx_handler::print_val() +{ + set_str(); + ibch_logdbg("%s", m_str); +} + +#ifdef DEFINED_DPCP +dpcp::adapter* ib_ctx_handler::set_dpcp_adapter() +{ + dpcp::status status = dpcp::DPCP_ERR_NO_SUPPORT; + dpcp::provider *p_provider = NULL; + dpcp::adapter_info* dpcp_lst = NULL; + size_t adapters_num = 0; + size_t i = 0; + + m_p_adapter = NULL; + if (!m_p_ibv_device) { + return NULL; + } + + status = dpcp::provider::get_instance(p_provider); + if (dpcp::DPCP_OK != status) { + ibch_logerr("failed getting provider status = %d", status); + goto err; + } + + status = p_provider->get_adapter_info_lst(NULL, adapters_num); + if (0 == adapters_num) { + ibch_logdbg("found no adapters status = %d", status); + goto err; + } + + dpcp_lst = new (std::nothrow)dpcp::adapter_info[adapters_num]; + if (!dpcp_lst) { + ibch_logerr("failed allocating memory for devices"); + goto err; + } + + status = p_provider->get_adapter_info_lst(dpcp_lst, adapters_num); + if (dpcp::DPCP_OK != status) { + ibch_logerr("failed getting adapter list"); + goto err; + } + + for (i = 0; i < adapters_num; i++) { + if (dpcp_lst[i].name == m_p_ibv_device->name) { + dpcp::adapter *adapter = NULL; + + status = p_provider->open_adapter(dpcp_lst[i].name, adapter); + if ((dpcp::DPCP_OK == status) && (adapter)) { + int ret = 0; + struct ibv_context *ctx = NULL; + struct ibv_pd *pd = NULL; + mlx5dv_obj mlx5_obj; + + ctx = (ibv_context*)adapter->get_ibv_context(); + if (!ctx) { + ibch_logerr("failed getting context for adapter %p (errno=%d %m) ", + adapter, errno); + delete adapter; + goto err; + } + + pd = ibv_alloc_pd(ctx); + if (!pd) { + ibch_logerr("failed pd allocation for %p context (errno=%d %m) ", + ctx, errno); + delete adapter; + goto err; + } + + mlx5_obj.pd.in = pd; + mlx5dv_pd out_pd; + mlx5_obj.pd.out = &out_pd; + + ret = vma_ib_mlx5dv_init_obj(&mlx5_obj, MLX5DV_OBJ_PD); + if (ret) { + ibch_logerr("failed getting mlx5_pd for %p (errno=%d %m) ", + m_p_ibv_pd, errno); + ibv_dealloc_pd(pd); + delete adapter; + goto err; + } + + adapter->set_pd(out_pd.pdn); + status = adapter->open(); + if (dpcp::DPCP_OK != status) { + ibch_logerr("failed opening dpcp adapter %s got %d", + adapter->get_name().c_str(), status); + ibv_dealloc_pd(pd); + delete adapter; + goto err; + } + + m_p_adapter = adapter; + m_p_ibv_context = ctx; + m_p_ibv_pd = pd; + ibch_logdbg("dpcp adapter: %s is up", adapter->get_name().c_str()); + } + + break; + } + } + +err: + if (dpcp_lst) { + delete[] dpcp_lst; + } + + return m_p_adapter; +} +#endif /* DEFINED_DPCP */ + +void ib_ctx_handler::set_ctx_time_converter_status(ts_conversion_mode_t conversion_mode) +{ +#ifdef DEFINED_IBV_CQ_TIMESTAMP + switch (conversion_mode) { + case TS_CONVERSION_MODE_DISABLE: + m_p_ctx_time_converter = new time_converter_ib_ctx(m_p_ibv_context, TS_CONVERSION_MODE_DISABLE, 0); + break; + case TS_CONVERSION_MODE_PTP: { +#ifdef DEFINED_IBV_CLOCK_INFO + if (is_mlx4()) { + m_p_ctx_time_converter = new time_converter_ib_ctx(m_p_ibv_context, TS_CONVERSION_MODE_SYNC, m_p_ibv_device_attr->hca_core_clock); + ibch_logwarn("ptp is not supported for mlx4 devices, reverting to mode TS_CONVERSION_MODE_SYNC (ibv context %p)", + m_p_ibv_context); + } else { + vma_ibv_clock_info clock_info; + memset(&clock_info, 0, sizeof(clock_info)); + int ret = vma_ibv_query_clock_info(m_p_ibv_context, &clock_info); + if (ret == 0) { + m_p_ctx_time_converter = new time_converter_ptp(m_p_ibv_context); + } else { + m_p_ctx_time_converter = new time_converter_ib_ctx(m_p_ibv_context, TS_CONVERSION_MODE_SYNC, m_p_ibv_device_attr->hca_core_clock); + ibch_logwarn("vma_ibv_query_clock_info failure for clock_info, reverting to mode TS_CONVERSION_MODE_SYNC (ibv context %p) (ret %d)", + m_p_ibv_context, ret); + } + } +# else + m_p_ctx_time_converter = new time_converter_ib_ctx(m_p_ibv_context, TS_CONVERSION_MODE_SYNC, m_p_ibv_device_attr->hca_core_clock); + ibch_logwarn("PTP is not supported by the underlying Infiniband verbs. DEFINED_IBV_CLOCK_INFO not defined. " + "reverting to mode TS_CONVERSION_MODE_SYNC"); +# endif // DEFINED_IBV_CLOCK_INFO + } + break; + default: + m_p_ctx_time_converter = new time_converter_ib_ctx(m_p_ibv_context, + conversion_mode, + m_p_ibv_device_attr->hca_core_clock); + break; + } +#else + m_p_ctx_time_converter = new time_converter_ib_ctx(m_p_ibv_context, TS_CONVERSION_MODE_DISABLE, 0); + if (conversion_mode != TS_CONVERSION_MODE_DISABLE) { + ibch_logwarn("time converter mode not applicable (configuration " + "value=%d). set to TS_CONVERSION_MODE_DISABLE.", + conversion_mode); + } +#endif // DEFINED_IBV_CQ_TIMESTAMP +} + +ts_conversion_mode_t ib_ctx_handler::get_ctx_time_converter_status() +{ + return m_p_ctx_time_converter ? m_p_ctx_time_converter->get_converter_status(): TS_CONVERSION_MODE_DISABLE; +} + +uint32_t ib_ctx_handler::mem_reg(void *addr, size_t length, uint64_t access) +{ + struct ibv_mr *mr = NULL; + uint32_t lkey = (uint32_t)(-1); + +#ifdef DEFINED_IBV_EXP_ACCESS_ALLOCATE_MR + struct ibv_exp_reg_mr_in in; + memset(&in, 0 ,sizeof(in)); + in.exp_access = access; + in.addr = addr; + in.length = length; + in.pd = m_p_ibv_pd; + mr = ibv_exp_reg_mr(&in); +#else + mr = ibv_reg_mr(m_p_ibv_pd, addr, length, access); +#endif + VALGRIND_MAKE_MEM_DEFINED(mr, sizeof(ibv_mr)); + if (NULL == mr) { + ibch_logerr("failed registering a memory region " + "(errno=%d %m)", errno); + } else { + m_mr_map_lkey[mr->lkey] = mr; + lkey = mr->lkey; + + ibch_logdbg("dev:%s (%p) addr=%p length=%d pd=%p", + get_ibname(), m_p_ibv_device, addr, length, m_p_ibv_pd); + } + + return lkey; +} + +void ib_ctx_handler::mem_dereg(uint32_t lkey) +{ + mr_map_lkey_t::iterator iter = m_mr_map_lkey.find(lkey); + if (iter != m_mr_map_lkey.end()) { + struct ibv_mr* mr = iter->second; + ibch_logdbg("dev:%s (%p) addr=%p length=%d pd=%p", + get_ibname(), m_p_ibv_device, mr->addr, mr->length, m_p_ibv_pd); + IF_VERBS_FAILURE_EX(ibv_dereg_mr(mr), EIO) { + ibch_logdbg("failed de-registering a memory region " + "(errno=%d %m)", errno); + } ENDIF_VERBS_FAILURE; + VALGRIND_MAKE_MEM_UNDEFINED(mr, sizeof(ibv_mr)); + m_mr_map_lkey.erase(iter); + } +} + +struct ibv_mr* ib_ctx_handler::get_mem_reg(uint32_t lkey) +{ + mr_map_lkey_t::iterator iter = m_mr_map_lkey.find(lkey); + if (iter != m_mr_map_lkey.end()) { + return iter->second; + } + + return NULL; +} + +void ib_ctx_handler::set_flow_tag_capability(bool flow_tag_capability) +{ + m_flow_tag_enabled = flow_tag_capability; +} + +void ib_ctx_handler::set_burst_capability(bool burst) +{ + m_pacing_caps.burst = burst; +} + +bool ib_ctx_handler::is_packet_pacing_supported(uint32_t rate /* =1 */) +{ + if (rate) { + return m_pacing_caps.rate_limit_min <= rate && rate <= m_pacing_caps.rate_limit_max; + } else { + return true; + } +} + +bool ib_ctx_handler::is_active(int port_num) +{ + ibv_port_attr port_attr; + + memset(&port_attr, 0, sizeof(ibv_port_attr)); + IF_VERBS_FAILURE(ibv_query_port(m_p_ibv_context, port_num, &port_attr)) { + ibch_logdbg("ibv_query_port failed on ibv device %p, port %d " + "(errno=%d)", m_p_ibv_context, port_num, errno); + }ENDIF_VERBS_FAILURE; + VALGRIND_MAKE_MEM_DEFINED(&port_attr.state, sizeof(port_attr.state)); + return port_attr.state == IBV_PORT_ACTIVE; +} + +void ib_ctx_handler::handle_event_ibverbs_cb(void *ev_data, void *ctx) +{ + NOT_IN_USE(ctx); + + struct ibv_async_event *ibv_event = (struct ibv_async_event*)ev_data; + ibch_logdbg("received ibv_event '%s' (%d)", + priv_ibv_event_desc_str(ibv_event->event_type), + ibv_event->event_type); + + if (ibv_event->event_type == IBV_EVENT_DEVICE_FATAL) { + handle_event_device_fatal(); + } +} + +void ib_ctx_handler::handle_event_device_fatal() +{ + m_removed = true; + + /* After getting IBV_EVENT_DEVICE_FATAL event rdma library returns + * an EIO from destroy commands when the kernel resources were already released. + * This comes to prevent memory leakage in the + * user space area upon device disassociation. Applications cannot + * call ibv_get_cq_event or ibv_get_async_event concurrently with any call to an + * object destruction function. + */ + g_p_event_handler_manager->unregister_ibverbs_event(m_p_ibv_context->async_fd, this); +} + +bool ib_ctx_handler::post_umr_wr(struct ibv_exp_send_wr &wr) +{ +#ifdef HAVE_MP_RQ + auto_unlocker lock(m_lock_umr); + ibv_exp_send_wr *bad_wr = NULL; + ibv_exp_wc wc; + + if (!m_umr_qp && !create_umr_qp()) { + ibch_logwarn("failed creating umr_qp"); + return false; + } + int res = ibv_exp_post_send(m_umr_qp, &wr, &bad_wr); + + if (res) { + if (bad_wr) { + ibch_logdbg("bad_wr info: wr_id=%#x, send_flags=%#x, " + "addr=%#x, length=%d, lkey=%#x", + bad_wr->wr_id, + bad_wr->exp_send_flags, + bad_wr->sg_list[0].addr, + bad_wr->sg_list[0].length, + bad_wr->sg_list[0].lkey); + } + return false; + } + int ret; + do { + ret = ibv_exp_poll_cq(m_umr_cq, 1, &wc, sizeof(wc)); + if (ret < 0) { + ibch_logdbg("poll CQ failed after %d errno:%d\n", ret, errno); + return false; + } + } while (!ret); + + if (wc.status != IBV_WC_SUCCESS) { + ibch_logdbg("post_umr_wr comp status %d\n", wc.status); + return false; + } + return true; +#else + NOT_IN_USE(wr); + return false; +#endif +} + +bool ib_ctx_handler::create_umr_qp() +{ +#ifdef HAVE_MP_RQ + ibch_logdbg(""); + int ret = 0; + uint8_t *gid_raw; + const int port_num = 1; + //create TX_QP & CQ for UMR + vma_ibv_cq_init_attr cq_attr; + memset(&cq_attr, 0, sizeof(cq_attr)); + + m_umr_cq = vma_ibv_create_cq(m_p_ibv_context, 16, NULL, NULL, 0, &cq_attr); + if (m_umr_cq == NULL) { + ibch_logdbg("failed creating UMR CQ (errno=%d %m)", errno); + return false; + } + // Create QP + vma_ibv_qp_init_attr qp_init_attr; + memset(&qp_init_attr, 0, sizeof(qp_init_attr)); + + qp_init_attr.qp_type = IBV_QPT_RC; + qp_init_attr.recv_cq = m_umr_cq; + qp_init_attr.send_cq = m_umr_cq; + qp_init_attr.cap.max_send_wr = 16; + qp_init_attr.cap.max_recv_wr = 16; + qp_init_attr.cap.max_send_sge = 1; + qp_init_attr.cap.max_recv_sge = 1; + vma_ibv_qp_init_attr_comp_mask(m_p_ibv_pd, qp_init_attr); + qp_init_attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS | + IBV_EXP_QP_INIT_ATTR_MAX_INL_KLMS; + qp_init_attr.exp_create_flags |= IBV_EXP_QP_CREATE_UMR; + // max UMR needed is 4, in STRIP with HEADER mode. net, hdr, payload, padding + qp_init_attr.max_inl_send_klms = 4; + m_umr_qp = vma_ibv_create_qp(m_p_ibv_pd, &qp_init_attr); + + BULLSEYE_EXCLUDE_BLOCK_START + if (!m_umr_qp) { + ibch_logdbg("vma_ibv_create_qp failed (errno=%d %m)", errno); + goto err_destroy_cq; + } + BULLSEYE_EXCLUDE_BLOCK_END + // Modify QP to INIT state + struct ibv_qp_attr qp_attr; + memset(&qp_attr, 0, sizeof(qp_attr)); + qp_attr.qp_state = IBV_QPS_INIT; + qp_attr.port_num = port_num; + ret = ibv_modify_qp(m_umr_qp, &qp_attr, + IBV_QP_STATE | IBV_QP_PKEY_INDEX | IBV_QP_PORT | IBV_QP_ACCESS_FLAGS); + if (ret) { + ibch_logdbg("Failed to modify UMR QP to INIT: (errno=%d %m)", errno); + goto err_destroy_qp; + } + // Modify to RTR + qp_attr.qp_state = IBV_QPS_RTR; + qp_attr.dest_qp_num = m_umr_qp->qp_num; + memset(&qp_attr.ah_attr, 0, sizeof(qp_attr.ah_attr)); + qp_attr.ah_attr.port_num = port_num; + qp_attr.ah_attr.is_global = 1; + if (ibv_query_gid(m_p_ibv_context, port_num, + 0, &qp_attr.ah_attr.grh.dgid)) { + ibch_logdbg("Failed getting port gid: (errno=%d %m)", errno); + goto err_destroy_qp; + } + gid_raw = qp_attr.ah_attr.grh.dgid.raw; + if ((*(uint64_t *)gid_raw == 0) && (*(uint64_t *)(gid_raw + 8) == 0)) { + ibch_logdbg("Port gid is zero: (errno=%d %m)", errno); + goto err_destroy_qp; + } + qp_attr.path_mtu = IBV_MTU_512; + qp_attr.min_rnr_timer = 7; + qp_attr.max_dest_rd_atomic = 1; + ret = ibv_modify_qp(m_umr_qp, &qp_attr, + IBV_QP_STATE | IBV_QP_AV | IBV_QP_PATH_MTU | IBV_QP_DEST_QPN | + IBV_QP_RQ_PSN | IBV_QP_MAX_DEST_RD_ATOMIC | IBV_QP_MIN_RNR_TIMER); + if (ret) { + ibch_logdbg("Failed to modify UMR QP to RTR:(errno=%d %m)", errno); + goto err_destroy_qp; + } + + /* Modify to RTS */ + qp_attr.qp_state = IBV_QPS_RTS; + qp_attr.sq_psn = 0; + qp_attr.timeout = 7; + qp_attr.rnr_retry = 7; + qp_attr.retry_cnt = 7; + qp_attr.max_rd_atomic = 1; + ret = ibv_modify_qp(m_umr_qp, &qp_attr, + IBV_QP_STATE | IBV_QP_TIMEOUT | IBV_QP_RETRY_CNT | + IBV_QP_RETRY_CNT | IBV_QP_RNR_RETRY | IBV_QP_SQ_PSN | + IBV_QP_MAX_QP_RD_ATOMIC); + if (ret) { + ibch_logdbg("Failed to modify UMR QP to RTS:(errno=%d %m)", errno); + goto err_destroy_qp; + } + + return true; +err_destroy_qp: + IF_VERBS_FAILURE(ibv_destroy_qp(m_umr_qp)) { + ibch_logdbg("destroy qp failed (errno=%d %m)", errno); + } ENDIF_VERBS_FAILURE; + m_umr_qp = NULL; +err_destroy_cq: + IF_VERBS_FAILURE(ibv_destroy_cq(m_umr_cq)) { + ibch_logdbg("destroy cq failed (errno=%d %m)", errno); + } ENDIF_VERBS_FAILURE; + m_umr_cq = NULL; + return false; +#else + return false; +#endif +} diff --git a/src/vma/dev/ib_ctx_handler.h b/src/vma/dev/ib_ctx_handler.h new file mode 100644 index 0000000..dfb58e1 --- /dev/null +++ b/src/vma/dev/ib_ctx_handler.h @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef IB_CTX_HANDLER_H +#define IB_CTX_HANDLER_H + +#include +#include + +#include "vma/event/event_handler_ibverbs.h" +#include "vma/dev/time_converter.h" +#include "vma/ib/base/verbs_extra.h" +#include "utils/lock_wrapper.h" + +#ifdef DEFINED_DPCP +#include +#endif /* DEFINED_DPCP */ + + +typedef std::tr1::unordered_map mr_map_lkey_t; + +struct pacing_caps_t { + uint32_t rate_limit_min; + uint32_t rate_limit_max; + bool burst; + + pacing_caps_t() : rate_limit_min(0), rate_limit_max(0), burst(false) {}; +}; + +// client to event manager 'command' invoker (??) +// +class ib_ctx_handler : public event_handler_ibverbs +{ +public: + struct ib_ctx_handler_desc { + struct ibv_device *device; + }; +public: + ib_ctx_handler(struct ib_ctx_handler_desc *desc); + virtual ~ib_ctx_handler(); + + /* + * on init or constructor: + * register to event manager with m_channel and this. + * */ + ibv_pd* get_ibv_pd() { return m_p_ibv_pd; } + bool post_umr_wr(struct ibv_exp_send_wr &wr); + ibv_device* get_ibv_device() { return m_p_ibv_device; } + inline char* get_ibname() { return (m_p_ibv_device ? m_p_ibv_device->name : (char *)""); } + struct ibv_context* get_ibv_context() { return m_p_ibv_context; } +#ifdef DEFINED_DPCP + dpcp::adapter* set_dpcp_adapter(); + dpcp::adapter* get_dpcp_adapter() { return m_p_adapter; } +#endif /* DEFINED_DPCP */ + vma_ibv_device_attr* get_ibv_device_attr() { return vma_get_device_orig_attr(m_p_ibv_device_attr); } +#ifdef DEFINED_TSO + vma_ibv_device_attr_ex* get_ibv_device_attr_ex() { return m_p_ibv_device_attr; } +#endif /* DEFINED_TSO */ + uint32_t mem_reg(void *addr, size_t length, uint64_t access); + void mem_dereg(uint32_t lkey); + struct ibv_mr* get_mem_reg(uint32_t lkey); + bool is_removed() { return m_removed;} + void set_ctx_time_converter_status(ts_conversion_mode_t conversion_mode); + ts_conversion_mode_t get_ctx_time_converter_status(); + void set_flow_tag_capability(bool flow_tag_capability); + bool get_flow_tag_capability() { return m_flow_tag_enabled; } // m_flow_tag_capability + void set_burst_capability(bool burst); + bool get_burst_capability() { return m_pacing_caps.burst; } + bool is_packet_pacing_supported(uint32_t rate = 1); + size_t get_on_device_memory_size() { return m_on_device_memory; } + bool is_active(int port_num); + bool is_mlx4(){ return is_mlx4(get_ibname()); } + static bool is_mlx4(const char *dev) { return strncmp(dev, "mlx4", 4) == 0; } + virtual void handle_event_ibverbs_cb(void *ev_data, void *ctx); + + void set_str(); + void print_val(); + + inline void convert_hw_time_to_system_time(uint64_t hwtime, struct timespec* systime) + { + m_p_ctx_time_converter->convert_hw_time_to_system_time(hwtime, systime); + } +private: + bool create_umr_qp(); + void handle_event_device_fatal(); + ibv_device* m_p_ibv_device; // HCA handle + struct ibv_context* m_p_ibv_context; +#ifdef DEFINED_DPCP + dpcp::adapter *m_p_adapter; +#endif /* DEFINED_DPCP */ + vma_ibv_device_attr_ex* m_p_ibv_device_attr; + ibv_pd* m_p_ibv_pd; + bool m_flow_tag_enabled; + pacing_caps_t m_pacing_caps; + size_t m_on_device_memory; + bool m_removed; + lock_spin m_lock_umr; + struct ibv_cq* m_umr_cq; + struct ibv_qp* m_umr_qp; + time_converter* m_p_ctx_time_converter; + mr_map_lkey_t m_mr_map_lkey; + + char m_str[255]; +}; + +#endif diff --git a/src/vma/dev/ib_ctx_handler_collection.cpp b/src/vma/dev/ib_ctx_handler_collection.cpp new file mode 100644 index 0000000..2b7de39 --- /dev/null +++ b/src/vma/dev/ib_ctx_handler_collection.cpp @@ -0,0 +1,239 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "utils/bullseye.h" +#include "vlogger/vlogger.h" +#include "ib_ctx_handler_collection.h" + +#include "vma/ib/base/verbs_extra.h" +#include "vma/util/utils.h" +#include "vma/event/event_handler_manager.h" + +#define MODULE_NAME "ib_ctx_collection" + + +#define ibchc_logpanic __log_panic +#define ibchc_logerr __log_err +#define ibchc_logwarn __log_warn +#define ibchc_loginfo __log_info +#define ibchc_logdbg __log_info_dbg +#define ibchc_logfunc __log_info_func +#define ibchc_logfuncall __log_info_funcall + +ib_ctx_handler_collection* g_p_ib_ctx_handler_collection = NULL; + +void check_flow_steering_log_num_mgm_entry_size() +{ + static bool checked_mlx4_steering = false; + if (checked_mlx4_steering) { + return; + } + + checked_mlx4_steering = true; + char flow_steering_val[4] = {0}; + if (priv_safe_try_read_file((const char*)FLOW_STEERING_MGM_ENTRY_SIZE_PARAM_FILE, flow_steering_val, sizeof(flow_steering_val)) == -1) { + vlog_printf(VLOG_DEBUG, "Flow steering option for mlx4 driver does not exist in current OFED version\n"); + } else if (flow_steering_val[0] != '-' || (strtol(&flow_steering_val[1], NULL, 0) % 2) == 0) { + char module_info[3] = {0}; + if (!run_and_retreive_system_command("modinfo mlx4_core > /dev/null 2>&1 ; echo $?", + module_info, sizeof(module_info)) && + (strlen(module_info) > 0)) { + if (module_info[0] == '0') { + vlog_printf(VLOG_WARNING, "***************************************************************************************\n"); + vlog_printf(VLOG_WARNING, "* VMA will not operate properly while flow steering option is disabled *\n"); + vlog_printf(VLOG_WARNING, "* In order to enable flow steering please restart your VMA applications after running *\n"); + vlog_printf(VLOG_WARNING, "* the following: *\n"); + vlog_printf(VLOG_WARNING, "* For your information the following steps will restart your network interface *\n"); + vlog_printf(VLOG_WARNING, "* 1. \"echo options mlx4_core log_num_mgm_entry_size=-1 > /etc/modprobe.d/mlnx.conf\" *\n"); + vlog_printf(VLOG_WARNING, "* 2. Restart openibd or rdma service depending on your system configuration *\n"); + vlog_printf(VLOG_WARNING, "* Read more about the Flow Steering support in the VMA's User Manual *\n"); + vlog_printf(VLOG_WARNING, "***************************************************************************************\n"); + } else { + vlog_printf(VLOG_DEBUG, "***************************************************************************************\n"); + vlog_printf(VLOG_DEBUG, "* VMA will not operate properly while flow steering option is disabled *\n"); + vlog_printf(VLOG_DEBUG, "* Read more about the Flow Steering support in the VMA's User Manual *\n"); + vlog_printf(VLOG_DEBUG, "***************************************************************************************\n"); + } + } + } +} + +ib_ctx_handler_collection::ib_ctx_handler_collection() +{ + ibchc_logdbg(""); + + /* Read ib table from kernel and save it in local variable. */ + update_tbl(); + + //Print table + print_val_tbl(); + + ibchc_logdbg("Done"); +} + +ib_ctx_handler_collection::~ib_ctx_handler_collection() +{ + ibchc_logdbg(""); + + ib_context_map_t::iterator ib_ctx_iter; + while ((ib_ctx_iter = m_ib_ctx_map.begin()) != m_ib_ctx_map.end()) { + ib_ctx_handler* p_ib_ctx_handler = ib_ctx_iter->second; + delete p_ib_ctx_handler; + m_ib_ctx_map.erase(ib_ctx_iter); + } + + ibchc_logdbg("Done"); +} + +void ib_ctx_handler_collection::update_tbl(const char *ifa_name) +{ + struct ibv_device **dev_list = NULL; + ib_ctx_handler * p_ib_ctx_handler = NULL; + int num_devices = 0; + int i; + + ibchc_logdbg("Checking for offload capable IB devices..."); + + dev_list = vma_ibv_get_device_list(&num_devices); + + BULLSEYE_EXCLUDE_BLOCK_START + if (!dev_list) { + ibchc_logerr("Failure in vma_ibv_get_device_list() (error=%d %m)", errno); + ibchc_logerr("Please check rdma configuration"); + throw_vma_exception("No IB capable devices found!"); + } + if (!num_devices) { + vlog_levels_t _level = ifa_name ? VLOG_DEBUG : VLOG_ERROR; // Print an error only during initialization. + vlog_printf(_level, "VMA does not detect IB capable devices\n"); + vlog_printf(_level, "No performance gain is expected in current configuration\n"); + } + + BULLSEYE_EXCLUDE_BLOCK_END + + for (i = 0; i < num_devices; i++) { + struct ib_ctx_handler::ib_ctx_handler_desc desc = {dev_list[i]}; + + /* 2. Skip existing devices (compare by name) */ + if (ifa_name && !check_device_name_ib_name(ifa_name, dev_list[i]->name)) { + continue; + } + + if (ib_ctx_handler::is_mlx4(dev_list[i]->name)) { + // Note: mlx4 does not support this capability. + if(safe_mce_sys().enable_socketxtreme) { + ibchc_logdbg("Blocking offload: mlx4 interfaces in socketxtreme mode"); + continue; + } + + // Check if mlx4 steering creation is supported. + check_flow_steering_log_num_mgm_entry_size(); + } + + /* 3. Add new ib devices */ + p_ib_ctx_handler = new ib_ctx_handler(&desc); + if (!p_ib_ctx_handler) { + ibchc_logerr("failed allocating new ib_ctx_handler (errno=%d %m)", errno); + continue; + } + m_ib_ctx_map[p_ib_ctx_handler->get_ibv_device()] = p_ib_ctx_handler; + } + + ibchc_logdbg("Check completed. Found %d offload capable IB devices", m_ib_ctx_map.size()); + + if (dev_list) { + ibv_free_device_list(dev_list); + } +} + +void ib_ctx_handler_collection::print_val_tbl() +{ + ib_context_map_t::iterator itr; + for (itr = m_ib_ctx_map.begin(); itr != m_ib_ctx_map.end(); itr++) { + ib_ctx_handler* p_ib_ctx_handler = itr->second; + p_ib_ctx_handler->print_val(); + } +} + +ib_ctx_handler* ib_ctx_handler_collection::get_ib_ctx(const char *ifa_name) +{ + char active_slave[IFNAMSIZ] = {0}; + unsigned int slave_flags = 0; + ib_context_map_t::iterator ib_ctx_iter; + + if (check_netvsc_device_exist(ifa_name)) { + if (!get_netvsc_slave(ifa_name, active_slave, slave_flags)) { + return NULL; + } + ifa_name = (const char *)active_slave; + } else if (check_device_exist(ifa_name, BOND_DEVICE_FILE)) { + /* active/backup: return active slave */ + if (!get_bond_active_slave_name(ifa_name, active_slave, sizeof(active_slave))) { + char slaves[IFNAMSIZ * 16] = {0}; + char* slave_name; + char* save_ptr; + + /* active/active: return the first slave */ + if (!get_bond_slaves_name_list(ifa_name, slaves, sizeof(slaves))) { + return NULL; + } + slave_name = strtok_r(slaves, " ", &save_ptr); + if (NULL == slave_name) { + return NULL; + } + save_ptr = strchr(slave_name, '\n'); + if (save_ptr) *save_ptr = '\0'; // Remove the tailing 'new line" char + strncpy(active_slave, slave_name, sizeof(active_slave) - 1); + } + ifa_name = (const char *)active_slave; + } + + for (ib_ctx_iter = m_ib_ctx_map.begin(); ib_ctx_iter != m_ib_ctx_map.end(); ib_ctx_iter++) { + if (check_device_name_ib_name(ifa_name, ib_ctx_iter->second->get_ibname())) { + return ib_ctx_iter->second; + } + } + + return NULL; +} + +void ib_ctx_handler_collection::del_ib_ctx(ib_ctx_handler* ib_ctx) +{ + if (ib_ctx) { + ib_context_map_t::iterator ib_ctx_iter = m_ib_ctx_map.find(ib_ctx->get_ibv_device()); + if (ib_ctx_iter != m_ib_ctx_map.end()) { + delete ib_ctx_iter->second; + m_ib_ctx_map.erase(ib_ctx_iter); + } + } +} diff --git a/src/vma/dev/ib_ctx_handler_collection.h b/src/vma/dev/ib_ctx_handler_collection.h new file mode 100644 index 0000000..692dc25 --- /dev/null +++ b/src/vma/dev/ib_ctx_handler_collection.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef IB_CTX_HANDLER_COLLECTION_H +#define IB_CTX_HANDLER_COLLECTION_H + +#include + +#include "vma/ib/base/verbs_extra.h" +#include "ib_ctx_handler.h" + +typedef std::tr1::unordered_map ib_context_map_t; + +class ib_ctx_handler_collection +{ +public: + ib_ctx_handler_collection(); + ~ib_ctx_handler_collection(); + + void update_tbl(const char *ifa_name = NULL); + void print_val_tbl(); + + inline ib_context_map_t* get_ib_cxt_list() { + return (m_ib_ctx_map.size() ? &m_ib_ctx_map : NULL); + } + ib_ctx_handler* get_ib_ctx(const char *ifa_name); + void del_ib_ctx(ib_ctx_handler* ib_ctx); + +private: + ib_context_map_t m_ib_ctx_map; +}; + +extern ib_ctx_handler_collection* g_p_ib_ctx_handler_collection; + +#endif diff --git a/src/vma/dev/net_device_entry.cpp b/src/vma/dev/net_device_entry.cpp new file mode 100644 index 0000000..7670edd --- /dev/null +++ b/src/vma/dev/net_device_entry.cpp @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "utils/bullseye.h" +#include "net_device_entry.h" +#include "net_device_table_mgr.h" +#include "vma/event/event_handler_manager.h" + +#define MODULE_NAME "nde" + +#define nde_logdbg __log_info_dbg +#define nde_logerr __log_info_err + +#define SLAVE_CHECK_TIMER_PERIOD_MSEC 1000 +#define SLAVE_CHECK_FAST_TIMER_PERIOD_MSEC 10 +#define SLAVE_CHECK_FAST_NUM_TIMES 10 + +net_device_entry::net_device_entry(in_addr_t local_ip, net_device_val* ndv) : cache_entry_subject(ip_address(local_ip)) +{ + nde_logdbg(""); + m_val = ndv; + m_is_valid = false; + m_cma_id_bind_trial_count = 0; + m_timer_handle = NULL; + timer_count = -1; + m_bond = net_device_val::NO_BOND; + + BULLSEYE_EXCLUDE_BLOCK_START + if (!m_val) { + nde_logdbg("ERROR: received m_val = NULL"); + return; + } + BULLSEYE_EXCLUDE_BLOCK_END + + m_is_valid = true; + m_bond = ndv->get_is_bond(); + if(m_bond != net_device_val::NO_BOND) { + m_timer_handle = g_p_event_handler_manager->register_timer_event(SLAVE_CHECK_TIMER_PERIOD_MSEC, this, PERIODIC_TIMER, 0); + } + if(ndv->get_is_bond() == net_device_val::LAG_8023ad) { + ndv->register_to_ibverbs_events(this); + } + nde_logdbg("Done"); +} + +net_device_entry::~net_device_entry() +{ + if (m_timer_handle) { + g_p_event_handler_manager->unregister_timer_event(this, m_timer_handle); + m_timer_handle = NULL; + } + net_device_val* p_ndv = dynamic_cast(m_val); + if(p_ndv && p_ndv->get_is_bond() == net_device_val::LAG_8023ad) { + p_ndv->unregister_to_ibverbs_events(this); + } + nde_logdbg("Done"); +} + +bool net_device_entry::get_val(INOUT net_device_val* &val) +{ + auto_unlocker lock(m_lock); + val = m_val; + return is_valid(); +} + +void net_device_entry::handle_event_ibverbs_cb(void *ev_data, void *ctx) +{ + NOT_IN_USE(ctx); + struct ibv_async_event *ibv_event = (struct ibv_async_event*)ev_data; + nde_logdbg("received ibv_event '%s' (%d)", priv_ibv_event_desc_str(ibv_event->event_type), ibv_event->event_type); + if (ibv_event->event_type == IBV_EVENT_PORT_ERR || ibv_event->event_type == IBV_EVENT_PORT_ACTIVE) { + timer_count = 0; + g_p_event_handler_manager->unregister_timer_event(this, m_timer_handle); + m_timer_handle = g_p_event_handler_manager->register_timer_event(SLAVE_CHECK_FAST_TIMER_PERIOD_MSEC, this, PERIODIC_TIMER, 0); + } +} + +void net_device_entry::handle_timer_expired(void* user_data) +{ + NOT_IN_USE(user_data); + auto_unlocker lock(m_lock); + net_device_val* p_ndv = dynamic_cast(m_val); + if (p_ndv) { + if(m_bond == net_device_val::ACTIVE_BACKUP) { + if(p_ndv->update_active_backup_slaves()) { + //active slave was changed + notify_observers(); + } + } else if(m_bond == net_device_val::LAG_8023ad){ + if(p_ndv->update_active_slaves()) { + //slave state was changed + g_p_event_handler_manager->unregister_timer_event(this, m_timer_handle); + m_timer_handle = g_p_event_handler_manager->register_timer_event(SLAVE_CHECK_TIMER_PERIOD_MSEC, this, PERIODIC_TIMER, 0); + notify_observers(); + } else { + if (timer_count >= 0) { + timer_count++; + if (timer_count == SLAVE_CHECK_FAST_NUM_TIMES) { + timer_count = -1; + g_p_event_handler_manager->unregister_timer_event(this, m_timer_handle); + m_timer_handle = g_p_event_handler_manager->register_timer_event(SLAVE_CHECK_TIMER_PERIOD_MSEC, this, PERIODIC_TIMER, 0); + } + } + } + } + } +} diff --git a/src/vma/dev/net_device_entry.h b/src/vma/dev/net_device_entry.h new file mode 100644 index 0000000..0bda949 --- /dev/null +++ b/src/vma/dev/net_device_entry.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef NET_DEVICE_ENTRY_H +#define NET_DEVICE_ENTRY_H + +#include "net_device_val.h" +#include "vma/infra/cache_subject_observer.h" +#include "vma/proto/ip_address.h" +#include "vma/event/timer_handler.h" + +#define MAX_CMA_ID_BIND_TRIAL_COUNT 10 +#define CMA_ID_BIND_TIMER_PERIOD_MSEC 100 + +class net_device_entry : public cache_entry_subject , public event_handler_ibverbs, public timer_handler +{ +public: + friend class net_device_table_mgr; + + net_device_entry(in_addr_t local_ip, net_device_val* ndv); + virtual ~net_device_entry(); + + bool get_val(INOUT net_device_val* &val); + bool is_valid() { return m_is_valid; }; // m_val is NULL at first + + virtual void handle_event_ibverbs_cb(void *ev_data, void *ctx); + + void handle_timer_expired(void* user_data); + +private: + + bool m_is_valid; + size_t m_cma_id_bind_trial_count; + void* m_timer_handle; + net_device_val::bond_type m_bond; + int timer_count; +}; + +#endif diff --git a/src/vma/dev/net_device_table_mgr.cpp b/src/vma/dev/net_device_table_mgr.cpp new file mode 100644 index 0000000..42bf056 --- /dev/null +++ b/src/vma/dev/net_device_table_mgr.cpp @@ -0,0 +1,664 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include +#include +#include +#include +#include + +#include "utils/bullseye.h" +#include "vlogger/vlogger.h" +#include "vma/event/netlink_event.h" +#include "vma/event/event_handler_manager.h" +#include "vma/util/vtypes.h" +#include "vma/util/utils.h" +#include "vma/util/valgrind.h" +#include "vma/sock/sock-redirect.h" +#include "vma/sock/fd_collection.h" +#include "vma/dev/ring.h" +#include "net_device_table_mgr.h" + +#include "ib_ctx_handler_collection.h" + +#define MODULE_NAME "ndtm" + + +#define ndtm_logpanic __log_panic +#define ndtm_logerr __log_err +#define ndtm_logwarn __log_warn +#define ndtm_loginfo __log_info +#define ndtm_logdbg __log_info_dbg +#define ndtm_logfunc __log_info_func +#define ndtm_logfuncall __log_info_funcall + +net_device_table_mgr* g_p_net_device_table_mgr = NULL; + +enum net_device_table_mgr_timers { + RING_PROGRESS_ENGINE_TIMER, + RING_ADAPT_CQ_MODERATION_TIMER +}; + +net_device_table_mgr::net_device_table_mgr() : + cache_table_mgr("net_device_table_mgr"), + m_lock("net_device_table_mgr"), + m_time_conversion_mode(TS_CONVERSION_MODE_DISABLE) +{ + m_num_devices = 0; + m_global_ring_epfd = 0; + m_max_mtu = 0; + + ndtm_logdbg(""); + + m_global_ring_epfd = orig_os_api.epoll_create(48); + + BULLSEYE_EXCLUDE_BLOCK_START + if (m_global_ring_epfd == -1) { + ndtm_logerr("epoll_create failed. (errno=%d %m)", errno); + free_ndtm_resources(); + throw_vma_exception("epoll_create failed"); + } + + if (orig_os_api.pipe(m_global_ring_pipe_fds)) { + ndtm_logerr("pipe create failed. (errno=%d %m)", errno); + free_ndtm_resources(); + throw_vma_exception("pipe create failed"); + } + if (orig_os_api.write(m_global_ring_pipe_fds[1], "#", 1) != 1) { + ndtm_logerr("pipe write failed. (errno=%d %m)", errno); + free_ndtm_resources(); + throw_vma_exception("pipe write failed"); + } + BULLSEYE_EXCLUDE_BLOCK_END + + /* Read Link table from kernel and save it in local variable. */ + update_tbl(); + + /* throw exception if there are no supported devices. */ + if (m_net_device_map_index.empty()) { + int num_devices = 0; + struct ibv_device **dev_list = NULL; + dev_list = vma_ibv_get_device_list(&num_devices); + if (dev_list && num_devices == 0) { + ibv_free_device_list(dev_list); + ndtm_logdbg("net_device_map is empty %d", num_devices); + free_ndtm_resources(); + throw_vma_exception("net_device_map is empty"); + } + } + + //Print table + print_val_tbl(); + + // Calculate and update time conversion mode + m_time_conversion_mode = time_converter::update_device_converters_status(m_net_device_map_index); + + // register to netlink event + g_p_netlink_handler->register_event(nlgrpLINK, this); + ndtm_logdbg("Registered to g_p_netlink_handler"); + +#ifndef DEFINED_NO_THREAD_LOCK + if (safe_mce_sys().progress_engine_interval_msec != MCE_CQ_DRAIN_INTERVAL_DISABLED && safe_mce_sys().progress_engine_wce_max != 0) { + ndtm_logdbg("registering timer for ring draining with %d msec intervales", safe_mce_sys().progress_engine_interval_msec); + g_p_event_handler_manager->register_timer_event(safe_mce_sys().progress_engine_interval_msec, this, PERIODIC_TIMER, (void*)RING_PROGRESS_ENGINE_TIMER); + } + + if (safe_mce_sys().cq_aim_interval_msec != MCE_CQ_ADAPTIVE_MODERATION_DISABLED) { + ndtm_logdbg("registering timer for cq adaptive moderation with %d msec intervales", safe_mce_sys().cq_aim_interval_msec); + g_p_event_handler_manager->register_timer_event(safe_mce_sys().cq_aim_interval_msec, this, PERIODIC_TIMER, (void*)RING_ADAPT_CQ_MODERATION_TIMER); + } +#endif // DEFINED_NO_THREAD_LOCK + + ndtm_logdbg("Done"); +} + +void net_device_table_mgr::free_ndtm_resources() +{ + m_lock.lock(); + + if (m_global_ring_epfd > 0) { + orig_os_api.close(m_global_ring_epfd); + m_global_ring_epfd = 0; + } + + orig_os_api.close(m_global_ring_pipe_fds[1]); + orig_os_api.close(m_global_ring_pipe_fds[0]); + + net_device_map_index_t::iterator itr; + while ((itr = m_net_device_map_index.begin()) != m_net_device_map_index.end()) { + delete itr->second; + m_net_device_map_index.erase(itr); + } + m_net_device_map_addr.clear(); + + m_lock.unlock(); +} + +net_device_table_mgr::~net_device_table_mgr() +{ + ndtm_logdbg(""); + free_ndtm_resources(); + ndtm_logdbg("Done"); +} + +void net_device_table_mgr::update_tbl() +{ + int rc = 0; + int fd = -1; + struct { + struct nlmsghdr hdr; + struct ifinfomsg infomsg; + } nl_req; + struct nlmsghdr *nl_msg; + int nl_msglen = 0; + char nl_res[8096]; + static int _seq = 0; + net_device_val* p_net_device_val; + + /* Set up the netlink socket */ + fd = orig_os_api.socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (fd < 0) { + ndtm_logerr("netlink socket() creation"); + return; + } + + ndtm_logdbg("Checking for offload capable network interfaces..."); + + /* Prepare RTM_GETLINK request */ + memset(&nl_req, 0, sizeof(nl_req)); + nl_req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifinfomsg)); + nl_req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; + nl_req.hdr.nlmsg_type = RTM_GETLINK; + nl_req.hdr.nlmsg_seq = _seq++; + nl_req.hdr.nlmsg_pid = getpid(); + nl_req.infomsg.ifi_family = AF_INET; + + /* Send the netlink request */ + rc = orig_os_api.send(fd, &nl_req, nl_req.hdr.nlmsg_len, 0); + if (rc < 0) { + ndtm_logerr("netlink send() operation"); + goto ret; + } + + m_lock.lock(); + + do { + /* Receive the netlink reply */ + rc = orig_os_api.recv(fd, nl_res, sizeof(nl_res), 0); + if (rc < 0) { + ndtm_logerr("netlink recv() operation"); + goto ret; + } + + nl_msg = (struct nlmsghdr *)nl_res; + nl_msglen = rc; + while (NLMSG_OK(nl_msg, (size_t)nl_msglen) && (nl_msg->nlmsg_type != NLMSG_ERROR)) { + struct ifinfomsg *nl_msgdata; + + nl_msgdata = (struct ifinfomsg *)NLMSG_DATA(nl_msg); + + /* Skip existing interfaces */ + if (m_net_device_map_index.find(nl_msgdata->ifi_index) != m_net_device_map_index.end()) { + goto next; + } + + /* Skip some types */ + if (!(nl_msgdata->ifi_flags & IFF_SLAVE)) { + struct net_device_val::net_device_val_desc desc = {nl_msg}; + /* Add new interfaces */ + switch (nl_msgdata->ifi_type) { + case ARPHRD_ETHER: + p_net_device_val = new net_device_val_eth(&desc); + break; + case ARPHRD_INFINIBAND: + p_net_device_val = new net_device_val_ib(&desc); + break; + default: + goto next; + } + BULLSEYE_EXCLUDE_BLOCK_START + if (!p_net_device_val) { + ndtm_logerr("failed allocating new net_device (errno=%d %m)", errno); + goto next; + } + if (p_net_device_val->get_state() == net_device_val::INVALID) { + delete p_net_device_val; + goto next; + } + + BULLSEYE_EXCLUDE_BLOCK_END + if ((int)get_max_mtu() < p_net_device_val->get_mtu()) { + set_max_mtu(p_net_device_val->get_mtu()); + } + + const ip_data_vector_t& ip = p_net_device_val->get_ip_array(); + for (size_t i = 0; i < ip.size(); i++) { + m_net_device_map_addr[ip[i]->local_addr] = p_net_device_val; + } + m_net_device_map_index[p_net_device_val->get_if_idx()] = p_net_device_val; + } + +next: + + /* Check if it is the last message */ + if(nl_msg->nlmsg_type == NLMSG_DONE) { + goto ret; + } + nl_msg = NLMSG_NEXT(nl_msg, nl_msglen); + } + } while (1); + +ret: + + m_lock.unlock(); + ndtm_logdbg("Check completed. Found %d offload capable network interfaces", m_net_device_map_index.size()); + + orig_os_api.close(fd); +} + +void net_device_table_mgr::print_val_tbl() +{ + net_device_map_index_t::iterator itr; + for (itr = m_net_device_map_index.begin(); itr != m_net_device_map_index.end(); itr++) { + net_device_val* p_ndev = dynamic_cast (itr->second); + p_ndev->print_val(); + } +} + +net_device_val* net_device_table_mgr::get_net_device_val(in_addr_t local_addr) +{ + auto_unlocker lock(m_lock); + + net_device_map_addr_t::iterator iter = m_net_device_map_addr.find(local_addr); + if (iter != m_net_device_map_addr.end()) { + net_device_val* net_dev = iter->second; + ndtm_logdbg("Found %s for addr: %d.%d.%d.%d", net_dev->to_str().c_str(), NIPQUAD(local_addr)); + if (net_dev->get_state() == net_device_val::INVALID) { + ndtm_logdbg("invalid net_device %s", net_dev->to_str().c_str()); + return NULL; + } + return iter->second; + } + ndtm_logdbg("Can't find net_device for addr: %d.%d.%d.%d", NIPQUAD(local_addr)); + return NULL; +} + +net_device_val* net_device_table_mgr::get_net_device_val(int if_index) +{ + net_device_map_index_t::iterator iter; + net_device_val* net_dev = NULL; + + auto_unlocker lock(m_lock); + + /* Find master interface */ + for (iter = m_net_device_map_index.begin(); iter != m_net_device_map_index.end(); iter++) { + net_dev = iter->second; + /* Check if interface is master */ + if (if_index == net_dev->get_if_idx()) { + goto out; + } + /* Check if interface is slave */ + const slave_data_vector_t& slaves = net_dev->get_slave_array(); + for (size_t i = 0; i < slaves.size(); i++) { + if (if_index == slaves[i]->if_index) { + goto out; + } + } + /* Check if interface is new netvsc slave */ + if (net_dev->get_is_bond() == net_device_val::NETVSC) { + char if_name[IFNAMSIZ] = {0}; + char sys_path[256] = {0}; + int ret = 0; + if (if_indextoname(if_index, if_name)) { + ret = snprintf(sys_path, sizeof(sys_path), NETVSC_DEVICE_UPPER_FILE, if_name, net_dev->get_ifname()); + if (ret > 0 && (size_t)ret < sizeof(sys_path)) { + ret = errno; /* to suppress errno */ + int fd = open(sys_path, O_RDONLY); + if (fd >= 0) { + close(fd); + goto out; + } + errno = ret; + } + } + } + } + + ndtm_logdbg("Can't find net_device for index: %d", if_index); + return NULL; + +out: + + ndtm_logdbg("Found %s for index: %d", net_dev->to_str().c_str(), if_index); + if (net_dev->get_state() == net_device_val::INVALID) { + ndtm_logdbg("invalid net_device %s", net_dev->to_str().c_str()); + return NULL; + } + return net_dev; +} + +net_device_entry* net_device_table_mgr::create_new_entry(ip_address local_ip, const observer* obs) +{ + ndtm_logdbg(""); + NOT_IN_USE(obs); + + net_device_val *p_ndv = get_net_device_val(local_ip.get_in_addr()); + + if (p_ndv) { + return new net_device_entry(local_ip.get_in_addr(), p_ndv); + } + return NULL; +} + +local_ip_list_t net_device_table_mgr::get_ip_list(int if_index) +{ + net_device_map_index_t::iterator iter; + local_ip_list_t ip_list; + size_t i; + + m_lock.lock(); + + iter = (if_index > 0 ? + m_net_device_map_index.find(if_index) : + m_net_device_map_index.begin()); + + for (; iter != m_net_device_map_index.end(); iter++) { + net_device_val* p_ndev = iter->second; + const ip_data_vector_t& ip = p_ndev->get_ip_array(); + for (i = 0; i < ip.size(); i++) { + ip_list.push_back(*ip[i]); + } + if (if_index > 0) { + break; + } + } + + m_lock.unlock(); + + return ip_list; +} + +int net_device_table_mgr::global_ring_poll_and_process_element(uint64_t *p_poll_sn, void* pv_fd_ready_array/*= NULL*/) +{ + ndtm_logfunc(""); + int ret_total = 0; + + net_device_map_index_t::iterator net_dev_iter; + for (net_dev_iter=m_net_device_map_index.begin(); net_dev_iter!=m_net_device_map_index.end(); net_dev_iter++) { + int ret = net_dev_iter->second->global_ring_poll_and_process_element(p_poll_sn, pv_fd_ready_array); + if (ret < 0) { + ndtm_logdbg("Error in net_device_val[%p]->poll_and_process_element() (errno=%d %m)", net_dev_iter->second, errno); + return ret; + } + ret_total += ret; + } + if (ret_total) { + ndtm_logfunc("ret_total=%d", ret_total); + } else { + ndtm_logfuncall("ret_total=%d", ret_total); + } + return ret_total; +} + +int net_device_table_mgr::global_ring_request_notification(uint64_t poll_sn) +{ + ndtm_logfunc(""); + int ret_total = 0; + net_device_map_index_t::iterator net_dev_iter; + for (net_dev_iter = m_net_device_map_index.begin(); m_net_device_map_index.end() != net_dev_iter; net_dev_iter++) { + int ret = net_dev_iter->second->global_ring_request_notification(poll_sn); + BULLSEYE_EXCLUDE_BLOCK_START + if (ret < 0) { + ndtm_logerr("Error in net_device_val[%p]->request_notification() (errno=%d %m)", net_dev_iter->second, errno); + return ret; + } + BULLSEYE_EXCLUDE_BLOCK_END + ret_total += ret; + } + return ret_total; + +} + +int net_device_table_mgr::global_ring_epfd_get() +{ + return m_global_ring_epfd; +} + +int net_device_table_mgr::global_ring_wait_for_notification_and_process_element(uint64_t *p_poll_sn, void* pv_fd_ready_array /*=NULL*/) +{ + ndtm_logfunc(""); + int ret_total = 0; + int max_fd = 16; + struct epoll_event events[max_fd]; + + int res = orig_os_api.epoll_wait(global_ring_epfd_get(), events, max_fd, 0); + if (res > 0) { + for (int event_idx = 0; event_idx < res ; ++event_idx) { + int fd = events[event_idx].data.fd; // This is the Rx cq channel fd + cq_channel_info* p_cq_ch_info = g_p_fd_collection->get_cq_channel_fd(fd); + if (p_cq_ch_info) { + ring* p_ready_ring = p_cq_ch_info->get_ring(); + // Handle the CQ notification channel + int ret = p_ready_ring->wait_for_notification_and_process_element(fd, p_poll_sn, pv_fd_ready_array); + if (ret < 0) { + if (errno == EAGAIN) { + ndtm_logdbg("Error in ring[%d]->wait_for_notification_and_process_element() of %p (errno=%d %m)", event_idx, p_ready_ring, errno); + } + else { + ndtm_logerr("Error in ring[%d]->wait_for_notification_and_process_element() of %p (errno=%d %m)", event_idx, p_ready_ring, errno); + } + continue; + } + if (ret > 0) { + ndtm_logfunc("ring[%p] Returned with: %d (sn=%d)", p_ready_ring, ret, *p_poll_sn); + } + ret_total += ret; + } + else { + ndtm_logdbg("removing wakeup fd from epfd"); + BULLSEYE_EXCLUDE_BLOCK_START + if ((orig_os_api.epoll_ctl(m_global_ring_epfd, EPOLL_CTL_DEL, + m_global_ring_pipe_fds[0], NULL)) && (!(errno == ENOENT || errno == EBADF))) { + ndtm_logerr("failed to del pipe channel fd from internal epfd (errno=%d %m)", errno); + } + BULLSEYE_EXCLUDE_BLOCK_END + } + } + } + if (ret_total) { + ndtm_logfunc("ret_total=%d", ret_total); + } else { + ndtm_logfuncall("ret_total=%d", ret_total); + } + return ret_total; +} + +int net_device_table_mgr::global_ring_drain_and_procces() +{ + ndtm_logfuncall(""); + int ret_total = 0; + + net_device_map_index_t::iterator net_dev_iter; + for (net_dev_iter=m_net_device_map_index.begin(); m_net_device_map_index.end() != net_dev_iter; net_dev_iter++) { + int ret = net_dev_iter->second->ring_drain_and_proccess(); + if (ret < 0 && errno != EAGAIN) { + ndtm_logerr("Error in ring[%p]->drain() (errno=%d %m)", net_dev_iter->second, errno); + return ret; + } + ret_total += ret; + } + if (ret_total) { + ndtm_logfunc("ret_total=%d", ret_total); + } else { + ndtm_logfuncall("ret_total=%d", ret_total); + } + return ret_total; +} + +void net_device_table_mgr::global_ring_adapt_cq_moderation() +{ + ndtm_logfuncall(""); + + net_device_map_index_t::iterator net_dev_iter; + for (net_dev_iter=m_net_device_map_index.begin(); m_net_device_map_index.end() != net_dev_iter; net_dev_iter++) { + net_dev_iter->second->ring_adapt_cq_moderation(); + } +} + +void net_device_table_mgr::handle_timer_expired(void* user_data) +{ + int timer_type = (uint64_t)user_data; + switch (timer_type) { + case RING_PROGRESS_ENGINE_TIMER: + global_ring_drain_and_procces(); + break; + case RING_ADAPT_CQ_MODERATION_TIMER: + global_ring_adapt_cq_moderation(); + break; + default: + ndtm_logerr("unrecognized timer %d", timer_type); + } +} + +void net_device_table_mgr::global_ring_wakeup() +{ + ndtm_logdbg(""); + epoll_event ev = {0, {0}}; + + ev.events = EPOLLIN; + ev.data.ptr = NULL; + int errno_tmp = errno; //don't let wakeup affect errno, as this can fail with EEXIST + BULLSEYE_EXCLUDE_BLOCK_START + if ((orig_os_api.epoll_ctl(m_global_ring_epfd, EPOLL_CTL_ADD, + m_global_ring_pipe_fds[0], &ev)) && (errno != EEXIST)) { + ndtm_logerr("failed to add pipe channel fd to internal epfd (errno=%d %m)", errno); + } + BULLSEYE_EXCLUDE_BLOCK_END + errno = errno_tmp; +} + +void net_device_table_mgr::set_max_mtu(uint32_t mtu) +{ + m_max_mtu = mtu; +} + +uint32_t net_device_table_mgr::get_max_mtu() +{ + return m_max_mtu; +} + +void net_device_table_mgr::del_link_event(const netlink_link_info* info) +{ + ndtm_logdbg("netlink event: RTM_DELLINK if_index: %d", info->ifindex); + + /* This flow is actual when interface is removed quickly + * w/o moving it in DOWN state. + * Usually interface is removed during sequence of RTM_NEWLINK events + * that puts it in DOWN state. In this case VMA has more time to release + * resources correctly. + */ + if (info->flags & IFF_SLAVE) { + net_device_val* net_dev = NULL; + int if_index = info->ifindex; + + ndtm_logdbg("netlink event: if_index: %d state: %s", + info->ifindex, (info->flags & IFF_RUNNING ? "Up" : "Down")); + + net_dev = get_net_device_val(if_index); + if (net_dev && + (if_index != net_dev->get_if_idx()) && + (net_dev->get_is_bond() == net_device_val::NETVSC) && + (net_dev->get_slave(if_index))) { + ndtm_logdbg("found entry [%p]: if_index: %d : %s", + net_dev, net_dev->get_if_idx(), net_dev->get_ifname()); + net_dev->update_netvsc_slaves(info->ifindex, info->flags); + } + } +} + +void net_device_table_mgr::new_link_event(const netlink_link_info* info) +{ + ndtm_logdbg("netlink event: RTM_NEWLINK if_index: %d", info->ifindex); + + /* This flow is used to process interface UP and DOWN scenarios. + * It is important that interface can be removed w/o putting it into + * DOWN state (see RTM_DELLINK). + */ + if (info->flags & IFF_SLAVE) { + net_device_val* net_dev = NULL; + int if_index = info->ifindex; + + ndtm_logdbg("netlink event: if_index: %d state: %s", + info->ifindex, (info->flags & IFF_RUNNING ? "Up" : "Down")); + + net_dev = get_net_device_val(if_index); + if (net_dev && + (if_index != net_dev->get_if_idx()) && + (net_dev->get_is_bond() == net_device_val::NETVSC) && + ((net_dev->get_slave(if_index) && !(info->flags & IFF_RUNNING)) || + (!net_dev->get_slave(if_index) && (info->flags & IFF_RUNNING)))) { + ndtm_logdbg("found entry [%p]: if_index: %d : %s", + net_dev, net_dev->get_if_idx(), net_dev->get_ifname()); + net_dev->update_netvsc_slaves(info->ifindex, info->flags); + } + } +} + +void net_device_table_mgr::notify_cb(event *ev) +{ + ndtm_logdbg("netlink event: LINK"); + + link_nl_event *link_netlink_ev = dynamic_cast (ev); + if (!link_netlink_ev) { + ndtm_logwarn("netlink event: invalid!!!"); + return; + } + + const netlink_link_info* p_netlink_link_info = link_netlink_ev->get_link_info(); + if (!p_netlink_link_info) { + ndtm_logwarn("netlink event: invalid!!!"); + return; + } + + switch(link_netlink_ev->nl_type) { + case RTM_NEWLINK: + new_link_event(p_netlink_link_info); + break; + case RTM_DELLINK: + del_link_event(p_netlink_link_info); + break; + default: + ndtm_logdbg("netlink event: (%u) is not handled", link_netlink_ev->nl_type); + break; + } +} diff --git a/src/vma/dev/net_device_table_mgr.h b/src/vma/dev/net_device_table_mgr.h new file mode 100644 index 0000000..c14235b --- /dev/null +++ b/src/vma/dev/net_device_table_mgr.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef NET_DEVICE_TABLE_MGR_H +#define NET_DEVICE_TABLE_MGR_H + +#include +#include +#include +#include +#include + +#include "vma/event/timer_handler.h" +#include "vma/util/sys_vars.h" +#include "vma/proto/ip_address.h" +#include "vma/infra/cache_subject_observer.h" +#include "net_device_val.h" +#include "net_device_entry.h" + +typedef std::tr1::unordered_map net_device_map_addr_t; +typedef std::tr1::unordered_map net_device_map_index_t; +typedef std::list local_ip_list_t; + +class net_device_table_mgr : public cache_table_mgr, public observer +{ +public: + net_device_table_mgr(); + virtual ~net_device_table_mgr(); + + void update_tbl(); + void print_val_tbl(); + + virtual void notify_cb(event *ev); + net_device_entry* create_new_entry(ip_address local_ip, const observer* dst); + + net_device_val* get_net_device_val(const in_addr_t local_ip); + net_device_val* get_net_device_val(int if_index); + + local_ip_list_t get_ip_list(int if_index = 0); // return list of the table_mgr managed ips + + /** + * Arm ALL the managed CQ's notification channel + * This call will also check for race condition by polling each CQ after arming the notification channel. + * If race condition case occures then that CQ is polled and processed (and the CQ notification is armed) + * Returns >=0 the total number of wce processed + * < 0 on error + */ + int global_ring_poll_and_process_element(uint64_t *p_poll_sn, void* pv_fd_ready_array = NULL); + + + /** + * This will poll one time on the ALL the managed CQ's + * If a wce was found 'processing' will occur. + * Returns: >=0 the total number of wce processed + * < 0 error + */ + int global_ring_wait_for_notification_and_process_element(uint64_t *p_poll_sn, void* pv_fd_ready_array = NULL); + + int global_ring_request_notification(uint64_t poll_sn); + + /** + * This will poll one time on the ALL the managed CQ's + * If a wce was found 'processing' will occur. + * Returns: >=0 the total number of wce processed + * < 0 error + */ + int global_ring_drain_and_procces(); + + void global_ring_adapt_cq_moderation(); + + void global_ring_wakeup(); + + int global_ring_epfd_get(); + + void handle_timer_expired(void* user_data); + + uint32_t get_max_mtu(); + + inline ts_conversion_mode_t get_ctx_time_conversion_mode() { + return m_time_conversion_mode; + }; + +private: + void del_link_event(const netlink_link_info* info); + void new_link_event(const netlink_link_info* info); + + void free_ndtm_resources(); + void set_max_mtu(uint32_t); + + lock_mutex m_lock; + ts_conversion_mode_t m_time_conversion_mode; + net_device_map_addr_t m_net_device_map_addr; + net_device_map_index_t m_net_device_map_index; + int m_num_devices; + + int m_global_ring_epfd; + int m_global_ring_pipe_fds[2]; + + uint32_t m_max_mtu; +}; + +extern net_device_table_mgr* g_p_net_device_table_mgr; + +#endif diff --git a/src/vma/dev/net_device_val.cpp b/src/vma/dev/net_device_val.cpp new file mode 100644 index 0000000..cb8eb78 --- /dev/null +++ b/src/vma/dev/net_device_val.cpp @@ -0,0 +1,1751 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "utils/bullseye.h" +#include "vma/util/if.h" +#include "vma/dev/net_device_val.h" +#include "vma/util/vtypes.h" +#include "vma/util/utils.h" +#include "vma/util/valgrind.h" +#include "vma/event/event_handler_manager.h" +#include "vma/proto/L2_address.h" +#include "vma/dev/ib_ctx_handler_collection.h" +#include "vma/dev/ring_tap.h" +#include "vma/dev/ring_simple.h" +#include "vma/dev/ring_eth_cb.h" +#include "vma/dev/ring_eth_direct.h" +#include "vma/dev/ring_slave.h" +#include "vma/dev/ring_bond.h" +#include "vma/sock/sock-redirect.h" +#include "vma/dev/net_device_table_mgr.h" +#include "vma/proto/neighbour_table_mgr.h" +#include "ring_profile.h" + +#ifdef HAVE_LIBNL3 +#include +#endif + +#define MODULE_NAME "ndv" + +#define nd_logpanic __log_panic +#define nd_logerr __log_err +#define nd_logwarn __log_warn +#define nd_loginfo __log_info +#define nd_logdbg __log_info_dbg +#define nd_logfunc __log_info_func +#define nd_logfuncall __log_info_funcall + +ring_alloc_logic_attr::ring_alloc_logic_attr(): + m_ring_alloc_logic(RING_LOGIC_PER_INTERFACE), + m_ring_profile_key(0), + m_user_id_key(0) +{ + m_mem_desc.iov_base = NULL; + m_mem_desc.iov_len = 0; + init(); +} + +ring_alloc_logic_attr::ring_alloc_logic_attr(ring_logic_t ring_logic): + m_ring_alloc_logic(ring_logic), + m_ring_profile_key(0), + m_user_id_key(0) +{ + m_mem_desc.iov_base = NULL; + m_mem_desc.iov_len = 0; + init(); +} + +ring_alloc_logic_attr::ring_alloc_logic_attr(const ring_alloc_logic_attr &other): + m_hash(other.m_hash), + m_ring_alloc_logic(other.m_ring_alloc_logic), + m_ring_profile_key(other.m_ring_profile_key), + m_user_id_key(other.m_user_id_key), + m_mem_desc(other.m_mem_desc) +{ + snprintf(m_str, RING_ALLOC_STR_SIZE, "%s", other.m_str); +} + +void ring_alloc_logic_attr::init() +{ + size_t h = 5381; + int c; + char buff[RING_ALLOC_STR_SIZE]; + + snprintf(m_str, RING_ALLOC_STR_SIZE, + "allocation logic %d profile %d key %ld user address %p " + "user length %zd", m_ring_alloc_logic, m_ring_profile_key, + m_user_id_key, m_mem_desc.iov_base, m_mem_desc.iov_len); + snprintf(buff, RING_ALLOC_STR_SIZE, "%d%d%ld%p%zd", m_ring_alloc_logic, + m_ring_profile_key, m_user_id_key, m_mem_desc.iov_base, + m_mem_desc.iov_len); + const char* chr = buff; + while ((c = *chr++)) + h = ((h << 5) + h) + c; /* m_hash * 33 + c */ + m_hash = h; +} + +void ring_alloc_logic_attr::set_ring_alloc_logic(ring_logic_t logic) +{ + if (m_ring_alloc_logic != logic) { + m_ring_alloc_logic = logic; + init(); + } +} + +void ring_alloc_logic_attr::set_ring_profile_key(vma_ring_profile_key profile) +{ + if (m_ring_profile_key != profile) { + m_ring_profile_key = profile; + init(); + } +} + +void ring_alloc_logic_attr::set_memory_descriptor(iovec &mem_desc) +{ + if (m_mem_desc.iov_base != mem_desc.iov_base || + m_mem_desc.iov_len != mem_desc.iov_len) { + m_mem_desc = mem_desc; + init(); + } +} + +void ring_alloc_logic_attr::set_user_id_key(uint64_t user_id_key) +{ + if (m_user_id_key != user_id_key) { + m_user_id_key = user_id_key; + init(); + } +} + +net_device_val::net_device_val(struct net_device_val_desc *desc) : m_lock("net_device_val lock") +{ + bool valid = false; + ib_ctx_handler* ib_ctx; + struct nlmsghdr *nl_msg = NULL; + struct ifinfomsg *nl_msgdata = NULL; + int nl_attrlen; + struct rtattr *nl_attr; + + m_if_idx = 0; + m_if_link = 0; + m_type = 0; + m_flags = 0; + m_mtu = 0; + m_state = INVALID; + m_p_L2_addr = NULL; + m_p_br_addr = NULL; + m_bond = NO_BOND; + m_if_active = 0; + m_bond_xmit_hash_policy = XHP_LAYER_2; + m_bond_fail_over_mac = 0; + m_transport_type = VMA_TRANSPORT_UNKNOWN; + + if (NULL == desc) { + nd_logerr("Invalid net_device_val name=%s", "NA"); + m_state = INVALID; + return; + } + + nl_msg = desc->nl_msg; + nl_msgdata = (struct ifinfomsg *)NLMSG_DATA(nl_msg); + + nl_attr = (struct rtattr *)IFLA_RTA(nl_msgdata); + nl_attrlen = IFLA_PAYLOAD(nl_msg); + + set_type(nl_msgdata->ifi_type); + set_if_idx(nl_msgdata->ifi_index); + set_flags(nl_msgdata->ifi_flags); + while (RTA_OK(nl_attr, nl_attrlen)) { + char *nl_attrdata = (char *)RTA_DATA(nl_attr); + size_t nl_attrpayload = RTA_PAYLOAD(nl_attr); + + switch (nl_attr->rta_type) { + case IFLA_MTU: + set_mtu(*(int32_t *)nl_attrdata); + break; + case IFLA_LINK: + set_if_link(*(int32_t *)nl_attrdata); + break; + case IFLA_IFNAME: + set_ifname(nl_attrdata); + break; + case IFLA_ADDRESS: + set_l2_if_addr((uint8_t *)nl_attrdata, nl_attrpayload); + break; + case IFLA_BROADCAST: + set_l2_bc_addr((uint8_t *)nl_attrdata, nl_attrpayload); + break; + default: + break; + } + nl_attr = RTA_NEXT(nl_attr, nl_attrlen); + } + + /* Valid interface should have at least one IP address */ + set_ip_array(); + if (m_ip.empty()) { + return; + } + + /* Identify device type */ + if ((get_flags() & IFF_MASTER) || check_device_exist(get_ifname_link(), BOND_DEVICE_FILE)) { + verify_bonding_mode(); + } else if (check_netvsc_device_exist(get_ifname_link())) { + m_bond = NETVSC; + } else { + m_bond = NO_BOND; + } + + set_str(); + + nd_logdbg("Check interface '%s' (index=%d addr=%d.%d.%d.%d flags=%X)", + get_ifname(), get_if_idx(), NIPQUAD(get_local_addr()), get_flags()); + + valid = false; + ib_ctx = g_p_ib_ctx_handler_collection->get_ib_ctx(get_ifname_link()); + switch (m_bond) { + case NETVSC: + if (get_type() == ARPHRD_ETHER) { + char slave_ifname[IFNAMSIZ] = {0}; + unsigned int slave_flags = 0; + /* valid = true; uncomment it is valid flow to operate w/o SRIOV */ + if (get_netvsc_slave(get_ifname_link(), slave_ifname, slave_flags)) { + valid = verify_qp_creation(slave_ifname, IBV_QPT_RAW_PACKET); + } + } + break; + case LAG_8023ad: + case ACTIVE_BACKUP: + // this is a bond interface (or a vlan/alias over bond), find the slaves + valid = verify_bond_ipoib_or_eth_qp_creation(); + break; + default: + valid = (bool)(ib_ctx && verify_ipoib_or_eth_qp_creation(get_ifname_link())); + break; + } + + if (!valid) { + nd_logdbg("Skip interface '%s'", get_ifname()); + return; + } + + if (safe_mce_sys().mtu != 0 && (int)safe_mce_sys().mtu != get_mtu()) { + nd_logwarn("Mismatch between interface %s MTU=%d and VMA_MTU=%d." + "Make sure VMA_MTU and all offloaded interfaces MTUs match.", + get_ifname(), get_mtu(), safe_mce_sys().mtu); + } + + /* Set interface state after all verifications */ + if (m_flags & IFF_RUNNING) { + m_state = RUNNING; + } + else { + if (m_flags & IFF_UP) { + m_state = UP; + } + else { + m_state = DOWN; + } + } + + nd_logdbg("Use interface '%s'", get_ifname()); + if (ib_ctx) { + nd_logdbg("%s ==> %s port %d (%s)", + get_ifname(), + ib_ctx->get_ibname(), get_port_from_ifname(get_ifname_link()), + (ib_ctx->is_active(get_port_from_ifname(get_ifname_link())) ? "Up" : "Down")); + } else { + nd_logdbg("%s ==> none", + get_ifname()); + } +} + +net_device_val::~net_device_val() +{ + auto_unlocker lock(m_lock); + + rings_hash_map_t::iterator ring_iter; + while ((ring_iter = m_h_ring_map.begin()) != m_h_ring_map.end()) { + delete THE_RING; + resource_allocation_key *tmp = ring_iter->first; + m_h_ring_map.erase(ring_iter); + delete tmp; + } + + rings_key_redirection_hash_map_t::iterator redirect_iter; + while ((redirect_iter = m_h_ring_key_redirection_map.begin()) != + m_h_ring_key_redirection_map.end()) { + delete redirect_iter->second.first; + m_h_ring_key_redirection_map.erase(redirect_iter); + } + if (m_p_br_addr) { + delete m_p_br_addr; + m_p_br_addr = NULL; + } + + if (m_p_L2_addr) { + delete m_p_L2_addr; + m_p_L2_addr = NULL; + } + + slave_data_vector_t::iterator slave = m_slaves.begin(); + for (; slave != m_slaves.end(); ++slave) { + delete *slave; + } + m_slaves.clear(); + + ip_data_vector_t::iterator ip = m_ip.begin(); + for (; ip != m_ip.end(); ++ip) { + delete *ip; + } + m_ip.clear(); +} + +void net_device_val::set_ip_array() +{ + int rc = 0; + int fd = -1; + struct { + struct nlmsghdr hdr; + struct ifaddrmsg addrmsg; + } nl_req; + struct nlmsghdr *nl_msg; + int nl_msglen = 0; + char nl_res[8096]; + static int _seq = 0; + + /* Set up the netlink socket */ + fd = orig_os_api.socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE); + if (fd < 0) { + nd_logerr("netlink socket() creation"); + return; + } + + /* Prepare RTM_GETADDR request */ + memset(&nl_req, 0, sizeof(nl_req)); + nl_req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(struct ifaddrmsg)); + nl_req.hdr.nlmsg_flags = NLM_F_REQUEST | NLM_F_DUMP; + nl_req.hdr.nlmsg_type = RTM_GETADDR; + nl_req.hdr.nlmsg_seq = _seq++; + nl_req.hdr.nlmsg_pid = getpid(); + nl_req.addrmsg.ifa_family = AF_INET; + nl_req.addrmsg.ifa_index = m_if_idx; + + /* Send the netlink request */ + rc = orig_os_api.send(fd, &nl_req, nl_req.hdr.nlmsg_len, 0); + if (rc < 0) { + nd_logerr("netlink send() operation"); + goto ret; + } + + do { + /* Receive the netlink reply */ + rc = orig_os_api.recv(fd, nl_res, sizeof(nl_res), 0); + if (rc < 0) { + nd_logerr("netlink recv() operation"); + goto ret; + } + + nl_msg = (struct nlmsghdr *)nl_res; + nl_msglen = rc; + while (NLMSG_OK(nl_msg, (size_t)nl_msglen) && (nl_msg->nlmsg_type != NLMSG_ERROR)) { + int nl_attrlen; + struct ifaddrmsg *nl_msgdata; + struct rtattr *nl_attr; + ip_data_t* p_val = NULL; + + nl_msgdata = (struct ifaddrmsg *)NLMSG_DATA(nl_msg); + + /* Process just specific if index */ + if ((int)nl_msgdata->ifa_index == m_if_idx) { + nl_attr = (struct rtattr *)IFA_RTA(nl_msgdata); + nl_attrlen = IFA_PAYLOAD(nl_msg); + + p_val = new ip_data_t; + p_val->flags = nl_msgdata->ifa_flags; + memset(&p_val->netmask, 0, sizeof(in_addr_t)); + p_val->netmask = prefix_to_netmask(nl_msgdata->ifa_prefixlen); + while (RTA_OK(nl_attr, nl_attrlen)) { + char *nl_attrdata = (char *)RTA_DATA(nl_attr); + + switch (nl_attr->rta_type) { + case IFA_ADDRESS: + memset(&p_val->local_addr, 0, sizeof(in_addr_t)); + memcpy(&p_val->local_addr, (in_addr_t *)nl_attrdata, sizeof(in_addr_t)); + break; + default: + break; + } + nl_attr = RTA_NEXT(nl_attr, nl_attrlen); + } + + m_ip.push_back(p_val); + } + + /* Check if it is the last message */ + if(nl_msg->nlmsg_type == NLMSG_DONE) { + goto ret; + } + nl_msg = NLMSG_NEXT(nl_msg, nl_msglen); + } + } while (1); + +ret: + orig_os_api.close(fd); +} + +void net_device_val::set_str() +{ + char str_x[BUFF_SIZE] = {0}; + + m_str[0] = '\0'; + + str_x[0] = '\0'; + sprintf(str_x, "%d:", m_if_idx); + strcat(m_str, str_x); + + str_x[0] = '\0'; + if (!strcmp(get_ifname(), get_ifname_link())) { + sprintf(str_x, " %s:", get_ifname()); + } else { + sprintf(str_x, " %s@%s:", get_ifname(), get_ifname_link()); + } + strcat(m_str, str_x); + + str_x[0] = '\0'; + sprintf(str_x, " <%s%s%s%s%s%s%s%s%s%s%s>:", + (m_flags & IFF_UP ? "UP," : ""), + (m_flags & IFF_RUNNING ? "RUNNING," : ""), + (m_flags & IFF_NOARP ? "NO_ARP," : ""), + (m_flags & IFF_LOOPBACK ? "LOOPBACK," : ""), + (m_flags & IFF_BROADCAST ? "BROADCAST," : ""), + (m_flags & IFF_MULTICAST ? "MULTICAST," : ""), + (m_flags & IFF_MASTER ? "MASTER," : ""), + (m_flags & IFF_SLAVE ? "SLAVE," : ""), + (m_flags & IFF_LOWER_UP ? "LOWER_UP," : ""), + (m_flags & IFF_DEBUG ? "DEBUG," : ""), + (m_flags & IFF_PROMISC ? "PROMISC," : "")); + strcat(m_str, str_x); + + str_x[0] = '\0'; + sprintf(str_x, " mtu %d", m_mtu); + strcat(m_str, str_x); + + str_x[0] = '\0'; + switch (m_type) { + case ARPHRD_LOOPBACK: + sprintf(str_x, " type %s", "loopback"); + break; + case ARPHRD_ETHER: + sprintf(str_x, " type %s", "ether"); + break; + case ARPHRD_INFINIBAND: + sprintf(str_x, " type %s", "infiniband"); + break; + default: + sprintf(str_x, " type %s", "unknown"); + break; + } + + str_x[0] = '\0'; + switch (m_bond) { + case NETVSC: + sprintf(str_x, " (%s)", "netvsc"); + break; + case LAG_8023ad: + sprintf(str_x, " (%s)", "lag 8023ad"); + break; + case ACTIVE_BACKUP: + sprintf(str_x, " (%s)", "active backup"); + break; + default: + sprintf(str_x, " (%s)", "normal"); + break; + } + strcat(m_str, str_x); +} + +void net_device_val::print_val() +{ + size_t i = 0; + rings_hash_map_t::iterator ring_iter; + + set_str(); + nd_logdbg("%s", m_str); + + nd_logdbg(" ip list: %s", (m_ip.empty() ? "empty " : "")); + for (i = 0; i < m_ip.size(); i++) { + nd_logdbg(" inet: %d.%d.%d.%d netmask: %d.%d.%d.%d flags: 0x%X", + NIPQUAD(m_ip[i]->local_addr), NIPQUAD(m_ip[i]->netmask), m_ip[i]->flags); + } + + nd_logdbg(" slave list: %s", (m_slaves.empty() ? "empty " : "")); + for (i = 0; i < m_slaves.size(); i++) { + char if_name[IFNAMSIZ] = {0}; + + if_name[0] = '\0'; + if_indextoname(m_slaves[i]->if_index, if_name); + nd_logdbg(" %d: %s: %s active: %d", + m_slaves[i]->if_index, if_name, m_slaves[i]->p_L2_addr->to_str().c_str(), m_slaves[i]->active); + } + + nd_logdbg(" ring list: %s", (m_h_ring_map.empty() ? "empty " : "")); + for (ring_iter = m_h_ring_map.begin(); ring_iter != m_h_ring_map.end(); ring_iter++) { + ring *cur_ring = ring_iter->second.first; + NOT_IN_USE(cur_ring); // Suppress --enable-opt-log=high warning + nd_logdbg(" %d: 0x%X: parent 0x%X ref %d", + cur_ring->get_if_index(), cur_ring, cur_ring->get_parent(), ring_iter->second.second); + } +} + +void net_device_val::set_slave_array() +{ + char active_slave[IFNAMSIZ] = {0}; // gather the slave data (only for active-backup)- + + nd_logdbg(""); + + if (m_bond == NETVSC) { + slave_data_t* s = NULL; + unsigned int slave_flags = 0; + if (get_netvsc_slave(get_ifname_link(), active_slave, slave_flags)) { + if ((slave_flags & IFF_UP) && + verify_qp_creation(active_slave, IBV_QPT_RAW_PACKET)) { + s = new slave_data_t(if_nametoindex(active_slave)); + m_slaves.push_back(s); + } + } + } else if (m_bond == NO_BOND) { + slave_data_t* s = new slave_data_t(if_nametoindex(get_ifname())); + m_slaves.push_back(s); + } else { + // bond device + + // get list of all slave devices + char slaves_list[IFNAMSIZ * MAX_SLAVES] = {0}; + if (get_bond_slaves_name_list(get_ifname_link(), slaves_list, sizeof(slaves_list))) { + char* slave = strtok(slaves_list, " "); + while (slave) { + char* p = strchr(slave, '\n'); + if (p) *p = '\0'; // Remove the tailing 'new line" char + + slave_data_t* s = new slave_data_t(if_nametoindex(slave)); + m_slaves.push_back(s); + slave = strtok(NULL, " "); + } + } + + // find the active slave + if (get_bond_active_slave_name(get_ifname_link(), active_slave, sizeof(active_slave))) { + m_if_active = if_nametoindex(active_slave); + nd_logdbg("found the active slave: %d: '%s'", m_if_active, active_slave); + } + else { + nd_logdbg("failed to find the active slave, Moving to LAG state"); + } + } + + bool up_and_active_slaves[m_slaves.size()]; + + memset(up_and_active_slaves, 0, sizeof(up_and_active_slaves)); + + if (m_bond == LAG_8023ad) { + get_up_and_active_slaves(up_and_active_slaves, m_slaves.size()); + } + + for (uint16_t i = 0; i < m_slaves.size(); i++) { + char if_name[IFNAMSIZ] = {0}; + char base_ifname[IFNAMSIZ]; + + if (!if_indextoname(m_slaves[i]->if_index, if_name)) { + nd_logerr("Can not find interface name by index=%d", m_slaves[i]->if_index); + continue; + } + get_base_interface_name((const char*)if_name, base_ifname, sizeof(base_ifname)); + + // Save L2 address + m_slaves[i]->p_L2_addr = create_L2_address(if_name); + m_slaves[i]->active = false; + + if (m_bond == ACTIVE_BACKUP && m_if_active == m_slaves[i]->if_index) { + m_slaves[i]->active = true; + } + + if (m_bond == LAG_8023ad) { + if (up_and_active_slaves[i]) { + m_slaves[i]->active = true; + } + } + + if (m_bond == NETVSC) { + m_slaves[i]->active = true; + } + + if (m_bond == NO_BOND) { + m_slaves[i]->active = true; + } + + m_slaves[i]->p_ib_ctx = g_p_ib_ctx_handler_collection->get_ib_ctx(base_ifname); + m_slaves[i]->port_num = get_port_from_ifname(base_ifname); + if (m_slaves[i]->port_num < 1) { + nd_logdbg("Error: port %d ==> ifname=%s base_ifname=%s", + m_slaves[i]->port_num, if_name, base_ifname); + } + } + + if (m_slaves.empty() && NETVSC != m_bond) { + m_state = INVALID; + nd_logpanic("No slave found."); + } +} + +const slave_data_t* net_device_val::get_slave(int if_index) +{ + auto_unlocker lock(m_lock); + + slave_data_vector_t::iterator iter; + for (iter = m_slaves.begin(); iter != m_slaves.end(); iter++) { + slave_data_t *cur_slave = *iter; + if (cur_slave->if_index == if_index) { + return cur_slave; + } + } + return NULL; +} + +void net_device_val::verify_bonding_mode() +{ + // this is a bond interface, lets get its mode. + char bond_mode_file_content[FILENAME_MAX]; + char bond_failover_mac_file_content[FILENAME_MAX]; + char bond_mode_param_file[FILENAME_MAX]; + char bond_failover_mac_param_file[FILENAME_MAX]; + char bond_xmit_hash_policy_file_content[FILENAME_MAX]; + char bond_xmit_hash_policy_param_file[FILENAME_MAX]; + + memset(bond_mode_file_content, 0, FILENAME_MAX); + sprintf(bond_mode_param_file, BONDING_MODE_PARAM_FILE, get_ifname_link()); + sprintf(bond_failover_mac_param_file, BONDING_FAILOVER_MAC_PARAM_FILE, get_ifname_link()); + + if (priv_safe_read_file(bond_mode_param_file, bond_mode_file_content, FILENAME_MAX) > 0) { + char *bond_mode = NULL; + bond_mode = strtok(bond_mode_file_content, " "); + if (bond_mode) { + if (!strcmp(bond_mode, "active-backup")) { + m_bond = ACTIVE_BACKUP; + } else if (strstr(bond_mode, "802.3ad")) { + m_bond = LAG_8023ad; + } + if (priv_safe_read_file(bond_failover_mac_param_file, bond_failover_mac_file_content, FILENAME_MAX) > 0) { + if(strstr(bond_failover_mac_file_content, "0")){ + m_bond_fail_over_mac = 0; + } else if(strstr(bond_failover_mac_file_content, "1")){ + m_bond_fail_over_mac = 1; + } else if(strstr(bond_failover_mac_file_content, "2")){ + m_bond_fail_over_mac = 2; + } + } + } + } + + memset(bond_xmit_hash_policy_file_content, 0, FILENAME_MAX); + sprintf(bond_xmit_hash_policy_param_file, BONDING_XMIT_HASH_POLICY_PARAM_FILE, get_ifname_link()); + if (priv_safe_try_read_file(bond_xmit_hash_policy_param_file, bond_xmit_hash_policy_file_content, FILENAME_MAX) > 0) { + char *bond_xhp = NULL; + char *saveptr = NULL; + + bond_xhp = strtok_r(bond_xmit_hash_policy_file_content, " ", &saveptr); + if (NULL == bond_xhp) { + nd_logdbg("could not parse bond xmit hash policy, staying with default (L2)\n"); + } else { + bond_xhp = strtok_r(NULL, " ", &saveptr); + if (bond_xhp) { + m_bond_xmit_hash_policy = (bond_xmit_hash_policy)strtol(bond_xhp, NULL , 10); + if (m_bond_xmit_hash_policy < XHP_LAYER_2 || m_bond_xmit_hash_policy > XHP_ENCAP_3_4) { + vlog_printf(VLOG_WARNING,"VMA does not support xmit hash policy = %d\n", m_bond_xmit_hash_policy); + m_bond_xmit_hash_policy = XHP_LAYER_2; + } + } + nd_logdbg("got bond xmit hash policy = %d\n", m_bond_xmit_hash_policy); + } + } else { + nd_logdbg("could not read bond xmit hash policy, staying with default (L2)\n"); + } + + if (m_bond == NO_BOND || m_bond_fail_over_mac > 1) { + vlog_printf(VLOG_WARNING,"******************************************************************************\n"); + vlog_printf(VLOG_WARNING,"VMA doesn't support current bonding configuration of %s.\n", get_ifname_link()); + vlog_printf(VLOG_WARNING,"The only supported bonding mode is \"802.3ad 4(#4)\" or \"active-backup(#1)\"\n"); + vlog_printf(VLOG_WARNING,"with \"fail_over_mac=1\" or \"fail_over_mac=0\".\n"); + vlog_printf(VLOG_WARNING,"The effect of working in unsupported bonding mode is undefined.\n"); + vlog_printf(VLOG_WARNING,"Read more about Bonding in the VMA's User Manual\n"); + vlog_printf(VLOG_WARNING,"******************************************************************************\n"); + } +} + +/** + * only for active-backup bond + */ +bool net_device_val::update_active_backup_slaves() +{ + // update the active slave + // /sys/class/net/bond0/bonding/active_slave + char active_slave[IFNAMSIZ*MAX_SLAVES] = {0}; + int if_active_slave = 0; + + if (!get_bond_active_slave_name(get_ifname_link(), active_slave, IFNAMSIZ)) { + nd_logdbg("failed to find the active slave!"); + return 0; + } + + //nothing changed + if_active_slave = if_nametoindex(active_slave); + if (m_if_active == if_active_slave) { + return 0; + } + + m_p_L2_addr = create_L2_address(get_ifname()); + bool found_active_slave = false; + for (size_t i = 0; i < m_slaves.size(); i++) { + if (if_active_slave == m_slaves[i]->if_index) { + m_slaves[i]->active = true; + found_active_slave = true; + nd_logdbg("Slave changed old=%d new=%d", m_if_active, if_active_slave); + m_if_active = if_active_slave; + } else { + m_slaves[i]->active = false; + } + } + if (!found_active_slave) { + nd_logdbg("Failed to locate new active slave details"); + return 0; + } + // restart rings + rings_hash_map_t::iterator ring_iter; + for (ring_iter = m_h_ring_map.begin(); ring_iter != m_h_ring_map.end(); ring_iter++) { + THE_RING->restart(); + } + return 1; +} + +/* + * this function assume m_slaves[i]->if_name and m_slaves.size() are already set. + */ +bool net_device_val::get_up_and_active_slaves(bool* up_and_active_slaves, size_t size) +{ + bool up_slaves[m_slaves.size()]; + int num_up = 0; + bool active_slaves[m_slaves.size()]; + int num_up_and_active = 0; + size_t i = 0; + + if (size != m_slaves.size()) { + nd_logwarn("programmer error! array size is not correct"); + return false; + } + + /* get slaves operstate and active state */ + for (i = 0; i < m_slaves.size(); i++) { + char oper_state[5] = {0}; + char slave_state[10] = {0}; + char if_name[IFNAMSIZ] = {0}; + + if (!if_indextoname(m_slaves[i]->if_index, if_name)) { + nd_logerr("Can not find interface name by index=%d", m_slaves[i]->if_index); + continue; + } + + // get interface operstate + get_interface_oper_state(if_name, oper_state, sizeof(oper_state)); + if (strstr(oper_state, "up")) { + num_up++; + up_slaves[i] = true; + } else { + up_slaves[i] = false; + } + + active_slaves[i] = true; + // get slave state + if (get_bond_slave_state(if_name, slave_state, sizeof(slave_state))){ + if (!strstr(slave_state, "active")) + active_slaves[i] = false; + } + + if (active_slaves[i] && up_slaves[i]) { + up_and_active_slaves[i] = true; + num_up_and_active++; + } else { + up_and_active_slaves[i] = false; + } + } + + /* make sure at least one up interface is active */ + if (!num_up_and_active && num_up) { + for (i = 0; i < m_slaves.size(); i++) { + if (up_slaves[i]) { + up_and_active_slaves[i] = true; + break; + } + } + } + + return true; +} + +bool net_device_val::update_active_slaves() +{ + bool changed = false; + bool up_and_active_slaves[m_slaves.size()]; + size_t i = 0; + + memset(&up_and_active_slaves, 0, m_slaves.size() * sizeof(bool)); + get_up_and_active_slaves(up_and_active_slaves, m_slaves.size()); + + /* compare to current status and prepare for restart */ + for (i = 0; i< m_slaves.size(); i++) { + if (up_and_active_slaves[i]) { + //slave came up + if (!m_slaves[i]->active) { + nd_logdbg("slave %d is up ", m_slaves[i]->if_index); + m_slaves[i]->active = true; + changed = true; + } + } + else { + //slave went down + if (m_slaves[i]->active) { + nd_logdbg("slave %d is down ", m_slaves[i]->if_index); + m_slaves[i]->active = false; + changed = true; + } + } + } + + /* restart if status changed */ + if (changed) { + m_p_L2_addr = create_L2_address(get_ifname()); + // restart rings + rings_hash_map_t::iterator ring_iter; + for (ring_iter = m_h_ring_map.begin(); ring_iter != m_h_ring_map.end(); ring_iter++) { + THE_RING->restart(); + } + return 1; + } + return 0; +} + +void net_device_val::update_netvsc_slaves(int if_index, int if_flags) +{ + slave_data_t* s = NULL; + bool found = false; + ib_ctx_handler *ib_ctx = NULL, *up_ib_ctx = NULL; + char if_name[IFNAMSIZ] = {0}; + + m_lock.lock(); + + if (if_indextoname(if_index, if_name) && (if_flags & IFF_UP) && (if_flags & IFF_RUNNING)) { + nd_logdbg("slave %d is up", if_index); + + g_p_ib_ctx_handler_collection->update_tbl(if_name); + if ((up_ib_ctx = g_p_ib_ctx_handler_collection->get_ib_ctx(if_name))) { + s = new slave_data_t(if_index); + s->active = true; + s->p_ib_ctx = up_ib_ctx; + s->p_L2_addr = create_L2_address(if_name); + s->port_num = get_port_from_ifname(if_name); + m_slaves.push_back(s); + + up_ib_ctx->set_ctx_time_converter_status(g_p_net_device_table_mgr->get_ctx_time_conversion_mode()); + g_buffer_pool_rx->register_memory(s->p_ib_ctx); + g_buffer_pool_tx->register_memory(s->p_ib_ctx); + found = true; + } + } else { + if (!m_slaves.empty()) { + s = m_slaves.back(); + m_slaves.pop_back(); + + nd_logdbg("slave %d is down ", s->if_index); + + ib_ctx = s->p_ib_ctx; + delete s; + found = true; + } + } + + m_lock.unlock(); + + if (!found) { + nd_logdbg("Unable to detect any changes for interface %d. ignoring", if_index); + return; + } + + /* restart if status changed */ + m_p_L2_addr = create_L2_address(get_ifname()); + // restart rings + rings_hash_map_t::iterator ring_iter; + for (ring_iter = m_h_ring_map.begin(); ring_iter != m_h_ring_map.end(); ring_iter++) { + THE_RING->restart(); + } + + if (ib_ctx) { + g_p_ib_ctx_handler_collection->del_ib_ctx(ib_ctx); + } +} + +std::string net_device_val::to_str() +{ + return std::string("Net Device: " + m_name); +} + +ring* net_device_val::reserve_ring(resource_allocation_key *key) +{ + nd_logfunc(""); + auto_unlocker lock(m_lock); + key = ring_key_redirection_reserve(key); + ring* the_ring = NULL; + rings_hash_map_t::iterator ring_iter = m_h_ring_map.find(key); + + if (m_h_ring_map.end() == ring_iter) { + nd_logdbg("Creating new RING for %s", key->to_str()); + // copy key since we keep pointer and socket can die so map will lose pointer + resource_allocation_key *new_key = new resource_allocation_key(*key); + the_ring = create_ring(new_key); + if (!the_ring) { + return NULL; + } + m_h_ring_map[new_key] = std::make_pair(the_ring, 0); // each ring is born with ref_count = 0 + ring_iter = m_h_ring_map.find(new_key); + epoll_event ev = {0, {0}}; + int num_ring_rx_fds = the_ring->get_num_resources(); + int *ring_rx_fds_array = the_ring->get_rx_channel_fds(); + ev.events = EPOLLIN; + for (int i = 0; i < num_ring_rx_fds; i++) { + int cq_ch_fd = ring_rx_fds_array[i]; + ev.data.fd = cq_ch_fd; + BULLSEYE_EXCLUDE_BLOCK_START + if (unlikely( orig_os_api.epoll_ctl(g_p_net_device_table_mgr->global_ring_epfd_get(), + EPOLL_CTL_ADD, cq_ch_fd, &ev))) { + nd_logerr("Failed to add RING notification fd to global_table_mgr_epfd (errno=%d %m)", errno); + } + BULLSEYE_EXCLUDE_BLOCK_END + } + + g_p_net_device_table_mgr->global_ring_wakeup(); + } + // now we are sure the ring is in the map + + ADD_RING_REF_CNT; + the_ring = GET_THE_RING(key); + + nd_logdbg("0x%X: if_index %d parent 0x%X ref %d key %s", + the_ring, the_ring->get_if_index(), + the_ring->get_parent(), RING_REF_CNT, key->to_str()); + + return the_ring; +} + +bool net_device_val::release_ring(resource_allocation_key *key) +{ + nd_logfunc(""); + + resource_allocation_key *red_key; + + auto_unlocker lock(m_lock); + red_key = get_ring_key_redirection(key); + ring* the_ring = NULL; + rings_hash_map_t::iterator ring_iter = m_h_ring_map.find(red_key); + + if (m_h_ring_map.end() != ring_iter) { + DEC_RING_REF_CNT; + the_ring = GET_THE_RING(red_key); + + nd_logdbg("0x%X: if_index %d parent 0x%X ref %d key %s", + the_ring, the_ring->get_if_index(), + the_ring->get_parent(), RING_REF_CNT, red_key->to_str()); + + if ( TEST_REF_CNT_ZERO ) { + int num_ring_rx_fds = the_ring->get_num_resources(); + int *ring_rx_fds_array = the_ring->get_rx_channel_fds(); + nd_logdbg("Deleting RING %p for key %s and removing notification fd from global_table_mgr_epfd (epfd=%d)", + the_ring, red_key->to_str(), g_p_net_device_table_mgr->global_ring_epfd_get()); + for (int i = 0; i < num_ring_rx_fds; i++) { + int cq_ch_fd = ring_rx_fds_array[i]; + BULLSEYE_EXCLUDE_BLOCK_START + if (unlikely(orig_os_api.epoll_ctl(g_p_net_device_table_mgr->global_ring_epfd_get(), + EPOLL_CTL_DEL, cq_ch_fd, NULL))) { + nd_logerr("Failed to delete RING notification fd to global_table_mgr_epfd (errno=%d %m)", errno); + } + BULLSEYE_EXCLUDE_BLOCK_END + } + + ring_key_redirection_release(key); + + delete the_ring; + delete ring_iter->first; + m_h_ring_map.erase(ring_iter); + } + return true; + } + return false; +} + +/* + * this function maps key to new keys that it created + * the key that it creates is the size of the map + */ +resource_allocation_key* net_device_val::ring_key_redirection_reserve(resource_allocation_key *key) +{ + // if allocation logic is usr idx feature disabled + if (!safe_mce_sys().ring_limit_per_interface || + key->get_ring_alloc_logic() == RING_LOGIC_PER_USER_ID) + return key; + + if (m_h_ring_key_redirection_map.find(key) != m_h_ring_key_redirection_map.end()) { + m_h_ring_key_redirection_map[key].second++; + nd_logdbg("redirecting key=%s (ref-count:%d) to key=%s", key->to_str(), + m_h_ring_key_redirection_map[key].second, + m_h_ring_key_redirection_map[key].first->to_str()); + return m_h_ring_key_redirection_map[key].first; + } + + int ring_map_size = (int)m_h_ring_map.size(); + if (safe_mce_sys().ring_limit_per_interface > ring_map_size) { + resource_allocation_key *key2 = new resource_allocation_key(*key); + // replace key to redirection key + key2->set_user_id_key(ring_map_size); + m_h_ring_key_redirection_map[key] = std::make_pair(key2, 1); + nd_logdbg("redirecting key=%s (ref-count:1) to key=%s", + key->to_str(), key2->to_str()); + return key2; + } + + rings_hash_map_t::iterator ring_iter = m_h_ring_map.begin(); + int min_ref_count = ring_iter->second.second; + resource_allocation_key *min_key = ring_iter->first; + while (ring_iter != m_h_ring_map.end()) { + // redirect only to ring with the same profile + if (ring_iter->first->get_ring_profile_key() == + key->get_ring_profile_key() && + ring_iter->second.second < min_ref_count) { + min_ref_count = ring_iter->second.second; + min_key = ring_iter->first; + } + ring_iter++; + } + m_h_ring_key_redirection_map[key] = std::make_pair(new resource_allocation_key(*min_key), 1); + nd_logdbg("redirecting key=%s (ref-count:1) to key=%s", + key->to_str(), min_key->to_str()); + return min_key; +} + +resource_allocation_key* net_device_val::get_ring_key_redirection(resource_allocation_key *key) +{ + if (!safe_mce_sys().ring_limit_per_interface) return key; + + if (m_h_ring_key_redirection_map.find(key) == m_h_ring_key_redirection_map.end()) { + nd_logdbg("key = %s is not found in the redirection map", + key->to_str()); + return key; + } + + return m_h_ring_key_redirection_map[key].first; +} + +void net_device_val::ring_key_redirection_release(resource_allocation_key *key) +{ + if (safe_mce_sys().ring_limit_per_interface && m_h_ring_key_redirection_map.find(key) != m_h_ring_key_redirection_map.end() + && --m_h_ring_key_redirection_map[key].second == 0) { + // this is allocated in ring_key_redirection_reserve + nd_logdbg("release redirecting key=%s (ref-count:%d) to key=%s", key->to_str(), + m_h_ring_key_redirection_map[key].second, + m_h_ring_key_redirection_map[key].first->to_str()); + delete m_h_ring_key_redirection_map[key].first; + m_h_ring_key_redirection_map.erase(key); + } +} + +int net_device_val::global_ring_poll_and_process_element(uint64_t *p_poll_sn, void* pv_fd_ready_array /*=NULL*/) +{ + nd_logfuncall(""); + int ret_total = 0; + auto_unlocker lock(m_lock); + rings_hash_map_t::iterator ring_iter; + for (ring_iter = m_h_ring_map.begin(); ring_iter != m_h_ring_map.end(); ring_iter++) { + int ret = THE_RING->poll_and_process_element_rx(p_poll_sn, pv_fd_ready_array); + BULLSEYE_EXCLUDE_BLOCK_START + if (ret < 0 && errno != EAGAIN) { + nd_logerr("Error in ring->poll_and_process_element() of %p (errno=%d %m)", THE_RING, errno); + return ret; + } + BULLSEYE_EXCLUDE_BLOCK_END + if (ret > 0) + nd_logfunc("ring[%p] Returned with: %d (sn=%d)", THE_RING, ret, *p_poll_sn); + ret_total += ret; + } + return ret_total; +} + +int net_device_val::global_ring_request_notification(uint64_t poll_sn) +{ + int ret_total = 0; + auto_unlocker lock(m_lock); + rings_hash_map_t::iterator ring_iter; + for (ring_iter = m_h_ring_map.begin(); ring_iter != m_h_ring_map.end(); ring_iter++) { + int ret = THE_RING->request_notification(CQT_RX, poll_sn); + if (ret < 0) { + nd_logerr("Error ring[%p]->request_notification() (errno=%d %m)", THE_RING, errno); + return ret; + } + nd_logfunc("ring[%p] Returned with: %d (sn=%d)", THE_RING, ret, poll_sn); + ret_total += ret; + } + return ret_total; +} + +int net_device_val::ring_drain_and_proccess() +{ + nd_logfuncall(); + int ret_total = 0; + + auto_unlocker lock(m_lock); + rings_hash_map_t::iterator ring_iter; + for (ring_iter = m_h_ring_map.begin(); ring_iter != m_h_ring_map.end(); ring_iter++) { + int ret = THE_RING->drain_and_proccess(); + if (ret < 0) + return ret; + if (ret > 0) + nd_logfunc("cq[%p] Returned with: %d", THE_RING, ret); + ret_total += ret; + } + return ret_total; +} + +void net_device_val::ring_adapt_cq_moderation() +{ + nd_logfuncall(); + + auto_unlocker lock(m_lock); + rings_hash_map_t::iterator ring_iter; + for (ring_iter = m_h_ring_map.begin(); ring_iter != m_h_ring_map.end(); ring_iter++) { + THE_RING->adapt_cq_moderation(); + } +} + +void net_device_val::register_to_ibverbs_events(event_handler_ibverbs *handler) { + for (size_t i = 0; i < m_slaves.size(); i++) { + bool found = false; + for (size_t j = 0; j < i; j++) { + if (m_slaves[i]->p_ib_ctx == m_slaves[j]->p_ib_ctx) { + found = true; //two slaves might be on two ports of the same device, register only once + break; + } + } + if (found) + continue; + nd_logfunc("registering slave to ibverbs events slave=%p", m_slaves[i]); + g_p_event_handler_manager->register_ibverbs_event(m_slaves[i]->p_ib_ctx->get_ibv_context()->async_fd, handler, m_slaves[i]->p_ib_ctx->get_ibv_context(), 0); + } +} + +void net_device_val::unregister_to_ibverbs_events(event_handler_ibverbs *handler) { + for (size_t i = 0; i < m_slaves.size(); i++) { + bool found = false; + for (size_t j = 0; j < i; j++) { + if (m_slaves[i]->p_ib_ctx == m_slaves[j]->p_ib_ctx) { + found = true; //two slaves might be on two ports of the same device, unregister only once + break; + } + } + if (found) + continue; + nd_logfunc("unregistering slave to ibverbs events slave=%p", m_slaves[i]); + g_p_event_handler_manager->unregister_ibverbs_event(m_slaves[i]->p_ib_ctx->get_ibv_context()->async_fd, handler); + } +} + +void net_device_val_eth::configure() +{ + m_p_L2_addr = create_L2_address(get_ifname()); + + BULLSEYE_EXCLUDE_BLOCK_START + if (m_p_L2_addr == NULL) { + nd_logpanic("m_p_L2_addr allocation error"); + } + BULLSEYE_EXCLUDE_BLOCK_END + + create_br_address(get_ifname()); + + m_vlan = get_vlan_id_from_ifname(get_ifname()); + if (m_vlan) { + parse_prio_egress_map(); + } + if (m_vlan && m_bond != NO_BOND && m_bond_fail_over_mac == 1) { + vlog_printf(VLOG_WARNING, " ******************************************************************\n"); + vlog_printf(VLOG_WARNING, "%s: vlan over bond while fail_over_mac=1 is not offloaded\n", get_ifname()); + vlog_printf(VLOG_WARNING, " ******************************************************************\n"); + m_state = INVALID; + } + if(!m_vlan && (get_flags() & IFF_MASTER)) { + char if_name[IFNAMSIZ] = {0}; + + if (!if_indextoname(m_slaves[0]->if_index, if_name)) { + nd_logerr("Can not find interface name by index=%d", m_slaves[0]->if_index); + } + + //in case vlan is configured on slave + m_vlan = get_vlan_id_from_ifname(if_name); + } +} + +int net_device_val::get_priority_by_tc_class(uint32_t tc_class) +{ + tc_class_priority_map::iterator it = m_class_prio_map.find(tc_class); + if (it == m_class_prio_map.end()) { + return VMA_DEFAULT_ENGRESS_MAP_PRIO; + } + return it->second; +} + +void net_device_val_eth::parse_prio_egress_map() +{ +#ifdef HAVE_LIBNL3 + int len, ret; + nl_cache *cache = NULL; + rtnl_link *link; + vlan_map *map; + + nl_socket_handle *nl_socket = nl_socket_handle_alloc(); + if (!nl_socket) { + nd_logdbg("unable to allocate socket socket %m", errno); + goto out; + } + nl_socket_set_local_port(nl_socket, 0); + ret = nl_connect(nl_socket, NETLINK_ROUTE); + if (ret < 0) { + nd_logdbg("unable to connect to libnl socket %d %m", ret, errno); + goto out; + } + ret = rtnl_link_alloc_cache(nl_socket, AF_UNSPEC, &cache); + if (!cache) { + nd_logdbg("unable to create libnl cache %d %m", ret, errno); + goto out; + } + link = rtnl_link_get_by_name(cache, get_ifname()); + if (!link) { + nd_logdbg("unable to get libnl link %d %m", ret, errno); + goto out; + } + map = rtnl_link_vlan_get_egress_map(link, &len); + if (!map || !len) { + nd_logdbg("no egress map found %d %p",len, map); + goto out; + } + for (int i = 0; i < len; i++) { + m_class_prio_map[map[i].vm_from] = map[i].vm_to; + } +out: + if (cache) { + nl_cache_free(cache); + } + if (nl_socket) { + nl_socket_handle_free(nl_socket); + } +#else + nd_logdbg("libnl3 not found, cannot read engress map, " + "SO_PRIORITY will not work properly"); +#endif +} + +ring* net_device_val_eth::create_ring(resource_allocation_key *key) +{ + ring* ring = NULL; + + // if this is a ring profile key get the profile from the global map + if (key->get_ring_profile_key()) { + if (!g_p_ring_profile) { + nd_logdbg("could not find ring profile"); + return NULL; + } + ring_profile *prof = + g_p_ring_profile->get_profile(key->get_ring_profile_key()); + if (prof == NULL) { + nd_logerr("could not find ring profile %d", + key->get_ring_profile_key()); + return NULL; + } + try { + switch (prof->get_ring_type()) { +#ifdef HAVE_MP_RQ + case VMA_RING_CYCLIC_BUFFER: + ring = new ring_eth_cb(get_if_idx(), + &prof->get_desc()->ring_cyclicb, + key->get_memory_descriptor()); + break; +#endif + case VMA_RING_EXTERNAL_MEM: + ring = new ring_eth_direct(get_if_idx(), + &prof->get_desc()->ring_ext); + break; + default: + nd_logdbg("Unknown ring type"); + break; + } + } catch (vma_error &error) { + nd_logdbg("failed creating ring %s", error.message); + } + } else { + try { + switch (m_bond) { + case NO_BOND: + ring = new ring_eth(get_if_idx()); + break; + case ACTIVE_BACKUP: + case LAG_8023ad: + ring = new ring_bond_eth(get_if_idx()); + break; + case NETVSC: + ring = new ring_bond_netvsc(get_if_idx()); + break; + default: + nd_logdbg("Unknown ring type"); + break; + } + } catch (vma_error &error) { + nd_logdbg("failed creating ring %s", error.message); + } + } + return ring; +} + +L2_address* net_device_val_eth::create_L2_address(const char* ifname) +{ + if (m_p_L2_addr) { + delete m_p_L2_addr; + m_p_L2_addr = NULL; + } + unsigned char hw_addr[ETH_ALEN]; + get_local_ll_addr(ifname, hw_addr, ETH_ALEN, false); + return new ETH_addr(hw_addr); +} + +void net_device_val_eth::create_br_address(const char* ifname) +{ + if(m_p_br_addr) { + delete m_p_br_addr; + m_p_br_addr = NULL; + } + uint8_t hw_addr[ETH_ALEN]; + get_local_ll_addr(ifname, hw_addr, ETH_ALEN, true); + m_p_br_addr = new ETH_addr(hw_addr); + + BULLSEYE_EXCLUDE_BLOCK_START + if(m_p_br_addr == NULL) { + nd_logpanic("m_p_br_addr allocation error"); + } + BULLSEYE_EXCLUDE_BLOCK_END +} +std::string net_device_val_eth::to_str() +{ + return std::string("ETH: " + net_device_val::to_str()); +} + +net_device_val_ib::~net_device_val_ib() +{ + struct in_addr in; + if (1 == inet_pton(AF_INET, BROADCAST_IP, &in)) { + g_p_neigh_table_mgr->unregister_observer(neigh_key(ip_address(in.s_addr), this), this); + } +} + +void net_device_val_ib::configure() +{ + ib_ctx_handler* p_ib_ctx = NULL; + struct in_addr in; + + m_p_L2_addr = create_L2_address(get_ifname()); + + BULLSEYE_EXCLUDE_BLOCK_START + if(m_p_L2_addr == NULL) { + nd_logpanic("m_p_L2_addr allocation error"); + } + BULLSEYE_EXCLUDE_BLOCK_END + + create_br_address(get_ifname()); + + if (1 == inet_pton(AF_INET, BROADCAST_IP, &in)) { + g_p_neigh_table_mgr->unregister_observer(neigh_key(ip_address(in.s_addr), this), this); + } + + //Register to IB BR neigh + cache_entry_subject* p_ces = NULL; + if (1 == inet_pton(AF_INET, BROADCAST_IP, &in)) { + g_p_neigh_table_mgr->register_observer(neigh_key(ip_address(in.s_addr), this), this, &p_ces); + } + m_br_neigh = dynamic_cast(p_ces); + + p_ib_ctx = g_p_ib_ctx_handler_collection->get_ib_ctx(get_ifname_link()); + if (!p_ib_ctx || ibv_query_pkey(p_ib_ctx->get_ibv_context(), get_port_from_ifname(get_ifname_link()), 0, &m_pkey)) { + nd_logerr("failed querying pkey"); + } + nd_logdbg("pkey: %d", m_pkey); +} + +ring* net_device_val_ib::create_ring(resource_allocation_key *key) +{ + ring* ring = NULL; + + NOT_IN_USE(key); + try { + switch (m_bond) { + case NO_BOND: + ring = new ring_ib(get_if_idx()); + break; + case ACTIVE_BACKUP: + case LAG_8023ad: + ring = new ring_bond_ib(get_if_idx()); + break; + default: + nd_logdbg("Unknown ring type"); + break; + } + } catch (vma_error &error) { + nd_logdbg("failed creating ring %s", error.message); + } + + return ring; +} + +L2_address* net_device_val_ib::create_L2_address(const char* ifname) +{ + if (m_p_L2_addr) { + delete m_p_L2_addr; + m_p_L2_addr = NULL; + } + unsigned char hw_addr[IPOIB_HW_ADDR_LEN]; + get_local_ll_addr(ifname, hw_addr, IPOIB_HW_ADDR_LEN, false); + return new IPoIB_addr(hw_addr); +} + +void net_device_val_ib::create_br_address(const char* ifname) +{ + if (m_p_br_addr) { + delete m_p_br_addr; + m_p_br_addr = NULL; + } + unsigned char hw_addr[IPOIB_HW_ADDR_LEN]; + get_local_ll_addr(ifname, hw_addr, IPOIB_HW_ADDR_LEN, true); + m_p_br_addr = new IPoIB_addr(hw_addr); + BULLSEYE_EXCLUDE_BLOCK_START + if (m_p_br_addr == NULL) { + nd_logpanic("m_p_br_addr allocation error"); + } + BULLSEYE_EXCLUDE_BLOCK_END +} + +std::string net_device_val_ib::to_str() +{ + return std::string("IB: " + net_device_val::to_str()); +} + + +bool net_device_val::verify_bond_ipoib_or_eth_qp_creation() +{ + char slaves[IFNAMSIZ * MAX_SLAVES] = {0}; + if (!get_bond_slaves_name_list(get_ifname_link(), slaves, sizeof slaves)) { + vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n"); + vlog_printf(VLOG_WARNING,"* Interface %s will not be offloaded, slave list or bond name could not be found\n", get_ifname()); + vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n"); + return false; + } + //go over all slaves and check preconditions + bool bond_ok = true; + char* slave_name; + char* save_ptr; + slave_name = strtok_r(slaves, " ", &save_ptr); + while (slave_name != NULL) + { + char* p = strchr(slave_name, '\n'); + if (p) *p = '\0'; // Remove the tailing 'new line" char + if (!verify_ipoib_or_eth_qp_creation(slave_name)) { + //check all slaves but print only once for bond + bond_ok = false; + } + slave_name = strtok_r(NULL, " ", &save_ptr); + } + if (!bond_ok) { + vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n"); + vlog_printf(VLOG_WARNING,"* Bond %s will not be offloaded due to problem with its slaves.\n", get_ifname()); + vlog_printf(VLOG_WARNING,"* Check warning messages for more information.\n"); + vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n"); + } else { + /* + * Print warning message while bond device contains two slaves of the same HCA + * while RoCE LAG is enabled for both slaves. + */ + sys_image_guid_map_t::iterator guid_iter; + for (guid_iter = m_sys_image_guid_map.begin(); guid_iter != m_sys_image_guid_map.end(); guid_iter++) { + char bond_roce_lag_path[256] = {0}; + if (guid_iter->second.size() > 1 && + check_bond_roce_lag_exist(bond_roce_lag_path, sizeof(bond_roce_lag_path), guid_iter->second.front().c_str()) && + check_bond_roce_lag_exist(bond_roce_lag_path, sizeof(bond_roce_lag_path), guid_iter->second.back().c_str())) { + print_roce_lag_warnings(get_ifname_link(), bond_roce_lag_path, guid_iter->second.front().c_str(), guid_iter->second.back().c_str()); + } + } + } + return bond_ok; +} + +//interface name can be slave while ifa struct can describe bond +bool net_device_val::verify_ipoib_or_eth_qp_creation(const char* interface_name) +{ + if (m_type == ARPHRD_INFINIBAND) { + if (verify_enable_ipoib(interface_name) && verify_qp_creation(interface_name, IBV_QPT_UD)) { + return true; + } + } else { + if (verify_qp_creation(interface_name, IBV_QPT_RAW_PACKET)) { + return true; + } + } + return false; +} + +bool net_device_val::verify_enable_ipoib(const char* interface_name) +{ + char filename[256] = "\0"; + char ifname[IFNAMSIZ] = "\0"; + NOT_IN_USE(interface_name); // Suppress --enable-opt-log=high warning + + if(!safe_mce_sys().enable_ipoib) { + nd_logdbg("Blocking offload: IPoIB interfaces ('%s')", interface_name); + return false; + } + +#ifndef DEFINED_IBV_QP_INIT_SOURCE_QPN + // Note: mlx4 does not support this capability + ib_ctx_handler* ib_ctx = g_p_ib_ctx_handler_collection->get_ib_ctx(get_ifname_link()); + if (!ib_ctx->is_mlx4()) { + nd_logwarn("Blocking offload: SOURCE_QPN is not supported for this driver ('%s')", interface_name); + return false; + } +#endif + + // Verify IPoIB is in 'datagram mode' for proper VMA with flow steering operation + if (validate_ipoib_prop(get_ifname(), m_flags, IPOIB_MODE_PARAM_FILE, "datagram", 8, filename, ifname)) { + vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n"); + vlog_printf(VLOG_WARNING,"* IPoIB mode of interface '%s' is \"connected\" !\n", get_ifname()); + vlog_printf(VLOG_WARNING,"* Please change it to datagram: \"echo datagram > %s\" before loading your application with VMA library\n", filename); + vlog_printf(VLOG_WARNING,"* VMA doesn't support IPoIB in connected mode.\n"); + vlog_printf(VLOG_WARNING,"* Please refer to VMA Release Notes for more information\n"); + vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n"); + return false; + } + else { + nd_logdbg("verified interface '%s' is running in datagram mode", get_ifname()); + } + + // Verify umcast is disabled for IB flow + if (validate_ipoib_prop(get_ifname(), m_flags, UMCAST_PARAM_FILE, "0", 1, filename, ifname)) { // Extract UMCAST flag (only for IB transport types) + vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n"); + vlog_printf(VLOG_WARNING,"* UMCAST flag is Enabled for interface %s !\n", get_ifname()); + vlog_printf(VLOG_WARNING,"* Please disable it: \"echo 0 > %s\" before loading your application with VMA library\n", filename); + vlog_printf(VLOG_WARNING,"* This option in no longer needed in this version\n"); + vlog_printf(VLOG_WARNING,"* Please refer to Release Notes for more information\n"); + vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n"); + return false; + } + else { + nd_logdbg("verified interface '%s' is running with umcast disabled", get_ifname()); + } + + return true; +} + +//ifname should point to a physical device +bool net_device_val::verify_qp_creation(const char* ifname, enum ibv_qp_type qp_type) +{ + bool success = false; + char bond_roce_lag_path[256] = {0}; + struct ibv_cq* cq = NULL; + struct ibv_comp_channel *channel = NULL; + struct ibv_qp* qp = NULL; + + vma_ibv_qp_init_attr qp_init_attr; + memset(&qp_init_attr, 0, sizeof(qp_init_attr)); + + vma_ibv_cq_init_attr attr; + memset(&attr, 0, sizeof(attr)); + + qp_init_attr.cap.max_send_wr = MCE_DEFAULT_TX_NUM_WRE; + qp_init_attr.cap.max_recv_wr = MCE_DEFAULT_RX_NUM_WRE; + qp_init_attr.cap.max_inline_data = MCE_DEFAULT_TX_MAX_INLINE; + qp_init_attr.cap.max_send_sge = MCE_DEFAULT_TX_NUM_SGE; + qp_init_attr.cap.max_recv_sge = MCE_DEFAULT_RX_NUM_SGE; + qp_init_attr.sq_sig_all = 0; + qp_init_attr.qp_type = qp_type; + + //find ib_cxt + char base_ifname[IFNAMSIZ]; + get_base_interface_name((const char*)(ifname), base_ifname, sizeof(base_ifname)); + int port_num = get_port_from_ifname(base_ifname); + ib_ctx_handler* p_ib_ctx = g_p_ib_ctx_handler_collection->get_ib_ctx(base_ifname); + + if (!p_ib_ctx) { + nd_logdbg("Cant find ib_ctx for interface %s", base_ifname); + if (qp_type == IBV_QPT_RAW_PACKET && m_bond != NO_BOND) { + if (check_bond_roce_lag_exist(bond_roce_lag_path, sizeof(bond_roce_lag_path), ifname)) { + print_roce_lag_warnings(get_ifname_link(), bond_roce_lag_path); + } else if ((p_ib_ctx = g_p_ib_ctx_handler_collection->get_ib_ctx(get_ifname_link())) + && strstr(p_ib_ctx->get_ibname(), "bond")) { + print_roce_lag_warnings(get_ifname_link()); + } + } + goto release_resources; + } else if (port_num > p_ib_ctx->get_ibv_device_attr()->phys_port_cnt) { + nd_logdbg("Invalid port for interface %s", base_ifname); + if (qp_type == IBV_QPT_RAW_PACKET && m_bond != NO_BOND && p_ib_ctx->is_mlx4()) { + print_roce_lag_warnings(get_ifname_link()); + } + goto release_resources; + } + + // Add to guid map in order to detect roce lag issue + if (qp_type == IBV_QPT_RAW_PACKET && m_bond != NO_BOND) { + m_sys_image_guid_map[p_ib_ctx->get_ibv_device_attr()->sys_image_guid].push_back(base_ifname); + } + + //create qp resources + channel = ibv_create_comp_channel(p_ib_ctx->get_ibv_context()); + if (!channel) { + nd_logdbg("channel creation failed for interface %s (errno=%d %m)", ifname, errno); + goto release_resources; + } + VALGRIND_MAKE_MEM_DEFINED(channel, sizeof(ibv_comp_channel)); + cq = vma_ibv_create_cq(p_ib_ctx->get_ibv_context(), safe_mce_sys().tx_num_wr, (void*)this, channel, 0, &attr); + if (!cq) { + nd_logdbg("cq creation failed for interface %s (errno=%d %m)", ifname, errno); + goto release_resources; + } + + vma_ibv_qp_init_attr_comp_mask(p_ib_ctx->get_ibv_pd(), qp_init_attr); + qp_init_attr.recv_cq = cq; + qp_init_attr.send_cq = cq; + + // Set source qpn for non mlx4 IPoIB devices + if (qp_type == IBV_QPT_UD && !p_ib_ctx->is_mlx4()) { + unsigned char hw_addr[IPOIB_HW_ADDR_LEN]; + get_local_ll_addr(ifname, hw_addr, IPOIB_HW_ADDR_LEN, false); + IPoIB_addr ipoib_addr(hw_addr); + ibv_source_qpn_set(qp_init_attr, ipoib_addr.get_qpn()); + } + + qp = vma_ibv_create_qp(p_ib_ctx->get_ibv_pd(), &qp_init_attr); + if (qp) { + if (qp_type == IBV_QPT_UD && priv_ibv_create_flow_supported(qp, port_num) == -1) { + nd_logdbg("Create_ibv_flow failed on interface %s (errno=%d %m), Traffic will not be offloaded", ifname, errno); + goto qp_failure; + } else { + success = true; + + if (qp_type == IBV_QPT_RAW_PACKET && !priv_ibv_query_flow_tag_supported(qp, port_num)) { + p_ib_ctx->set_flow_tag_capability(true); + } + nd_logdbg("verified interface %s for flow tag capabilities : %s", ifname, p_ib_ctx->get_flow_tag_capability() ? "enabled" : "disabled"); + + if (qp_type == IBV_QPT_RAW_PACKET && p_ib_ctx->is_packet_pacing_supported() && !priv_ibv_query_burst_supported(qp, port_num)) { + p_ib_ctx->set_burst_capability(true); + } + nd_logdbg("verified interface %s for burst capabilities : %s", ifname, p_ib_ctx->get_burst_capability() ? "enabled" : "disabled"); + } + } else { + nd_logdbg("QP creation failed on interface %s (errno=%d %m), Traffic will not be offloaded", ifname, errno); +qp_failure: + int err = errno; //verify_raw_qp_privliges can overwrite errno so keep it before the call + if (validate_raw_qp_privliges() == 0) { + // MLNX_OFED raw_qp_privliges file exist with bad value + vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n"); + vlog_printf(VLOG_WARNING,"* Interface %s will not be offloaded.\n", ifname); + vlog_printf(VLOG_WARNING,"* Working in this mode might causes VMA malfunction over Ethernet/InfiniBand interfaces\n"); + vlog_printf(VLOG_WARNING,"* WARNING: the following steps will restart your network interface!\n"); + vlog_printf(VLOG_WARNING,"* 1. \"echo options ib_uverbs disable_raw_qp_enforcement=1 > /etc/modprobe.d/ib_uverbs.conf\"\n"); + vlog_printf(VLOG_WARNING,"* 2. Restart openibd or rdma service depending on your system configuration\n"); + vlog_printf(VLOG_WARNING,"* Read the RAW_PACKET QP root access enforcement section in the VMA's User Manual for more information\n"); + vlog_printf(VLOG_WARNING,"******************************************************************************************************\n"); + } + else if (validate_user_has_cap_net_raw_privliges() == 0 || err == EPERM) { + vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n"); + vlog_printf(VLOG_WARNING,"* Interface %s will not be offloaded.\n", ifname); + vlog_printf(VLOG_WARNING,"* Offloaded resources are restricted to root or user with CAP_NET_RAW privileges\n"); + vlog_printf(VLOG_WARNING,"* Read the CAP_NET_RAW and root access section in the VMA's User Manual for more information\n"); + vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n"); + } else { + vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n"); + vlog_printf(VLOG_WARNING,"* Interface %s will not be offloaded.\n", ifname); + vlog_printf(VLOG_WARNING,"* VMA was not able to create QP for this device (errno = %d).\n", err); + vlog_printf(VLOG_WARNING,"*******************************************************************************************************\n"); + } + } + +release_resources: + if(qp) { + IF_VERBS_FAILURE(ibv_destroy_qp(qp)) { + nd_logdbg("qp destroy failed on interface %s (errno=%d %m)", ifname, errno); + success = false; + } ENDIF_VERBS_FAILURE; + } + if (cq) { + IF_VERBS_FAILURE(ibv_destroy_cq(cq)) { + nd_logdbg("cq destroy failed on interface %s (errno=%d %m)", ifname, errno); + success = false; + } ENDIF_VERBS_FAILURE; + } + if (channel) { + IF_VERBS_FAILURE(ibv_destroy_comp_channel(channel)) { + nd_logdbg("channel destroy failed on interface %s (errno=%d %m)", ifname, errno); + success = false; + } ENDIF_VERBS_FAILURE; + VALGRIND_MAKE_MEM_UNDEFINED(channel, sizeof(ibv_comp_channel)); + } + return success; +} diff --git a/src/vma/dev/net_device_val.h b/src/vma/dev/net_device_val.h new file mode 100644 index 0000000..46b5bdf --- /dev/null +++ b/src/vma/dev/net_device_val.h @@ -0,0 +1,379 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef NET_DEVICE_VAL_H +#define NET_DEVICE_VAL_H + + +#include +#include +#include +#include +#include +#include + +#include "utils/lock_wrapper.h" +#include "vma/util/sys_vars.h" +#include "vma/event/event_handler_ibverbs.h" +#include "vma/event/event_handler_rdma_cm.h" +#include "vma/dev/ib_ctx_handler.h" +#include "vma/proto/neighbour_observer.h" +#include "vma/proto/L2_address.h" +#include "vma/infra/cache_subject_observer.h" + + +class L2_address; +class ring; +class ib_ctx_handler; +class neigh_ib_broadcast; + +#define RING_ALLOC_STR_SIZE 256 +class ring_alloc_logic_attr +{ +public: + ring_alloc_logic_attr(); + ring_alloc_logic_attr(ring_logic_t ring_logic); + ring_alloc_logic_attr(const ring_alloc_logic_attr &other); + void set_ring_alloc_logic(ring_logic_t logic); + void set_ring_profile_key(vma_ring_profile_key profile); + void set_memory_descriptor(iovec &mem_desc); + void set_user_id_key(uint64_t user_id_key); + inline ring_logic_t get_ring_alloc_logic() { return m_ring_alloc_logic;} + inline vma_ring_profile_key get_ring_profile_key() { return m_ring_profile_key;} + inline iovec* get_memory_descriptor() { return &m_mem_desc;} + inline uint64_t get_user_id_key() { return m_user_id_key;} + + bool operator==(const ring_alloc_logic_attr& other) const + { + return (m_ring_alloc_logic == other.m_ring_alloc_logic && + m_ring_profile_key == other.m_ring_profile_key && + m_user_id_key == other.m_user_id_key && + m_mem_desc.iov_base == other.m_mem_desc.iov_base && + m_mem_desc.iov_len == other.m_mem_desc.iov_len); + } + + bool operator!=(const ring_alloc_logic_attr& other) const + { + return !(*this == other); + } + + ring_alloc_logic_attr& operator=(const ring_alloc_logic_attr& other) + { + if (this != &other) { + m_ring_alloc_logic = other.m_ring_alloc_logic; + m_ring_profile_key = other.m_ring_profile_key; + m_user_id_key = other.m_user_id_key; + m_hash = other.m_hash; + m_mem_desc.iov_base = other.m_mem_desc.iov_base; + m_mem_desc.iov_len = other.m_mem_desc.iov_len; + snprintf(m_str, RING_ALLOC_STR_SIZE, "%s", other.m_str); + } + return *this; + } + + const char* to_str() const + { + + return m_str; + } + + size_t operator()(const ring_alloc_logic_attr *key) const + { + return key->m_hash; + } + + bool operator()(const ring_alloc_logic_attr *k1, const ring_alloc_logic_attr *k2) const + { + return *k1 == *k2; + } +private: + size_t m_hash; + /* ring allocation logic , per thread per fd ... */ + ring_logic_t m_ring_alloc_logic; + /* key in g_p_ring_profile */ + vma_ring_profile_key m_ring_profile_key; + /* either user_idx or key as defined in ring_logic_t */ + uint64_t m_user_id_key; + char m_str[RING_ALLOC_STR_SIZE]; + iovec m_mem_desc; + void init(); +}; + +typedef ring_alloc_logic_attr resource_allocation_key; +// each ring has a ref count +typedef std::tr1::unordered_map, ring_alloc_logic_attr, ring_alloc_logic_attr> rings_hash_map_t; +typedef std::tr1::unordered_map > sys_image_guid_map_t; +typedef std::tr1::unordered_map ,ring_alloc_logic_attr, ring_alloc_logic_attr> rings_key_redirection_hash_map_t; + +#define THE_RING ring_iter->second.first +#define GET_THE_RING(key) m_h_ring_map[key].first +#define RING_REF_CNT ring_iter->second.second +#define ADD_RING_REF_CNT RING_REF_CNT++ +#define DEC_RING_REF_CNT RING_REF_CNT-- +#define TEST_REF_CNT_ZERO RING_REF_CNT==0 + +#define MAX_SLAVES 16 + +typedef struct slave_data { + int if_index; + ib_ctx_handler* p_ib_ctx; + int port_num; + L2_address* p_L2_addr; + bool active; + slave_data(int _if_index) : + if_index(_if_index), p_ib_ctx(NULL), port_num(-1), p_L2_addr(NULL), active(false) {} + ~slave_data() { + delete p_L2_addr; + p_L2_addr = NULL; + } +} slave_data_t; + +typedef std::vector slave_data_vector_t; + +typedef struct ip_data { + int flags; + in_addr_t local_addr; + in_addr_t netmask; + ip_data() { + flags = 0; + local_addr = 0; + netmask = 0; + } + ~ip_data() { + flags = 0; + local_addr = 0; + netmask = 0; + } +} ip_data_t; + +typedef std::vector ip_data_vector_t; + +#define VMA_DEFAULT_ENGRESS_MAP_PRIO (0) +typedef std::tr1::unordered_map tc_class_priority_map; +/* + * Represents Offloading capable device such as eth4, ib1, eth3.5, eth5:6 + */ +class net_device_val +{ +public: + enum state { + DOWN, + UP, + RUNNING, + INVALID + }; + enum bond_type { + NO_BOND, + ACTIVE_BACKUP, + LAG_8023ad, + NETVSC + }; + enum bond_xmit_hash_policy { + XHP_LAYER_2, + XHP_LAYER_3_4, + XHP_LAYER_2_3, + XHP_ENCAP_2_3, + XHP_ENCAP_3_4 + }; + struct net_device_val_desc { + struct nlmsghdr *nl_msg; + }; +public: + + net_device_val(struct net_device_val_desc *desc); + /* on init: + * get ibv, sys channel handlers from the relevant collections. + * register to ibv_ctx, rdma_cm and sys_net_channel + * + * */ + virtual ~net_device_val(); + + inline void set_type(int type) { m_type = type; } + inline void set_if_idx(int if_idx) { m_if_idx = if_idx; } + inline void set_flags(int flags) { m_flags = flags; } + inline void set_mtu(int mtu) { m_mtu = mtu; } + inline void set_if_link(int if_link) { m_if_link = if_link; } + inline void set_ifname(char *ifname) { + m_name = ifname; + get_base_interface_name(ifname, m_base_name, sizeof(m_base_name)); + } + inline void set_l2_if_addr(uint8_t *addr, size_t size) { + memcpy(m_l2_if_addr, addr, std::min(sizeof(m_l2_if_addr), size)); + } + inline void set_l2_bc_addr(uint8_t *addr, size_t size) { + memcpy(m_l2_bc_addr, addr, std::min(sizeof(m_l2_bc_addr), size)); + } + void set_ip_array(); + + inline int get_type() { return m_type; } + inline int get_if_idx() { return m_if_idx; } + inline int get_flags() { return m_flags; } + inline int get_mtu() { return m_mtu; } + inline char* get_ifname() { return (char *)m_name.c_str(); } + inline char* get_ifname_link() { return m_base_name; } + inline uint8_t* get_l2_if_addr() { return m_l2_if_addr; } + const ip_data_vector_t& get_ip_array() const { return m_ip; } + const slave_data_vector_t& get_slave_array() const { return m_slaves; } + const slave_data_t* get_slave(int if_index); + + void set_str(); + void print_val(); + + ring* reserve_ring(resource_allocation_key*); // create if not exists + bool release_ring(resource_allocation_key*); // delete from m_hash if ref_cnt == 0 + state get_state() const { return m_state; } // not sure, look at state init at c'tor + virtual std::string to_str(); + inline void set_transport_type(transport_type_t value) { m_transport_type = value; } + transport_type_t get_transport_type() const { return m_transport_type; } + bool update_active_backup_slaves(); + in_addr_t get_local_addr() { return m_ip[0]->local_addr; } // Valid object must have at least one address + int global_ring_poll_and_process_element(uint64_t *p_poll_sn, void* pv_fd_ready_array = NULL); + int global_ring_request_notification(uint64_t poll_sn) ; + int ring_drain_and_proccess(); + void ring_adapt_cq_moderation(); + L2_address* get_l2_address() { return m_p_L2_addr; }; + L2_address* get_br_address() { return m_p_br_addr; }; + inline bond_type get_is_bond() { return m_bond; } + inline bond_xmit_hash_policy get_bond_xmit_hash_policy() { return m_bond_xmit_hash_policy; } + bool update_active_slaves(); + void update_netvsc_slaves(int if_index, int if_flags); + void register_to_ibverbs_events(event_handler_ibverbs *handler); + void unregister_to_ibverbs_events(event_handler_ibverbs *handler); + int get_priority_by_tc_class(uint32_t tc_class); +protected: + + void set_slave_array(); + virtual ring* create_ring(resource_allocation_key *key) = 0; + virtual void create_br_address(const char* ifname) = 0; + virtual L2_address* create_L2_address(const char* ifname) = 0; + + L2_address* m_p_L2_addr; + L2_address* m_p_br_addr; + transport_type_t m_transport_type; + lock_mutex_recursive m_lock; + rings_hash_map_t m_h_ring_map; + sys_image_guid_map_t m_sys_image_guid_map; + rings_key_redirection_hash_map_t m_h_ring_key_redirection_map; + + state m_state; /* device current state */ + bond_type m_bond; /* type of the device as simple, bond, etc */ + slave_data_vector_t m_slaves; /* array of slaves */ + int m_if_active; /* ifindex of active slave (only for active-backup) */ + bond_xmit_hash_policy m_bond_xmit_hash_policy; + int m_bond_fail_over_mac; + tc_class_priority_map m_class_prio_map; + +private: + void verify_bonding_mode(); + bool verify_qp_creation(const char* ifname, enum ibv_qp_type qp_type); + bool verify_bond_ipoib_or_eth_qp_creation(); + bool verify_ipoib_or_eth_qp_creation(const char* interface_name); + bool verify_enable_ipoib(const char* ifname); + + resource_allocation_key* ring_key_redirection_reserve(resource_allocation_key *key); + resource_allocation_key* get_ring_key_redirection(resource_allocation_key *key); + void ring_key_redirection_release(resource_allocation_key *key); + + bool get_up_and_active_slaves(bool* up_and_active_slaves, size_t size); + + /* See: RFC 3549 2.3.3.1. */ + int m_if_idx; /* Uniquely identifies interface (not unique: eth4 and eth4:5 has the same idx) */ + int m_type; /* This defines the type of the link. */ + int m_flags; /* Device Flags (IFF_x). */ + int m_mtu; /* MTU of the device. */ + int m_if_link; /* ifindex of link to which this device is bound */ + uint8_t m_l2_if_addr[20]; /* hardware L2 interface address */ + uint8_t m_l2_bc_addr[20]; /* hardware L2 broadcast address */ + + /* See: RFC 3549 2.3.3.2. */ + ip_data_vector_t m_ip; /* vector of ip addresses */ + + std::string m_name; /* container for ifname */ + char m_str[BUFF_SIZE]; /* detailed information about device */ + char m_base_name[IFNAMSIZ]; /* base name of device basing ifname */ +}; + +class net_device_val_eth : public net_device_val +{ +public: + net_device_val_eth(struct net_device_val_desc *desc) : net_device_val(desc), m_vlan(0) { + set_transport_type(VMA_TRANSPORT_ETH); + if (INVALID != get_state()) { + set_slave_array(); + configure(); + } + } + uint16_t get_vlan() {return m_vlan;} + std::string to_str(); + +protected: + virtual ring* create_ring(resource_allocation_key *key); + void parse_prio_egress_map(); +private: + void configure(); + L2_address* create_L2_address(const char* ifname); + void create_br_address(const char* ifname); + uint16_t m_vlan; +}; + + +class net_device_val_ib : public net_device_val, public neigh_observer, public cache_observer +{ +public: + net_device_val_ib(struct net_device_val_desc *desc) : net_device_val(desc), m_pkey(0), m_br_neigh(NULL) { + set_transport_type(VMA_TRANSPORT_IB); + if (INVALID != get_state()) { + set_slave_array(); + configure(); + } + } + ~net_device_val_ib(); + + std::string to_str(); + uint16_t get_pkey() { return m_pkey; } + const neigh_ib_broadcast* get_br_neigh() {return m_br_neigh;} + virtual transport_type_t get_obs_transport_type() const {return get_transport_type();} + +protected: + ring* create_ring(resource_allocation_key *key); + +private: + void configure(); + L2_address* create_L2_address(const char* ifname); + void create_br_address(const char* ifname); + uint16_t m_pkey; + neigh_ib_broadcast* m_br_neigh; +}; + + +#endif diff --git a/src/vma/dev/qp_mgr.cpp b/src/vma/dev/qp_mgr.cpp new file mode 100644 index 0000000..4a78bbe --- /dev/null +++ b/src/vma/dev/qp_mgr.cpp @@ -0,0 +1,870 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "qp_mgr.h" +#include "utils/bullseye.h" +#include "vma/util/utils.h" +#include "vma/util/valgrind.h" +#include "vma/util/instrumentation.h" +#include "vma/iomux/io_mux_call.h" +#include "buffer_pool.h" +#include "cq_mgr.h" +#include "ring_simple.h" +#include "util/valgrind.h" + +#undef MODULE_NAME +#define MODULE_NAME "qpm" + +#define qp_logpanic __log_info_panic +#define qp_logerr __log_info_err +#define qp_logwarn __log_info_warn +#define qp_loginfo __log_info_info +#define qp_logdbg __log_info_dbg +#define qp_logfunc __log_info_func +#define qp_logfuncall __log_info_funcall + + +//#define ALIGN_WR_UP(_num_wr_) (max(32, ((_num_wr_ + 0xf) & ~(0xf)))) +#define ALIGN_WR_DOWN(_num_wr_) (max(32, ((_num_wr_ ) & ~(0xf)))) + +#define FICTIVE_REMOTE_QPN 0x48 +#define FICTIVE_REMOTE_QKEY 0x01234567 +#define FICTIVE_AH_SL 5 +#define FICTIVE_AH_DLID 0x3 + +#define MAX_UPSTREAM_CQ_MSHV_SIZE 8192 + +qp_mgr::qp_mgr(const ring_simple* p_ring, const ib_ctx_handler* p_context, + const uint8_t port_num, const uint32_t tx_num_wr): + m_qp(NULL) + ,m_rq_wqe_idx_to_wrid(NULL) + ,m_p_ring((ring_simple*)p_ring) + ,m_port_num((uint8_t)port_num) + ,m_p_ib_ctx_handler((ib_ctx_handler*)p_context) + ,m_max_qp_wr(0) + ,m_p_cq_mgr_rx(NULL) + ,m_p_cq_mgr_tx(NULL) + ,m_rx_num_wr(safe_mce_sys().rx_num_wr) + ,m_tx_num_wr(tx_num_wr) + ,m_hw_dummy_send_support(false) + ,m_n_sysvar_rx_num_wr_to_post_recv(safe_mce_sys().rx_num_wr_to_post_recv) + ,m_n_sysvar_tx_num_wr_to_signal(safe_mce_sys().tx_num_wr_to_signal) + ,m_n_sysvar_rx_prefetch_bytes_before_poll(safe_mce_sys().rx_prefetch_bytes_before_poll) + ,m_curr_rx_wr(0) + ,m_last_posted_rx_wr_id(0) + ,m_n_unsignaled_count(0) + ,m_p_last_tx_mem_buf_desc(NULL) + ,m_p_prev_rx_desc_pushed(NULL) + ,m_n_ip_id_base(0) + ,m_n_ip_id_offset(0) +{ +#ifdef DEFINED_TSO + memset(&m_qp_cap, 0, sizeof(m_qp_cap)); + m_qp_cap.max_inline_data = safe_mce_sys().tx_max_inline; + m_qp_cap.max_send_sge = (m_p_ring->is_tso() ? + m_p_ib_ctx_handler->get_ibv_device_attr()->max_sge : MCE_DEFAULT_TX_NUM_SGE); + m_qp_cap.max_recv_sge = (m_p_ring->is_socketxtreme()) ? 1 : MCE_DEFAULT_RX_NUM_SGE; +#else + m_max_inline_data = 0; +#endif /* DEFINED_TSO */ + + m_ibv_rx_sg_array = new ibv_sge[m_n_sysvar_rx_num_wr_to_post_recv]; + m_ibv_rx_wr_array = new ibv_recv_wr[m_n_sysvar_rx_num_wr_to_post_recv]; + + set_unsignaled_count(); + memset(&m_rate_limit, 0, sizeof(struct vma_rate_limit_t)); + + qp_logfunc(""); +} + +qp_mgr::~qp_mgr() +{ + qp_logfunc(""); + + qp_logdbg("calling ibv_destroy_qp(qp=%p)", m_qp); + if (m_qp) { + IF_VERBS_FAILURE_EX(ibv_destroy_qp(m_qp), EIO) { + qp_logdbg("QP destroy failure (errno = %d %m)", -errno); + } ENDIF_VERBS_FAILURE; + VALGRIND_MAKE_MEM_UNDEFINED(m_qp, sizeof(ibv_qp)); + } + m_qp = NULL; + + if (m_p_cq_mgr_tx) { + delete m_p_cq_mgr_tx; + m_p_cq_mgr_tx = NULL; + } + if (m_p_cq_mgr_rx) { + delete m_p_cq_mgr_rx; + m_p_cq_mgr_rx = NULL; + } + + delete[] m_ibv_rx_sg_array; + delete[] m_ibv_rx_wr_array; + + qp_logdbg("Rx buffer poll: %d free global buffers available", g_buffer_pool_rx->get_free_count()); + qp_logdbg("delete done"); +} + +cq_mgr* qp_mgr::handle_cq_initialization(uint32_t *num_wr, struct ibv_comp_channel* comp_event_channel, bool is_rx) +{ + qp_logfunc(""); + cq_mgr* cq = NULL; + + try { + cq = new cq_mgr(m_p_ring, m_p_ib_ctx_handler, *num_wr, comp_event_channel, is_rx); + } catch (vma_exception& e) { + // This is a workaround for an issue with cq creation of mlx4 devices on + // upstream-driver VMs over Windows Hypervisor. + if (safe_mce_sys().hypervisor == mce_sys_var::HYPER_MSHV && m_p_ib_ctx_handler->is_mlx4() && + *num_wr > MAX_UPSTREAM_CQ_MSHV_SIZE) { + qp_logdbg("cq creation failed with cq_size of %d. retrying with size of %d", *num_wr, MAX_UPSTREAM_CQ_MSHV_SIZE); + *num_wr = MAX_UPSTREAM_CQ_MSHV_SIZE; + try { + cq = new cq_mgr(m_p_ring, m_p_ib_ctx_handler, *num_wr, comp_event_channel, is_rx); + } catch (vma_exception&) { + } + } + + if (!cq) { + qp_logerr("%s", e.message); + } + } + + return cq; +} + +cq_mgr* qp_mgr::init_rx_cq_mgr(struct ibv_comp_channel* p_rx_comp_event_channel) +{ + return handle_cq_initialization(&m_rx_num_wr, p_rx_comp_event_channel, true); +} + +cq_mgr* qp_mgr::init_tx_cq_mgr() +{ + return handle_cq_initialization(&m_tx_num_wr, m_p_ring->get_tx_comp_event_channel(), false); +} + +int qp_mgr::configure(struct ibv_comp_channel* p_rx_comp_event_channel) +{ + qp_logdbg("Creating QP of transport type '%s' on ibv device '%s' [%p] on port %d", + priv_vma_transport_type_str(m_p_ring->get_transport_type()), + m_p_ib_ctx_handler->get_ibname(), m_p_ib_ctx_handler->get_ibv_device(), m_port_num); + + // Check device capabilities for max QP work requests + m_max_qp_wr = ALIGN_WR_DOWN(m_p_ib_ctx_handler->get_ibv_device_attr()->max_qp_wr - 1); + if (m_rx_num_wr > m_max_qp_wr) { + qp_logwarn("Allocating only %d Rx QP work requests while user " + "requested %s=%d for QP on <%p, %d>", + m_max_qp_wr, SYS_VAR_RX_NUM_WRE, m_rx_num_wr, + m_p_ib_ctx_handler, m_port_num); + m_rx_num_wr = m_max_qp_wr; + } + + qp_logdbg("HW Dummy send support for QP = %d", m_hw_dummy_send_support); + + // Create associated Tx & Rx cq_mgrs + m_p_cq_mgr_tx = init_tx_cq_mgr(); + BULLSEYE_EXCLUDE_BLOCK_START + if (!m_p_cq_mgr_tx) { + qp_logerr("Failed allocating m_p_cq_mgr_tx (errno=%d %m)", errno); + return -1; + } + m_p_cq_mgr_rx = init_rx_cq_mgr(p_rx_comp_event_channel); + if (!m_p_cq_mgr_rx) { + qp_logerr("Failed allocating m_p_cq_mgr_rx (errno=%d %m)", errno); + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_END + + // Modify the Rx and Tx cq_mgr to use a non-blocking event channel + set_fd_block_mode(m_p_cq_mgr_rx->get_channel_fd(), false); + set_fd_block_mode(m_p_cq_mgr_tx->get_channel_fd(), false); + + qp_logdbg("cq tx: %p rx: %p", m_p_cq_mgr_tx, m_p_cq_mgr_rx); + + // Create QP + vma_ibv_qp_init_attr qp_init_attr; + memset(&qp_init_attr, 0, sizeof(qp_init_attr)); + +#ifdef DEFINED_TSO + // TODO: m_tx_num_wr and m_rx_num_wr should be part of m_qp_cap + // and assigned as a result of ibv_query_qp() + m_qp_cap.max_send_wr = m_tx_num_wr; + m_qp_cap.max_recv_wr = m_rx_num_wr; + + memcpy(&qp_init_attr.cap, &m_qp_cap, sizeof(qp_init_attr.cap)); + qp_init_attr.recv_cq = m_p_cq_mgr_rx->get_ibv_cq_hndl(); + qp_init_attr.send_cq = m_p_cq_mgr_tx->get_ibv_cq_hndl(); + qp_init_attr.sq_sig_all = 0; + + // In case of enabled TSO we need to take into account amount of SGE together with header inline + // Per PRM maximum of CTRL + ETH + ETH_HEADER_INLINE+DATA_PTR*NUM_SGE+MAX_INLINE+INLINE_SIZE + // MLX5 return 32678 WQEBBs at max so minimal number + int max_wqe_sz = 16+14+m_p_ring->m_tso.max_header_sz+16*qp_init_attr.cap.max_send_sge+qp_init_attr.cap.max_inline_data+4; + int num_wr = 32678*64/max_wqe_sz; + qp_logdbg("calculated max_wqe_sz=%d num_wr=%d", max_wqe_sz, num_wr); + if (num_wr < (signed)m_tx_num_wr) { + qp_init_attr.cap.max_send_wr = num_wr; // force min for create_qp or you will have error of memory allocation + } + + qp_logdbg("Requested QP parameters: " + "wre: tx = %d rx = %d " + "sge: tx = %d rx = %d " + "inline: %d", + qp_init_attr.cap.max_send_wr, qp_init_attr.cap.max_recv_wr, + qp_init_attr.cap.max_send_sge, qp_init_attr.cap.max_recv_sge, + qp_init_attr.cap.max_inline_data); + + // Create the QP + if (prepare_ibv_qp(qp_init_attr)) { + return -1; + } + + qp_logdbg("Configured QP parameters: " + "wre: tx = %d rx = %d " + "sge: tx = %d rx = %d " + "inline: %d", + qp_init_attr.cap.max_send_wr, qp_init_attr.cap.max_recv_wr, + qp_init_attr.cap.max_send_sge, qp_init_attr.cap.max_recv_sge, + qp_init_attr.cap.max_inline_data); + + /* Check initial parameters with actual */ + enum ibv_qp_attr_mask attr_mask = IBV_QP_CAP; + struct ibv_qp_attr tmp_ibv_qp_attr; + struct ibv_qp_init_attr tmp_ibv_qp_init_attr; + IF_VERBS_FAILURE(ibv_query_qp(m_qp, &tmp_ibv_qp_attr, attr_mask, + &tmp_ibv_qp_init_attr)) { + qp_logerr("ibv_query_qp failed (errno=%d %m)", errno); + return -1; + } ENDIF_VERBS_FAILURE; + m_qp_cap.max_send_wr = min(tmp_ibv_qp_attr.cap.max_send_wr, m_qp_cap.max_send_wr); + m_qp_cap.max_recv_wr = min(tmp_ibv_qp_attr.cap.max_recv_wr, m_qp_cap.max_recv_wr); + m_qp_cap.max_send_sge = min(tmp_ibv_qp_attr.cap.max_send_sge, m_qp_cap.max_send_sge); + m_qp_cap.max_recv_sge = min(tmp_ibv_qp_attr.cap.max_recv_sge, m_qp_cap.max_recv_sge); + m_qp_cap.max_inline_data = min(tmp_ibv_qp_attr.cap.max_inline_data, m_qp_cap.max_inline_data); + + if ( m_qp_cap.max_send_wr < m_tx_num_wr ) { + qp_logwarn("Amount of requested TX_WRE %d lowered to %d", m_tx_num_wr, m_qp_cap.max_send_wr); + m_tx_num_wr = m_qp_cap.max_send_wr; + m_p_ring->set_tx_num_wr( m_tx_num_wr ); + } + + qp_logdbg("Used QP (num=%d) " + "wre: tx = %d rx = %d " + "sge: tx = %d rx = %d " + "inline: %d", + m_qp->qp_num, + m_qp_cap.max_send_wr, m_qp_cap.max_recv_wr, + m_qp_cap.max_send_sge, m_qp_cap.max_recv_sge, + m_qp_cap.max_inline_data); +#else + // Check device capabilities for max SG elements + uint32_t tx_max_inline = safe_mce_sys().tx_max_inline; + uint32_t rx_num_sge = (m_p_ring->is_socketxtreme() ? 1 : MCE_DEFAULT_RX_NUM_SGE); + uint32_t tx_num_sge = MCE_DEFAULT_TX_NUM_SGE; + + qp_init_attr.cap.max_send_wr = m_tx_num_wr; + qp_init_attr.cap.max_recv_wr = m_rx_num_wr; + qp_init_attr.cap.max_inline_data = tx_max_inline; + qp_init_attr.cap.max_send_sge = tx_num_sge; + qp_init_attr.cap.max_recv_sge = rx_num_sge; + qp_init_attr.recv_cq = m_p_cq_mgr_rx->get_ibv_cq_hndl(); + qp_init_attr.send_cq = m_p_cq_mgr_tx->get_ibv_cq_hndl(); + qp_init_attr.sq_sig_all = 0; + + // Create the QP + if (prepare_ibv_qp(qp_init_attr)) { + return -1; + } + + qp_logdbg("Created QP (num=%d) with %d tx wre and inline=%d and %d rx " + "wre and %d sge", m_qp->qp_num, m_tx_num_wr, m_max_inline_data, + m_rx_num_wr, rx_num_sge); +#endif /* DEFINED_TSO */ + + // All buffers will be allocated from this qp_mgr buffer pool so we can already set the Rx & Tx lkeys + for (uint32_t wr_idx = 0; wr_idx < m_n_sysvar_rx_num_wr_to_post_recv; wr_idx++) { + m_ibv_rx_wr_array[wr_idx].sg_list = &m_ibv_rx_sg_array[wr_idx]; + m_ibv_rx_wr_array[wr_idx].num_sge = 1; + m_ibv_rx_wr_array[wr_idx].next = &m_ibv_rx_wr_array[wr_idx+1]; // pre-define the linked list + } + m_ibv_rx_wr_array[m_n_sysvar_rx_num_wr_to_post_recv-1].next = NULL; // end linked list + + m_curr_rx_wr = 0; + + if (m_p_cq_mgr_tx) { + m_p_cq_mgr_tx->add_qp_tx(this); + } + + return 0; +} + +void qp_mgr::up() +{ + // Add buffers + qp_logdbg("QP current state: %d", priv_ibv_query_qp_state(m_qp)); + release_rx_buffers(); // We might have old flushed cqe's in our CQ still from previous HA event + release_tx_buffers(); + + /* clean any link to completions with error we might have */ + set_unsignaled_count(); + m_p_last_tx_mem_buf_desc = NULL; + + modify_qp_to_ready_state(); + m_p_cq_mgr_rx->add_qp_rx(this); +} + +void qp_mgr::down() +{ + qp_logdbg("QP current state: %d", priv_ibv_query_qp_state(m_qp)); + modify_qp_to_error_state(); + + // free buffers from current active resource iterator + trigger_completion_for_all_sent_packets(); + + // let the QP drain all wqe's to flushed cqe's now that we moved + // it to error state and post_sent final trigger for completion + usleep(1000); + + release_tx_buffers(); + release_rx_buffers(); + m_p_cq_mgr_rx->del_qp_rx(this); +} + +void qp_mgr::modify_qp_to_error_state() +{ + qp_logdbg(""); + + BULLSEYE_EXCLUDE_BLOCK_START + if (priv_ibv_modify_qp_to_err(m_qp)) { + qp_logdbg("ibv_modify_qp failure (errno = %d %m)", errno); + } + BULLSEYE_EXCLUDE_BLOCK_END +} + +void qp_mgr::release_rx_buffers() +{ + int total_ret = m_curr_rx_wr; + if (m_curr_rx_wr) { + qp_logdbg("Returning %d pending post_recv buffers to CQ owner", m_curr_rx_wr); + while (m_curr_rx_wr) { + --m_curr_rx_wr; + mem_buf_desc_t* p_mem_buf_desc = (mem_buf_desc_t*)(uintptr_t)m_ibv_rx_wr_array[m_curr_rx_wr].wr_id; + if (p_mem_buf_desc && p_mem_buf_desc->p_desc_owner) { + m_p_ring->mem_buf_desc_return_to_owner_rx(p_mem_buf_desc); + } + else { + g_buffer_pool_rx->put_buffers_thread_safe(p_mem_buf_desc); + } + } + } + // Wait for all FLUSHed WQE on Rx CQ + qp_logdbg("draining rx cq_mgr %p (last_posted_rx_wr_id = %p)", m_p_cq_mgr_rx, m_last_posted_rx_wr_id); + uintptr_t last_polled_rx_wr_id = 0; + while (m_p_cq_mgr_rx && last_polled_rx_wr_id != m_last_posted_rx_wr_id && + errno != EIO && !m_p_ib_ctx_handler->is_removed()) { + + // Process the FLUSH'ed WQE's + int ret = m_p_cq_mgr_rx->drain_and_proccess(&last_polled_rx_wr_id); + qp_logdbg("draining completed on rx cq_mgr (%d wce) last_polled_rx_wr_id = %p", ret, last_polled_rx_wr_id); + + total_ret += ret; + + if (!ret) { + // Query context for ib_verbs events (especially for IBV_EVENT_DEVICE_FATAL) + g_p_event_handler_manager->query_for_ibverbs_event(m_p_ib_ctx_handler->get_ibv_context()->async_fd); + } + + // Add short delay (500 usec) to allow for WQE's to be flushed to CQ every poll cycle + const struct timespec short_sleep = {0, 500000}; // 500 usec + nanosleep(&short_sleep, NULL); + } + m_last_posted_rx_wr_id = 0; // Clear the posted WR_ID flag, we just clear the entire RQ + qp_logdbg("draining completed with a total of %d wce's on rx cq_mgr", total_ret); +} + +void qp_mgr::release_tx_buffers() +{ + int ret = 0; + uint64_t poll_sn = 0; + qp_logdbg("draining tx cq_mgr %p", m_p_cq_mgr_tx); + while (m_p_cq_mgr_tx && m_qp && + ((ret = m_p_cq_mgr_tx->poll_and_process_element_tx(&poll_sn)) > 0) && + (errno != EIO && !m_p_ib_ctx_handler->is_removed())) { + qp_logdbg("draining completed on tx cq_mgr (%d wce)", ret); + } +} + +void qp_mgr::trigger_completion_for_all_sent_packets() +{ + vma_ibv_send_wr send_wr; + ibv_sge sge[1]; + + // Handle releasing of Tx buffers + // Single post send with SIGNAL of a dummy packet + + // NOTE: Since the QP is in ERROR state no packets will be sent on the wire! + // So we can post_send anything we want :) + + qp_logdbg("unsignaled count=%d, last=%p", m_n_unsignaled_count, m_p_last_tx_mem_buf_desc); + if (m_p_last_tx_mem_buf_desc) { // Meaning that there is at least one post_send in the QP mem_buf_desc that wasn't signaled for completion + qp_logdbg("Need to send closing tx wr..."); + // Allocate new send buffer + mem_buf_desc_t* p_mem_buf_desc = m_p_ring->mem_buf_tx_get(0, true); + m_p_ring->m_missing_buf_ref_count--; // Align Tx buffer accounting since we will be bypassing the normal send calls + if (!p_mem_buf_desc) { + qp_logerr("no buffer in pool"); + return; + } + p_mem_buf_desc->p_next_desc = m_p_last_tx_mem_buf_desc; + + // Prepare dummy packet: zeroed payload ('0000'). + // For ETH it replaces the MAC header!! (Nothing is going on the wire, QP in error state) + // For IB it replaces the IPoIB header. + + /* need to send at least eth+ip, since libmlx5 will drop just eth header */ + ethhdr* p_buffer_ethhdr = (ethhdr *)p_mem_buf_desc->p_buffer; + memset(p_buffer_ethhdr, 0, sizeof(*p_buffer_ethhdr)); + p_buffer_ethhdr->h_proto = htons(ETH_P_IP); + iphdr* p_buffer_iphdr = (iphdr *)(p_mem_buf_desc->p_buffer + sizeof(*p_buffer_ethhdr)); + memset(p_buffer_iphdr, 0, sizeof(*p_buffer_iphdr)); + sge[0].length = sizeof(ethhdr) + sizeof(iphdr); + sge[0].addr = (uintptr_t)(p_mem_buf_desc->p_buffer); + sge[0].lkey = m_p_ring->m_tx_lkey; + + struct ibv_ah *p_ah = NULL; + ibv_ah_attr ah_attr; + + if (m_p_ring->get_transport_type() == VMA_TRANSPORT_IB) { + memset(&ah_attr, 0, sizeof(ah_attr)); + ah_attr.dlid = FICTIVE_AH_DLID; + ah_attr.sl = FICTIVE_AH_SL; + ah_attr.src_path_bits = 0; + ah_attr.static_rate = 0; + ah_attr.is_global = 0; + ah_attr.port_num = m_port_num; // Do we need it? + + p_ah = ibv_create_ah(m_p_ib_ctx_handler->get_ibv_pd(), &ah_attr); + BULLSEYE_EXCLUDE_BLOCK_START + if (!p_ah && (errno != EIO)) { + qp_logpanic("failed creating address handler (errno=%d %m)", errno); + } + BULLSEYE_EXCLUDE_BLOCK_END + } + + // Prepare send wr for (does not care if it is UD/IB or RAW/ETH) + // UD requires AH+qkey, RAW requires minimal payload instead of MAC header. + + memset(&send_wr, 0, sizeof(send_wr)); + send_wr.wr_id = (uintptr_t)p_mem_buf_desc; + send_wr.wr.ud.ah = p_ah; + send_wr.wr.ud.remote_qpn = FICTIVE_REMOTE_QPN; + send_wr.wr.ud.remote_qkey = FICTIVE_REMOTE_QKEY; + send_wr.sg_list = sge; + send_wr.num_sge = 1; + send_wr.next = NULL; + vma_send_wr_opcode(send_wr) = VMA_IBV_WR_SEND; + qp_logdbg("IBV_SEND_SIGNALED"); + + // Close the Tx unsignaled send list + set_unsignaled_count(); + m_p_last_tx_mem_buf_desc = NULL; + + if (!m_p_ring->m_tx_num_wr_free) { + qp_logdbg("failed to trigger completion for all packets due to no available wr"); + return; + } + m_p_ring->m_tx_num_wr_free--; + + send_to_wire(&send_wr, (vma_wr_tx_packet_attr)(VMA_TX_PACKET_L3_CSUM|VMA_TX_PACKET_L4_CSUM), true); + if (p_ah) { + IF_VERBS_FAILURE_EX(ibv_destroy_ah(p_ah), EIO) + { + qp_logpanic("failed destroying address handle (errno=%d %m)", errno); + }ENDIF_VERBS_FAILURE; + } + } +} + +uint32_t qp_mgr::get_rx_max_wr_num() +{ + return m_rx_num_wr; +} + +void qp_mgr::post_recv_buffer(mem_buf_desc_t* p_mem_buf_desc) +{ + if (m_n_sysvar_rx_prefetch_bytes_before_poll) { + if (m_p_prev_rx_desc_pushed) + m_p_prev_rx_desc_pushed->p_prev_desc = p_mem_buf_desc; + m_p_prev_rx_desc_pushed = p_mem_buf_desc; + } + + m_ibv_rx_wr_array[m_curr_rx_wr].wr_id = (uintptr_t)p_mem_buf_desc; + m_ibv_rx_sg_array[m_curr_rx_wr].addr = (uintptr_t)p_mem_buf_desc->p_buffer; + m_ibv_rx_sg_array[m_curr_rx_wr].length = p_mem_buf_desc->sz_buffer; + m_ibv_rx_sg_array[m_curr_rx_wr].lkey = p_mem_buf_desc->lkey; + + if (m_curr_rx_wr == m_n_sysvar_rx_num_wr_to_post_recv - 1) { + + m_last_posted_rx_wr_id = (uintptr_t)p_mem_buf_desc; + + m_p_prev_rx_desc_pushed = NULL; + p_mem_buf_desc->p_prev_desc = NULL; + + m_curr_rx_wr = 0; + struct ibv_recv_wr *bad_wr = NULL; + IF_VERBS_FAILURE(ibv_post_recv(m_qp, &m_ibv_rx_wr_array[0], &bad_wr)) { + uint32_t n_pos_bad_rx_wr = ((uint8_t*)bad_wr - (uint8_t*)m_ibv_rx_wr_array) / sizeof(struct ibv_recv_wr); + qp_logerr("failed posting list (errno=%d %m)", errno); + qp_logerr("bad_wr is %d in submitted list (bad_wr=%p, m_ibv_rx_wr_array=%p, size=%d)", n_pos_bad_rx_wr, bad_wr, m_ibv_rx_wr_array, sizeof(struct ibv_recv_wr)); + qp_logerr("bad_wr info: wr_id=%#x, next=%p, addr=%#x, length=%d, lkey=%#x", bad_wr[0].wr_id, bad_wr[0].next, bad_wr[0].sg_list[0].addr, bad_wr[0].sg_list[0].length, bad_wr[0].sg_list[0].lkey); + qp_logerr("QP current state: %d", priv_ibv_query_qp_state(m_qp)); + + // Fix broken linked list of rx_wr + if (n_pos_bad_rx_wr != (m_n_sysvar_rx_num_wr_to_post_recv - 1)) { + m_ibv_rx_wr_array[n_pos_bad_rx_wr].next = &m_ibv_rx_wr_array[n_pos_bad_rx_wr+1]; + } + throw; + } ENDIF_VERBS_FAILURE; + qp_logfunc("Successful ibv_post_recv"); + } + else { + m_curr_rx_wr++; + } +} + +void qp_mgr::post_recv_buffers(descq_t* p_buffers, size_t count) +{ + qp_logfuncall(""); + // Called from cq_mgr context under cq_mgr::LOCK! + while (count--) { + post_recv_buffer(p_buffers->get_and_pop_front()); + } +} + +inline int qp_mgr::send_to_wire(vma_ibv_send_wr* p_send_wqe, vma_wr_tx_packet_attr attr, bool request_comp) +{ + NOT_IN_USE(attr); + int ret = 0; + vma_ibv_send_wr *bad_wr = NULL; + + if (request_comp) { + vma_send_wr_send_flags(*p_send_wqe) = (vma_ibv_send_flags)(vma_send_wr_send_flags(*p_send_wqe) | VMA_IBV_SEND_SIGNALED); + } + + IF_VERBS_FAILURE(vma_ibv_post_send(m_qp, p_send_wqe, &bad_wr)) { + qp_logerr("failed post_send%s (errno=%d %m)\n", ((vma_send_wr_send_flags(*p_send_wqe) & VMA_IBV_SEND_INLINE)?"(+inline)":""), errno); + if (bad_wr) { + qp_logerr("bad_wr info: wr_id=%#x, send_flags=%#x, addr=%#x, length=%d, lkey=%#x, max_inline_data=%d", + bad_wr->wr_id, vma_send_wr_send_flags(*bad_wr), bad_wr->sg_list[0].addr, bad_wr->sg_list[0].length, bad_wr->sg_list[0].lkey, get_max_inline_data()); + } + ret = -1; + } ENDIF_VERBS_FAILURE; + + // Clear the SINGAL request + vma_send_wr_send_flags(*p_send_wqe) = (vma_ibv_send_flags)(vma_send_wr_send_flags(*p_send_wqe) & ~VMA_IBV_SEND_SIGNALED); + + return ret; +} + +int qp_mgr::send(vma_ibv_send_wr* p_send_wqe, vma_wr_tx_packet_attr attr) +{ + mem_buf_desc_t* p_mem_buf_desc = (mem_buf_desc_t *)p_send_wqe->wr_id; + + qp_logfunc("VERBS send, unsignaled_count: %d", m_n_unsignaled_count); + bool request_comp = is_completion_need(); + +#ifdef VMA_TIME_MEASURE + TAKE_T_TX_POST_SEND_START; +#endif + +#ifdef RDTSC_MEASURE_TX_VERBS_POST_SEND + RDTSC_TAKE_START(g_rdtsc_instr_info_arr[RDTSC_FLOW_TX_VERBS_POST_SEND]); +#endif //RDTSC_MEASURE_TX_SENDTO_TO_AFTER_POST_SEND + + if (send_to_wire(p_send_wqe, attr, request_comp)) { +#ifdef VMA_TIME_MEASURE + INC_ERR_TX_COUNT; +#endif + return -1; + } + +#ifdef RDTSC_MEASURE_TX_VERBS_POST_SEND + RDTSC_TAKE_END(g_rdtsc_instr_info_arr[RDTSC_FLOW_TX_VERBS_POST_SEND]); +#endif //RDTSC_MEASURE_TX_SENDTO_TO_AFTER_POST_SEND + +#ifdef RDTSC_MEASURE_TX_SENDTO_TO_AFTER_POST_SEND + RDTSC_TAKE_END(g_rdtsc_instr_info_arr[RDTSC_FLOW_SENDTO_TO_AFTER_POST_SEND]); +#endif //RDTSC_MEASURE_TX_SENDTO_TO_AFTER_POST_SEND + +#ifdef VMA_TIME_MEASURE + TAKE_T_TX_POST_SEND_END; +#endif + // Link this new mem_buf_desc to the previous one sent + p_mem_buf_desc->p_next_desc = m_p_last_tx_mem_buf_desc; + + if (request_comp) { + int ret; + + set_unsignaled_count(); + m_p_last_tx_mem_buf_desc = NULL; + + // Poll the Tx CQ + uint64_t dummy_poll_sn = 0; + ret = m_p_cq_mgr_tx->poll_and_process_element_tx(&dummy_poll_sn); + BULLSEYE_EXCLUDE_BLOCK_START + if (ret < 0) { + qp_logerr("error from cq_mgr_tx->process_next_element (ret=%d %m)", ret); + } + BULLSEYE_EXCLUDE_BLOCK_END + qp_logfunc("polling succeeded on tx cq_mgr (%d wce)", ret); + } else { + m_n_unsignaled_count--; + m_p_last_tx_mem_buf_desc = p_mem_buf_desc; + } + + return 0; +} + +void qp_mgr_eth::modify_qp_to_ready_state() +{ + qp_logdbg(""); + int ret = 0; + int qp_state = priv_ibv_query_qp_state(m_qp); + if (qp_state != IBV_QPS_INIT) { + BULLSEYE_EXCLUDE_BLOCK_START + if ((ret = priv_ibv_modify_qp_from_err_to_init_raw(m_qp, m_port_num)) != 0) { + qp_logpanic("failed to modify QP from %d to RTS state (ret = %d)", qp_state, ret); + } + BULLSEYE_EXCLUDE_BLOCK_END + } + + BULLSEYE_EXCLUDE_BLOCK_START + if ((ret = priv_ibv_modify_qp_from_init_to_rts(m_qp)) != 0) { + qp_logpanic("failed to modify QP from INIT to RTS state (ret = %d)", ret); + } + + BULLSEYE_EXCLUDE_BLOCK_END +} + +int qp_mgr_eth::prepare_ibv_qp(vma_ibv_qp_init_attr& qp_init_attr) +{ + qp_logdbg(""); + int ret = 0; + + qp_init_attr.qp_type = IBV_QPT_RAW_PACKET; + vma_ibv_qp_init_attr_comp_mask(m_p_ib_ctx_handler->get_ibv_pd(), qp_init_attr); + +#ifdef DEFINED_TSO + if (m_p_ring->is_tso()) { + vma_ibv_qp_init_attr_tso(qp_init_attr, m_p_ring->get_max_header_sz()); + qp_logdbg("create qp with max_tso_header = %d", m_p_ring->get_max_header_sz()); + } +#endif /* DEFINED_TSO */ + + m_qp = vma_ibv_create_qp(m_p_ib_ctx_handler->get_ibv_pd(), &qp_init_attr); + + BULLSEYE_EXCLUDE_BLOCK_START + if (!m_qp) { + qp_logerr("ibv_create_qp failed (errno=%d %m)", errno); + return -1; + } + VALGRIND_MAKE_MEM_DEFINED(m_qp, sizeof(ibv_qp)); + if ((ret = priv_ibv_modify_qp_from_err_to_init_raw(m_qp, m_port_num)) != 0) { + qp_logerr("failed to modify QP from ERR to INIT state (ret = %d)", ret); + return ret; + } + BULLSEYE_EXCLUDE_BLOCK_END + +#ifdef DEFINED_TSO +#else + enum ibv_qp_attr_mask attr_mask = IBV_QP_CAP; + struct ibv_qp_attr tmp_ibv_qp_attr; + struct ibv_qp_init_attr tmp_ibv_qp_init_attr; + IF_VERBS_FAILURE(ibv_query_qp(m_qp, &tmp_ibv_qp_attr, attr_mask, + &tmp_ibv_qp_init_attr)) { + qp_logerr("ibv_query_qp failed (errno=%d %m)", errno); + return -1; + } ENDIF_VERBS_FAILURE; + uint32_t tx_max_inline = safe_mce_sys().tx_max_inline; + m_max_inline_data = min(tmp_ibv_qp_attr.cap.max_inline_data, tx_max_inline); + qp_logdbg("requested max inline = %d QP, actual max inline = %d, " + "VMA max inline set to %d, max_send_wr=%d, max_recv_wr=%d, " + "max_recv_sge=%d, max_send_sge=%d", + tx_max_inline, tmp_ibv_qp_init_attr.cap.max_inline_data, + m_max_inline_data, tmp_ibv_qp_attr.cap.max_send_wr, + tmp_ibv_qp_attr.cap.max_recv_wr, tmp_ibv_qp_attr.cap.max_recv_sge, + tmp_ibv_qp_attr.cap.max_send_sge); +#endif /* DEFINED_TSO */ + return 0; +} + +void qp_mgr_ib::modify_qp_to_ready_state() +{ + qp_logdbg(""); + int ret = 0; + int qp_state = priv_ibv_query_qp_state(m_qp); + + BULLSEYE_EXCLUDE_BLOCK_START + if (qp_state != IBV_QPS_INIT) { + if ((ret = priv_ibv_modify_qp_from_err_to_init_ud(m_qp, m_port_num, m_pkey_index, m_underly_qpn)) != 0) { + qp_logpanic("failed to modify QP from %d to RTS state (ret = %d)", qp_state, ret); + } + } + if ((ret = priv_ibv_modify_qp_from_init_to_rts(m_qp, m_underly_qpn)) != 0) { + qp_logpanic("failed to modify QP from INIT to RTS state (ret = %d)", ret); + } + BULLSEYE_EXCLUDE_BLOCK_END +} + +int qp_mgr_ib::prepare_ibv_qp(vma_ibv_qp_init_attr& qp_init_attr) +{ + qp_logdbg(""); + int ret = 0; + + qp_init_attr.qp_type = IBV_QPT_UD; + vma_ibv_qp_init_attr_comp_mask(m_p_ib_ctx_handler->get_ibv_pd(), qp_init_attr); + +#ifdef DEFINED_TSO + if (m_p_ring->is_tso()) { + vma_ibv_qp_init_attr_tso(qp_init_attr, m_p_ring->get_max_header_sz()); + qp_logdbg("create qp with max_tso_header = %d", m_p_ring->get_max_header_sz()); + } +#endif /* DEFINED_TSO */ + + if (m_underly_qpn) { + ibv_source_qpn_set(qp_init_attr, m_underly_qpn); + qp_logdbg("create qp using underly qpn = 0x%X", m_underly_qpn); + } + + m_qp = vma_ibv_create_qp(m_p_ib_ctx_handler->get_ibv_pd(), &qp_init_attr); + + BULLSEYE_EXCLUDE_BLOCK_START + if (!m_qp) { + qp_logerr("ibv_create_qp failed (errno=%d %m)", errno); + return -1; + } + + if ((ret = priv_ibv_modify_qp_from_err_to_init_ud(m_qp, m_port_num, + m_pkey_index, + m_underly_qpn)) != 0) { + VLOG_PRINTF_INFO_ONCE_THEN_ALWAYS( + VLOG_ERROR, VLOG_DEBUG, + "failed to modify QP from ERR to INIT state (ret = %d) check number of available fds (ulimit -n)", + ret, errno); + return ret; + } + BULLSEYE_EXCLUDE_BLOCK_END + +#ifdef DEFINED_TSO +#else + enum ibv_qp_attr_mask attr_mask = IBV_QP_CAP; + struct ibv_qp_attr tmp_ibv_qp_attr; + struct ibv_qp_init_attr tmp_ibv_qp_init_attr; + IF_VERBS_FAILURE(ibv_query_qp(m_qp, &tmp_ibv_qp_attr, attr_mask, + &tmp_ibv_qp_init_attr)) { + qp_logerr("ibv_query_qp failed (errno=%d %m)", errno); + return -1; + } ENDIF_VERBS_FAILURE; + uint32_t tx_max_inline = safe_mce_sys().tx_max_inline; + m_max_inline_data = min(tmp_ibv_qp_attr.cap.max_inline_data, tx_max_inline); + qp_logdbg("requested max inline = %d QP, actual max inline = %d, " + "VMA max inline set to %d, max_send_wr=%d, max_recv_wr=%d, " + "max_recv_sge=%d, max_send_sge=%d", + tx_max_inline, tmp_ibv_qp_init_attr.cap.max_inline_data, + m_max_inline_data, tmp_ibv_qp_attr.cap.max_send_wr, + tmp_ibv_qp_attr.cap.max_recv_wr, tmp_ibv_qp_attr.cap.max_recv_sge, + tmp_ibv_qp_attr.cap.max_send_sge); +#endif /* DEFINED_TSO */ + + return 0; +} + +void qp_mgr_ib::update_pkey_index() +{ + qp_logdbg(""); + VALGRIND_MAKE_MEM_DEFINED(&m_pkey, sizeof(m_pkey)); + if (priv_ibv_find_pkey_index(m_p_ib_ctx_handler->get_ibv_context(), get_port_num(), m_pkey, &m_pkey_index)) { + qp_logdbg("IB: Can't find correct pkey_index for pkey '%d'", m_pkey); + m_pkey_index = (uint16_t)-1; + } + else { + qp_logdbg("IB: Found correct pkey_index (%d) for pkey '%d'", m_pkey_index, m_pkey); + } +#ifdef DEFINED_IBV_QP_INIT_SOURCE_QPN + /* m_underly_qpn is introduced to detect if current qp_mgr is able to + * use associated qp. + * It is set to non zero value if OFED supports such possibility only but final + * decision can be made just after attempt to create qp. The value of + * m_underly_qpn is reverted to zero if function to qp creation returns + * failure. + * So zero value for this field means no such capability. + * Note: mlx4 does not support this capability. Disable it explicitly because dynamic check + * using ibv_create_qp does not help + */ + if (!m_p_ib_ctx_handler->is_mlx4()) { + m_underly_qpn = m_p_ring->get_qpn(); + } + qp_logdbg("IB: Use qpn = 0x%X for device: %s", m_underly_qpn, m_p_ib_ctx_handler->get_ibname()); +#endif /* DEFINED_IBV_QP_INIT_SOURCE_QPN */ +} + +uint32_t qp_mgr::is_ratelimit_change(struct vma_rate_limit_t &rate_limit) +{ + uint32_t rl_changes = 0; + + if (m_rate_limit.rate != rate_limit.rate) { + rl_changes |= RL_RATE; + } + if (m_rate_limit.max_burst_sz != rate_limit.max_burst_sz) { + rl_changes |= RL_BURST_SIZE; + } + if (m_rate_limit.typical_pkt_sz != rate_limit.typical_pkt_sz) { + rl_changes |= RL_PKT_SIZE; + } + + return rl_changes; +} + +int qp_mgr::modify_qp_ratelimit(struct vma_rate_limit_t &rate_limit, uint32_t rl_changes) +{ + int ret; + + ret = priv_ibv_modify_qp_ratelimit(m_qp, rate_limit, rl_changes); + if (ret) { + qp_logdbg("failed to modify qp ratelimit ret %d (errno=%d %m)", ret, errno); + return -1; + } + + m_rate_limit = rate_limit; + return 0; +} diff --git a/src/vma/dev/qp_mgr.h b/src/vma/dev/qp_mgr.h new file mode 100644 index 0000000..9a7b52c --- /dev/null +++ b/src/vma/dev/qp_mgr.h @@ -0,0 +1,234 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef QP_MGR_H +#define QP_MGR_H + +#include +#include + +#include "vma/ib/base/verbs_extra.h" +#include "vma/proto/vma_lwip.h" +#include "vlogger/vlogger.h" +#include "utils/atomic.h" +#include "vma/util/vtypes.h" +#include "vma/util/sys_vars.h" +#include "vma/util/libvma.h" +#include "vma/util/if.h" +#include "vma/util/hash_map.h" +#include "vma/lwip/opt.h" +#include "vma/proto/mem_buf_desc.h" +#include "vma/infra/sender.h" +#include "vma/dev/ib_ctx_handler.h" +#include "vma/dev/cq_mgr.h" + +class buffer_pool; +class cq_mgr; +class ring; +class ring_simple; +class ring_eth_cb; + +#ifndef MAX_SUPPORTED_IB_INLINE_SIZE +#define MAX_SUPPORTED_IB_INLINE_SIZE 884 +#endif + +/** + * @class qp_mgr + * + * Object to manages the QP operation + * This object is used for Rx & Tx at the same time + * Once created it requests from the system a CQ to work with (for Rx & Tx separately) + * + * The qp_mgr object will manage the memory data buffers to be used for Rx & Tx. + * A descriptor (mem_buf_desc_t) is used to point to each memory data buffers which is also menaged by the qm_mgr. + * + * NOTE: + * The idea here is to use the rmda_cma_id object to manage the QP + * all we need is to rdma_resolve_addr() so we have the correct pkey in the cma_id object + * the rest is a simple transition of the QP states that is hidden inside the rdma_cm + * + */ +class qp_mgr +{ +friend class cq_mgr; +friend class cq_mgr_mlx5; +friend class cq_mgr_mp; +public: + qp_mgr(const ring_simple* p_ring, const ib_ctx_handler* p_context, const uint8_t port_num, const uint32_t tx_num_wr); + virtual ~qp_mgr(); + + virtual void up(); + virtual void down(); + + virtual void post_recv_buffer(mem_buf_desc_t* p_mem_buf_desc); // Post for receive single mem_buf_desc + void post_recv_buffers(descq_t* p_buffers, size_t count); // Post for receive a list of mem_buf_desc + int send(vma_ibv_send_wr* p_send_wqe, vma_wr_tx_packet_attr attr); + +#ifdef DEFINED_TSO + inline uint32_t get_max_inline_data() const { + return m_qp_cap.max_inline_data; + } + inline uint32_t get_max_send_sge() const { + return m_qp_cap.max_send_sge; + } +#else + uint32_t get_max_inline_data() const {return m_max_inline_data; } +#endif /* DEFINED_TSO */ + int get_port_num() const { return m_port_num; } + virtual uint16_t get_partiton() const { return 0; }; + virtual uint32_t get_underly_qpn() const { return 0; }; + struct ibv_qp* get_ibv_qp() const { return m_qp; }; + class cq_mgr* get_tx_cq_mgr() const { return m_p_cq_mgr_tx; } + class cq_mgr* get_rx_cq_mgr() const { return m_p_cq_mgr_rx; } + virtual uint32_t get_rx_max_wr_num(); + // This function can be replaced with a parameter during ring creation. + // chain of calls may serve as cache warm for dummy send feature. + inline bool get_hw_dummy_send_support() {return m_hw_dummy_send_support; } + + virtual void modify_qp_to_ready_state() = 0; + void modify_qp_to_error_state(); + + void release_rx_buffers(); + void release_tx_buffers(); + virtual void trigger_completion_for_all_sent_packets(); + uint32_t is_ratelimit_change(struct vma_rate_limit_t &rate_limit); + int modify_qp_ratelimit(struct vma_rate_limit_t &rate_limit, uint32_t rl_changes); + static inline bool is_lib_mlx5(const char* device_name) {return strstr(device_name, "mlx5");} + virtual void dm_release_data(mem_buf_desc_t* buff) { NOT_IN_USE(buff); } + virtual bool fill_hw_descriptors(vma_mlx_hw_device_data &data) {NOT_IN_USE(data);return false;}; +protected: + struct ibv_qp* m_qp; + uint64_t* m_rq_wqe_idx_to_wrid; + + ring_simple* m_p_ring; + uint8_t m_port_num; + ib_ctx_handler* m_p_ib_ctx_handler; + +#ifdef DEFINED_TSO + struct ibv_qp_cap m_qp_cap; +#else + uint32_t m_max_inline_data; +#endif /* DEFINED_TSO */ + uint32_t m_max_qp_wr; + + cq_mgr* m_p_cq_mgr_rx; + cq_mgr* m_p_cq_mgr_tx; + + uint32_t m_rx_num_wr; + uint32_t m_tx_num_wr; + + bool m_hw_dummy_send_support; + + uint32_t m_n_sysvar_rx_num_wr_to_post_recv; + const uint32_t m_n_sysvar_tx_num_wr_to_signal; + const uint32_t m_n_sysvar_rx_prefetch_bytes_before_poll; + + // recv_wr + ibv_sge* m_ibv_rx_sg_array; + ibv_recv_wr* m_ibv_rx_wr_array; + uint32_t m_curr_rx_wr; + uintptr_t m_last_posted_rx_wr_id; // Remember so in case we flush RQ we know to wait until this WR_ID is received + + // send wr + uint32_t m_n_unsignaled_count; + mem_buf_desc_t* m_p_last_tx_mem_buf_desc; // Remembered so we can list several mem_buf_desc_t on a single notification request + + mem_buf_desc_t* m_p_prev_rx_desc_pushed; + + // generating packet IDs + uint16_t m_n_ip_id_base; + uint16_t m_n_ip_id_offset; + struct vma_rate_limit_t m_rate_limit; + + int configure(struct ibv_comp_channel* p_rx_comp_event_channel); + virtual int prepare_ibv_qp(vma_ibv_qp_init_attr& qp_init_attr) = 0; + inline void set_unsignaled_count(void) { m_n_unsignaled_count = m_n_sysvar_tx_num_wr_to_signal - 1; } + + virtual cq_mgr* init_rx_cq_mgr(struct ibv_comp_channel* p_rx_comp_event_channel); + virtual cq_mgr* init_tx_cq_mgr(void); + + cq_mgr* handle_cq_initialization(uint32_t *num_wr, struct ibv_comp_channel* comp_event_channel, bool is_rx); + + virtual int send_to_wire(vma_ibv_send_wr* p_send_wqe, vma_wr_tx_packet_attr attr, bool request_comp); + virtual bool is_completion_need() { return !m_n_unsignaled_count; }; +}; + +class qp_mgr_eth : public qp_mgr +{ +public: + qp_mgr_eth(const ring_simple* p_ring, const ib_ctx_handler* p_context, + const uint8_t port_num, + struct ibv_comp_channel* p_rx_comp_event_channel, + const uint32_t tx_num_wr, const uint16_t vlan, + bool call_configure = true): + qp_mgr(p_ring, p_context, port_num, tx_num_wr), m_vlan(vlan) { + if(call_configure && configure(p_rx_comp_event_channel)) + throw_vma_exception("failed creating qp"); + }; + + virtual ~qp_mgr_eth() {} + + virtual void modify_qp_to_ready_state(); + virtual uint16_t get_partiton() const { return m_vlan; }; + +protected: + virtual int prepare_ibv_qp(vma_ibv_qp_init_attr& qp_init_attr); +private: + const uint16_t m_vlan; +}; + +class qp_mgr_ib : public qp_mgr +{ +public: + qp_mgr_ib(const ring_simple* p_ring, const ib_ctx_handler* p_context, const uint8_t port_num, + struct ibv_comp_channel* p_rx_comp_event_channel, const uint32_t tx_num_wr, const uint16_t pkey): + qp_mgr(p_ring, p_context, port_num, tx_num_wr), m_pkey(pkey), m_underly_qpn(0) { + update_pkey_index(); + if(configure(p_rx_comp_event_channel)) throw_vma_exception("failed creating qp"); }; + + virtual void modify_qp_to_ready_state(); + virtual uint16_t get_partiton() const { return m_pkey; }; + virtual uint32_t get_underly_qpn() const { return m_underly_qpn; }; + +protected: + virtual int prepare_ibv_qp(vma_ibv_qp_init_attr& qp_init_attr); + +private: + const uint16_t m_pkey; + uint16_t m_pkey_index; + uint32_t m_underly_qpn; + + void update_pkey_index(); +}; + +#endif diff --git a/src/vma/dev/qp_mgr_eth_direct.cpp b/src/vma/dev/qp_mgr_eth_direct.cpp new file mode 100644 index 0000000..0b161fa --- /dev/null +++ b/src/vma/dev/qp_mgr_eth_direct.cpp @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "qp_mgr_eth_direct.h" +#include "vlogger/vlogger.h" +#include "vma/util/valgrind.h" +#include "cq_mgr_mlx5.h" +#include "ring_simple.h" + +#if defined(DEFINED_DIRECT_VERBS) + +#undef MODULE_NAME +#define MODULE_NAME "qp_mgr_direct" +#define qp_logpanic __log_info_panic +#define qp_logerr __log_info_err +#define qp_logwarn __log_info_warn +#define qp_loginfo __log_info_info +#define qp_logdbg __log_info_dbg +#define qp_logfunc __log_info_func +#define qp_logfuncall __log_info_funcall + +qp_mgr_eth_direct::qp_mgr_eth_direct(const ring_simple* p_ring, + const ib_ctx_handler* p_context, const uint8_t port_num, + ibv_comp_channel* p_rx_comp_event_channel, + const uint32_t tx_num_wr, const uint16_t vlan): + qp_mgr_eth_mlx5(p_ring, p_context, port_num, + p_rx_comp_event_channel, tx_num_wr, vlan, false) +{ + // must be called from this class to call derived prepare_ibv_qp + if (configure(p_rx_comp_event_channel)) { + throw_vma_exception("failed creating qp_mgr_eth"); + } + + qp_logfunc("m_p_qp= %p", m_qp); +} + +cq_mgr* qp_mgr_eth_direct::init_tx_cq_mgr() +{ + m_tx_num_wr = m_p_ib_ctx_handler->get_ibv_device_attr()->max_qp_wr; + return new cq_mgr_mlx5(m_p_ring, m_p_ib_ctx_handler, m_tx_num_wr, m_p_ring->get_tx_comp_event_channel(), false); +} + +int qp_mgr_eth_direct::prepare_ibv_qp(vma_ibv_qp_init_attr& qp_init_attr) +{ + qp_init_attr.cap.max_send_wr = m_p_ib_ctx_handler->get_ibv_device_attr()->max_qp_wr; + qp_init_attr.cap.max_send_sge = 1; + qp_init_attr.cap.max_recv_sge = 1; + qp_init_attr.cap.max_inline_data = 0; +#if defined(DEFINED_IBV_DEVICE_CROSS_CHANNEL) + qp_init_attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS; + qp_init_attr.exp_create_flags |= IBV_EXP_QP_CREATE_CROSS_CHANNEL; + qp_logdbg("Cross-Channel is in qp"); +#else + qp_logdbg("Cross-Channel is not supported in qp"); +#endif /* DEFINED_IBV_DEVICE_CROSS_CHANNEL */ + return qp_mgr_eth_mlx5::prepare_ibv_qp(qp_init_attr); +} + +void qp_mgr_eth_direct::up() +{ + init_sq(); + m_p_last_tx_mem_buf_desc = NULL; + modify_qp_to_ready_state(); + m_p_cq_mgr_rx->add_qp_rx(this); +} + +void qp_mgr_eth_direct::down() +{ + qp_logdbg("QP current state: %d", priv_ibv_query_qp_state(m_qp)); + modify_qp_to_error_state(); + + // let the QP drain all wqe's to flushed cqe's now that we moved + // it to error state and post_sent final trigger for completion + usleep(1000); + + m_p_cq_mgr_rx->del_qp_rx(this); +} + +bool qp_mgr_eth_direct::fill_hw_descriptors(vma_mlx_hw_device_data &data) +{ + qp_logdbg("QPN: %d dbrec: %p QP.info.SQ. buf: %p wqe_cnt: %d " + "stride: %d bf.reg: %p", + m_mlx5_qp.qpn, m_mlx5_qp.sq.dbrec, m_mlx5_qp.sq.buf, m_mlx5_qp.sq.wqe_cnt, + m_mlx5_qp.sq.stride, m_mlx5_qp.bf.reg); + + data.sq_data.sq_num = m_mlx5_qp.qpn; + data.sq_data.wq_data.dbrec = m_mlx5_qp.sq.dbrec; + data.sq_data.wq_data.buf = m_mlx5_qp.sq.buf; + data.sq_data.wq_data.stride = m_mlx5_qp.sq.stride; + data.sq_data.wq_data.wqe_cnt = m_mlx5_qp.sq.wqe_cnt; + + data.sq_data.bf.reg = m_mlx5_qp.bf.reg; + data.sq_data.bf.offset = m_mlx5_qp.bf.offset; + data.sq_data.bf.size = m_mlx5_qp.bf.size; + + data.rq_data.wq_data.buf = m_mlx5_qp.rq.buf; + data.rq_data.wq_data.dbrec = m_mlx5_qp.rq.dbrec; + data.rq_data.wq_data.stride = m_mlx5_qp.rq.stride; + data.rq_data.wq_data.wqe_cnt = m_mlx5_qp.rq.wqe_cnt; + data.rq_data.head = &m_mlx5_qp.rq.head; + data.rq_data.tail = &m_mlx5_qp.rq.tail; + + return true; +} + +qp_mgr_eth_direct::~qp_mgr_eth_direct() +{ + if (m_qp) { + IF_VERBS_FAILURE(ibv_destroy_qp(m_qp)) { + qp_logdbg("QP destroy failure (errno = %d %m)", -errno); + } ENDIF_VERBS_FAILURE; + VALGRIND_MAKE_MEM_UNDEFINED(m_qp, sizeof(ibv_qp)); + } + m_qp = NULL; + delete m_p_cq_mgr_tx; + m_p_cq_mgr_tx = NULL; + delete m_p_cq_mgr_rx; + m_p_cq_mgr_rx = NULL; +} + +#endif /* DEFINED_DIRECT_VERBS */ diff --git a/src/vma/dev/qp_mgr_eth_direct.h b/src/vma/dev/qp_mgr_eth_direct.h new file mode 100644 index 0000000..fa34d45 --- /dev/null +++ b/src/vma/dev/qp_mgr_eth_direct.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef SRC_VMA_DEV_QP_MGR_ETH_DIRECT_H_ +#define SRC_VMA_DEV_QP_MGR_ETH_DIRECT_H_ + +#include "qp_mgr_eth_mlx5.h" + +#if defined(DEFINED_DIRECT_VERBS) + +class qp_mgr_eth_direct: public qp_mgr_eth_mlx5 +{ +public: + qp_mgr_eth_direct(const ring_simple* p_ring, const ib_ctx_handler* p_context, + const uint8_t port_num, ibv_comp_channel* p_rx_comp_event_channel, + const uint32_t tx_num_wr, const uint16_t vlan); + virtual ~qp_mgr_eth_direct(); + virtual cq_mgr* init_tx_cq_mgr(void); + virtual void up(); + virtual void down(); + virtual uint32_t get_rx_max_wr_num() { return 0;}; + virtual bool fill_hw_descriptors(vma_mlx_hw_device_data &data); +protected: + virtual int prepare_ibv_qp(vma_ibv_qp_init_attr& qp_init_attr); +}; + +#endif /* DEFINED_DIRECT_VERBS */ + +#endif /* SRC_VMA_DEV_QP_MGR_ETH_DIRECT_H_ */ diff --git a/src/vma/dev/qp_mgr_eth_mlx5.cpp b/src/vma/dev/qp_mgr_eth_mlx5.cpp new file mode 100644 index 0000000..3557629 --- /dev/null +++ b/src/vma/dev/qp_mgr_eth_mlx5.cpp @@ -0,0 +1,903 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "qp_mgr_eth_mlx5.h" + +#if defined(DEFINED_DIRECT_VERBS) + +#include +#include "cq_mgr_mlx5.h" +#include "vma/util/utils.h" +#include "vlogger/vlogger.h" +#include "ring_simple.h" + +#undef MODULE_NAME +#define MODULE_NAME "qpm_mlx5" +#define qp_logpanic __log_info_panic +#define qp_logerr __log_info_err +#define qp_logwarn __log_info_warn +#define qp_loginfo __log_info_info +#define qp_logdbg __log_info_dbg +#define qp_logfunc __log_info_func +#define qp_logfuncall __log_info_funcall + +#if !defined(MLX5_ETH_INLINE_HEADER_SIZE) +#define MLX5_ETH_INLINE_HEADER_SIZE 18 +#endif + +#define OCTOWORD 16 +#define WQEBB 64 + + +//#define DBG_DUMP_WQE 1 + +#ifdef DBG_DUMP_WQE +#define dbg_dump_wqe(_addr, _size) { \ + uint32_t* _wqe = _addr; \ + qp_logfunc("Dumping %d bytes from %p", _size, _wqe); \ + for (int i = 0; i < (int)_size / 4; i += 4) { \ + qp_logfunc("%08x %08x %08x %08x", ntohl(_wqe[i+0]), ntohl(_wqe[i+1]), ntohl(_wqe[i+2]), ntohl(_wqe[i+3])); \ + } \ +} +#else +#define dbg_dump_wqe(_addr, _size) +#endif + +static inline uint64_t align_to_octoword_up(uint64_t val) +{ + return ((val+16-1)>>4)<<4; +} + +static inline uint64_t align_to_WQEBB_up(uint64_t val) +{ + return ((val+4-1)>>2)<<2; +} + +static bool is_bf(struct ibv_context *ib_ctx) +{ +#define VMA_MLX5_MMAP_GET_WC_PAGES_CMD 2 // Corresponding to MLX5_MMAP_GET_WC_PAGES_CMD +#define VMA_MLX5_IB_MMAP_CMD_SHIFT 8 // Corresponding to MLX5_IB_MMAP_CMD_SHIFT + static int page_size = sysconf(_SC_PAGESIZE); + static off_t offset = VMA_MLX5_MMAP_GET_WC_PAGES_CMD << VMA_MLX5_IB_MMAP_CMD_SHIFT; + char *env; + + /* This limitation is done for RM: 1557652, 1894523, 1914464, 2069198 */ + if (safe_mce_sys().hypervisor != mce_sys_var::HYPER_NONE) { + return false; + } + + env = getenv("MLX5_SHUT_UP_BF"); + if (!env || !strcmp(env, "0")) { + /* + * The following logic was taken from libmlx5 library and its purpose is to check whether + * the use of BF is supported for the device. + */ + void *addr = mmap(NULL, page_size, PROT_WRITE, MAP_SHARED, + ib_ctx->cmd_fd, page_size * offset); + if (addr != MAP_FAILED) { + (void)munmap(addr, page_size); + return true; + } + } + return false; +} + +//! Maps vma_ibv_wr_opcode to real MLX5 opcode. +// +static inline uint32_t get_mlx5_opcode(vma_ibv_wr_opcode verbs_opcode) +{ + switch (verbs_opcode) { + case VMA_IBV_WR_SEND: + return MLX5_OPCODE_SEND; +#ifdef DEFINED_TSO + case VMA_IBV_WR_TSO: + return MLX5_OPCODE_TSO; +#endif /* DEFINED_TSO */ + case VMA_IBV_WR_NOP: + return MLX5_OPCODE_NOP; + default: + return MLX5_OPCODE_SEND; + } +} + +qp_mgr_eth_mlx5::qp_mgr_eth_mlx5(const ring_simple* p_ring, + const ib_ctx_handler* p_context, const uint8_t port_num, + struct ibv_comp_channel* p_rx_comp_event_channel, + const uint32_t tx_num_wr, const uint16_t vlan, bool call_configure): + qp_mgr_eth(p_ring, p_context, port_num, p_rx_comp_event_channel, tx_num_wr, vlan, false) + ,m_sq_wqe_idx_to_wrid(NULL) + ,m_rq_wqe_counter(0) + ,m_sq_wqes(NULL) + ,m_sq_wqe_hot(NULL) + ,m_sq_wqes_end(NULL) + ,m_sq_wqe_hot_index(0) + ,m_sq_wqe_counter(0) + ,m_dm_enabled(0) +{ + // Check device capabilities for dummy send support + m_hw_dummy_send_support = vma_is_nop_supported(m_p_ib_ctx_handler->get_ibv_device_attr()); + + if (call_configure && configure(p_rx_comp_event_channel)) { + throw_vma_exception("failed creating qp_mgr_eth"); + } + + memset(&m_mlx5_qp, 0, sizeof(m_mlx5_qp)); + m_db_method = (is_bf(((ib_ctx_handler*)p_context)->get_ibv_context()) ? MLX5_DB_METHOD_BF : MLX5_DB_METHOD_DB); + + qp_logdbg("m_db_method=%d", m_db_method); +} + +void qp_mgr_eth_mlx5::init_sq() +{ + if (0 != vma_ib_mlx5_get_qp(m_qp, &m_mlx5_qp)) { + qp_logpanic("vma_ib_mlx5_get_qp failed (errno=%d %m)", errno); + } + + m_sq_wqes = (struct mlx5_wqe64 (*)[])(uintptr_t)m_mlx5_qp.sq.buf; + m_sq_wqe_hot = &(*m_sq_wqes)[0]; + m_sq_wqes_end = (uint8_t*)((uintptr_t)m_mlx5_qp.sq.buf + m_mlx5_qp.sq.wqe_cnt * m_mlx5_qp.sq.stride); + m_sq_wqe_counter = 0; + + m_sq_wqe_hot_index = 0; + + m_tx_num_wr = (m_sq_wqes_end-(uint8_t *)m_sq_wqe_hot)/WQEBB; + /* Maximum BF inlining consists of: + * - CTRL: + * - 1st WQEBB is mostly used for CTRL and ETH segment (where ETH header is inlined) + * - 4 bytes for size of inline data + * - DATA: + * - 1 OCTOWORD from 1st WQEBB is used for data inlining, except for + * the 4 bytes used for stating the inline data size + * - 3 WQEBB are fully availabie for data inlining + */ +#ifdef DEFINED_TSO + m_qp_cap.max_inline_data = OCTOWORD - 4 + 3 * WQEBB; +#else + m_max_inline_data = OCTOWORD-4 + 3*WQEBB; +#endif /* DEFINED_TSO */ + + if (m_sq_wqe_idx_to_wrid == NULL) { + m_sq_wqe_idx_to_wrid = (uint64_t*)mmap(NULL, m_tx_num_wr * sizeof(*m_sq_wqe_idx_to_wrid), + PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (m_sq_wqe_idx_to_wrid == MAP_FAILED) { + qp_logerr("Failed allocating m_sq_wqe_idx_to_wrid (errno=%d %m)", errno); + return; + } + } + + qp_logfunc("m_tx_num_wr=%d max_inline_data: %d m_sq_wqe_idx_to_wrid=%p", + m_tx_num_wr, get_max_inline_data(), m_sq_wqe_idx_to_wrid); + + memset((void *)(uintptr_t)m_sq_wqe_hot, 0, sizeof(struct mlx5_wqe64)); + m_sq_wqe_hot->ctrl.data[0] = htonl(MLX5_OPCODE_SEND); + m_sq_wqe_hot->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | 4); + m_sq_wqe_hot->ctrl.data[2] = 0; + m_sq_wqe_hot->eseg.inline_hdr_sz = htons(MLX5_ETH_INLINE_HEADER_SIZE); + m_sq_wqe_hot->eseg.cs_flags = VMA_TX_PACKET_L3_CSUM | VMA_TX_PACKET_L4_CSUM; + + qp_logfunc("%p allocated for %d QPs sq_wqes:%p sq_wqes_end: %p and configured %d WRs BlueFlame: %p buf_size: %d offset: %d", + m_qp, m_mlx5_qp.qpn, m_sq_wqes, m_sq_wqes_end, m_tx_num_wr, m_mlx5_qp.bf.reg, m_mlx5_qp.bf.size, m_mlx5_qp.bf.offset); +} + +void qp_mgr_eth_mlx5::up() +{ + init_sq(); + qp_mgr::up(); + + /* This limitation is done because of a observation + * that dm_copy takes a lot of time on VMs w/o BF (RM:1542628) + */ + if (m_p_ib_ctx_handler->get_on_device_memory_size() > 0) { + if (m_db_method == MLX5_DB_METHOD_BF) { + m_dm_enabled = m_dm_mgr.allocate_resources(m_p_ib_ctx_handler, m_p_ring->m_p_ring_stat); + + } else { +#if defined(DEFINED_IBV_DM) + VLOG_PRINTF_ONCE_THEN_DEBUG(VLOG_WARNING, "Device Memory functionality is not used on devices w/o Blue Flame support\n"); +#endif /* DEFINED_IBV_DM */ + } + } +} + +void qp_mgr_eth_mlx5::down() +{ + if (m_dm_enabled) { + m_dm_mgr.release_resources(); + } + + qp_mgr::down(); +} + +//! Cleanup resources QP itself will be freed by base class DTOR +qp_mgr_eth_mlx5::~qp_mgr_eth_mlx5() +{ + if (m_rq_wqe_idx_to_wrid) { + if (0 != munmap(m_rq_wqe_idx_to_wrid, m_rx_num_wr * sizeof(*m_rq_wqe_idx_to_wrid))) { + qp_logerr("Failed deallocating memory with munmap m_rq_wqe_idx_to_wrid (errno=%d %m)", errno); + } + + m_rq_wqe_idx_to_wrid = NULL; + } + if (m_sq_wqe_idx_to_wrid) { + if (0 != munmap(m_sq_wqe_idx_to_wrid, m_tx_num_wr * sizeof(*m_sq_wqe_idx_to_wrid))) { + qp_logerr("Failed deallocating memory with munmap m_sq_wqe_idx_to_wrid (errno=%d %m)", errno); + } + + m_sq_wqe_idx_to_wrid = NULL; + } +} + +void qp_mgr_eth_mlx5::post_recv_buffer(mem_buf_desc_t* p_mem_buf_desc) +{ + if (m_n_sysvar_rx_prefetch_bytes_before_poll) { + if (m_p_prev_rx_desc_pushed) + m_p_prev_rx_desc_pushed->p_prev_desc = p_mem_buf_desc; + m_p_prev_rx_desc_pushed = p_mem_buf_desc; + } + + m_ibv_rx_wr_array[m_curr_rx_wr].wr_id = (uintptr_t)p_mem_buf_desc; + m_ibv_rx_sg_array[m_curr_rx_wr].addr = (uintptr_t)p_mem_buf_desc->p_buffer; + m_ibv_rx_sg_array[m_curr_rx_wr].length = p_mem_buf_desc->sz_buffer; + m_ibv_rx_sg_array[m_curr_rx_wr].lkey = p_mem_buf_desc->lkey; + + if (m_rq_wqe_idx_to_wrid) { + uint32_t index = m_rq_wqe_counter & (m_rx_num_wr - 1); + m_rq_wqe_idx_to_wrid[index] = (uintptr_t)p_mem_buf_desc; + ++m_rq_wqe_counter; + } + + if (m_curr_rx_wr == m_n_sysvar_rx_num_wr_to_post_recv - 1) { + + m_last_posted_rx_wr_id = (uintptr_t)p_mem_buf_desc; + + m_p_prev_rx_desc_pushed = NULL; + p_mem_buf_desc->p_prev_desc = NULL; + + m_curr_rx_wr = 0; + struct ibv_recv_wr *bad_wr = NULL; + IF_VERBS_FAILURE(vma_ib_mlx5_post_recv(&m_mlx5_qp, &m_ibv_rx_wr_array[0], &bad_wr)) { + uint32_t n_pos_bad_rx_wr = ((uint8_t*)bad_wr - (uint8_t*)m_ibv_rx_wr_array) / sizeof(struct ibv_recv_wr); + qp_logerr("failed posting list (errno=%d %m)", errno); + qp_logerr("bad_wr is %d in submitted list (bad_wr=%p, m_ibv_rx_wr_array=%p, size=%d)", n_pos_bad_rx_wr, bad_wr, m_ibv_rx_wr_array, sizeof(struct ibv_recv_wr)); + qp_logerr("bad_wr info: wr_id=%#x, next=%p, addr=%#x, length=%d, lkey=%#x", bad_wr[0].wr_id, bad_wr[0].next, bad_wr[0].sg_list[0].addr, bad_wr[0].sg_list[0].length, bad_wr[0].sg_list[0].lkey); + qp_logerr("QP current state: %d", priv_ibv_query_qp_state(m_qp)); + + // Fix broken linked list of rx_wr + if (n_pos_bad_rx_wr != (m_n_sysvar_rx_num_wr_to_post_recv - 1)) { + m_ibv_rx_wr_array[n_pos_bad_rx_wr].next = &m_ibv_rx_wr_array[n_pos_bad_rx_wr+1]; + } + throw; + } ENDIF_VERBS_FAILURE; + qp_logfunc("Successful ibv_post_recv"); + } + else { + m_curr_rx_wr++; + } +} + +cq_mgr* qp_mgr_eth_mlx5::init_rx_cq_mgr(struct ibv_comp_channel* p_rx_comp_event_channel) +{ + m_rx_num_wr = align32pow2(m_rx_num_wr); + + m_rq_wqe_idx_to_wrid = (uint64_t*)mmap(NULL, m_rx_num_wr * sizeof(*m_rq_wqe_idx_to_wrid), + PROT_READ | PROT_WRITE, MAP_ANONYMOUS | MAP_PRIVATE, -1, 0); + if (m_rq_wqe_idx_to_wrid == MAP_FAILED) { + qp_logerr("Failed allocating m_rq_wqe_idx_to_wrid (errno=%d %m)", errno); + return NULL; + } + + return new cq_mgr_mlx5(m_p_ring, m_p_ib_ctx_handler, m_rx_num_wr, p_rx_comp_event_channel, true); +} + +cq_mgr* qp_mgr_eth_mlx5::init_tx_cq_mgr() +{ + m_tx_num_wr = align32pow2(m_tx_num_wr); + return new cq_mgr_mlx5(m_p_ring, m_p_ib_ctx_handler, m_tx_num_wr, m_p_ring->get_tx_comp_event_channel(), false); +} + +inline void qp_mgr_eth_mlx5::set_signal_in_next_send_wqe() +{ + volatile struct mlx5_wqe64 *wqe = &(*m_sq_wqes)[m_sq_wqe_counter & (m_tx_num_wr - 1)]; + wqe->ctrl.data[2] = htonl(8); +} + +#ifdef DEFINED_TSO +inline void qp_mgr_eth_mlx5::ring_doorbell(uint64_t* wqe, int db_method, int num_wqebb, int num_wqebb_top) +{ + uint64_t* dst = (uint64_t*)((uint8_t*)m_mlx5_qp.bf.reg + m_mlx5_qp.bf.offset); + uint64_t* src = wqe; + + m_sq_wqe_counter = (m_sq_wqe_counter + num_wqebb + num_wqebb_top) & 0xFFFF; + + // Make sure that descriptors are written before + // updating doorbell record and ringing the doorbell + wmb(); + *m_mlx5_qp.sq.dbrec = htonl(m_sq_wqe_counter); + + // This wc_wmb ensures ordering between DB record and BF copy */ + wc_wmb(); + if (likely(db_method == MLX5_DB_METHOD_BF)) { + /* Copying src to BlueFlame register buffer by Write Combining cnt WQEBBs + * Avoid using memcpy() to copy to BlueFlame page, since memcpy() + * implementations may use move-string-buffer assembler instructions, + * which do not guarantee order of copying. + */ + while (num_wqebb--) { + COPY_64B_NT(dst, src); + } + src = (uint64_t*)m_sq_wqes; + while (num_wqebb_top--) { + COPY_64B_NT(dst, src); + } + } else { + *dst = *src; + } + + /* Use wc_wmb() to ensure write combining buffers are flushed out + * of the running CPU. + * sfence instruction affects only the WC buffers of the CPU that executes it + */ + wc_wmb(); + m_mlx5_qp.bf.offset ^= m_mlx5_qp.bf.size; +} +#else +inline void qp_mgr_eth_mlx5::ring_doorbell(uint64_t* wqe, int num_wqebb, int num_wqebb_top) +{ + uint64_t* dst = (uint64_t*)((uint8_t*)m_mlx5_qp.bf.reg + m_mlx5_qp.bf.offset); + uint64_t* src = wqe; + + m_sq_wqe_counter = (m_sq_wqe_counter + num_wqebb + num_wqebb_top) & 0xFFFF; + + // Make sure that descriptors are written before + // updating doorbell record and ringing the doorbell + wmb(); + *m_mlx5_qp.sq.dbrec = htonl(m_sq_wqe_counter); + + // This wc_wmb ensures ordering between DB record and BF copy */ + wc_wmb(); + if (likely(m_db_method == MLX5_DB_METHOD_BF)) { + /* Copying src to BlueFlame register buffer by Write Combining cnt WQEBBs + * Avoid using memcpy() to copy to BlueFlame page, since memcpy() + * implementations may use move-string-buffer assembler instructions, + * which do not guarantee order of copying. + */ + while (num_wqebb--) { + COPY_64B_NT(dst, src); + } + src = (uint64_t*)m_sq_wqes; + while (num_wqebb_top--) { + COPY_64B_NT(dst, src); + } + } else { + *dst = *src; + } + + /* Use wc_wmb() to ensure write combining buffers are flushed out + * of the running CPU. + * sfence instruction affects only the WC buffers of the CPU that executes it + */ + wc_wmb(); + m_mlx5_qp.bf.offset ^= m_mlx5_qp.bf.size; +} +#endif /* DEFINED_TSO */ + +inline int qp_mgr_eth_mlx5::fill_inl_segment(sg_array &sga, uint8_t *cur_seg, uint8_t* data_addr, + int max_inline_len, int inline_len) +{ + int wqe_inline_size = 0; + while ((data_addr!=NULL) && inline_len) { + dbg_dump_wqe((uint32_t*)data_addr, inline_len); + memcpy(cur_seg, data_addr, inline_len); + wqe_inline_size += inline_len; + cur_seg += inline_len; + inline_len = max_inline_len-wqe_inline_size; + data_addr = sga.get_data(&inline_len); + qp_logfunc("data_addr:%p cur_seg: %p inline_len: %d wqe_inline_size: %d", + data_addr, cur_seg, inline_len, wqe_inline_size); + + } + return wqe_inline_size; +} + +inline int qp_mgr_eth_mlx5::fill_ptr_segment(sg_array &sga, struct mlx5_wqe_data_seg* dp_seg, uint8_t* data_addr, + int data_len, mem_buf_desc_t* buffer) +{ + int wqe_seg_size = 0; + int len = data_len; + + // Currently, a maximum of 2 data pointer segments are utilized by + // VMA. This is enforced by the dst layer during l2 header + // configuration. + while ((data_addr!=NULL) && data_len) { + wqe_seg_size += sizeof(struct mlx5_wqe_data_seg); + data_addr = sga.get_data(&len); + dp_seg->byte_count = htonl(len); + + // Try to copy data to On Device Memory + if (!(m_dm_enabled && m_dm_mgr.copy_data(dp_seg, data_addr, data_len, buffer))) { + // Use the registered buffer if copying did not succeed + dp_seg->lkey = htonl(sga.get_current_lkey()); + dp_seg->addr = htonll((uint64_t)data_addr); + } + + data_len -= len; + qp_logfunc("data_addr:%llx data_len: %d len: %d lkey: %x", data_addr, data_len, len, dp_seg->lkey); + dp_seg++; + } + return wqe_seg_size; +} + +//! Fill WQE dynamically, based on amount of free WQEBB in SQ +#ifdef DEFINED_TSO +inline int qp_mgr_eth_mlx5::fill_wqe(vma_ibv_send_wr *pswr) +{ + // control segment is mostly filled by preset after previous packet + // we always inline ETH header + sg_array sga(pswr->sg_list, pswr->num_sge); + uint8_t* cur_seg = (uint8_t*)m_sq_wqe_hot + sizeof(struct mlx5_wqe_ctrl_seg); + int inline_len = MLX5_ETH_INLINE_HEADER_SIZE; + int data_len = sga.length(); + int wqe_size = sizeof(struct mlx5_wqe_ctrl_seg) / OCTOWORD; + int max_inline_len = get_max_inline_data(); + + // assume packet is full inline + if (likely(data_len <= max_inline_len)) { + uint8_t* data_addr = sga.get_data(&inline_len); // data for inlining in ETH header + data_len -= inline_len; + qp_logfunc("wqe_hot:%p num_sge: %d data_addr: %p data_len: %d max_inline_len: %d inline_len: %d", + m_sq_wqe_hot, pswr->num_sge, data_addr, data_len, max_inline_len, inline_len); + + // Fill Ethernet segment with header inline, static data + // were populated in preset after previous packet send + memcpy(cur_seg+offsetof(struct mlx5_wqe_eth_seg, inline_hdr_start), data_addr, MLX5_ETH_INLINE_HEADER_SIZE); + cur_seg += sizeof(struct mlx5_wqe_eth_seg); + wqe_size += sizeof(struct mlx5_wqe_eth_seg)/OCTOWORD; + + max_inline_len = data_len; + // Filling inline data segment + // size of BlueFlame buffer is 4*WQEBBs, 3*OCTOWORDS of the first + // was allocated for control and ethernet segment so we have 3*WQEBB+16-4 + int rest_space = std::min((int)(m_sq_wqes_end-cur_seg-4), (3*WQEBB+OCTOWORD-4)); + // Filling till the end of inline WQE segment or + // to end of WQEs + if (likely(max_inline_len <= rest_space)) { + inline_len = max_inline_len; + qp_logfunc("data_addr:%p cur_seg: %p rest_space: %d inline_len: %d wqe_size: %d", + data_addr, cur_seg, rest_space, inline_len, wqe_size); + //bypass inline size and fill inline data segment + data_addr = sga.get_data(&inline_len); + inline_len = fill_inl_segment(sga, cur_seg+4, data_addr, max_inline_len, inline_len); + + // store inline data size and mark the data as inlined + *(uint32_t*)((uint8_t*)m_sq_wqe_hot+sizeof(struct mlx5_wqe_ctrl_seg)+sizeof(struct mlx5_wqe_eth_seg)) + = htonl(0x80000000|inline_len); + rest_space = align_to_octoword_up(inline_len+4); // align to OCTOWORDs + wqe_size += rest_space/OCTOWORD; + //assert((data_len-inline_len)==0); + // configuring control + m_sq_wqe_hot->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | wqe_size); + rest_space = align_to_WQEBB_up(wqe_size)/4; + qp_logfunc("data_len: %d inline_len: %d wqe_size: %d wqebbs: %d", + data_len-inline_len, inline_len, wqe_size, rest_space); + ring_doorbell((uint64_t *)m_sq_wqe_hot, m_db_method, rest_space); + return rest_space; + } else { + // wrap around case, first filling till the end of m_sq_wqes + int wrap_up_size = max_inline_len-rest_space; + inline_len = rest_space; + qp_logfunc("WRAP_UP_SIZE: %d data_addr:%p cur_seg: %p rest_space: %d inline_len: %d wqe_size: %d", + wrap_up_size, data_addr, cur_seg, rest_space, inline_len, wqe_size); + + data_addr = sga.get_data(&inline_len); + inline_len = fill_inl_segment(sga, cur_seg+4, data_addr, rest_space, inline_len); + data_len -= inline_len; + rest_space = align_to_octoword_up(inline_len+4); + wqe_size += rest_space/OCTOWORD; + rest_space = align_to_WQEBB_up(rest_space/OCTOWORD)/4;// size of 1st chunk at the end + + qp_logfunc("END chunk data_addr: %p data_len: %d inline_len: %d wqe_size: %d wqebbs: %d", + data_addr, data_len, inline_len, wqe_size, rest_space); + // Wrap around + // + cur_seg = (uint8_t*)m_sq_wqes; + data_addr = sga.get_data(&wrap_up_size); + + wrap_up_size = fill_inl_segment(sga, cur_seg, data_addr, data_len, wrap_up_size); + inline_len += wrap_up_size; + max_inline_len = align_to_octoword_up(wrap_up_size); + wqe_size += max_inline_len/OCTOWORD; + max_inline_len = align_to_WQEBB_up(max_inline_len/OCTOWORD)/4; + // store inline data size + *(uint32_t*)((uint8_t* )m_sq_wqe_hot+sizeof(struct mlx5_wqe_ctrl_seg)+sizeof(struct mlx5_wqe_eth_seg)) + = htonl(0x80000000|inline_len); + qp_logfunc("BEGIN_CHUNK data_addr: %p data_len: %d wqe_size: %d inline_len: %d end_wqebbs: %d wqebbs: %d", + data_addr, data_len-wrap_up_size, wqe_size, inline_len+wrap_up_size, rest_space, max_inline_len); + //assert((data_len-wrap_up_size)==0); + // configuring control + m_sq_wqe_hot->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | wqe_size); + + dbg_dump_wqe((uint32_t*)m_sq_wqe_hot, rest_space*4*16); + dbg_dump_wqe((uint32_t*)m_sq_wqes, max_inline_len*4*16); + + ring_doorbell((uint64_t*)m_sq_wqe_hot, m_db_method, rest_space, max_inline_len); + return rest_space+max_inline_len; + } + } else { + // data is bigger than max to inline we inlined only ETH header + uint from IP (18 bytes) + // the rest will be in data pointer segment + // adding data seg with pointer if there still data to transfer + if (vma_send_wr_opcode(*pswr) == VMA_IBV_WR_SEND ) { + uint8_t* data_addr = sga.get_data(&inline_len); // data for inlining in ETH header + + qp_logfunc("wqe_hot:%p num_sge: %d data_addr: %p data_len: %d max_inline_len: %d inline_len$ %d", + m_sq_wqe_hot, pswr->num_sge, data_addr, data_len, max_inline_len, inline_len); + + // Fill Ethernet segment with header inline, static data + // were populated in preset after previous packet send + memcpy(cur_seg+offsetof(struct mlx5_wqe_eth_seg, inline_hdr_start), data_addr, MLX5_ETH_INLINE_HEADER_SIZE); + data_addr += MLX5_ETH_INLINE_HEADER_SIZE; + data_len -= MLX5_ETH_INLINE_HEADER_SIZE; + cur_seg += sizeof(struct mlx5_wqe_eth_seg); + wqe_size += sizeof(struct mlx5_wqe_eth_seg)/OCTOWORD; + inline_len = fill_ptr_segment(sga, (struct mlx5_wqe_data_seg*)cur_seg, data_addr, data_len, (mem_buf_desc_t *)pswr->wr_id); + wqe_size += inline_len/OCTOWORD; + qp_logfunc("data_addr: %p data_len: %d rest_space: %d wqe_size: %d", + data_addr, data_len, inline_len, wqe_size); + // configuring control + m_sq_wqe_hot->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | wqe_size); + inline_len = align_to_WQEBB_up(wqe_size)/4; + ring_doorbell((uint64_t*)m_sq_wqe_hot, m_db_method, inline_len); + } else { + // We supporting also VMA_IBV_WR_SEND_TSO, it is the case + wqe_size = fill_wqe_lso(pswr); + return wqe_size; + } + } + return 1; +} +#else +inline int qp_mgr_eth_mlx5::fill_wqe(vma_ibv_send_wr *pswr) +{ + // control segment is mostly filled by preset after previous packet + // we always inline ETH header + sg_array sga(pswr->sg_list, pswr->num_sge); + int inline_len = MLX5_ETH_INLINE_HEADER_SIZE; + int data_len = sga.length()-inline_len; + int max_inline_len = m_max_inline_data; + int wqe_size = sizeof(struct mlx5_wqe_ctrl_seg)/OCTOWORD + sizeof(struct mlx5_wqe_eth_seg)/OCTOWORD; + + uint8_t* cur_seg = (uint8_t*)m_sq_wqe_hot+sizeof(struct mlx5_wqe_ctrl_seg); + uint8_t* data_addr = sga.get_data(&inline_len); // data for inlining in ETH header + + qp_logfunc("wqe_hot:%p num_sge: %d data_addr: %p data_len: %d max_inline_len: %d inline_len$ %d", + m_sq_wqe_hot, pswr->num_sge, data_addr, data_len, max_inline_len, inline_len); + + // Fill Ethernet segment with header inline, static data + // were populated in preset after previous packet send + memcpy(cur_seg+offsetof(struct mlx5_wqe_eth_seg, inline_hdr_start), data_addr, MLX5_ETH_INLINE_HEADER_SIZE); + data_addr += MLX5_ETH_INLINE_HEADER_SIZE; + cur_seg += sizeof(struct mlx5_wqe_eth_seg); + + if (likely(data_len <= max_inline_len)) { + max_inline_len = data_len; + // Filling inline data segment + // size of BlueFlame buffer is 4*WQEBBs, 3*OCTOWORDS of the first + // was allocated for control and ethernet segment so we have 3*WQEBB+16-4 + int rest_space = std::min((int)(m_sq_wqes_end-cur_seg-4), (3*WQEBB+OCTOWORD-4)); + // Filling till the end of inline WQE segment or + // to end of WQEs + if (likely(max_inline_len <= rest_space)) { + inline_len = max_inline_len; + qp_logfunc("NO WRAP data_addr:%p cur_seg: %p rest_space: %d inline_len: %d wqe_size: %d", + data_addr, cur_seg, rest_space, inline_len, wqe_size); + //bypass inline size and fill inline data segment + data_addr = sga.get_data(&inline_len); + inline_len = fill_inl_segment(sga, cur_seg+4, data_addr, max_inline_len, inline_len); + + // store inline data size and mark the data as inlined + *(uint32_t*)((uint8_t*)m_sq_wqe_hot+sizeof(struct mlx5_wqe_ctrl_seg)+sizeof(struct mlx5_wqe_eth_seg)) + = htonl(0x80000000|inline_len); + rest_space = align_to_octoword_up(inline_len+4); // align to OCTOWORDs + wqe_size += rest_space/OCTOWORD; + //assert((data_len-inline_len)==0); + // configuring control + m_sq_wqe_hot->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | wqe_size); + rest_space = align_to_WQEBB_up(wqe_size)/4; + qp_logfunc("data_len: %d inline_len: %d wqe_size: %d wqebbs: %d", + data_len-inline_len, inline_len, wqe_size, rest_space); + ring_doorbell((uint64_t *)m_sq_wqe_hot, rest_space); + dbg_dump_wqe((uint32_t *)m_sq_wqe_hot, wqe_size*16); + return rest_space; + } else { + // wrap around case, first filling till the end of m_sq_wqes + int wrap_up_size = max_inline_len-rest_space; + inline_len = rest_space; + qp_logfunc("WRAP_UP_SIZE: %d data_addr:%p cur_seg: %p rest_space: %d inline_len: %d wqe_size: %d", + wrap_up_size, data_addr, cur_seg, rest_space, inline_len, wqe_size); + + data_addr = sga.get_data(&inline_len); + inline_len = fill_inl_segment(sga, cur_seg+4, data_addr, rest_space, inline_len); + data_len -= inline_len; + rest_space = align_to_octoword_up(inline_len+4); + wqe_size += rest_space/OCTOWORD; + rest_space = align_to_WQEBB_up(rest_space/OCTOWORD)/4;// size of 1st chunk at the end + + qp_logfunc("END chunk data_addr: %p data_len: %d inline_len: %d wqe_size: %d wqebbs: %d", + data_addr, data_len, inline_len, wqe_size, rest_space); + // Wrap around + // + cur_seg = (uint8_t*)m_sq_wqes; + data_addr = sga.get_data(&wrap_up_size); + + wrap_up_size = fill_inl_segment(sga, cur_seg, data_addr, data_len, wrap_up_size); + inline_len += wrap_up_size; + max_inline_len = align_to_octoword_up(wrap_up_size); + wqe_size += max_inline_len/OCTOWORD; + max_inline_len = align_to_WQEBB_up(max_inline_len/OCTOWORD)/4; + // store inline data size + *(uint32_t*)((uint8_t* )m_sq_wqe_hot+sizeof(struct mlx5_wqe_ctrl_seg)+sizeof(struct mlx5_wqe_eth_seg)) + = htonl(0x80000000|inline_len); + qp_logfunc("BEGIN_CHUNK data_addr: %p data_len: %d wqe_size: %d inline_len: %d end_wqebbs: %d wqebbs: %d", + data_addr, data_len-wrap_up_size, wqe_size, inline_len+wrap_up_size, rest_space, max_inline_len); + //assert((data_len-wrap_up_size)==0); + // configuring control + m_sq_wqe_hot->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | wqe_size); + + dbg_dump_wqe((uint32_t*)m_sq_wqe_hot, rest_space*4*16); + dbg_dump_wqe((uint32_t*)m_sq_wqes, max_inline_len*4*16); + + ring_doorbell((uint64_t*)m_sq_wqe_hot, rest_space, max_inline_len); + return rest_space+max_inline_len; + } + } else { + // data is bigger than max to inline we inlined only ETH header + uint from IP (18 bytes) + // the rest will be in data pointer segment + // adding data seg with pointer if there still data to transfer + inline_len = fill_ptr_segment(sga, (struct mlx5_wqe_data_seg*)cur_seg, data_addr, data_len, (mem_buf_desc_t *)pswr->wr_id); + wqe_size += inline_len/OCTOWORD; + qp_logfunc("data_addr: %p data_len: %d rest_space: %d wqe_size: %d", + data_addr, data_len, inline_len, wqe_size); + // configuring control + m_sq_wqe_hot->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | wqe_size); + inline_len = align_to_WQEBB_up(wqe_size)/4; + ring_doorbell((uint64_t*)m_sq_wqe_hot, inline_len); + dbg_dump_wqe((uint32_t *)m_sq_wqe_hot, wqe_size*16); + } + return 1; +} +#endif /* DEFINED_TSO */ + +#ifdef DEFINED_TSO +//! Filling wqe for LSO +int qp_mgr_eth_mlx5::fill_wqe_lso(vma_ibv_send_wr* pswr) +{ + struct mlx5_wqe_eth_seg* eth_seg = (struct mlx5_wqe_eth_seg*)((uint8_t*)m_sq_wqe_hot + sizeof(struct mlx5_wqe_ctrl_seg)); + struct mlx5_wqe_data_seg* dp_seg = NULL; + uint8_t* cur_seg = (uint8_t*)eth_seg; + uint8_t* p_hdr = (uint8_t*)pswr->tso.hdr; + int wqe_size = sizeof(struct mlx5_wqe_ctrl_seg) / OCTOWORD; + // For TSO we fully inline headers in Ethernet segment + // + int inline_len = pswr->tso.hdr_sz; + int max_inline_len = align_to_octoword_up(sizeof(struct mlx5_wqe_eth_seg) + inline_len - MLX5_ETH_INLINE_HEADER_SIZE); + eth_seg->mss = htons(pswr->tso.mss); + eth_seg->inline_hdr_sz = htons(inline_len); + int rest = (int)(m_sq_wqes_end - (uint8_t*)eth_seg); + int bottom_hdr_sz = 0; + int i = 0; + + if (likely(max_inline_len < rest)) { + // Fill Ethernet segment with full header inline + memcpy(cur_seg+offsetof(struct mlx5_wqe_eth_seg, inline_hdr_start), p_hdr, inline_len); + cur_seg += max_inline_len; + } else { + // wrap around SQ on inline ethernet header + bottom_hdr_sz = rest - offsetof(struct mlx5_wqe_eth_seg, inline_hdr_start); + memcpy(cur_seg + offsetof(struct mlx5_wqe_eth_seg, inline_hdr_start), p_hdr, bottom_hdr_sz); + memcpy(m_sq_wqes, p_hdr + bottom_hdr_sz, inline_len - bottom_hdr_sz); + max_inline_len = align_to_octoword_up(inline_len - bottom_hdr_sz); + cur_seg = (uint8_t*)m_sq_wqes + max_inline_len; + wqe_size += rest / OCTOWORD; + bottom_hdr_sz = align_to_WQEBB_up(wqe_size) / 4; + } + wqe_size += max_inline_len / OCTOWORD; + qp_logfunc("TSO: num_sge: %d max_inline_len: %d inline_len: %d rest: %d", + pswr->num_sge, max_inline_len, inline_len, rest); + // Filling data pointer segments with payload by scatter-gather list elements + dp_seg = (struct mlx5_wqe_data_seg*)cur_seg; + for (i = 0; i < pswr->num_sge; i++) { + if (unlikely((uintptr_t)dp_seg >= (uintptr_t)m_sq_wqes_end)) { + dp_seg = (struct mlx5_wqe_data_seg *)m_sq_wqes; + bottom_hdr_sz = align_to_WQEBB_up(wqe_size)/4; + } + dp_seg->addr = htonll((uint64_t)pswr->sg_list[i].addr); + dp_seg->lkey = htonl(pswr->sg_list[i].lkey); + dp_seg->byte_count = htonl(pswr->sg_list[i].length); + + qp_logfunc("DATA_SEG: addr:%llx len: %d lkey: %x dp_seg: %p wqe_size: %d", + pswr->sg_list[i].addr, pswr->sg_list[i].length, dp_seg->lkey, dp_seg, wqe_size); + + dp_seg ++; + wqe_size += sizeof(struct mlx5_wqe_data_seg)/OCTOWORD; + } + inline_len = align_to_WQEBB_up(wqe_size) / 4; + m_sq_wqe_hot->ctrl.data[1] = htonl((m_mlx5_qp.qpn << 8) | wqe_size); + // sending by BlueFlame or DoorBell covering wrap around + if (likely(inline_len <= 4)) { + if (likely(bottom_hdr_sz == 0)) { + ring_doorbell((uint64_t*)m_sq_wqe_hot, MLX5_DB_METHOD_DB, inline_len); + } else { + ring_doorbell((uint64_t*)m_sq_wqe_hot, MLX5_DB_METHOD_DB, bottom_hdr_sz, inline_len - bottom_hdr_sz); + } + } else { + ring_doorbell((uint64_t*)m_sq_wqe_hot, MLX5_DB_METHOD_DB, inline_len); + } + return wqe_size; +} +#endif /* DEFINED_TSO */ + +//! Send one RAW packet by MLX5 BlueFlame +// +#ifdef DEFINED_TSO +int qp_mgr_eth_mlx5::send_to_wire(vma_ibv_send_wr *p_send_wqe, vma_wr_tx_packet_attr attr, bool request_comp) +{ + struct mlx5_wqe_ctrl_seg *ctrl = NULL; + struct mlx5_wqe_eth_seg *eseg = NULL; + + ctrl = (struct mlx5_wqe_ctrl_seg *)m_sq_wqe_hot; + eseg = (struct mlx5_wqe_eth_seg *)((uint8_t *)m_sq_wqe_hot + sizeof(*ctrl)); + + /* Configure ctrl segment + * qpn_ds or ctrl.data[1] is set inside fill_wqe() + */ + ctrl->opmod_idx_opcode = htonl(((m_sq_wqe_counter & 0xffff) << 8) | (get_mlx5_opcode(vma_send_wr_opcode(*p_send_wqe)) & 0xff)); + m_sq_wqe_hot->ctrl.data[2] = 0; + ctrl->fm_ce_se = (request_comp ? (uint8_t)MLX5_WQE_CTRL_CQ_UPDATE : 0); + ctrl->imm = 0; + + /* Configure eth segment + * reset rsvd0, cs_flags, rsvd1, mss and rsvd2 fields + * checksum flags are set here + */ + *((uint64_t *)eseg) = 0; + eseg->rsvd2 = 0; + eseg->cs_flags = (uint8_t)(attr & (VMA_TX_PACKET_L3_CSUM | VMA_TX_PACKET_L4_CSUM) & 0xff); + + /* Complete WQE */ + fill_wqe(p_send_wqe); + + /* Store buffer descriptor */ + m_sq_wqe_idx_to_wrid[m_sq_wqe_hot_index] = (uintptr_t)p_send_wqe->wr_id; + + /* Preparing next WQE and index */ + m_sq_wqe_hot = &(*m_sq_wqes)[m_sq_wqe_counter & (m_tx_num_wr - 1)]; + qp_logfunc("m_sq_wqe_hot: %p m_sq_wqe_hot_index: %d wqe_counter: %d new_hot_index: %d wr_id: %llx", + m_sq_wqe_hot, m_sq_wqe_hot_index, m_sq_wqe_counter, (m_sq_wqe_counter&(m_tx_num_wr-1)), p_send_wqe->wr_id); + m_sq_wqe_hot_index = m_sq_wqe_counter & (m_tx_num_wr - 1); + + memset((void*)m_sq_wqe_hot, 0, sizeof(struct mlx5_wqe64)); + + /* Fill Ethernet segment with header inline */ + eseg = (struct mlx5_wqe_eth_seg*)((uint8_t*)m_sq_wqe_hot + sizeof(struct mlx5_wqe_ctrl_seg)); + eseg->inline_hdr_sz = htons(MLX5_ETH_INLINE_HEADER_SIZE); + + return 0; +} +#else +int qp_mgr_eth_mlx5::send_to_wire(vma_ibv_send_wr *p_send_wqe, vma_wr_tx_packet_attr attr, bool request_comp) +{ + // Set current WQE's ethernet segment checksum flags + struct mlx5_wqe_eth_seg* eth_seg = (struct mlx5_wqe_eth_seg*)((uint8_t*)m_sq_wqe_hot+sizeof(struct mlx5_wqe_ctrl_seg)); + eth_seg->cs_flags = (uint8_t)(attr & (VMA_TX_PACKET_L3_CSUM | VMA_TX_PACKET_L4_CSUM) & 0xff); + + m_sq_wqe_hot->ctrl.data[0] = htonl((m_sq_wqe_counter << 8) | (get_mlx5_opcode(vma_send_wr_opcode(*p_send_wqe)) & 0xff) ); + m_sq_wqe_hot->ctrl.data[2] = request_comp ? htonl(8) : 0 ; + + fill_wqe(p_send_wqe); + m_sq_wqe_idx_to_wrid[m_sq_wqe_hot_index] = (uintptr_t)p_send_wqe->wr_id; + + // Preparing next WQE and index + m_sq_wqe_hot = &(*m_sq_wqes)[m_sq_wqe_counter & (m_tx_num_wr - 1)]; + qp_logfunc("m_sq_wqe_hot: %p m_sq_wqe_hot_index: %d wqe_counter: %d new_hot_index: %d wr_id: %llx", + m_sq_wqe_hot, m_sq_wqe_hot_index, m_sq_wqe_counter, (m_sq_wqe_counter&(m_tx_num_wr-1)), p_send_wqe->wr_id); + m_sq_wqe_hot_index = m_sq_wqe_counter & (m_tx_num_wr - 1); + + memset((void*)(uintptr_t)m_sq_wqe_hot, 0, sizeof(struct mlx5_wqe64)); + + // Fill Ethernet segment with header inline + eth_seg = (struct mlx5_wqe_eth_seg*)((uint8_t*)m_sq_wqe_hot+sizeof(struct mlx5_wqe_ctrl_seg)); + eth_seg->inline_hdr_sz = htons(MLX5_ETH_INLINE_HEADER_SIZE); + + return 0; +} +#endif /* DEFINED_TSO */ + +//! Handle releasing of Tx buffers +// Single post send with SIGNAL of a dummy packet +// NOTE: Since the QP is in ERROR state no packets will be sent on the wire! +// So we can post_send anything we want :) +void qp_mgr_eth_mlx5::trigger_completion_for_all_sent_packets() +{ + qp_logfunc("unsignaled count=%d, last=%p", m_n_unsignaled_count, m_p_last_tx_mem_buf_desc); + + if (m_p_last_tx_mem_buf_desc) { // Meaning that there is at least one post_send in the QP mem_buf_desc that wasn't signaled for completion + qp_logdbg("Need to send closing tx wr..."); + // Allocate new send buffer + mem_buf_desc_t* p_mem_buf_desc = m_p_ring->mem_buf_tx_get(0, true); + m_p_ring->m_missing_buf_ref_count--; // Align Tx buffer accounting since we will be bypassing the normal send calls + if (!p_mem_buf_desc) { + qp_logerr("no buffer in pool"); + return; + } + p_mem_buf_desc->p_next_desc = m_p_last_tx_mem_buf_desc; + + // Prepare dummy packet: zeroed payload ('0000'). + // For ETH it replaces the MAC header!! (Nothing is going on the wire, QP in error state) + /* need to send at least eth+ip, since libmlx5 will drop just eth header */ + ethhdr* p_buffer_ethhdr = (ethhdr *)p_mem_buf_desc->p_buffer; + memset(p_buffer_ethhdr, 0, sizeof(*p_buffer_ethhdr)); + p_buffer_ethhdr->h_proto = htons(ETH_P_IP); + iphdr* p_buffer_iphdr = (iphdr *)(p_mem_buf_desc->p_buffer + sizeof(*p_buffer_ethhdr)); + memset(p_buffer_iphdr, 0, sizeof(*p_buffer_iphdr)); + + ibv_sge sge[1]; + sge[0].length = sizeof(ethhdr) + sizeof(iphdr); + sge[0].addr = (uintptr_t)(p_mem_buf_desc->p_buffer); + sge[0].lkey = m_p_ring->m_tx_lkey; + + // Prepare send wr for (does not care if it is UD/IB or RAW/ETH) + // UD requires AH+qkey, RAW requires minimal payload instead of MAC header. + vma_ibv_send_wr send_wr; + + memset(&send_wr, 0, sizeof(send_wr)); + send_wr.wr_id = (uintptr_t)p_mem_buf_desc; + send_wr.wr.ud.ah = NULL; + send_wr.sg_list = sge; + send_wr.num_sge = 1; + send_wr.next = NULL; + vma_send_wr_opcode(send_wr) = VMA_IBV_WR_SEND; + + // Close the Tx unsignaled send list + set_unsignaled_count(); + m_p_last_tx_mem_buf_desc = NULL; + + if (!m_p_ring->m_tx_num_wr_free) { + qp_logdbg("failed to trigger completion for all packets due to no available wr"); + return; + } + m_p_ring->m_tx_num_wr_free--; + + set_signal_in_next_send_wqe(); + send_to_wire(&send_wr, (vma_wr_tx_packet_attr)(VMA_TX_PACKET_L3_CSUM|VMA_TX_PACKET_L4_CSUM), true); + } +} + +#endif /* DEFINED_DIRECT_VERBS */ diff --git a/src/vma/dev/qp_mgr_eth_mlx5.h b/src/vma/dev/qp_mgr_eth_mlx5.h new file mode 100644 index 0000000..6dcaa62 --- /dev/null +++ b/src/vma/dev/qp_mgr_eth_mlx5.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef QP_MGR_ETH_MLX5_H +#define QP_MGR_ETH_MLX5_H + +#include "qp_mgr.h" +#include "vma/util/sg_array.h" +#include "vma/dev/dm_mgr.h" + +#if defined(DEFINED_DIRECT_VERBS) + + +struct mlx5_wqe64 { + union { + struct mlx5_wqe_ctrl_seg ctrl; + uint32_t data[4]; + } ctrl; + struct mlx5_wqe_eth_seg eseg; + struct mlx5_wqe_data_seg dseg; +}; + +class qp_mgr_eth_mlx5 : public qp_mgr_eth +{ +friend class cq_mgr_mlx5; +public: + qp_mgr_eth_mlx5(const ring_simple* p_ring, const ib_ctx_handler* p_context, const uint8_t port_num, + struct ibv_comp_channel* p_rx_comp_event_channel, const uint32_t tx_num_wr, + const uint16_t vlan, bool call_configure = true); + virtual ~qp_mgr_eth_mlx5(); + virtual void up(); + virtual void down(); + virtual void post_recv_buffer(mem_buf_desc_t* p_mem_buf_desc); // Post for receive single mem_buf_desc + vma_ib_mlx5_qp_t m_mlx5_qp; +protected: + void trigger_completion_for_all_sent_packets(); + void init_sq(); + + uint64_t* m_sq_wqe_idx_to_wrid; + uint64_t m_rq_wqe_counter; +private: + cq_mgr* init_rx_cq_mgr(struct ibv_comp_channel* p_rx_comp_event_channel); + virtual cq_mgr* init_tx_cq_mgr(void); + virtual bool is_completion_need() { return !m_n_unsignaled_count || (m_dm_enabled && m_dm_mgr.is_completion_need()); }; + virtual void dm_release_data(mem_buf_desc_t* buff) { m_dm_mgr.release_data(buff); } + + inline void set_signal_in_next_send_wqe(); + int send_to_wire(vma_ibv_send_wr* p_send_wqe, vma_wr_tx_packet_attr attr, bool request_comp); + inline int fill_wqe(vma_ibv_send_wr* p_send_wqe); +#ifdef DEFINED_TSO + int fill_wqe_lso(vma_ibv_send_wr* pswr); + inline void ring_doorbell(uint64_t* wqe, int db_method, int num_wqebb, int num_wqebb_top = 0); +#else + inline void ring_doorbell(uint64_t* wqe, int num_wqebb, int num_wqebb_top = 0); +#endif /* DEFINED_TSO */ + inline int fill_inl_segment(sg_array &sga, uint8_t *cur_seg, uint8_t* data_addr, int max_inline_len, int inline_len); + inline int fill_ptr_segment(sg_array &sga, struct mlx5_wqe_data_seg* dp_seg, uint8_t* data_addr, int data_len, mem_buf_desc_t* buffer); + + struct mlx5_wqe64 (*m_sq_wqes)[]; + struct mlx5_wqe64* m_sq_wqe_hot; + uint8_t* m_sq_wqes_end; + enum { + MLX5_DB_METHOD_BF, + MLX5_DB_METHOD_DB + } m_db_method; + + int m_sq_wqe_hot_index; + uint16_t m_sq_wqe_counter; + dm_mgr m_dm_mgr; + bool m_dm_enabled; +}; +#endif //defined(DEFINED_DIRECT_VERBS) +#endif //QP_MGR_ETH_MLX5_H diff --git a/src/vma/dev/qp_mgr_mp.cpp b/src/vma/dev/qp_mgr_mp.cpp new file mode 100644 index 0000000..295de25 --- /dev/null +++ b/src/vma/dev/qp_mgr_mp.cpp @@ -0,0 +1,290 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include "dev/cq_mgr_mp.h" + +#undef MODULE_NAME +#define MODULE_NAME "qpmp" + +#define qp_logpanic __log_info_panic +#define qp_logerr __log_info_err +#define qp_logwarn __log_info_warn +#define qp_loginfo __log_info_info +#define qp_logdbg __log_info_dbg +#define qp_logfunc __log_info_func +#define qp_logfuncall __log_info_funcall + + +#ifdef HAVE_MP_RQ + + +cq_mgr* qp_mgr_mp::init_rx_cq_mgr(struct ibv_comp_channel* p_rx_comp_event_channel) +{ + // CQ size should be aligned to power of 2 due to PRM + // also it size is the max CQs we can hold at once + // this equals to number of strides in WQe * WQ's + uint32_t cq_size = align32pow2((m_p_mp_ring->get_strides_num() * + m_p_mp_ring->get_wq_count())); + return new cq_mgr_mp(m_p_mp_ring, m_p_ib_ctx_handler, cq_size, + p_rx_comp_event_channel, true, m_external_mem); +} + +int qp_mgr_mp::prepare_ibv_qp(vma_ibv_qp_init_attr& qp_init_attr) +{ + NOT_IN_USE(qp_init_attr); + struct ibv_exp_rx_hash_conf rx_hash_conf; + struct ibv_exp_query_intf_params query_intf_params; + struct ibv_exp_release_intf_params rel_intf_params; + struct ibv_exp_rwq_ind_table_init_attr rwq_ind_table_init_attr; + struct ibv_exp_qp_init_attr exp_qp_init_attr; + enum ibv_exp_query_intf_status intf_status; + uint32_t lkey; + uint8_t *ptr; + uint32_t size; + uint8_t toeplitz_key[] = { 0x6d, 0x5a, 0x56, 0xda, 0x25, 0x5b, 0x0e, 0xc2, + 0x41, 0x67, 0x25, 0x3d, 0x43, 0xa3, 0x8f, 0xb0, + 0xd0, 0xca, 0x2b, 0xcb, 0xae, 0x7b, 0x30, 0xb4, + 0x77, 0xcb, 0x2d, 0xa3, 0x80, 0x30, 0xf2, 0x0c, + 0x6a, 0x42, 0xb7, 0x3b, 0xbe, 0xac, 0x01, 0xfa }; + const int TOEPLITZ_RX_HASH_KEY_LEN = + sizeof(toeplitz_key)/sizeof(toeplitz_key[0]); + // create RX resources + // create WQ + struct ibv_exp_wq_init_attr wq_init_attr; + memset(&wq_init_attr, 0, sizeof(wq_init_attr)); + + wq_init_attr.wq_type = IBV_EXP_WQT_RQ; + wq_init_attr.max_recv_wr = m_p_mp_ring->get_wq_count(); + wq_init_attr.max_recv_sge = 1; + wq_init_attr.pd = m_p_ib_ctx_handler->get_ibv_pd(); + wq_init_attr.cq = m_p_cq_mgr_rx->get_ibv_cq_hndl(); + wq_init_attr.comp_mask |= IBV_EXP_CREATE_WQ_RES_DOMAIN; + wq_init_attr.res_domain = m_p_mp_ring->get_res_domain(); + + wq_init_attr.comp_mask |= IBV_EXP_CREATE_WQ_MP_RQ; + wq_init_attr.mp_rq.use_shift = IBV_EXP_MP_RQ_NO_SHIFT; + wq_init_attr.mp_rq.single_wqe_log_num_of_strides = + m_p_mp_ring->get_single_wqe_log_num_of_strides(); + wq_init_attr.mp_rq.single_stride_log_num_of_bytes = + m_p_mp_ring->get_single_stride_log_num_of_bytes(); + + m_p_wq = ibv_exp_create_wq(m_p_ib_ctx_handler->get_ibv_context(), + &wq_init_attr); + if (!m_p_wq) { + qp_logerr("ibv_exp_create_wq failed (errno=%d %m)", errno); + return -1; + } + + // change WQ to ready state + struct ibv_exp_wq_attr wq_attr; + + memset(&wq_attr, 0, sizeof(wq_attr)); + wq_attr.attr_mask = IBV_EXP_WQ_ATTR_STATE; + wq_attr.wq_state = IBV_EXP_WQS_RDY; + + if (ibv_exp_modify_wq(m_p_wq, &wq_attr)) { + qp_logerr("failed changing WQ state (errno=%d %m)", errno); + goto err; + } + + intf_status = IBV_EXP_INTF_STAT_OK; + + memset(&query_intf_params, 0, sizeof(query_intf_params)); + query_intf_params.intf_scope = IBV_EXP_INTF_GLOBAL; + query_intf_params.intf = IBV_EXP_INTF_WQ; + query_intf_params.obj = m_p_wq; + m_p_wq_family = (struct ibv_exp_wq_family *) + ibv_exp_query_intf(m_p_ib_ctx_handler->get_ibv_context(), + &query_intf_params, &intf_status); + if (!m_p_wq_family) { + qp_logerr("ibv_exp_query_intf failed (errno=%m) status %d ", + errno, intf_status); + goto err; + } + // create indirect table + rwq_ind_table_init_attr.pd = m_p_ib_ctx_handler->get_ibv_pd(); + rwq_ind_table_init_attr.log_ind_tbl_size = 0; // ignore hash + rwq_ind_table_init_attr.ind_tbl = &m_p_wq; + rwq_ind_table_init_attr.comp_mask = 0; + m_p_rwq_ind_tbl = + ibv_exp_create_rwq_ind_table(m_p_ib_ctx_handler->get_ibv_context(), + &rwq_ind_table_init_attr); + if (!m_p_rwq_ind_tbl) { + qp_logerr("ibv_exp_create_rwq_ind_table failed (errno=%d %m)", errno); + goto err; + } + + // Create rx_hash_conf + memset(&rx_hash_conf, 0, sizeof(rx_hash_conf)); + rx_hash_conf.rx_hash_function = IBV_EXP_RX_HASH_FUNC_TOEPLITZ; + rx_hash_conf.rx_hash_key_len = TOEPLITZ_RX_HASH_KEY_LEN; + rx_hash_conf.rx_hash_key = toeplitz_key; + rx_hash_conf.rx_hash_fields_mask = IBV_EXP_RX_HASH_DST_PORT_UDP; + rx_hash_conf.rwq_ind_tbl = m_p_rwq_ind_tbl; + + memset(&exp_qp_init_attr, 0, sizeof(exp_qp_init_attr)); + + exp_qp_init_attr.comp_mask = IBV_EXP_QP_INIT_ATTR_CREATE_FLAGS | + IBV_EXP_QP_INIT_ATTR_PD | + IBV_EXP_QP_INIT_ATTR_RX_HASH | + IBV_EXP_QP_INIT_ATTR_RES_DOMAIN; + exp_qp_init_attr.rx_hash_conf = &rx_hash_conf; + exp_qp_init_attr.res_domain = m_p_mp_ring->get_res_domain(); + exp_qp_init_attr.pd = m_p_ib_ctx_handler->get_ibv_pd(); + exp_qp_init_attr.qp_type = IBV_QPT_RAW_PACKET; + // Create the QP + m_qp = ibv_exp_create_qp(m_p_ib_ctx_handler->get_ibv_context(), + &exp_qp_init_attr); + + BULLSEYE_EXCLUDE_BLOCK_START + if (!m_qp) { + qp_logerr("ibv_create_qp failed (errno=%d %m)", errno); + goto err; + } + BULLSEYE_EXCLUDE_BLOCK_END + // initlize the sge, the same sg will be used for all operations + ptr = (uint8_t *)m_buff_data.addr; + lkey = m_buff_data.lkey; + size = m_buff_data.length; + // initlize the sge, the same sg will be used for all operations + for (uint32_t i = 0; i < m_n_sysvar_rx_num_wr_to_post_recv; i++) { + m_ibv_rx_sg_array[i].addr = (uint64_t)ptr; + m_ibv_rx_sg_array[i].length = size; + m_ibv_rx_sg_array[i].lkey = lkey; + qp_logdbg("sge %u addr %p - %p size %d lkey %u", + i, ptr, ptr + size, size, lkey); + ptr += size; + } + return 0; +err: + if (m_qp) { + IF_VERBS_FAILURE(ibv_destroy_qp(m_qp)) { + qp_logerr("TX QP destroy failure (errno = %d %m)", -errno); + } ENDIF_VERBS_FAILURE; + } + if (m_p_rwq_ind_tbl) { + IF_VERBS_FAILURE(ibv_exp_destroy_rwq_ind_table(m_p_rwq_ind_tbl)) { + qp_logerr("ibv_exp_destroy_rwq_ind_table " + "failed (errno = %d %m)", -errno); + } ENDIF_VERBS_FAILURE; + } + if (m_p_wq_family) { + memset(&rel_intf_params, 0, sizeof(rel_intf_params)); + IF_VERBS_FAILURE(ibv_exp_release_intf(m_p_ib_ctx_handler->get_ibv_context(), + m_p_wq_family, &rel_intf_params)) { + qp_logerr("ibv_exp_release_intf failed (errno = %d %m)", -errno); + } ENDIF_VERBS_FAILURE; + } + if (m_p_wq) { + IF_VERBS_FAILURE(ibv_exp_destroy_wq(m_p_wq)) { + qp_logerr("ibv_exp_destroy_wq failed (errno = %d %m)", -errno); + } ENDIF_VERBS_FAILURE; + } + return -1; +} + + +void qp_mgr_mp::up() +{ + m_p_cq_mgr_rx->add_qp_rx(this); +} + +int qp_mgr_mp::post_recv(uint32_t sg_index, uint32_t num_of_sge) +{ + // this function always return 0 + qp_logdbg("calling recv_burst with index %d, num_of_sge %d", + sg_index, num_of_sge); + if (unlikely(num_of_sge + sg_index > m_p_mp_ring->get_wq_count())) { + qp_logdbg("not enough WQE to post"); + return -1; + } + return m_p_wq_family->recv_burst(m_p_wq, &m_ibv_rx_sg_array[sg_index], + num_of_sge); +} + +bool qp_mgr_mp::fill_hw_descriptors(vma_mlx_hw_device_data &data) +{ + struct mlx5_rwq *mrwq = container_of(m_p_wq, struct mlx5_rwq, wq); + + data.rq_data.wq_data.buf = (uint8_t *)mrwq->buf.buf + mrwq->rq.offset; + data.rq_data.wq_data.dbrec = mrwq->db; + data.rq_data.wq_data.wqe_cnt = mrwq->rq.wqe_cnt; + data.rq_data.wq_data.stride = (1 << mrwq->rq.wqe_shift); + + qp_logdbg("QP: %d WQ: dbrec: %p buf: %p wqe_cnt: %d stride: %d ", + m_qp->qp_num, data.rq_data.wq_data.dbrec, + data.rq_data.wq_data.buf, data.rq_data.wq_data.wqe_cnt, + data.rq_data.wq_data.stride); + return true; +} + +qp_mgr_mp::~qp_mgr_mp() +{ + // destroy RX QP + if (m_qp) { + IF_VERBS_FAILURE(ibv_destroy_qp(m_qp)) { + qp_logerr("TX QP destroy failure (errno = %d %m)", -errno); + } ENDIF_VERBS_FAILURE; + m_qp = NULL; + } + + if (m_p_wq_family) { + ibv_exp_release_intf_params params; + memset(¶ms, 0, sizeof(params)); + IF_VERBS_FAILURE(ibv_exp_release_intf(m_p_ib_ctx_handler->get_ibv_context(), + m_p_wq_family, ¶ms)) { + qp_logerr("ibv_exp_release_intf failed (errno = %d %m)", -errno); + } ENDIF_VERBS_FAILURE; + } + + if (m_p_rwq_ind_tbl) { + IF_VERBS_FAILURE(ibv_exp_destroy_rwq_ind_table(m_p_rwq_ind_tbl)) { + qp_logerr("ibv_exp_destroy_rwq_ind_table failed (errno = %d %m)", -errno); + } ENDIF_VERBS_FAILURE; + } + + if (m_p_wq) { + IF_VERBS_FAILURE(ibv_exp_destroy_wq(m_p_wq)) { + qp_logerr("ibv_exp_destroy_wq failed (errno = %d %m)", -errno); + } ENDIF_VERBS_FAILURE; + } + + delete m_p_cq_mgr_tx; + m_p_cq_mgr_tx = NULL; + + delete m_p_cq_mgr_rx; + m_p_cq_mgr_rx = NULL; +} +#endif //HAVE_MP_RQ + + diff --git a/src/vma/dev/qp_mgr_mp.h b/src/vma/dev/qp_mgr_mp.h new file mode 100644 index 0000000..798b929 --- /dev/null +++ b/src/vma/dev/qp_mgr_mp.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef SRC_VMA_DEV_QP_MGR_MP_H_ +#define SRC_VMA_DEV_QP_MGR_MP_H_ + +#include "dev/qp_mgr.h" +#include "dev/ring_eth_cb.h" + +#ifdef HAVE_MP_RQ + +class qp_mgr_mp : public qp_mgr_eth +{ +public: + qp_mgr_mp(const ring_eth_cb *p_ring, const ib_ctx_handler *p_context, + const uint8_t port_num, + struct ibv_comp_channel *p_rx_comp_event_channel, + const uint32_t tx_num_wr, const uint16_t vlan, ibv_sge &buff_d, + bool external_mem) : + qp_mgr_eth(p_ring, p_context, port_num, + p_rx_comp_event_channel, tx_num_wr, vlan, false), + m_p_wq(NULL), m_p_wq_family(NULL), m_p_rwq_ind_tbl(NULL), + m_buff_data(buff_d), m_external_mem(external_mem) { + m_p_mp_ring = p_ring; + m_n_sysvar_rx_num_wr_to_post_recv = m_p_mp_ring->get_wq_count(); + if (configure(p_rx_comp_event_channel)) + throw_vma_exception("failed creating mp qp"); + }; + bool fill_hw_descriptors(vma_mlx_hw_device_data &data); + virtual ~qp_mgr_mp(); + virtual void up(); + int post_recv(uint32_t sg_index, uint32_t num_of_sge); + int get_wq_count() {return m_p_mp_ring->get_wq_count();} + ibv_exp_wq* get_wq() {return m_p_wq;} +protected: + virtual cq_mgr* init_rx_cq_mgr(struct ibv_comp_channel* p_rx_comp_event_channel); + virtual int prepare_ibv_qp(vma_ibv_qp_init_attr& qp_init_attr); +private: + // override parent ring + const ring_eth_cb* m_p_mp_ring; + struct ibv_exp_wq* m_p_wq; + struct ibv_exp_wq_family* m_p_wq_family; + struct ibv_exp_rwq_ind_table* m_p_rwq_ind_tbl; + ibv_sge m_buff_data; + bool m_external_mem; +}; +#endif /* HAVE_MP_RQ */ + +#endif /* SRC_VMA_DEV_QP_MGR_MP_H_ */ diff --git a/src/vma/dev/rfs.cpp b/src/vma/dev/rfs.cpp new file mode 100644 index 0000000..86ee636 --- /dev/null +++ b/src/vma/dev/rfs.cpp @@ -0,0 +1,307 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "utils/bullseye.h" +#include "vma/dev/rfs.h" +#include "vma/dev/qp_mgr.h" +#include "vma/dev/ring_simple.h" + +#define MODULE_NAME "rfs" + + +/**/ +/** inlining functions can only help if they are implemented before their usage **/ +/**/ +inline void rfs::prepare_filter_attach(int& filter_counter, rule_filter_map_t::iterator& filter_iter) +{ + // If filter flow, need to attach flow only if this is the first request for this specific group (i.e. counter == 1) + if (!m_p_rule_filter) return; + + filter_iter = m_p_rule_filter->m_map.find(m_p_rule_filter->m_key); + if (filter_iter == m_p_rule_filter->m_map.end()) { + rfs_logdbg("No matching counter for filter"); + return; + } + + filter_counter = filter_iter->second.counter; + m_b_tmp_is_attached = (filter_counter > 1) || m_b_tmp_is_attached; +} + +inline void rfs::filter_keep_attached(rule_filter_map_t::iterator& filter_iter) +{ + if (!m_p_rule_filter || filter_iter == m_p_rule_filter->m_map.end()) return; + + //save all ibv_flow rules only for filter + for (size_t i = 0; i < m_attach_flow_data_vector.size(); i++) { + filter_iter->second.ibv_flows.push_back(m_attach_flow_data_vector[i]->ibv_flow); + } +} + +inline void rfs::prepare_filter_detach(int& filter_counter, bool decrease_counter) +{ + // If filter, need to detach flow only if this is the last attached rule for this specific group (i.e. counter == 0) + if (!m_p_rule_filter) return; + + rule_filter_map_t::iterator filter_iter = m_p_rule_filter->m_map.find(m_p_rule_filter->m_key); + if (filter_iter == m_p_rule_filter->m_map.end()) { + rfs_logdbg("No matching counter for filter"); + return; + } + + if (decrease_counter) { + filter_iter->second.counter = filter_iter->second.counter > 0 ? filter_iter->second.counter - 1 : 0; + } + + filter_counter = filter_iter->second.counter; + //if we do not need to detach_ibv_flow, still mark this rfs as detached + m_b_tmp_is_attached = (filter_counter == 0) && m_b_tmp_is_attached; + if (filter_counter != 0 || filter_iter->second.ibv_flows.empty()) return; + + BULLSEYE_EXCLUDE_BLOCK_START + if (m_attach_flow_data_vector.size() != filter_iter->second.ibv_flows.size()) { + //sanity check for having the same number of qps on all rfs objects + rfs_logerr("all rfs objects in the ring should have the same number of elements"); + } + BULLSEYE_EXCLUDE_BLOCK_END + + for (size_t i = 0; i < m_attach_flow_data_vector.size(); i++) { + BULLSEYE_EXCLUDE_BLOCK_START + if (m_attach_flow_data_vector[i]->ibv_flow && m_attach_flow_data_vector[i]->ibv_flow != filter_iter->second.ibv_flows[i]) { + rfs_logerr("our assumption that there should be only one rule for filter group is wrong"); + } else if (filter_iter->second.ibv_flows[i]) { + m_attach_flow_data_vector[i]->ibv_flow = filter_iter->second.ibv_flows[i]; + } + BULLSEYE_EXCLUDE_BLOCK_END + } +} + +rfs::rfs(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter* rule_filter /*= NULL*/, uint32_t flow_tag_id /*=0*/): + m_flow_tuple(rule_filter ? rule_filter->m_flow_tuple : *flow_spec_5t), m_p_ring(p_ring), + m_p_rule_filter(rule_filter), m_n_sinks_list_entries(0), m_n_sinks_list_max_length(RFS_SINKS_LIST_DEFAULT_LEN), + m_flow_tag_id(flow_tag_id), m_b_tmp_is_attached(false) +{ + m_sinks_list = new pkt_rcvr_sink*[m_n_sinks_list_max_length]; + + BULLSEYE_EXCLUDE_BLOCK_START + if (m_sinks_list == NULL) { + rfs_logpanic("sinks list allocation failed!"); + } + BULLSEYE_EXCLUDE_BLOCK_END + + memset(m_sinks_list, 0, sizeof(pkt_rcvr_sink*)*m_n_sinks_list_max_length); +} + +rfs::~rfs() +{ + // If filter, need to detach flow only if this is the last attached rule for this specific filter group (i.e. counter == 0) + if (m_p_rule_filter && m_b_tmp_is_attached) { + int counter = 0; + prepare_filter_detach(counter, true); + if (counter == 0) { + if (m_p_ring->is_simple()) { + destroy_ibv_flow(); + } + m_p_rule_filter->m_map.erase(m_p_rule_filter->m_key); + } + } else if (m_b_tmp_is_attached) { + if (m_p_ring->is_simple()) { + destroy_ibv_flow(); + } + } + + if (m_p_rule_filter) { + delete m_p_rule_filter; + m_p_rule_filter = NULL; + } + delete[] m_sinks_list; + + while (m_attach_flow_data_vector.size() > 0) { + delete m_attach_flow_data_vector.back(); + m_attach_flow_data_vector.pop_back(); + } +} + +bool rfs::add_sink(pkt_rcvr_sink* p_sink) +{ + uint32_t i; + + rfs_logfunc("called with sink (%p)", p_sink); + + // Check all sinks list array if already exists. + for (i = 0; i < m_n_sinks_list_entries; ++i) { + if (m_sinks_list[i] == p_sink) { + rfs_logdbg("sink (%p) already registered!!!", p_sink); + return true; + } + } + if (m_n_sinks_list_entries == m_n_sinks_list_max_length) { // Sinks list array is full + // Reallocate a new array with double size + uint32_t tmp_sinks_list_length = 2*m_n_sinks_list_max_length; + pkt_rcvr_sink** tmp_sinks_list = new pkt_rcvr_sink*[tmp_sinks_list_length]; + + BULLSEYE_EXCLUDE_BLOCK_START + if (tmp_sinks_list == NULL) { + rfs_logerr("sinks list allocation failed!"); + return false; + } + BULLSEYE_EXCLUDE_BLOCK_END + + memcpy(tmp_sinks_list, m_sinks_list, sizeof(pkt_rcvr_sink*)*m_n_sinks_list_max_length); + delete[] m_sinks_list; + m_sinks_list = tmp_sinks_list; + m_n_sinks_list_max_length = tmp_sinks_list_length; + } + + m_sinks_list[m_n_sinks_list_entries] = p_sink; + ++m_n_sinks_list_entries; + + rfs_logdbg("Added new sink (%p), num of sinks is now: %d", p_sink, m_n_sinks_list_entries); + return true; +} + +bool rfs::del_sink(pkt_rcvr_sink* p_sink) +{ + uint32_t i; + + rfs_logdbg("called with sink (%p)", p_sink); + + // Find and remove sink + for (i = 0; i < m_n_sinks_list_entries; ++i) { + if (m_sinks_list[i] == p_sink) { + + // Found the sink location to remove + // Remove this sink from list by shrinking it and keeping it in order + for (/*continue i*/; i < (m_n_sinks_list_entries-1); ++i) { + m_sinks_list[i] = m_sinks_list[i+1]; + } + m_sinks_list[i] = NULL; + + m_n_sinks_list_entries--; + rfs_logdbg("Removed sink (%p), num of sinks is now: %d", p_sink, m_n_sinks_list_entries); + + if (m_n_sinks_list_entries == 0) { + rfs_logdbg("rfs sinks list is now empty"); + } + return true; + } + } + rfs_logdbg("sink (%p) not found", p_sink); + return false; +} + +bool rfs::attach_flow(pkt_rcvr_sink *sink) +{ + bool ret; + int filter_counter = 1; + rule_filter_map_t::iterator filter_iter; + + prepare_filter_attach(filter_counter, filter_iter); + + // We also check if this is the FIRST sink so we need to call ibv_attach_flow + if ((m_n_sinks_list_entries == 0) && (!m_b_tmp_is_attached) && (filter_counter == 1)) { + if (m_p_ring->is_simple() && + !create_ibv_flow()) { + return false; + } + filter_keep_attached(filter_iter); + } + + if (sink) { + ret = add_sink(sink); + } else { + rfs_logdbg("rfs: Attach flow was called with sink == NULL"); + ret = true; + } + + return ret; +} + +bool rfs::detach_flow(pkt_rcvr_sink *sink) +{ + bool ret = false; + int filter_counter = 0; + + BULLSEYE_EXCLUDE_BLOCK_START + if (sink) { + ret = del_sink(sink); + } else { + rfs_logwarn("detach_flow() was called with sink == NULL"); + } + BULLSEYE_EXCLUDE_BLOCK_END + + prepare_filter_detach(filter_counter, false); + + // We also need to check if this is the LAST sink so we need to call ibv_attach_flow + if (m_p_ring->is_simple() && + (m_n_sinks_list_entries == 0) && (filter_counter == 0)) { + ret = destroy_ibv_flow(); + } + + return ret; +} + +bool rfs::create_ibv_flow() +{ + for (size_t i = 0; i < m_attach_flow_data_vector.size(); i++) { + attach_flow_data_t* iter = m_attach_flow_data_vector[i]; + iter->ibv_flow = vma_ibv_create_flow(iter->p_qp_mgr->get_ibv_qp(), &(iter->ibv_flow_attr)); + if (!iter->ibv_flow) { + rfs_logerr("Create of QP flow ID (tag: %d) failed with flow %s (errno=%d - %m)", + m_flow_tag_id, m_flow_tuple.to_str(), errno); //TODO ALEXR - Add info about QP, spec, priority into log msg + return false; + } + } + + m_b_tmp_is_attached = true; + rfs_logdbg("ibv_create_flow succeeded with flow %s, tag_id: %d", m_flow_tuple.to_str(), m_flow_tag_id); + return true; +} + +bool rfs::destroy_ibv_flow() +{ + for (size_t i = 0; i < m_attach_flow_data_vector.size(); i++) { + attach_flow_data_t* iter = m_attach_flow_data_vector[i]; + if (unlikely(!iter->ibv_flow)) { + rfs_logdbg("Destroy of QP flow ID failed - QP flow ID that was not created. This is OK for MC same ip diff port scenario."); //TODO ALEXR - Add info about QP, spec, priority into log msg + } + if (iter->ibv_flow) { + IF_VERBS_FAILURE_EX(vma_ibv_destroy_flow(iter->ibv_flow), EIO) { + rfs_logerr("Destroy of QP flow ID failed"); //TODO ALEXR - Add info about QP, spec, priority into log msg + } ENDIF_VERBS_FAILURE; + } + } + + m_b_tmp_is_attached = false; + rfs_logdbg("ibv_destroy_flow with flow %s", m_flow_tuple.to_str()); + return true; +} diff --git a/src/vma/dev/rfs.h b/src/vma/dev/rfs.h new file mode 100644 index 0000000..a5384c9 --- /dev/null +++ b/src/vma/dev/rfs.h @@ -0,0 +1,266 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef RFS_H +#define RFS_H + +#include + +#include "vma/ib/base/verbs_extra.h" +#include "vma/util/vtypes.h" +#include "vma/dev/ring_simple.h" +#include "vma/proto/mem_buf_desc.h" +#include "vma/proto/flow_tuple.h" + + +#define rfs_logpanic __log_info_panic +#define rfs_logerr __log_info_err +#define rfs_logwarn __log_info_warn +#define rfs_loginfo __log_info_info +#define rfs_logdbg __log_info_dbg +#define rfs_logfunc __log_info_func +#define rfs_logfuncall __log_info_funcall + +#define RFS_SINKS_LIST_DEFAULT_LEN 32 + +class qp_mgr; +class pkt_rcvr_sink; + +/* ETHERNET + */ +typedef struct attach_flow_data_eth_ipv4_tcp_udp_t { + struct ibv_flow * ibv_flow; + qp_mgr* p_qp_mgr; + struct ibv_flow_attr_eth_ipv4_tcp_udp { + vma_ibv_flow_attr attr; + vma_ibv_flow_spec_eth eth; + vma_ibv_flow_spec_ipv4 ipv4; + vma_ibv_flow_spec_tcp_udp tcp_udp; + vma_ibv_flow_spec_action_tag flow_tag; // must be the last as struct can be used without it + + ibv_flow_attr_eth_ipv4_tcp_udp(uint8_t port) { + memset(this, 0, sizeof(*this)); + attr.size = sizeof(struct ibv_flow_attr_eth_ipv4_tcp_udp) - sizeof(flow_tag); + attr.num_of_specs = 3; + attr.type = VMA_IBV_FLOW_ATTR_NORMAL; + attr.priority = 1; // almost highest priority, 0 is used for 5-tuple later + attr.port = port; + } + inline void add_flow_tag_spec(void) { + attr.num_of_specs++; + attr.size += sizeof(flow_tag); + } + } ibv_flow_attr; + attach_flow_data_eth_ipv4_tcp_udp_t(qp_mgr* qp_mgr) : + ibv_flow(NULL), + p_qp_mgr(qp_mgr), + ibv_flow_attr(qp_mgr->get_port_num()) {} +} attach_flow_data_eth_ipv4_tcp_udp_t; + +/* IPOIB (MC) + */ +typedef struct attach_flow_data_ib_v2_t { + struct ibv_flow * ibv_flow; + qp_mgr* p_qp_mgr; + struct ibv_flow_attr_ib_v2 { + vma_ibv_flow_attr attr; + vma_ibv_flow_spec_ipv4 ipv4; + vma_ibv_flow_spec_tcp_udp tcp_udp; + + ibv_flow_attr_ib_v2(uint8_t port) { + memset(this, 0, sizeof(*this)); + attr.size = sizeof(struct ibv_flow_attr_ib_v2); + attr.num_of_specs = 2; + attr.type = VMA_IBV_FLOW_ATTR_NORMAL; + attr.priority = 1; // almost highest priority, 0 is used for 5-tuple later + attr.port = port; + } + } ibv_flow_attr; + attach_flow_data_ib_v2_t(qp_mgr* qp_mgr) : + ibv_flow(NULL), + p_qp_mgr(qp_mgr), + ibv_flow_attr(qp_mgr->get_port_num()) {} + +} attach_flow_data_ib_v2_t; + +#ifdef DEFINED_IBV_FLOW_SPEC_IB +typedef struct attach_flow_data_ib_v1_t { + struct ibv_flow * ibv_flow; + qp_mgr* p_qp_mgr; + struct ibv_flow_attr_ib_v1 { + vma_ibv_flow_attr attr; + vma_ibv_flow_spec_ib ib; + + ibv_flow_attr_ib_v1(uint8_t port) { + memset(this, 0, sizeof(*this)); + attr.size = sizeof(struct ibv_flow_attr_ib_v1); + attr.num_of_specs = 1; + attr.type = VMA_IBV_FLOW_ATTR_NORMAL; + attr.priority = 1; // almost highest priority, 0 is used for 5-tuple later + attr.port = port; + } + } ibv_flow_attr; + attach_flow_data_ib_v1_t(qp_mgr* qp_mgr) : + ibv_flow(NULL), + p_qp_mgr(qp_mgr), + ibv_flow_attr(qp_mgr->get_port_num()) {} + +} attach_flow_data_ib_v1_t; +#endif + +/* IPOIB (UC) + */ +typedef struct attach_flow_data_ib_ipv4_tcp_udp_v2_t { + struct ibv_flow * ibv_flow; + qp_mgr* p_qp_mgr; + struct ibv_flow_attr_ib_ipv4_tcp_udp_v2 { + + vma_ibv_flow_attr attr; + vma_ibv_flow_spec_ipv4 ipv4; + vma_ibv_flow_spec_tcp_udp tcp_udp; + + ibv_flow_attr_ib_ipv4_tcp_udp_v2(uint8_t port) { + memset(this, 0, sizeof(*this)); + attr.size = sizeof(struct ibv_flow_attr_ib_ipv4_tcp_udp_v2); + attr.num_of_specs = 2; + attr.type = VMA_IBV_FLOW_ATTR_NORMAL; + attr.priority = 1; // almost highest priority, 0 is used for 5-tuple later + attr.port = port; + } + } ibv_flow_attr; + attach_flow_data_ib_ipv4_tcp_udp_v2_t(qp_mgr* qp_mgr) : + ibv_flow(NULL), + p_qp_mgr(qp_mgr), + ibv_flow_attr(qp_mgr->get_port_num()) {} +} attach_flow_data_ib_ipv4_tcp_udp_v2_t; + +#ifdef DEFINED_IBV_FLOW_SPEC_IB +typedef struct attach_flow_data_ib_ipv4_tcp_udp_v1_t { + struct ibv_flow * ibv_flow; + qp_mgr* p_qp_mgr; + struct ibv_flow_attr_ib_ipv4_tcp_udp_v1 { + + vma_ibv_flow_attr attr; + vma_ibv_flow_spec_ib ib; + vma_ibv_flow_spec_ipv4 ipv4; + vma_ibv_flow_spec_tcp_udp tcp_udp; + + ibv_flow_attr_ib_ipv4_tcp_udp_v1(uint8_t port) { + memset(this, 0, sizeof(*this)); + attr.size = sizeof(struct ibv_flow_attr_ib_ipv4_tcp_udp_v1); + attr.num_of_specs = 3; + attr.type = VMA_IBV_FLOW_ATTR_NORMAL; + attr.priority = 1; // almost highest priority, 0 is used for 5-tuple later + attr.port = port; + } + } ibv_flow_attr; + attach_flow_data_ib_ipv4_tcp_udp_v1_t(qp_mgr* qp_mgr) : + ibv_flow(NULL), + p_qp_mgr(qp_mgr), + ibv_flow_attr(qp_mgr->get_port_num()) {} +} attach_flow_data_ib_ipv4_tcp_udp_v1_t; +#endif /* DEFINED_IBV_FLOW_SPEC_IB */ + +typedef struct attach_flow_data_t { + vma_ibv_flow * ibv_flow; + qp_mgr* p_qp_mgr; + vma_ibv_flow_attr ibv_flow_attr; +} attach_flow_data_t; + +typedef std::vector attach_flow_data_vector_t; + + +class rfs_rule_filter +{ +public: + rfs_rule_filter(rule_filter_map_t& map, uint64_t key, flow_tuple& flow_tuple) : m_map(map), m_key(key), m_flow_tuple(flow_tuple) {} + rule_filter_map_t& m_map; + uint64_t m_key; + flow_tuple m_flow_tuple; +}; + +/** + * @class rfs + * + * Object to manages the sink list + * This object is used for maintaining the sink list and dispatching packets + * + */ + + +class rfs +{ +public: + rfs(flow_tuple *flow_spec_5t, ring_slave *p_ring, + rfs_rule_filter* rule_filter = NULL, uint32_t flow_tag_id = 0); + virtual ~rfs(); + + /** + * Register/Unregister a sink with this rfs object + * Get notifications about incoming packets using the pkt_rcvr_sink callback api + * The rfs will call ibv_attach on the QP once when at least one receiver sink is registered + * An ibv_detach is called when the last receiver sink is deleted from the registered list + * + */ + bool attach_flow(pkt_rcvr_sink *sink); // Add a sink. If this is the first sink --> map the sink and attach flow to QP + bool detach_flow(pkt_rcvr_sink *sink); // Delete a sink. If this is the last sink --> delete it and detach flow from QP + + uint32_t get_num_of_sinks() const { return m_n_sinks_list_entries; } + virtual bool rx_dispatch_packet(mem_buf_desc_t* p_rx_wc_buf_desc, void* pv_fd_ready_array) = 0; + +protected: + flow_tuple m_flow_tuple; + ring_slave* m_p_ring; + rfs_rule_filter* m_p_rule_filter; + attach_flow_data_vector_t m_attach_flow_data_vector; + pkt_rcvr_sink** m_sinks_list; + uint32_t m_n_sinks_list_entries; // Number of actual sinks in the array (we shrink the array if a sink is removed) + uint32_t m_n_sinks_list_max_length; + uint32_t m_flow_tag_id; // Associated with this rule, set by attach_flow() + bool m_b_tmp_is_attached; // Only temporary, while ibcm calls attach_flow with no sinks... + + bool create_ibv_flow(); // Attach flow to all qps + bool destroy_ibv_flow(); // Detach flow from all qps + bool add_sink(pkt_rcvr_sink* p_sink); + bool del_sink(pkt_rcvr_sink* p_sink); + virtual bool prepare_flow_spec() = 0; + +private: + rfs(); // I don't want anyone to use the default constructor + inline void prepare_filter_attach(int& filter_counter, rule_filter_map_t::iterator& filter_iter); + inline void filter_keep_attached(rule_filter_map_t::iterator& filter_iter); + inline void prepare_filter_detach(int& filter_counter, bool decrease_counter); + +}; + +#endif /* RFS_H */ diff --git a/src/vma/dev/rfs_mc.cpp b/src/vma/dev/rfs_mc.cpp new file mode 100644 index 0000000..d0b2773 --- /dev/null +++ b/src/vma/dev/rfs_mc.cpp @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "utils/bullseye.h" +#include "vma/util/utils.h" +#include "vma/dev/rfs_mc.h" +#include "vma/dev/ring_simple.h" + +#define MODULE_NAME "rfs_mc" + + +rfs_mc::rfs_mc(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter* rule_filter /*= NULL*/, int flow_tag_id /*=0*/): + rfs (flow_spec_5t, p_ring, rule_filter, flow_tag_id) +{ + BULLSEYE_EXCLUDE_BLOCK_START + if (!m_flow_tuple.is_udp_mc()) { + throw_vma_exception("rfs_mc called with non mc destination ip"); + } + BULLSEYE_EXCLUDE_BLOCK_END + + if (m_p_ring->is_simple() && !prepare_flow_spec()) { + throw_vma_exception("IB multicast offload is not supported"); + } +} + +bool rfs_mc::prepare_flow_spec() +{ + ring_simple* p_ring = dynamic_cast(m_p_ring); + + if (!p_ring) { + rfs_logpanic("Incompatible ring type"); + } + + transport_type_t type = p_ring->get_transport_type(); + + /* + * todo note that ring is not locked here. + * we touch members that should not change during the ring life. + * the ring will not be deleted as we increased refcnt. + * if one of these assumptions change, we must lock. + */ + attach_flow_data_t* p_attach_flow_data = NULL; + + switch (type) { + case VMA_TRANSPORT_IB: + { + attach_flow_data_ib_v2_t* attach_flow_data_ib_v2 = NULL; + + if (0 == p_ring->m_p_qp_mgr->get_underly_qpn()) { + // IB MC flow steering is done only on L2 --> need to zero other fields to get correct behaviour + // CX3 HW does not support L3+L4 MC flow steering rule +#ifdef DEFINED_IBV_FLOW_SPEC_IB + attach_flow_data_ib_v1_t* attach_flow_data_ib_v1 = NULL; + + attach_flow_data_ib_v1 = new attach_flow_data_ib_v1_t(p_ring->m_p_qp_mgr); + + uint8_t dst_gid[16]; + create_mgid_from_ipv4_mc_ip(dst_gid, p_ring->m_p_qp_mgr->get_partiton(), m_flow_tuple.get_dst_ip()); + ibv_flow_spec_ib_set_by_dst_gid(&(attach_flow_data_ib_v1->ibv_flow_attr.ib), + dst_gid); + + p_attach_flow_data = (attach_flow_data_t*)attach_flow_data_ib_v1; + break; +#else + return false; +#endif + } + + attach_flow_data_ib_v2 = new attach_flow_data_ib_v2_t(p_ring->m_p_qp_mgr); + + ibv_flow_spec_ipv4_set(&(attach_flow_data_ib_v2->ibv_flow_attr.ipv4), + m_flow_tuple.get_dst_ip(), + 0); + + ibv_flow_spec_tcp_udp_set(&(attach_flow_data_ib_v2->ibv_flow_attr.tcp_udp), + (m_flow_tuple.get_protocol() == PROTO_TCP), + m_flow_tuple.get_dst_port(), + m_flow_tuple.get_src_port()); + + p_attach_flow_data = (attach_flow_data_t*)attach_flow_data_ib_v2; + break; + } + case VMA_TRANSPORT_ETH: + { + attach_flow_data_eth_ipv4_tcp_udp_t* attach_flow_data_eth = NULL; + + attach_flow_data_eth = new attach_flow_data_eth_ipv4_tcp_udp_t(p_ring->m_p_qp_mgr); + + uint8_t dst_mac[6]; + create_multicast_mac_from_ip(dst_mac, m_flow_tuple.get_dst_ip()); + ibv_flow_spec_eth_set(&(attach_flow_data_eth->ibv_flow_attr.eth), + dst_mac, + htons(p_ring->m_p_qp_mgr->get_partiton())); + + if (safe_mce_sys().eth_mc_l2_only_rules) { + ibv_flow_spec_ipv4_set(&(attach_flow_data_eth->ibv_flow_attr.ipv4), 0, 0); + ibv_flow_spec_tcp_udp_set(&(attach_flow_data_eth->ibv_flow_attr.tcp_udp), 0, 0, 0); + p_attach_flow_data = (attach_flow_data_t*)attach_flow_data_eth; + break; + } + + ibv_flow_spec_ipv4_set(&(attach_flow_data_eth->ibv_flow_attr.ipv4), + m_flow_tuple.get_dst_ip(), + 0); + + ibv_flow_spec_tcp_udp_set(&(attach_flow_data_eth->ibv_flow_attr.tcp_udp), + (m_flow_tuple.get_protocol() == PROTO_TCP), + m_flow_tuple.get_dst_port(), + m_flow_tuple.get_src_port()); + + if (m_flow_tag_id) { // Will not attach flow_tag spec to rule for tag_id==0 + ibv_flow_spec_flow_tag_set(&attach_flow_data_eth->ibv_flow_attr.flow_tag, m_flow_tag_id); + attach_flow_data_eth->ibv_flow_attr.add_flow_tag_spec(); + rfs_logdbg("Adding flow_tag spec to MC rule, num_of_specs: %d flow_tag_id: %d", + attach_flow_data_eth->ibv_flow_attr.attr.num_of_specs, m_flow_tag_id); + } + + p_attach_flow_data = (attach_flow_data_t*)attach_flow_data_eth; + break; + } + BULLSEYE_EXCLUDE_BLOCK_START + default: + rfs_logpanic("Incompatible transport type = %d", type); + return false; + break; + BULLSEYE_EXCLUDE_BLOCK_END + } + + m_attach_flow_data_vector.push_back(p_attach_flow_data); + return true; +} + +bool rfs_mc::rx_dispatch_packet(mem_buf_desc_t* p_rx_wc_buf_desc, void* pv_fd_ready_array) +{ + // Dispatching: Notify new packet to all registered receivers + p_rx_wc_buf_desc->reset_ref_count(); + p_rx_wc_buf_desc->inc_ref_count(); + + for (uint32_t i=0; i < m_n_sinks_list_entries; ++i) { + if (m_sinks_list[i]) { + m_sinks_list[i]->rx_input_cb(p_rx_wc_buf_desc, pv_fd_ready_array); + } + } + + // Check packet ref_count to see if any receivers are interested in this packet + if (p_rx_wc_buf_desc->dec_ref_count() > 1) { + // The sink will be responsible to return the buffer to CQ for reuse + return true; + } + + // Reuse this data buffer & mem_buf_desc + return false; +} diff --git a/src/vma/dev/rfs_mc.h b/src/vma/dev/rfs_mc.h new file mode 100644 index 0000000..6bd572d --- /dev/null +++ b/src/vma/dev/rfs_mc.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef RFS_MC_H +#define RFS_MC_H + +#include "vma/dev/rfs.h" + + +/** + * @class rfs_mc + * + * Object to manages the sink list of a MC flow + * This object is used for maintaining the sink list and dispatching packets + * + */ + + +class rfs_mc : public rfs +{ +public: + rfs_mc(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter* rule_filter = NULL, int32_t flow_tag_id = 0); + + virtual bool rx_dispatch_packet(mem_buf_desc_t* p_rx_wc_buf_desc, void* pv_fd_ready_array); + +protected: + virtual bool prepare_flow_spec(); +}; + + +#endif /* RFS_MC_H */ diff --git a/src/vma/dev/rfs_uc.cpp b/src/vma/dev/rfs_uc.cpp new file mode 100644 index 0000000..3717fee --- /dev/null +++ b/src/vma/dev/rfs_uc.cpp @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "utils/bullseye.h" +#include "vma/dev/rfs_uc.h" +#include "vma/proto/L2_address.h" +#include "vma/dev/ring_simple.h" +#include "util/instrumentation.h" + +#define MODULE_NAME "rfs_uc" + + +rfs_uc::rfs_uc(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter* rule_filter, uint32_t flow_tag_id) : + rfs(flow_spec_5t, p_ring, rule_filter, flow_tag_id) +{ + BULLSEYE_EXCLUDE_BLOCK_START + if (m_flow_tuple.is_udp_mc()) { + throw_vma_exception("rfs_uc called with MC destination ip"); + } + BULLSEYE_EXCLUDE_BLOCK_END + + if (m_p_ring->is_simple() && !prepare_flow_spec()) { + throw_vma_exception("rfs_uc: Incompatible transport type"); + } +} + +bool rfs_uc::prepare_flow_spec() +{ + ring_simple* p_ring = dynamic_cast(m_p_ring); + + if (!p_ring) { + rfs_logpanic("Incompatible ring type"); + } + + transport_type_t type = p_ring->get_transport_type(); + /* + * todo note that ring is not locked here. + * we touch members that should not change during the ring life. + * the ring will not be deleted as we increased refcnt. + * if one of these assumptions change, we must lock. + */ + attach_flow_data_t* p_attach_flow_data = NULL; + vma_ibv_flow_spec_ipv4* p_ipv4 = NULL; + vma_ibv_flow_spec_tcp_udp* p_tcp_udp = NULL; + vma_ibv_flow_spec_action_tag* p_flow_tag = NULL; + + attach_flow_data_eth_ipv4_tcp_udp_t* attach_flow_data_eth = NULL; + + switch (type) { + case VMA_TRANSPORT_IB: + { + attach_flow_data_ib_ipv4_tcp_udp_v2_t* attach_flow_data_ib_v2 = NULL; + +#ifdef DEFINED_IBV_FLOW_SPEC_IB + if (0 == p_ring->m_p_qp_mgr->get_underly_qpn()) { + attach_flow_data_ib_ipv4_tcp_udp_v1_t* attach_flow_data_ib_v1 = NULL; + + attach_flow_data_ib_v1 = new attach_flow_data_ib_ipv4_tcp_udp_v1_t(p_ring->m_p_qp_mgr); + ibv_flow_spec_ib_set_by_dst_qpn(&(attach_flow_data_ib_v1->ibv_flow_attr.ib), + htonl(((IPoIB_addr*)p_ring->m_p_l2_addr)->get_qpn())); + p_ipv4 = &(attach_flow_data_ib_v1->ibv_flow_attr.ipv4); + p_tcp_udp = &(attach_flow_data_ib_v1->ibv_flow_attr.tcp_udp); + p_attach_flow_data = (attach_flow_data_t*)attach_flow_data_ib_v1; + break; + } +#endif + attach_flow_data_ib_v2 = new attach_flow_data_ib_ipv4_tcp_udp_v2_t(p_ring->m_p_qp_mgr); + + p_ipv4 = &(attach_flow_data_ib_v2->ibv_flow_attr.ipv4); + p_tcp_udp = &(attach_flow_data_ib_v2->ibv_flow_attr.tcp_udp); + p_attach_flow_data = (attach_flow_data_t*)attach_flow_data_ib_v2; + break; + } + case VMA_TRANSPORT_ETH: + { + attach_flow_data_eth = new attach_flow_data_eth_ipv4_tcp_udp_t(p_ring->m_p_qp_mgr); + + ibv_flow_spec_eth_set(&(attach_flow_data_eth->ibv_flow_attr.eth), + p_ring->m_p_l2_addr->get_address(), + htons(p_ring->m_p_qp_mgr->get_partiton())); + p_ipv4 = &(attach_flow_data_eth->ibv_flow_attr.ipv4); + p_tcp_udp = &(attach_flow_data_eth->ibv_flow_attr.tcp_udp); + p_flow_tag = &(attach_flow_data_eth->ibv_flow_attr.flow_tag); + p_attach_flow_data = (attach_flow_data_t*)attach_flow_data_eth; + break; + } + BULLSEYE_EXCLUDE_BLOCK_START + default: + return false; + break; + BULLSEYE_EXCLUDE_BLOCK_END + } + + ibv_flow_spec_ipv4_set(p_ipv4, + m_flow_tuple.get_dst_ip(), + m_flow_tuple.get_src_ip()); + + ibv_flow_spec_tcp_udp_set(p_tcp_udp, + (m_flow_tuple.get_protocol() == PROTO_TCP), + m_flow_tuple.get_dst_port(), + m_flow_tuple.get_src_port()); + + if (m_flow_tuple.get_src_port() || m_flow_tuple.get_src_ip()) { + // set priority of 5-tuple to be higher than 3-tuple + // to make sure 5-tuple have higher priority on ConnectX-4 + p_attach_flow_data->ibv_flow_attr.priority = 0; + } + + if (m_flow_tag_id && attach_flow_data_eth) { // Will not attach flow_tag spec to rule for tag_id==0 + ibv_flow_spec_flow_tag_set(p_flow_tag, m_flow_tag_id); + attach_flow_data_eth->ibv_flow_attr.add_flow_tag_spec(); + rfs_logdbg("Adding flow_tag spec to rule, num_of_specs: %d flow_tag_id: %d", + attach_flow_data_eth->ibv_flow_attr.attr.num_of_specs, + m_flow_tag_id); + } + rfs_logfunc("transport type: %d, num_of_specs: %d flow_tag_id: %d", type, + attach_flow_data_eth->ibv_flow_attr.attr.num_of_specs, + m_flow_tag_id); + + m_attach_flow_data_vector.push_back(p_attach_flow_data); + return true; +} + +bool rfs_uc::rx_dispatch_packet(mem_buf_desc_t* p_rx_wc_buf_desc, void* pv_fd_ready_array) +{ + static int enable_socketxtreme = safe_mce_sys().enable_socketxtreme; + uint32_t num_sinks = (enable_socketxtreme ? + 1 : m_n_sinks_list_entries); + + p_rx_wc_buf_desc->reset_ref_count(); + + for (uint32_t i=0; i < num_sinks; ++i) { + if (likely(m_sinks_list[i])) { +#ifdef RDTSC_MEASURE_RX_DISPATCH_PACKET + RDTSC_TAKE_START(g_rdtsc_instr_info_arr[RDTSC_FLOW_RX_DISPATCH_PACKET]); +#endif //RDTSC_MEASURE_RX_DISPATCH_PACKET + p_rx_wc_buf_desc->inc_ref_count(); + m_sinks_list[i]->rx_input_cb(p_rx_wc_buf_desc, pv_fd_ready_array); +#ifdef RDTSC_MEASURE_RX_DISPATCH_PACKET + RDTSC_TAKE_END(g_rdtsc_instr_info_arr[RDTSC_FLOW_RX_DISPATCH_PACKET]); +#endif //RDTSC_MEASURE_RX_DISPATCH_PACKET + // Check packet ref_count to see the last receiver is interested in this packet + if (p_rx_wc_buf_desc->dec_ref_count() > 1) { + // The sink will be responsible to return the buffer to CQ for reuse + return true; + } + } + } + // Reuse this data buffer & mem_buf_desc + return false; +} diff --git a/src/vma/dev/rfs_uc.h b/src/vma/dev/rfs_uc.h new file mode 100644 index 0000000..71916a3 --- /dev/null +++ b/src/vma/dev/rfs_uc.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef RFS_UC_H +#define RFS_UC_H + +#include "vma/dev/rfs.h" + + +/** + * @class rfs_uc + * + * Object to manages the sink list of a UC flow + * This object is used for maintaining the sink list and dispatching packets + * + */ + + +class rfs_uc : public rfs +{ +public: + rfs_uc(flow_tuple *flow_spec_5t, ring_slave *p_ring, + rfs_rule_filter* rule_filter = NULL, uint32_t flow_tag_id = 0); + + virtual bool rx_dispatch_packet(mem_buf_desc_t* p_rx_wc_buf_desc, void* pv_fd_ready_array); + +protected: + virtual bool prepare_flow_spec(); +}; + + +#endif /* RFS_UC_H */ diff --git a/src/vma/dev/rfs_uc_tcp_gro.cpp b/src/vma/dev/rfs_uc_tcp_gro.cpp new file mode 100644 index 0000000..9ee526e --- /dev/null +++ b/src/vma/dev/rfs_uc_tcp_gro.cpp @@ -0,0 +1,267 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "utils/bullseye.h" +#include "vma/dev/rfs_uc_tcp_gro.h" +#include "vma/dev/gro_mgr.h" +#include "vma/dev/ring_simple.h" +#include "vma/proto/route_rule_table_key.h" + +#define MODULE_NAME "rfs_uc_tcp_gro" + +#define IP_H_LEN_NO_OPTIONS 5 +#define TCP_H_LEN_NO_OPTIONS 5 +#define TCP_H_LEN_TIMESTAMP 8 + + +rfs_uc_tcp_gro::rfs_uc_tcp_gro(flow_tuple *flow_spec_5t, ring_slave *p_ring, rfs_rule_filter* rule_filter, uint32_t flow_tag_id) : + rfs_uc(flow_spec_5t, p_ring, rule_filter, flow_tag_id), + m_b_active(false), m_b_reserved(false) +{ + ring_simple* p_check_ring = dynamic_cast(p_ring); + + if (!p_check_ring) { + rfs_logpanic("Incompatible ring type"); + } + + m_p_gro_mgr = &(p_check_ring->m_gro_mgr); + m_n_buf_max = m_p_gro_mgr->get_buf_max(); + uint32_t mtu = p_check_ring->get_mtu(); + m_n_byte_max = m_p_gro_mgr->get_byte_max() - mtu; + memset(&m_gro_desc, 0, sizeof(m_gro_desc)); +} + +bool rfs_uc_tcp_gro::rx_dispatch_packet(mem_buf_desc_t* p_rx_pkt_mem_buf_desc_info, void* pv_fd_ready_array /* = NULL */) +{ + struct iphdr* p_ip_h = p_rx_pkt_mem_buf_desc_info->rx.tcp.p_ip_h; + struct tcphdr* p_tcp_h = p_rx_pkt_mem_buf_desc_info->rx.tcp.p_tcp_h; + + if (!m_b_active) { + if (!m_b_reserved && m_p_gro_mgr->is_stream_max()) { + goto out; + } + } + + if (!tcp_ip_check(p_rx_pkt_mem_buf_desc_info, p_ip_h, p_tcp_h)) { + if (m_b_active) { + flush_gro_desc(pv_fd_ready_array); + } + goto out; + } + + if (!m_b_active) { + if (!m_b_reserved) { + m_b_reserved = m_p_gro_mgr->reserve_stream(this); + } + init_gro_desc(p_rx_pkt_mem_buf_desc_info, p_ip_h, p_tcp_h); + m_b_active = true; + } else { + if (ntohl(p_tcp_h->seq) != m_gro_desc.next_seq) { + flush_gro_desc(pv_fd_ready_array); + goto out; + } + + if (!timestamp_check(p_tcp_h)) { + flush_gro_desc(pv_fd_ready_array); + goto out; + } + + add_packet(p_rx_pkt_mem_buf_desc_info, p_ip_h, p_tcp_h); + } + + if (m_gro_desc.buf_count >= m_n_buf_max || m_gro_desc.ip_tot_len >= m_n_byte_max) { + flush_gro_desc(pv_fd_ready_array); + } + + return true; + +out: + return rfs_uc::rx_dispatch_packet(p_rx_pkt_mem_buf_desc_info, pv_fd_ready_array); +} + +void rfs_uc_tcp_gro::add_packet(mem_buf_desc_t* mem_buf_desc, struct iphdr* p_ip_h, tcphdr* p_tcp_h) +{ + m_gro_desc.buf_count++; + m_gro_desc.ip_tot_len += mem_buf_desc->rx.sz_payload; + m_gro_desc.next_seq += mem_buf_desc->rx.sz_payload; + m_gro_desc.wnd = p_tcp_h->window; + m_gro_desc.ack = p_tcp_h->ack_seq; + + uint32_t* topt; + if (m_gro_desc.ts_present) { + topt = (uint32_t *) (p_tcp_h + 1); + m_gro_desc.tsecr = *(topt + 2); + } + + mem_buf_desc->reset_ref_count(); + + mem_buf_desc->lwip_pbuf.pbuf.flags = PBUF_FLAG_IS_CUSTOM; + mem_buf_desc->lwip_pbuf.pbuf.len = mem_buf_desc->lwip_pbuf.pbuf.tot_len = mem_buf_desc->rx.sz_payload; + mem_buf_desc->lwip_pbuf.pbuf.ref = 1; + mem_buf_desc->lwip_pbuf.pbuf.type = PBUF_REF; + mem_buf_desc->lwip_pbuf.pbuf.next = NULL; + mem_buf_desc->lwip_pbuf.pbuf.payload = (u8_t *)mem_buf_desc->p_buffer + mem_buf_desc->rx.tcp.n_transport_header_len + ntohs(p_ip_h->tot_len) - mem_buf_desc->rx.sz_payload; + + + m_gro_desc.p_last->lwip_pbuf.pbuf.next = &(mem_buf_desc->lwip_pbuf.pbuf); + m_gro_desc.p_last->p_next_desc = NULL; + mem_buf_desc->p_prev_desc = m_gro_desc.p_last; + m_gro_desc.p_last = mem_buf_desc; +} + +void rfs_uc_tcp_gro::flush(void* pv_fd_ready_array) +{ + flush_gro_desc(pv_fd_ready_array); + m_b_reserved = false; +} + +struct __attribute__((packed)) tcphdr_ts +{ + tcphdr p_tcp_h; + uint32_t popts[3]; +}; + +void rfs_uc_tcp_gro::flush_gro_desc(void* pv_fd_ready_array) +{ + ring_simple* p_ring = dynamic_cast(m_p_ring); + + if (!p_ring) { + rfs_logpanic("Incompatible ring type"); + } + + if (!m_b_active) return; + + if (m_gro_desc.buf_count > 1) { + m_gro_desc.p_ip_h->tot_len = htons(m_gro_desc.ip_tot_len); + m_gro_desc.p_tcp_h->ack_seq = m_gro_desc.ack; + m_gro_desc.p_tcp_h->window = m_gro_desc.wnd; + + if (m_gro_desc.ts_present) { + tcphdr_ts* p_tcp_ts_h = (tcphdr_ts*) m_gro_desc.p_tcp_h; + p_tcp_ts_h->popts[2] = m_gro_desc.tsecr; + } + + m_gro_desc.p_first->rx.tcp.gro = 1; + + m_gro_desc.p_first->lwip_pbuf.pbuf.flags = PBUF_FLAG_IS_CUSTOM; + m_gro_desc.p_first->lwip_pbuf.pbuf.tot_len = m_gro_desc.p_first->lwip_pbuf.pbuf.len = (m_gro_desc.p_first->sz_data - m_gro_desc.p_first->rx.tcp.n_transport_header_len); + m_gro_desc.p_first->lwip_pbuf.pbuf.ref = 1; + m_gro_desc.p_first->lwip_pbuf.pbuf.type = PBUF_REF; + m_gro_desc.p_first->lwip_pbuf.pbuf.payload = (u8_t *)(m_gro_desc.p_first->p_buffer + m_gro_desc.p_first->rx.tcp.n_transport_header_len); + m_gro_desc.p_first->rx.is_vma_thr = m_gro_desc.p_last->rx.is_vma_thr; + + for (mem_buf_desc_t* p_desc = m_gro_desc.p_last; p_desc != m_gro_desc.p_first; p_desc = p_desc->p_prev_desc) { + p_desc->p_prev_desc->lwip_pbuf.pbuf.tot_len += p_desc->lwip_pbuf.pbuf.tot_len; + } + } + + __log_func("Rx LRO TCP segment info: src_port=%d, dst_port=%d, flags='%s%s%s%s%s%s' seq=%u, ack=%u, win=%u, payload_sz=%u, num_bufs=%u", + ntohs(m_gro_desc.p_tcp_h->source), ntohs(m_gro_desc.p_tcp_h->dest), + m_gro_desc.p_tcp_h->urg?"U":"", m_gro_desc.p_tcp_h->ack?"A":"", m_gro_desc.p_tcp_h->psh?"P":"", + m_gro_desc.p_tcp_h->rst?"R":"", m_gro_desc.p_tcp_h->syn?"S":"", m_gro_desc.p_tcp_h->fin?"F":"", + ntohl(m_gro_desc.p_tcp_h->seq), ntohl(m_gro_desc.p_tcp_h->ack_seq), ntohs(m_gro_desc.p_tcp_h->window), + m_gro_desc.ip_tot_len - 40, m_gro_desc.buf_count); + + if (!rfs_uc::rx_dispatch_packet(m_gro_desc.p_first, pv_fd_ready_array)) { + p_ring->reclaim_recv_buffers_no_lock(m_gro_desc.p_first); + } + + m_b_active = false; +} + +void rfs_uc_tcp_gro::init_gro_desc(mem_buf_desc_t* mem_buf_desc, iphdr* p_ip_h, tcphdr* p_tcp_h) +{ + m_gro_desc.p_first = m_gro_desc.p_last = mem_buf_desc; + m_gro_desc.buf_count = 1; + m_gro_desc.p_ip_h = p_ip_h; + m_gro_desc.p_tcp_h = p_tcp_h; + m_gro_desc.ip_tot_len = ntohs(p_ip_h->tot_len); + m_gro_desc.ack = p_tcp_h->ack_seq; + m_gro_desc.next_seq = ntohl(p_tcp_h->seq) + mem_buf_desc->rx.sz_payload; + m_gro_desc.wnd = p_tcp_h->window; + m_gro_desc.ts_present = 0; + if (p_tcp_h->doff == TCP_H_LEN_TIMESTAMP) { + uint32_t* topt = (uint32_t*)(p_tcp_h + 1); + m_gro_desc.ts_present = 1; + m_gro_desc.tsval = *(topt+1); + m_gro_desc.tsecr = *(topt+2); + } +} + +bool rfs_uc_tcp_gro::tcp_ip_check(mem_buf_desc_t* mem_buf_desc, iphdr* p_ip_h, tcphdr* p_tcp_h) +{ + + if (mem_buf_desc->rx.sz_payload == 0) { + return false; + } + + if (p_ip_h->ihl != IP_H_LEN_NO_OPTIONS) { + return false; + } + + if (p_tcp_h->urg || !p_tcp_h->ack || p_tcp_h->rst || p_tcp_h->syn || p_tcp_h->fin) { + return false; + } + + if (p_tcp_h->doff != TCP_H_LEN_NO_OPTIONS && p_tcp_h->doff != TCP_H_LEN_TIMESTAMP) { + return false; + } + + return true; +} + +bool rfs_uc_tcp_gro::timestamp_check(tcphdr* p_tcp_h) +{ + if (p_tcp_h->doff == TCP_H_LEN_TIMESTAMP) { + uint32_t* topt = (uint32_t*)(p_tcp_h + 1); + if (*topt != htonl((TCPOPT_NOP << 24) | + (TCPOPT_NOP << 16) | + (TCPOPT_TIMESTAMP << 8) | + TCPOLEN_TIMESTAMP)) { + return false; + } + + topt++; + + if (ntohl(*topt) < ntohl(m_gro_desc.tsval)) { + + } + + topt++; + + if (*topt == 0) { + return false; + } + + } + return true; +} diff --git a/src/vma/dev/rfs_uc_tcp_gro.h b/src/vma/dev/rfs_uc_tcp_gro.h new file mode 100644 index 0000000..de21ec0 --- /dev/null +++ b/src/vma/dev/rfs_uc_tcp_gro.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef RFS_UC_TCP_GRO_H +#define RFS_UC_TCP_GRO_H + +#include "vma/dev/rfs_uc.h" +#include +/** + * @class rfs_uc_tcp_gro + * + * Object to manages the sink list of a UC TCP GRO flow + * This object is used for maintaining the sink list and dispatching packets + * + */ + +struct gro_mem_buf_desc { + mem_buf_desc_t* p_first; + mem_buf_desc_t* p_last; + iphdr* p_ip_h; + tcphdr* p_tcp_h; + uint32_t buf_count; + uint32_t next_seq; + uint32_t ack; + uint32_t ts_present; + uint32_t tsval; + uint32_t tsecr; + uint16_t ip_tot_len; + uint16_t wnd; +} typedef gro_mem_buf_desc_t; + +class gro_mgr; + +class rfs_uc_tcp_gro : public rfs_uc +{ +public: + rfs_uc_tcp_gro(flow_tuple *flow_spec_5t, ring_slave *p_ring, + rfs_rule_filter* rule_filter = NULL, uint32_t flow_tag_id = 0); + + virtual bool rx_dispatch_packet(mem_buf_desc_t* p_rx_wc_buf_desc, void* pv_fd_ready_array); + + void flush(void* pv_fd_ready_array); + +private: + + inline void flush_gro_desc(void* pv_fd_ready_array); + inline void add_packet(mem_buf_desc_t* mem_buf_desc, iphdr* p_ip_h, tcphdr* p_tcp_h); + inline void init_gro_desc(mem_buf_desc_t* mem_buf_desc, iphdr* p_ip_h, tcphdr* p_tcp_h); + inline bool tcp_ip_check(mem_buf_desc_t* mem_buf_desc, iphdr* p_ip_h, tcphdr* p_tcp_h); + inline bool timestamp_check(tcphdr* p_tcp_h); + + gro_mgr* m_p_gro_mgr; + bool m_b_active; + bool m_b_reserved; + struct gro_mem_buf_desc m_gro_desc; + uint32_t m_n_buf_max; + uint32_t m_n_byte_max; +}; + + +#endif /* RFS_UC_TCP_GRO_H */ diff --git a/src/vma/dev/ring.cpp b/src/vma/dev/ring.cpp new file mode 100644 index 0000000..d1bdcc7 --- /dev/null +++ b/src/vma/dev/ring.cpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "ring.h" +#include "vma/proto/route_table_mgr.h" + +#undef MODULE_NAME +#define MODULE_NAME "ring" +#undef MODULE_HDR +#define MODULE_HDR MODULE_NAME "%d:%s() " + +ring::ring() : + m_p_n_rx_channel_fds(NULL), m_parent(NULL) +{ + m_if_index = 0; + + print_val(); +} + +ring::~ring() +{ +} + +void ring::print_val() +{ + ring_logdbg("%d: 0x%X: parent 0x%X", + m_if_index, this, ((uintptr_t)this == (uintptr_t)m_parent ? 0 : m_parent)); +} diff --git a/src/vma/dev/ring.h b/src/vma/dev/ring.h new file mode 100644 index 0000000..6e8ad64 --- /dev/null +++ b/src/vma/dev/ring.h @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef RING_H +#define RING_H + +#include "vma/ib/base/verbs_extra.h" +#include "vma/proto/flow_tuple.h" +#include "vma/sock/socket_fd_api.h" + +class pkt_rcvr_sink; + +#define ring_logpanic __log_info_panic +#define ring_logerr __log_info_err +#define ring_logwarn __log_info_warn +#define ring_loginfo __log_info_info +#define ring_logdbg __log_info_dbg +#define ring_logfunc __log_info_func +#define ring_logfuncall __log_info_funcall +#define ring_logfine __log_info_fine + +typedef enum { + CQT_RX, + CQT_TX +} cq_type_t; + +typedef int ring_user_id_t; + +/* Ring event completion */ +struct ring_ec { + struct list_head list; + struct vma_completion_t completion; + struct vma_buff_t* last_buff_lst; + + inline void clear() + { + INIT_LIST_HEAD(&list); + memset(&completion, 0, sizeof(completion)); + last_buff_lst = NULL; + } +}; + +class ring +{ +public: + ring(); + + virtual ~ring(); + + virtual void print_val(); + + virtual bool attach_flow(flow_tuple& flow_spec_5t, pkt_rcvr_sink* sink) = 0; + virtual bool detach_flow(flow_tuple& flow_spec_5t, pkt_rcvr_sink* sink) = 0; + + virtual void restart() = 0; + + // Funcs taken from qp_mgr.h + // Get/Release memory buffer descriptor with a linked data memory buffer + virtual mem_buf_desc_t* mem_buf_tx_get(ring_user_id_t id, bool b_block, int n_num_mem_bufs = 1) = 0; + virtual int mem_buf_tx_release(mem_buf_desc_t* p_mem_buf_desc_list, bool b_accounting, bool trylock = false) = 0; + virtual void send_ring_buffer(ring_user_id_t id, vma_ibv_send_wr* p_send_wqe, vma_wr_tx_packet_attr attr) = 0; + virtual void send_lwip_buffer(ring_user_id_t id, vma_ibv_send_wr* p_send_wqe, vma_wr_tx_packet_attr attr) = 0; + + // Funcs taken from cq_mgr.h + virtual int get_num_resources() const = 0; + int* get_rx_channel_fds() const { return m_p_n_rx_channel_fds; }; + virtual int get_tx_channel_fd() const { return -1; }; + virtual bool get_hw_dummy_send_support(ring_user_id_t id, vma_ibv_send_wr* p_send_wqe) = 0; + virtual int request_notification(cq_type_t cq_type, uint64_t poll_sn) = 0; + virtual bool reclaim_recv_buffers(descq_t *rx_reuse) = 0; + virtual bool reclaim_recv_buffers(mem_buf_desc_t* rx_reuse_lst) = 0; + virtual int drain_and_proccess() = 0; + virtual int wait_for_notification_and_process_element(int cq_channel_fd, uint64_t* p_cq_poll_sn, void* pv_fd_ready_array = NULL) = 0; + virtual int poll_and_process_element_rx(uint64_t* p_cq_poll_sn, void* pv_fd_ready_array = NULL) = 0; + virtual void adapt_cq_moderation() = 0; + virtual void mem_buf_desc_return_single_to_owner_tx(mem_buf_desc_t* p_mem_buf_desc) = 0; + + virtual void inc_tx_retransmissions_stats(ring_user_id_t id) = 0; + virtual bool is_member(ring_slave* rng) = 0; + virtual bool is_active_member(ring_slave* rng, ring_user_id_t id) = 0; + ring* get_parent() { return m_parent; }; + ring_user_id_t generate_id() { return 0; }; + virtual ring_user_id_t generate_id(const address_t src_mac, const address_t dst_mac, uint16_t eth_proto, uint16_t encap_proto, uint32_t src_ip, uint32_t dst_ip, uint16_t src_port, uint16_t dst_port) = 0; + virtual int modify_ratelimit(struct vma_rate_limit_t &rate_limit) = 0; + virtual uint32_t get_max_inline_data() = 0; +#ifdef DEFINED_TSO + virtual uint32_t get_max_send_sge(void) = 0; + virtual uint32_t get_max_payload_sz(void) = 0; + virtual uint16_t get_max_header_sz(void) = 0; + virtual uint32_t get_tx_lkey(ring_user_id_t id) = 0; + virtual bool is_tso(void) = 0; +#endif /* DEFINED_TSO */ + virtual int reg_mr(void *addr, size_t length, uint32_t &lkey) { NOT_IN_USE(addr); NOT_IN_USE(length); NOT_IN_USE(lkey); return -1;}; + virtual int dereg_mr(void *addr, size_t length) { NOT_IN_USE(addr);NOT_IN_USE(length); return -1;}; + + virtual int socketxtreme_poll(struct vma_completion_t *vma_completions, unsigned int ncompletions, int flags) = 0; + + virtual bool is_socketxtreme(void) = 0; + virtual void put_ec(struct ring_ec *ec) = 0; + virtual void del_ec(struct ring_ec *ec) = 0; + virtual struct vma_completion_t *get_comp(void) = 0; + + inline int get_if_index() { return m_if_index; } + +protected: + inline void set_parent(ring* parent) { m_parent = ( parent ? parent : this); } + inline void set_if_index(int if_index) { m_if_index = if_index; } + + int* m_p_n_rx_channel_fds; + ring* m_parent; + + int m_if_index; /* Interface index */ +}; + +#endif /* RING_H */ diff --git a/src/vma/dev/ring_allocation_logic.cpp b/src/vma/dev/ring_allocation_logic.cpp new file mode 100644 index 0000000..3bc7b25 --- /dev/null +++ b/src/vma/dev/ring_allocation_logic.cpp @@ -0,0 +1,257 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include +#include "vma/dev/ring_allocation_logic.h" + + +#define MODULE_NAME "ral" + +#undef MODULE_HDR_INFO +#define MODULE_HDR_INFO MODULE_NAME "%s:%d:%s() " +#undef __INFO__ +#define __INFO__ m_tostr.c_str() + +#define ral_logpanic __log_info_panic +#define ral_logerr __log_info_err +#define ral_logwarn __log_info_warn +#define ral_loginfo __log_info_info +#define ral_logdbg __log_info_dbg +#define ral_logfunc __log_info_func +#define ral_logfuncall __log_info_funcall + +ring_allocation_logic::ring_allocation_logic():m_ring_migration_ratio(0), + m_source(-1), + m_migration_try_count(0), + m_migration_candidate(0), + m_active(true), + m_res_key() {} + +ring_allocation_logic::ring_allocation_logic(ring_logic_t allocation_logic, + int ring_migration_ratio, source_t source, + resource_allocation_key &ring_profile): + m_tostr("base"), m_ring_migration_ratio(ring_migration_ratio), + m_source(source), m_migration_try_count(ring_migration_ratio) +{ + if (ring_profile.get_ring_alloc_logic() == RING_LOGIC_PER_INTERFACE && + ring_profile.get_ring_profile_key() < START_RING_INDEX) { + ring_profile.set_ring_alloc_logic(allocation_logic); + } + m_res_key = resource_allocation_key(ring_profile); + m_migration_candidate = 0; + m_res_key.set_user_id_key(calc_res_key_by_logic()); + + m_active = true; +} + +/** + * + * @return the key that is part of a unique id in rings map + */ +uint64_t ring_allocation_logic::calc_res_key_by_logic() +{ + uint64_t res_key = 0; + switch (m_res_key.get_ring_alloc_logic()) { + case RING_LOGIC_PER_INTERFACE: + res_key = 0; + if (safe_mce_sys().tcp_ctl_thread > CTL_THREAD_DISABLE) + res_key = 1; + break; + case RING_LOGIC_PER_IP: + res_key = m_source.m_ip; + break; + case RING_LOGIC_PER_SOCKET: + res_key = m_source.m_fd; + break; + case RING_LOGIC_PER_USER_ID: + res_key = m_res_key.get_user_id_key(); + break; + case RING_LOGIC_PER_THREAD: + res_key = pthread_self(); + break; + case RING_LOGIC_PER_CORE: + case RING_LOGIC_PER_CORE_ATTACH_THREADS: + res_key = sched_getcpu(); + break; + BULLSEYE_EXCLUDE_BLOCK_START + default: + //not suppose to get here + ral_logdbg("non-valid ring logic = %d", m_res_key.get_ring_alloc_logic()); + break; + BULLSEYE_EXCLUDE_BLOCK_END + } + return res_key; +} + +resource_allocation_key* ring_allocation_logic::create_new_key(in_addr_t addr, int suggested_cpu /* = NO_CPU */) +{ + if (m_res_key.get_ring_alloc_logic() == RING_LOGIC_PER_CORE_ATTACH_THREADS) { + pthread_t tid = pthread_self(); + int cpu = g_cpu_manager.reserve_cpu_for_thread(tid, suggested_cpu); + if (cpu >= 0) { + m_res_key.set_user_id_key(cpu); + return &m_res_key; + } + } + + if (m_res_key.get_ring_alloc_logic() == RING_LOGIC_PER_IP) { + m_source.m_ip = addr; + } + + m_res_key.set_user_id_key(calc_res_key_by_logic()); + return &m_res_key; +} + +/* + * return true if ring migration is recommended. + */ +bool ring_allocation_logic::should_migrate_ring() +{ + ral_logfuncall("currently accessed from thread=%lu, cpu=%d", pthread_self(), sched_getcpu()); + + if (false == m_active) { + return false; + } + + int count_max = m_ring_migration_ratio; + if (m_migration_candidate) { + count_max = CANDIDATE_STABILITY_ROUNDS; + uint64_t new_id = calc_res_key_by_logic(); + if (m_migration_candidate != new_id) { + m_migration_candidate = 0; + m_migration_try_count = 0; + return false; + } + } + + + if (m_migration_try_count < count_max) { + m_migration_try_count++; + return false; + } else { + m_migration_try_count = 0; + } + + if (!m_migration_candidate) { + // save current used allocation key + // no need to save profile, and allocation logic + uint64_t curr_id = m_res_key.get_user_id_key(); + // calc new key + uint64_t new_id = calc_res_key_by_logic(); + if (new_id == curr_id || g_n_internal_thread_id == curr_id) { + return false; + } + m_migration_candidate = new_id; + return false; + } + + ral_logdbg("migrating from ring of id=%s to ring of id=%lu", + m_res_key.to_str(), m_migration_candidate); + m_migration_candidate = 0; + + return true; +} + +cpu_manager g_cpu_manager; +__thread int g_n_thread_cpu_core = NO_CPU; + +cpu_manager::cpu_manager() +{ + reset(); +} + +void cpu_manager::reset() +{ + memset(m_cpu_thread_count, 0, sizeof(m_cpu_thread_count)); +} + +int cpu_manager::reserve_cpu_for_thread(pthread_t tid, int suggested_cpu /* = NO_CPU */) +{ + lock(); + int cpu = g_n_thread_cpu_core; + if (cpu != NO_CPU) { //already reserved + unlock(); + return cpu; + } + + cpu_set_t cpu_set; + CPU_ZERO(&cpu_set); + + int ret = pthread_getaffinity_np(tid, sizeof(cpu_set_t), &cpu_set); + if (ret) { + unlock(); + __log_err("pthread_getaffinity_np failed for tid=%lu, ret=%d (errno=%d %m)", tid, ret, errno); + return -1; + } + + int avail_cpus = CPU_COUNT(&cpu_set); + if (avail_cpus == 0) { + unlock(); + __log_err("no cpu available for tid=%lu", tid); + return -1; + } + + if (avail_cpus == 1) { //already attached + for (cpu = 0; cpu < MAX_CPU && !CPU_ISSET(cpu, &cpu_set); cpu++) {} + } else { //need to choose one cpu to attach to + int min_cpu_count = -1; + for (int i = 0, j = 0; i < MAX_CPU && j < avail_cpus; i++) { + if (!CPU_ISSET(i, &cpu_set)) continue; + j++; + if (min_cpu_count < 0 || m_cpu_thread_count[i] < min_cpu_count) { + min_cpu_count = m_cpu_thread_count[i]; + cpu = i; + } + } + if (suggested_cpu >= 0 + && CPU_ISSET(suggested_cpu, &cpu_set) + && m_cpu_thread_count[suggested_cpu] <= min_cpu_count + 1 ) { + cpu = suggested_cpu; + } + CPU_ZERO(&cpu_set); + CPU_SET(cpu, &cpu_set); + __log_dbg("attach tid=%lu running on cpu=%d to cpu=%d", tid, sched_getcpu(), cpu); + ret = pthread_setaffinity_np(tid, sizeof(cpu_set_t), &cpu_set); + if (ret) { + unlock(); + __log_err("pthread_setaffinity_np failed for tid=%lu to cpu=%d, ret=%d (errno=%d %m)", tid, cpu, ret, errno); + return -1; + } + } + + g_n_thread_cpu_core = cpu; + if (cpu > NO_CPU && cpu < MAX_CPU) + m_cpu_thread_count[cpu]++; + unlock(); + return cpu; +} diff --git a/src/vma/dev/ring_allocation_logic.h b/src/vma/dev/ring_allocation_logic.h new file mode 100644 index 0000000..6e35946 --- /dev/null +++ b/src/vma/dev/ring_allocation_logic.h @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef RING_ALLOCATION_LOGIC_H_ +#define RING_ALLOCATION_LOGIC_H_ + +#include "utils/bullseye.h" +#include "vlogger/vlogger.h" +#include "vma/dev/net_device_table_mgr.h" +#include "vma/util/sys_vars.h" +#include "vma_extra.h" + +#define CANDIDATE_STABILITY_ROUNDS 20 + +#define RAL_TOSTR(to, type, owner) {char buf[100];sprintf(buf, "[%s=%p]",(type),(owner));(to) = buf;} + +#define MAX_CPU CPU_SETSIZE +#define NO_CPU -1 + +class source_t { +public: + + int m_fd; + in_addr_t m_ip; + + source_t(int fd) : m_fd(fd), m_ip(INADDR_ANY) {} + source_t(in_addr_t ip) : m_fd(-1), m_ip(ip) {} +}; + +/** + * this class is responsible for the AL (allocation logic). + * i gets the AL from the socket\environment variable and return + * a key which represent the resource behind the allocation logic, it can + * be the cpu witch the thread runs on or the threadID... + * this key is part of the ring key configured in ring_alloc_logic_attr + */ +class ring_allocation_logic +{ +protected: + ring_allocation_logic(); + ring_allocation_logic(ring_logic_t ring_allocation_logic, + int ring_migration_ratio, source_t source, + resource_allocation_key &ring_profile); + + +public: + /* careful, you'll lose the previous key !! */ + resource_allocation_key* create_new_key(in_addr_t addr, int suggested_cpu = NO_CPU); + + resource_allocation_key* get_key() { return &m_res_key; } + + bool should_migrate_ring(); + bool is_logic_support_migration() { return m_res_key.get_ring_alloc_logic() >= RING_LOGIC_PER_THREAD && m_ring_migration_ratio > 0;} + uint64_t calc_res_key_by_logic(); + inline void enable_migration(bool active) { m_active = active; } +protected: + string m_tostr; + +private: + int m_ring_migration_ratio; + source_t m_source; + int m_migration_try_count; + uint64_t m_migration_candidate; + bool m_active; + resource_allocation_key m_res_key; +}; + +class ring_allocation_logic_rx : public ring_allocation_logic +{ +public: + ring_allocation_logic_rx():ring_allocation_logic(){} + ring_allocation_logic_rx(source_t source, resource_allocation_key &ring_profile, const void* owner): + ring_allocation_logic(safe_mce_sys().ring_allocation_logic_rx, + safe_mce_sys().ring_migration_ratio_rx, + source, ring_profile) { + RAL_TOSTR(m_tostr, "Rx", owner); + } +}; + +class ring_allocation_logic_tx : public ring_allocation_logic +{ +public: + ring_allocation_logic_tx():ring_allocation_logic(){} + ring_allocation_logic_tx(source_t source, resource_allocation_key &ring_profile, const void* owner): + ring_allocation_logic(safe_mce_sys().ring_allocation_logic_tx, + safe_mce_sys().ring_migration_ratio_tx, + source, ring_profile) { + RAL_TOSTR(m_tostr, "Tx",owner); + } +}; + + +class cpu_manager; +extern cpu_manager g_cpu_manager; + +class cpu_manager : public lock_mutex +{ +public: + cpu_manager(); + void reset(); + int reserve_cpu_for_thread(pthread_t tid, int suggested_cpu = NO_CPU); + +private: + int m_cpu_thread_count[MAX_CPU]; +}; + +#endif /* RING_ALLOCATION_LOGIC_H_ */ diff --git a/src/vma/dev/ring_bond.cpp b/src/vma/dev/ring_bond.cpp new file mode 100644 index 0000000..cd5745a --- /dev/null +++ b/src/vma/dev/ring_bond.cpp @@ -0,0 +1,881 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ring_bond.h" + +#include "vma/sock/sockinfo.h" +#include "vma/dev/ring_simple.h" +#include "vma/dev/ring_tap.h" + +#undef MODULE_NAME +#define MODULE_NAME "ring_bond" +#undef MODULE_HDR +#define MODULE_HDR MODULE_NAME "%d:%s() " + +/* Set limitation for number of rings for bonding device */ +#define MAX_NUM_RING_RESOURCES 10 + +ring_bond::ring_bond(int if_index) : + ring(), + m_lock_ring_rx("ring_bond:lock_rx"), m_lock_ring_tx("ring_bond:lock_tx") +{ + net_device_val* p_ndev = NULL; + + /* Configure ring() fields */ + set_parent(this); + set_if_index(if_index); + + /* Sanity check */ + p_ndev = g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index()); + if (NULL == p_ndev) { + ring_logpanic("Invalid if_index = %d", if_index); + } + + /* Configure ring_bond() fields */ + m_bond_rings.clear(); + m_type = p_ndev->get_is_bond(); + m_xmit_hash_policy = p_ndev->get_bond_xmit_hash_policy(); + m_max_inline_data = 0; +#ifdef DEFINED_TSO + m_max_send_sge = 0; +#endif /* DEFINED_TSO */ + + print_val(); +} + +ring_bond::~ring_bond() +{ + print_val(); + + m_rx_flows.clear(); + + ring_slave_vector_t::iterator iter = m_bond_rings.begin(); + for (; iter != m_bond_rings.end(); iter++) { + delete *iter; + } + m_bond_rings.clear(); + + if (m_p_n_rx_channel_fds) { + delete[] m_p_n_rx_channel_fds; + } +} + +void ring_bond::print_val() +{ + ring_logdbg("%d: 0x%X: parent 0x%X type %s", + m_if_index, this, + ((uintptr_t)this == (uintptr_t)m_parent ? 0 : m_parent), + "bond"); +} + +bool ring_bond::attach_flow(flow_tuple& flow_spec_5t, pkt_rcvr_sink* sink) +{ + bool ret = true; + struct flow_sink_t value = {flow_spec_5t, sink}; + + auto_unlocker lock(m_lock_ring_rx); + + /* Map flow in local map */ + m_rx_flows.push_back(value); + + for (uint32_t i = 0; i < m_bond_rings.size(); i++) { + bool step_ret = m_bond_rings[i]->attach_flow(flow_spec_5t, sink); + ret = ret && step_ret; + } + + return ret; +} + +bool ring_bond::detach_flow(flow_tuple& flow_spec_5t, pkt_rcvr_sink* sink) +{ + bool ret = true; + struct flow_sink_t value = {flow_spec_5t, sink}; + + auto_unlocker lock(m_lock_ring_rx); + + std::vector::iterator iter; + for (iter = m_rx_flows.begin(); iter != m_rx_flows.end(); iter++) { + struct flow_sink_t cur = *iter; + if ((cur.flow == value.flow) && (cur.sink == value.sink)) { + m_rx_flows.erase(iter); + break; + } + } + + for (uint32_t i = 0; i < m_bond_rings.size(); i++) { + bool step_ret = m_bond_rings[i]->detach_flow(flow_spec_5t, sink); + ret = ret && step_ret; + } + + return ret; +} + +void ring_bond::restart() +{ + net_device_val* p_ndev = + g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index()); + + if (NULL == p_ndev) { + return; + } + const slave_data_vector_t& slaves = p_ndev->get_slave_array(); + + ring_logdbg("*** ring restart! ***"); + + m_lock_ring_rx.lock(); + m_lock_ring_tx.lock(); + + if(p_ndev->get_is_bond() == net_device_val::NETVSC) { + ring_bond_netvsc* p_ring_bond_netvsc = dynamic_cast(this); + if (p_ring_bond_netvsc) { + ring_tap* p_ring_tap = dynamic_cast(p_ring_bond_netvsc->m_tap_ring); + if (p_ring_tap) { + size_t num_ring_rx_fds = 0; + int *ring_rx_fds_array = NULL; + int epfd = -1; + int fd = -1; + int rc = 0; + size_t i, j, k; + NOT_IN_USE(rc); // Suppress --enable-opt-log=high warning + + if (slaves.empty()) { + num_ring_rx_fds = p_ring_bond_netvsc->m_vf_ring->get_num_resources(); + ring_rx_fds_array = p_ring_bond_netvsc->m_vf_ring->get_rx_channel_fds(); + + for (k = 0; k < num_ring_rx_fds; k++ ) { + epfd = g_p_net_device_table_mgr->global_ring_epfd_get(); + if (epfd > 0) { + fd = ring_rx_fds_array[k]; + rc = orig_os_api.epoll_ctl(epfd, EPOLL_CTL_DEL, fd, NULL); + ring_logdbg("Remove fd=%d from epfd=%d rc=%d errno=%d", fd, epfd, rc, errno); + } + } + for (j = 0; j < m_rx_flows.size(); j++) { + sockinfo* si = static_cast (m_rx_flows[j].sink); + for (k = 0; k < num_ring_rx_fds; k++ ) { + epfd = si->get_rx_epfd(); + if (epfd > 0) { + fd = ring_rx_fds_array[k]; + rc = orig_os_api.epoll_ctl(epfd, EPOLL_CTL_DEL, fd, NULL); + ring_logdbg("Remove fd=%d from epfd=%d rc=%d errno=%d", fd, epfd, rc, errno); + } + epfd = si->get_epoll_context_fd(); + if (epfd > 0) { + fd = ring_rx_fds_array[k]; + rc = orig_os_api.epoll_ctl(epfd, EPOLL_CTL_DEL, fd, NULL); + ring_logdbg("Remove fd=%d from epfd=%d rc=%d errno=%d", fd, epfd, rc, errno); + } + } + } + + p_ring_tap->m_active = true; + p_ring_tap->inc_vf_plugouts(); + p_ring_bond_netvsc->slave_destroy(p_ring_bond_netvsc->m_vf_ring->get_if_index()); + p_ring_bond_netvsc->m_vf_ring = NULL; + p_ring_tap->set_vf_ring(NULL); + } else { + for (i = 0; i < slaves.size(); i++) { + if (slaves[i]->if_index != p_ring_tap->get_if_index()) { + p_ring_tap->m_active = false; + slave_create(slaves[i]->if_index); + p_ring_tap->set_vf_ring(p_ring_bond_netvsc->m_vf_ring); + + num_ring_rx_fds = p_ring_bond_netvsc->m_vf_ring->get_num_resources(); + ring_rx_fds_array = p_ring_bond_netvsc->m_vf_ring->get_rx_channel_fds(); + + for (k = 0; k < num_ring_rx_fds; k++ ) { + epfd = g_p_net_device_table_mgr->global_ring_epfd_get(); + if (epfd > 0) { + epoll_event ev = {0, {0}}; + fd = ring_rx_fds_array[k]; + ev.events = EPOLLIN; + ev.data.fd = fd; + rc = orig_os_api.epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev); + ring_logdbg("Add fd=%d from epfd=%d rc=%d errno=%d", fd, epfd, rc, errno); + } + } + for (j = 0; j < m_rx_flows.size(); j++) { + sockinfo* si = static_cast (m_rx_flows[j].sink); + p_ring_bond_netvsc->m_vf_ring->attach_flow(m_rx_flows[j].flow, m_rx_flows[j].sink); + for (k = 0; k < num_ring_rx_fds; k++ ) { + epfd = si->get_rx_epfd(); + if (epfd > 0) { + epoll_event ev = {0, {0}}; + fd = ring_rx_fds_array[k]; + ev.events = EPOLLIN; + ev.data.fd = fd; + rc = orig_os_api.epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev); + ring_logdbg("Add fd=%d from epfd=%d rc=%d errno=%d", fd, epfd, rc, errno); + } + epfd = si->get_epoll_context_fd(); + if (epfd > 0) { + #define CQ_FD_MARK 0xabcd /* see socket_fd_api */ + epoll_event ev = {0, {0}}; + fd = ring_rx_fds_array[k]; + ev.events = EPOLLIN | EPOLLPRI; + ev.data.u64 = (((uint64_t)CQ_FD_MARK << 32) | fd); + rc = orig_os_api.epoll_ctl(epfd, EPOLL_CTL_ADD, fd, &ev); + ring_logdbg("Add fd=%d from epfd=%d rc=%d errno=%d", fd, epfd, rc, errno); + } + } + } + break; + } + } + } + } + } + } else { + /* for active-backup mode + * It is guaranteed that the first slave is active by popup_active_rings() + */ + ring_simple* previously_active = dynamic_cast(m_bond_rings[0]); + + for (uint32_t i = 0; i < m_bond_rings.size(); i++) { + ring_simple* tmp_ring = dynamic_cast(m_bond_rings[i]); + + if (!tmp_ring) { + continue; + } + + for (uint32_t j = 0; j < slaves.size() ; j ++) { + + if (slaves[j]->if_index != m_bond_rings[i]->get_if_index()) { + continue; + } + + if (slaves[j]->active) { + ring_logdbg("ring %d active", i); + tmp_ring->start_active_qp_mgr(); + m_bond_rings[i]->m_active = true; + } else { + ring_logdbg("ring %d not active", i); + tmp_ring->stop_active_qp_mgr(); + m_bond_rings[i]->m_active = false; + } + } + } + popup_active_rings(); + + int ret = 0; + uint64_t poll_sn = cq_mgr::m_n_global_sn; + ret = request_notification(CQT_RX, poll_sn); + if (ret < 0) { + ring_logdbg("failed arming rx cq_mgr (errno=%d %m)", errno); + } + ret = request_notification(CQT_TX, poll_sn); + if (ret < 0) { + ring_logdbg("failed arming tx cq_mgr (errno=%d %m)", errno); + } + + if (m_type == net_device_val::ACTIVE_BACKUP) { + ring_simple* currently_active = dynamic_cast(m_bond_rings[0]); + if (currently_active && safe_mce_sys().cq_moderation_enable) { + if (likely(previously_active)) { + currently_active->m_cq_moderation_info.period = previously_active->m_cq_moderation_info.period; + currently_active->m_cq_moderation_info.count = previously_active->m_cq_moderation_info.count; + } + else { + currently_active->m_cq_moderation_info.period = safe_mce_sys().cq_moderation_period_usec; + currently_active->m_cq_moderation_info.count = safe_mce_sys().cq_moderation_count; + } + + currently_active->modify_cq_moderation(safe_mce_sys().cq_moderation_period_usec, safe_mce_sys().cq_moderation_count); + } + } + } + + m_lock_ring_tx.unlock(); + m_lock_ring_rx.unlock(); + + ring_logdbg("*** ring restart done! ***"); +} + +void ring_bond::adapt_cq_moderation() +{ + if (m_lock_ring_rx.trylock()) { + return ; + } + + for (uint32_t i = 0; i < m_bond_rings.size(); i++) { + if (m_bond_rings[i]->is_up()) + m_bond_rings[i]->adapt_cq_moderation(); + } + + m_lock_ring_rx.unlock(); +} + +mem_buf_desc_t* ring_bond::mem_buf_tx_get(ring_user_id_t id, bool b_block, int n_num_mem_bufs /* default = 1 */) +{ + mem_buf_desc_t* ret = NULL; + + auto_unlocker lock(m_lock_ring_tx); + ret = m_bond_rings[id]->mem_buf_tx_get(id, b_block, n_num_mem_bufs); + + return ret; +} + +int ring_bond::mem_buf_tx_release(mem_buf_desc_t* p_mem_buf_desc_list, bool b_accounting, bool trylock/*=false*/) +{ + mem_buf_desc_t* buffer_per_ring[MAX_NUM_RING_RESOURCES]; + int ret = 0; + uint32_t i = 0; + + auto_unlocker lock(m_lock_ring_tx); + + memset(buffer_per_ring, 0, sizeof(buffer_per_ring)); + ret = devide_buffers_helper(p_mem_buf_desc_list, buffer_per_ring); + + for (i = 0; i < m_bond_rings.size(); i++) { + if (buffer_per_ring[i]) { + ret += m_bond_rings[i]->mem_buf_tx_release(buffer_per_ring[i], b_accounting, trylock); + } + } + return ret; +} + +void ring_bond::mem_buf_desc_return_single_to_owner_tx(mem_buf_desc_t* p_mem_buf_desc) +{ + p_mem_buf_desc->p_desc_owner->mem_buf_desc_return_single_to_owner_tx(p_mem_buf_desc); +} + +void ring_bond::send_ring_buffer(ring_user_id_t id, vma_ibv_send_wr* p_send_wqe, vma_wr_tx_packet_attr attr) +{ + mem_buf_desc_t* p_mem_buf_desc = (mem_buf_desc_t*)(p_send_wqe->wr_id); + + auto_unlocker lock(m_lock_ring_tx); + ring_slave* active_ring = m_bond_rings[id]; + + if (is_active_member(p_mem_buf_desc->p_desc_owner, id)) { + active_ring->send_ring_buffer(id, p_send_wqe, attr); + } else { + ring_logfunc("active ring=%p, silent packet drop (%p), (HA event?)", active_ring, p_mem_buf_desc); + p_mem_buf_desc->p_next_desc = NULL; + if (likely(p_mem_buf_desc->p_desc_owner == active_ring)) { + active_ring->mem_buf_tx_release(p_mem_buf_desc, true); + } else { + mem_buf_tx_release(p_mem_buf_desc, true); + } + } +} + +void ring_bond::send_lwip_buffer(ring_user_id_t id, vma_ibv_send_wr* p_send_wqe, vma_wr_tx_packet_attr attr) +{ + mem_buf_desc_t* p_mem_buf_desc = (mem_buf_desc_t*)(p_send_wqe->wr_id); + + auto_unlocker lock(m_lock_ring_tx); + ring_slave* active_ring = m_bond_rings[id]; + + if (is_active_member(p_mem_buf_desc->p_desc_owner, id)) { + active_ring->send_lwip_buffer(id, p_send_wqe, attr); + } else { + ring_logfunc("active ring=%p, silent packet drop (%p), (HA event?)", active_ring, p_mem_buf_desc); + p_mem_buf_desc->p_next_desc = NULL; + /* no need to free the buffer here, as for lwip buffers we have 2 ref counts, */ + /* one for caller, and one for completion. for completion, we ref count in */ + /* send_lwip_buffer(). Since we are not going in, the caller will free the */ + /* buffer. */ + } +} + +bool ring_bond::get_hw_dummy_send_support(ring_user_id_t id, vma_ibv_send_wr* p_send_wqe) +{ + mem_buf_desc_t* p_mem_buf_desc = (mem_buf_desc_t*)(p_send_wqe->wr_id); + + auto_unlocker lock(m_lock_ring_tx); + ring_slave* active_ring = m_bond_rings[id]; + + if (is_active_member(p_mem_buf_desc->p_desc_owner, id)) { + return active_ring->get_hw_dummy_send_support(id, p_send_wqe); + } else { + if (likely(p_mem_buf_desc->p_desc_owner == active_ring)) { + return active_ring->get_hw_dummy_send_support(id, p_send_wqe); + } + } + + return false; +} + +int ring_bond::poll_and_process_element_rx(uint64_t* p_cq_poll_sn, void* pv_fd_ready_array /*NULL*/) +{ + if (m_lock_ring_rx.trylock()) { + errno = EAGAIN; + return 0; + } + + int temp = 0; + int ret = 0; + for (uint32_t i = 0; i < m_bond_rings.size(); i++) { + if (m_bond_rings[i]->is_up()) { + //TODO consider returning immediately after finding something, continue next time from next ring + temp = m_bond_rings[i]->poll_and_process_element_rx(p_cq_poll_sn, pv_fd_ready_array); + if (temp > 0) { + ret += temp; + } + } + } + m_lock_ring_rx.unlock(); + if (ret > 0) { + return ret; + } else { + return temp; + } +} + +int ring_bond::drain_and_proccess() +{ + if (m_lock_ring_rx.trylock()) { + errno = EAGAIN; + return 0; + } + + int temp = 0; + int ret = 0; + for (uint32_t i = 0; i < m_bond_rings.size(); i++) { + if (m_bond_rings[i]->is_up()) { + temp = m_bond_rings[i]->drain_and_proccess(); + if (temp > 0) { + ret += temp; + } + } + } + + m_lock_ring_rx.unlock(); + + if (ret > 0) { + return ret; + } else { + return temp; + } +} + +int ring_bond::wait_for_notification_and_process_element(int cq_channel_fd, uint64_t* p_cq_poll_sn, void* pv_fd_ready_array /*NULL*/) { + if(m_lock_ring_rx.trylock()) { + errno = EAGAIN; + return -1; + } + + int temp = 0; + int ret = 0; + for (uint32_t i = 0; i < m_bond_rings.size(); i++) { + if (m_bond_rings[i]->is_up()) { + temp = m_bond_rings[i]->wait_for_notification_and_process_element(cq_channel_fd, p_cq_poll_sn, pv_fd_ready_array); + if (temp > 0) { + ret += temp; + } + } + } + m_lock_ring_rx.unlock(); + if (ret > 0) { + return ret; + } else { + return temp; + } +} + +int ring_bond::request_notification(cq_type_t cq_type, uint64_t poll_sn) +{ + if (likely(CQT_RX == cq_type)) { + if (m_lock_ring_rx.trylock()) { + errno = EAGAIN; + return 1; + } + } else { + if (m_lock_ring_tx.trylock()) { + errno = EAGAIN; + return 1; + } + } + int ret = 0; + int temp; + for (uint32_t i = 0; i < m_bond_rings.size(); i++) { + if (m_bond_rings[i]->is_up()) { + temp = m_bond_rings[i]->request_notification(cq_type, poll_sn); + if (temp < 0) { + ret = temp; + break; + } else { + ret += temp; + } + } + } + if (likely(CQT_RX == cq_type)) + m_lock_ring_rx.unlock(); + else + m_lock_ring_tx.unlock(); + return ret; +} + +void ring_bond::inc_tx_retransmissions_stats(ring_user_id_t id) +{ + auto_unlocker lock(m_lock_ring_tx); + ring_slave* active_ring = m_bond_rings[id]; + if (likely(active_ring->m_active)) { + active_ring->inc_tx_retransmissions_stats(id); + } +} + +bool ring_bond::reclaim_recv_buffers(descq_t *rx_reuse) +{ + /* use this local array to avoid locking mechanizm + * for threads synchronization. So every thread should use + * own array. Set hardcoded number to meet C++11 + * VLA is not an official part of C++11. + */ + descq_t buffer_per_ring[MAX_NUM_RING_RESOURCES]; + uint32_t i = 0; + + if(m_lock_ring_rx.trylock()) { + errno = EAGAIN; + return false; + } + + devide_buffers_helper(rx_reuse, buffer_per_ring); + + for (i = 0; i < m_bond_rings.size(); i++) { + if (buffer_per_ring[i].size() > 0) { + if (!m_bond_rings[i]->reclaim_recv_buffers(&buffer_per_ring[i])) { + g_buffer_pool_rx->put_buffers_after_deref_thread_safe(&buffer_per_ring[i]); + } + } + } + + if (buffer_per_ring[m_bond_rings.size()].size() > 0) { + g_buffer_pool_rx->put_buffers_after_deref_thread_safe(&buffer_per_ring[m_bond_rings.size()]); + } + + m_lock_ring_rx.unlock(); + + return true; +} + +bool ring_bond::reclaim_recv_buffers(mem_buf_desc_t*) +{ + /* TODO: not supported */ + return false; +} + +void ring_bond::update_cap(ring_slave *slave) +{ + if (NULL == slave) { + m_max_inline_data = (uint32_t)(-1); +#ifdef DEFINED_TSO + m_max_send_sge = (uint32_t)(-1); +#endif /* DEFINED_TSO */ + return ; + } + + m_max_inline_data = (m_max_inline_data == (uint32_t)(-1) ? + slave->get_max_inline_data() : + min(m_max_inline_data, slave->get_max_inline_data())); + +#ifdef DEFINED_TSO + m_max_send_sge = (m_max_send_sge == (uint32_t)(-1) ? + slave->get_max_send_sge() : + min(m_max_send_sge, slave->get_max_send_sge())); +#endif /* DEFINED_TSO */ +} + +void ring_bond::devide_buffers_helper(descq_t *rx_reuse, descq_t* buffer_per_ring) +{ + int last_found_index = 0; + while (!rx_reuse->empty()) { + mem_buf_desc_t* buff = rx_reuse->get_and_pop_front(); + uint32_t checked = 0; + int index = last_found_index; + while (checked < m_bond_rings.size()) { + if (m_bond_rings[index] == buff->p_desc_owner) { + buffer_per_ring[index].push_back(buff); + last_found_index = index; + break; + } + checked++; + index++; + index = index % m_bond_rings.size(); + } + //no owner + if (checked == m_bond_rings.size()) { + ring_logfunc("No matching ring %p to return buffer", buff->p_desc_owner); + buffer_per_ring[m_bond_rings.size()].push_back(buff); + } + } +} + +int ring_bond::devide_buffers_helper(mem_buf_desc_t *p_mem_buf_desc_list, mem_buf_desc_t **buffer_per_ring) +{ + mem_buf_desc_t* buffers_last[MAX_NUM_RING_RESOURCES]; + mem_buf_desc_t *head, *current, *temp; + ring_slave* last_owner; + int count = 0; + int ret = 0; + + memset(buffers_last, 0, sizeof(buffers_last)); + head = p_mem_buf_desc_list; + while (head) { + last_owner = head->p_desc_owner; + current = head; + count = 1; + while(head && head->p_next_desc && head->p_next_desc->p_desc_owner == last_owner) { + head = head->p_next_desc; + count++; + } + uint32_t i = 0; + for (i = 0; i < m_bond_rings.size(); i++) { + if (m_bond_rings[i] == last_owner) { + if (buffers_last[i]) { + buffers_last[i]->p_next_desc = current; + buffers_last[i] = head; + } else { + buffer_per_ring[i] = current; + buffers_last[i] = head; + } + break; + } + } + temp = head->p_next_desc; + head->p_next_desc = NULL; + if (i == m_bond_rings.size()) { + //handle no owner + ring_logdbg("No matching ring %p to return buffer", current->p_desc_owner); + g_buffer_pool_tx->put_buffers_thread_safe(current); + ret += count; + } + + head = temp; + } + + return ret; +} + +void ring_bond::popup_active_rings() +{ + ring_slave *cur_slave = NULL; + int i, j; + + for (i = 0; i < (int)m_bond_rings.size(); i++) { + for (j = i + 1; j < (int)m_bond_rings.size(); j++) { + if (!m_bond_rings[i]->m_active && m_bond_rings[j]->m_active) { + cur_slave = m_bond_rings[i]; + m_bond_rings[i] = m_bond_rings[j]; + m_bond_rings[j] = cur_slave; + } + } + } +} + +void ring_bond::update_rx_channel_fds() +{ + if (m_p_n_rx_channel_fds) { + delete[] m_p_n_rx_channel_fds; + } + m_p_n_rx_channel_fds = new int[m_bond_rings.size()]; + for (uint32_t i = 0; i < m_bond_rings.size(); i++) { + m_p_n_rx_channel_fds[i] = m_bond_rings[i]->get_rx_channel_fds()[0]; + } +} + +bool ring_bond::is_active_member(ring_slave* rng, ring_user_id_t id) +{ + return (m_bond_rings[id] == rng && m_bond_rings[id]->m_active); +} + +bool ring_bond::is_member(ring_slave* rng) +{ + for (uint32_t i = 0; i < m_bond_rings.size(); i++) { + if (m_bond_rings[i]->is_member(rng)) { + return true; + } + } + return false; +} + +ring_user_id_t ring_bond::generate_id(const address_t src_mac, const address_t dst_mac, uint16_t eth_proto, uint16_t encap_proto, uint32_t src_ip, uint32_t dst_ip, uint16_t src_port, uint16_t dst_port) { + + if (m_type != net_device_val::LAG_8023ad) + return 0; + + ring_logdbg("generate_id for policy %d from src_mac=" ETH_HW_ADDR_PRINT_FMT ", dst_mac=" ETH_HW_ADDR_PRINT_FMT ", eth_proto=%#x, encap_proto=%#x, src_ip=%d.%d.%d.%d, dst_ip=%d.%d.%d.%d, src_port=%d, dst_port=%d", + m_xmit_hash_policy, ETH_HW_ADDR_PRINT_ADDR(src_mac), ETH_HW_ADDR_PRINT_ADDR(dst_mac), ntohs(eth_proto), ntohs(encap_proto), NIPQUAD(src_ip), NIPQUAD(dst_ip), ntohs(src_port), ntohs(dst_port)); + + uint32_t user_id = 0; + + if (m_xmit_hash_policy > net_device_val::XHP_LAYER_2_3 && eth_proto == htons(ETH_P_8021Q)) { + eth_proto = encap_proto; + } + + if (eth_proto != htons(ETH_P_IP)) { + user_id = dst_mac[5] ^ src_mac[5] ^ eth_proto; + return user_id % m_bond_rings.size(); + } + + switch (m_xmit_hash_policy) { + case(net_device_val::XHP_LAYER_2): + user_id = dst_mac[5] ^ src_mac[5] ^ eth_proto; + break; + case(net_device_val::XHP_LAYER_2_3): + case(net_device_val::XHP_ENCAP_2_3): + user_id = dst_mac[5] ^ src_mac[5] ^ eth_proto; + user_id ^= dst_ip ^ src_ip; + user_id ^= (user_id >> 16); + user_id ^= (user_id >> 8); + break; + case(net_device_val::XHP_LAYER_3_4): + case(net_device_val::XHP_ENCAP_3_4): + user_id = src_port | (dst_port << 16); + user_id ^= dst_ip ^ src_ip; + user_id ^= (user_id >> 16); + user_id ^= (user_id >> 8); + break; + default: + return ring::generate_id(); + } + + return user_id % m_bond_rings.size(); +} + +int ring_bond::modify_ratelimit(struct vma_rate_limit_t &rate_limit) { + for (uint32_t i = 0; i < m_bond_rings.size(); i++) { + if (m_bond_rings[i]) { + m_bond_rings[i]->modify_ratelimit(rate_limit); + } + } + return 0; +} + +uint32_t ring_bond::get_max_inline_data() +{ + return m_max_inline_data; +} + +#ifdef DEFINED_TSO +uint32_t ring_bond::get_max_send_sge(void) +{ + return m_max_send_sge; +} + +uint32_t ring_bond::get_max_payload_sz(void) +{ + return 0; +} + +uint16_t ring_bond::get_max_header_sz(void) +{ + return 0; +} + +bool ring_bond::is_tso(void) +{ + return false; +} +#endif /* DEFINED_TSO */ + +int ring_bond::socketxtreme_poll(struct vma_completion_t *, unsigned int, int) +{ + return 0; +} + +void ring_bond::slave_destroy(int if_index) +{ + ring_slave *cur_slave = NULL; + ring_slave_vector_t::iterator iter; + + for (iter = m_bond_rings.begin(); iter != m_bond_rings.end(); iter++) { + cur_slave = *iter; + if (cur_slave->get_if_index() == if_index) { + delete cur_slave; + m_bond_rings.erase(iter); + update_rx_channel_fds(); + break; + } + } +} + +void ring_bond_eth::slave_create(int if_index) +{ + ring_slave *cur_slave = NULL; + + cur_slave = new ring_eth(if_index, this); + update_cap(cur_slave); + m_bond_rings.push_back(cur_slave); + + if (m_bond_rings.size() > MAX_NUM_RING_RESOURCES) { + ring_logpanic("Error creating bond ring with more than %d resource", MAX_NUM_RING_RESOURCES); + } + + popup_active_rings(); + update_rx_channel_fds(); +} + +void ring_bond_ib::slave_create(int if_index) +{ + ring_slave *cur_slave = NULL; + + cur_slave = new ring_ib(if_index, this); + update_cap(cur_slave); + m_bond_rings.push_back(cur_slave); + + if (m_bond_rings.size() > MAX_NUM_RING_RESOURCES) { + ring_logpanic("Error creating bond ring with more than %d resource", MAX_NUM_RING_RESOURCES); + } + + popup_active_rings(); + update_rx_channel_fds(); +} + +void ring_bond_netvsc::slave_create(int if_index) +{ + ring_slave *cur_slave = NULL; + net_device_val* p_ndev = NULL; + + p_ndev = g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index()); + if (NULL == p_ndev) { + ring_logpanic("Error creating bond ring"); + } + + if (if_index == p_ndev->get_if_idx()) { + cur_slave = new ring_tap(if_index, this); + m_tap_ring = cur_slave; + } else { + cur_slave = new ring_eth(if_index, this); + m_vf_ring = cur_slave; + update_cap(cur_slave); + } + + m_bond_rings.push_back(cur_slave); + + if (m_bond_rings.size() > 2) { + ring_logpanic("Error creating bond ring with more than %d resource", 2); + } + + popup_active_rings(); + update_rx_channel_fds(); +} diff --git a/src/vma/dev/ring_bond.h b/src/vma/dev/ring_bond.h new file mode 100644 index 0000000..adfe556 --- /dev/null +++ b/src/vma/dev/ring_bond.h @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RING_BOND_H +#define RING_BOND_H + +#include "ring.h" + +#include "vma/dev/ring_tap.h" +#include "vma/dev/net_device_table_mgr.h" + +typedef std::vector ring_slave_vector_t; + +struct flow_sink_t { + flow_tuple flow; + pkt_rcvr_sink *sink; +}; + +class ring_bond : public ring { + +public: + ring_bond(int if_index); + virtual ~ring_bond(); + + virtual void print_val(); + + virtual int request_notification(cq_type_t cq_type, uint64_t poll_sn); + virtual int poll_and_process_element_rx(uint64_t* p_cq_poll_sn, void* pv_fd_ready_array = NULL); + virtual void adapt_cq_moderation(); + virtual bool reclaim_recv_buffers(descq_t *rx_reuse); + virtual bool reclaim_recv_buffers(mem_buf_desc_t* rx_reuse_lst); + virtual int drain_and_proccess(); + virtual int wait_for_notification_and_process_element(int cq_channel_fd, uint64_t* p_cq_poll_sn, void* pv_fd_ready_array = NULL); + virtual int get_num_resources() const { return m_bond_rings.size(); }; + virtual bool attach_flow(flow_tuple& flow_spec_5t, pkt_rcvr_sink* sink); + virtual bool detach_flow(flow_tuple& flow_spec_5t, pkt_rcvr_sink* sink); + virtual void restart(); + virtual mem_buf_desc_t* mem_buf_tx_get(ring_user_id_t id, bool b_block, int n_num_mem_bufs = 1); + virtual int mem_buf_tx_release(mem_buf_desc_t* p_mem_buf_desc_list, bool b_accounting, bool trylock = false); + virtual void inc_tx_retransmissions_stats(ring_user_id_t id); + virtual void send_ring_buffer(ring_user_id_t id, vma_ibv_send_wr* p_send_wqe, vma_wr_tx_packet_attr attr); + virtual void send_lwip_buffer(ring_user_id_t id, vma_ibv_send_wr* p_send_wqe, vma_wr_tx_packet_attr attr); + virtual void mem_buf_desc_return_single_to_owner_tx(mem_buf_desc_t* p_mem_buf_desc); + virtual bool is_member(ring_slave* rng); + virtual bool is_active_member(ring_slave* rng, ring_user_id_t id); + virtual ring_user_id_t generate_id(const address_t src_mac, const address_t dst_mac, uint16_t eth_proto, uint16_t encap_proto, uint32_t src_ip, uint32_t dst_ip, uint16_t src_port, uint16_t dst_port); + virtual bool get_hw_dummy_send_support(ring_user_id_t id, vma_ibv_send_wr* p_send_wqe); + virtual int modify_ratelimit(struct vma_rate_limit_t &rate_limit); + virtual uint32_t get_max_inline_data(); +#ifdef DEFINED_TSO + virtual uint32_t get_max_send_sge(void); + virtual uint32_t get_max_payload_sz(void); + virtual uint16_t get_max_header_sz(void); + virtual uint32_t get_tx_lkey(ring_user_id_t id) { return m_bond_rings[id]->get_tx_lkey(id); } + virtual bool is_tso(void); +#endif /* DEFINED_TSO */ + int socketxtreme_poll(struct vma_completion_t *vma_completions, unsigned int ncompletions, int flags); + virtual void slave_create(int if_index) = 0; + virtual void slave_destroy(int if_index); +protected: + void update_cap(ring_slave *slave = NULL); + void update_rx_channel_fds(); + void popup_active_rings(); + + ring_slave_vector_t m_bond_rings; + std::vector m_rx_flows; + uint32_t m_max_inline_data; +#ifdef DEFINED_TSO + uint32_t m_max_send_sge; +#endif /* DEFINED_TSO */ + +private: + void devide_buffers_helper(descq_t *rx_reuse, descq_t *buffer_per_ring); + int devide_buffers_helper(mem_buf_desc_t *p_mem_buf_desc_list, mem_buf_desc_t** buffer_per_ring); + + bool is_socketxtreme(void) { return false; } + void put_ec(struct ring_ec *ec) { NOT_IN_USE(ec); } + void del_ec(struct ring_ec *ec) { NOT_IN_USE(ec); } + struct vma_completion_t *get_comp(void) { return NULL; } + + net_device_val::bond_type m_type; + net_device_val::bond_xmit_hash_policy m_xmit_hash_policy; + lock_mutex_recursive m_lock_ring_rx; + lock_mutex_recursive m_lock_ring_tx; +}; + +class ring_bond_eth : public ring_bond +{ +public: + ring_bond_eth(int if_index): + ring_bond(if_index) { + net_device_val* p_ndev = + g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index()); + if (p_ndev) { + const slave_data_vector_t& slaves = p_ndev->get_slave_array(); + update_cap(); + for (size_t i = 0; i < slaves.size(); i++) { + slave_create(slaves[i]->if_index); + } + } + } + +protected: + virtual void slave_create(int if_index); +}; + +class ring_bond_ib : public ring_bond +{ +public: + ring_bond_ib(int if_index): + ring_bond(if_index) { + net_device_val* p_ndev = + g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index()); + if (p_ndev) { + const slave_data_vector_t& slaves = p_ndev->get_slave_array(); + update_cap(); + for (size_t i = 0; i < slaves.size(); i++) { + slave_create(slaves[i]->if_index); + } + } + } + +protected: + virtual void slave_create(int if_index); +}; + +class ring_bond_netvsc : public ring_bond +{ +public: + ring_bond_netvsc(int if_index): + ring_bond(if_index) { + net_device_val* p_ndev = + g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index()); + + m_vf_ring = NULL; + m_tap_ring = NULL; + if (p_ndev) { + const slave_data_vector_t& slaves = p_ndev->get_slave_array(); + update_cap(); + slave_create(p_ndev->get_if_idx()); + for (size_t i = 0; i < slaves.size(); i++) { + slave_create(slaves[i]->if_index); + } + + if (m_tap_ring && m_vf_ring) { + ring_tap* p_ring_tap = dynamic_cast(m_tap_ring); + if (p_ring_tap) { + p_ring_tap->set_vf_ring(m_vf_ring); + } + } + } + } + +protected: + virtual void slave_create(int if_index); + +public: + ring_slave* m_vf_ring; + ring_slave* m_tap_ring; +}; + +#endif /* RING_BOND_H */ diff --git a/src/vma/dev/ring_eth_cb.cpp b/src/vma/dev/ring_eth_cb.cpp new file mode 100644 index 0000000..4d02521 --- /dev/null +++ b/src/vma/dev/ring_eth_cb.cpp @@ -0,0 +1,716 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include + +#undef MODULE_NAME +#define MODULE_NAME "ring_eth_cb" +#undef MODULE_HDR +#define MODULE_HDR MODULE_NAME "%d:%s() " + + +#ifdef HAVE_MP_RQ + +#define DUMP_LKEY (0x700) +#define VMA_MP_MIN_LOG_STRIDES (10) +#define MAX_MP_WQES (20) // limit max used memory +#define MIN_MP_WQES (4) + +ring_eth_cb::ring_eth_cb(int if_index, vma_cyclic_buffer_ring_attr *cb_ring, + iovec *mem_desc, ring *parent): + ring_eth(if_index, parent, RING_ETH_CB, false) + ,m_curr_wqe_used_strides(0) + ,m_curr_packets(0) + ,m_padd_mode_used_strides(0) + ,m_all_wqes_used_strides(0) + ,m_packet_receive_mode(cb_ring->packet_receive_mode) + ,m_curr_wq(0) + ,m_curr_payload_addr(NULL) + ,m_curr_hdr_ptr(NULL) + ,m_res_domain(NULL) + ,m_external_mem(cb_ring->comp_mask & VMA_CB_EXTERNAL_MEM) + +{ + struct ibv_exp_res_domain_init_attr res_domain_attr; + + // check MP capabilities currently all caps are 0 due to a buf + vma_ibv_device_attr* r_ibv_dev_attr = m_p_ib_ctx->get_ibv_device_attr(); + + memset(&m_umr_wr, 0, sizeof(m_umr_wr)); + memset(m_sge_ptrs, 0, sizeof(m_sge_ptrs)); + m_p_umr_mr = NULL; + m_hdr_len = 0; + + if (!r_ibv_dev_attr->max_ctx_res_domain) { + ring_logdbg("device doesn't support resource domain"); + throw_vma_exception("device doesn't support resource domain"); + } + + struct ibv_exp_mp_rq_caps *mp_rq_caps = &r_ibv_dev_attr->mp_rq_caps; + if (!(mp_rq_caps->supported_qps & IBV_EXP_QPT_RAW_PACKET)) { + ring_logdbg("mp_rq is not supported"); + throw_vma_exception("device doesn't support RC QP"); + } + + res_domain_attr.comp_mask = IBV_EXP_RES_DOMAIN_THREAD_MODEL | + IBV_EXP_RES_DOMAIN_MSG_MODEL; + + // driver is in charge of locks + res_domain_attr.thread_model = IBV_EXP_THREAD_SAFE; + + // currently have no affect + res_domain_attr.msg_model = IBV_EXP_MSG_HIGH_BW; + + m_res_domain = ibv_exp_create_res_domain(m_p_ib_ctx->get_ibv_context(), + &res_domain_attr); + if (!m_res_domain) { + ring_logdbg("could not create resource domain"); + throw_vma_exception("failed creating resource domain"); + } + // stride size is headers + user payload aligned to power of 2 + uint16_t net_len = 0; + if (m_partition) { + net_len = ETH_VLAN_HDR_LEN + sizeof(struct iphdr) + sizeof(struct udphdr); + } else { + net_len = ETH_HDR_LEN + sizeof(struct iphdr) + sizeof(struct udphdr); + } + m_single_stride_log_num_of_bytes = ilog_2(align32pow2( + cb_ring->stride_bytes + cb_ring->hdr_bytes + net_len)); + if (m_single_stride_log_num_of_bytes < mp_rq_caps->min_single_stride_log_num_of_bytes) { + m_single_stride_log_num_of_bytes = mp_rq_caps->min_single_stride_log_num_of_bytes; + } + if (m_single_stride_log_num_of_bytes > mp_rq_caps->max_single_stride_log_num_of_bytes) { + m_single_stride_log_num_of_bytes = mp_rq_caps->max_single_stride_log_num_of_bytes; + } + m_stride_size = 1 << m_single_stride_log_num_of_bytes; + uint32_t max_wqe_size = 1 << mp_rq_caps->max_single_wqe_log_num_of_strides; + uint32_t user_req_wq = cb_ring->num / max_wqe_size; + if (user_req_wq > MIN_MP_WQES) { + m_wq_count = std::min(user_req_wq, MAX_MP_WQES); + m_single_wqe_log_num_of_strides = mp_rq_caps->max_single_wqe_log_num_of_strides; + } else { + m_wq_count = MIN_MP_WQES; + m_single_wqe_log_num_of_strides = ilog_2(align32pow2(cb_ring->num) / m_wq_count); + if (m_single_wqe_log_num_of_strides < VMA_MP_MIN_LOG_STRIDES) { + m_single_wqe_log_num_of_strides = VMA_MP_MIN_LOG_STRIDES; + } + if (m_single_wqe_log_num_of_strides > mp_rq_caps->max_single_wqe_log_num_of_strides) { + m_single_wqe_log_num_of_strides = mp_rq_caps->max_single_wqe_log_num_of_strides; + } + } + m_strides_num = 1 << m_single_wqe_log_num_of_strides; + ring_logdbg("using strides_num %d stride size %d, wqe_count %d stride_bytes " + "%d, hdr_bytes %d num %d rec mode %d", m_strides_num, m_stride_size, + m_wq_count, cb_ring->stride_bytes, cb_ring->hdr_bytes, cb_ring->num, + m_packet_receive_mode); + + memset(&m_curr_hw_timestamp, 0, sizeof(m_curr_hw_timestamp)); + if (m_packet_receive_mode == PADDED_PACKET) { + size_t buffer_size = m_stride_size * m_strides_num * m_wq_count; + m_sge_ptrs[CB_UMR_PAYLOAD] = (uint64_t)allocate_memory(mem_desc, buffer_size); + if (unlikely(!m_sge_ptrs[CB_UMR_PAYLOAD])) { + throw_vma_exception("user provided to small memory"); + } + m_buff_data.addr = m_sge_ptrs[CB_UMR_PAYLOAD]; + m_buff_data.length = m_stride_size * m_strides_num; + m_buff_data.lkey = get_mem_lkey(m_p_ib_ctx); + m_packet_size = cb_ring->stride_bytes + net_len; + m_payload_len = m_stride_size; + if (unlikely(m_buff_data.lkey == (uint32_t)(-1))) { + ring_logerr("got invalid lkey for memory %p size %zd", + mem_desc->iov_base, mem_desc->iov_len); + throw_vma_exception("failed retrieving lkey"); + } + ring_logdbg("using buffer size %zd", buffer_size); + } else if (allocate_umr_mem(cb_ring, mem_desc, net_len)) { + ring_logerr("failed creating UMR QP"); + throw_vma_exception("failed creating UMR QP"); + } + + /* Complete resources initialization */ + ring_simple::create_resources(); +} + +void* ring_eth_cb::allocate_memory(iovec *mem_desc, size_t buffer_size) +{ + if (mem_desc && mem_desc->iov_len) { + if (unlikely(mem_desc->iov_len < buffer_size)) { + ring_logerr("user provided to small memory " + "expected %zd but got %zd", + buffer_size, mem_desc->iov_len); + errno = EINVAL; + return NULL; + } + return m_alloc.alloc_and_reg_mr(mem_desc->iov_len, m_p_ib_ctx, + mem_desc->iov_base); + } else { + return m_alloc.alloc_and_reg_mr(buffer_size, m_p_ib_ctx); + } +} + +qp_mgr* ring_eth_cb::create_qp_mgr(const ib_ctx_handler *ib_ctx, + uint8_t port_num, + struct ibv_comp_channel *p_rx_comp_event_channel) +{ + return new qp_mgr_mp(this, ib_ctx, port_num, p_rx_comp_event_channel, + get_tx_num_wr(), m_partition, m_buff_data, + m_external_mem); +} + +int ring_eth_cb::get_mem_info(ibv_sge &mem_info) +{ + if (!m_buff_data.addr) { + ring_logwarn("no valid memory to return"); + return -1; + } + mem_info.addr = m_buff_data.addr; + mem_info.length = m_buff_data.length; + mem_info.lkey = m_buff_data.lkey; + ring_logdbg("returning ptr %p, legnth %zd, lkey %u", mem_info.addr, + mem_info.length, mem_info.lkey); + return 0; +} + +/** + * allocate and set UMR addresses + * @return 0 on success -1 on failure + * @note when using UMR memory appears in VMA as follows + * +----------------------------+ + * | WQE0 network headers | + * | WQE1 network headers | + * | ... | + * | WQE0 user headers | + * | WQE1 user headers | + * | ... | + * | WQE0 payload | + * | WQE1 payload | + * | ... | + * | WQE0 padding | + * | WQE1 padding | + * | ... | + * +----------------------------+ + */ +int ring_eth_cb::allocate_umr_mem(vma_cyclic_buffer_ring_attr *cb_ring, + iovec *mem_desc, + uint16_t net_len) +{ + ibv_exp_create_mr_in mrin; + ibv_exp_mem_repeat_block* p_mem_rep_list = NULL; + ibv_mr* mr = NULL, *dump_mr; + size_t curr_data_len = 0, packet_len, pad_len, buffer_size; + size_t packets_num = m_strides_num * m_wq_count; + uint64_t base_ptr, prev_addr, pad_addr; + int index = 0, count = 1, umr_blocks; + const int ndim = 1; // we only use one dimension see UMR docs + int retval = 0; + + // the min mr is two one for padding and one for data + umr_blocks = 2; + if ((cb_ring->comp_mask & VMA_CB_HDR_BYTE) && cb_ring->hdr_bytes && + m_packet_receive_mode == RAW_PACKET) { + ring_logwarn("bad parameters!, you cannot choose " + "RAW_PACKET and define user header " + "the header\n"); + return -1; + } + + if (m_packet_receive_mode != RAW_PACKET) { + umr_blocks++; // add user_hd\netwrok_hdr + if ((cb_ring->comp_mask & VMA_CB_HDR_BYTE) && + cb_ring->hdr_bytes && + m_packet_receive_mode == STRIP_NETWORK_HDRS) { + umr_blocks++; // strip network hdr + } + } + + p_mem_rep_list = new(std::nothrow) ibv_exp_mem_repeat_block[umr_blocks](); + if (p_mem_rep_list == NULL) { + ring_logwarn("failed allocating memory"); + errno = ENOMEM; + return -1; + } + for (int i = 0; i < umr_blocks; i++) { + p_mem_rep_list[i].byte_count = new(std::nothrow) size_t[ndim]; + p_mem_rep_list[i].stride = new(std::nothrow) size_t[ndim]; + if (p_mem_rep_list[i].byte_count == NULL || + p_mem_rep_list[i].stride == NULL) { + ring_logwarn("failed allocating memory"); + errno = ENOMEM; + retval = -1; + goto cleanup; + } + } + + m_payload_len = cb_ring->stride_bytes; + m_hdr_len = cb_ring->hdr_bytes; + m_packet_size = m_payload_len + m_hdr_len + net_len; + + // in case stride smaller then packet size + while ((m_stride_size * count) <= m_packet_size) { + ++count; + } + // no need to allocate padding + pad_len = (m_stride_size * count) - m_packet_size; + // allocate buffer + if (m_packet_receive_mode == STRIP_NETWORK_HDRS) { + buffer_size = (m_packet_size - net_len) * packets_num; + } else { + buffer_size = m_packet_size * packets_num; + } + // will raise an exception on failure + base_ptr = (uint64_t)allocate_memory(mem_desc, buffer_size); + if (unlikely(!base_ptr)) { + goto cleanup; + } + ring_logdbg("using buffer parameters, buffer_size %zd " + "pad len %d packet size %d stride size %d", + buffer_size, pad_len, m_packet_size, m_stride_size); + prev_addr = base_ptr; + mr = m_alloc.find_ibv_mr_by_ib_ctx(m_p_ib_ctx); + // redmine.mellanox.com/issues/1379468 + pad_addr = (uint64_t)m_dump_mr.alloc_and_reg_mr(128, m_p_ib_ctx); + dump_mr = m_dump_mr.find_ibv_mr_by_ib_ctx(m_p_ib_ctx); + BULLSEYE_EXCLUDE_BLOCK_START + if (unlikely(mr == NULL || dump_mr == NULL)) { + ring_logerr("could not find mr %p, dump mr %p", mr, dump_mr); + retval = -1; + goto cleanup; + } + BULLSEYE_EXCLUDE_BLOCK_END + // no problem overriding lkey since deregmr is not using it + dump_mr->lkey = DUMP_LKEY; + packet_len = net_len; + switch (m_packet_receive_mode) { + case RAW_PACKET: + packet_len += m_payload_len; + // for size calculation in read_cyclic + m_payload_len = packet_len; + m_sge_ptrs[CB_UMR_PAYLOAD] = base_ptr; + p_mem_rep_list[index].base_addr = base_ptr; + p_mem_rep_list[index].byte_count[0] = packet_len; + p_mem_rep_list[index].stride[0] = packet_len; + p_mem_rep_list[index].mr = mr; + index++; + break; + case STRIP_NETWORK_HDRS: + // network not accessible to application + p_mem_rep_list[index].base_addr = pad_addr; + p_mem_rep_list[index].byte_count[0] = net_len; + // optimize write header to the same physical address + p_mem_rep_list[index].stride[0] = 0; + p_mem_rep_list[index].mr = dump_mr; + index++; + if (m_hdr_len) { + p_mem_rep_list[index].base_addr = base_ptr; + p_mem_rep_list[index].byte_count[0] = m_hdr_len; + p_mem_rep_list[index].stride[0] = m_hdr_len; + p_mem_rep_list[index].mr = mr; + m_sge_ptrs[CB_UMR_HDR] = base_ptr; + curr_data_len = packets_num * m_hdr_len; + prev_addr += curr_data_len; + index++; + } + p_mem_rep_list[index].base_addr = prev_addr; + p_mem_rep_list[index].byte_count[0] = m_payload_len; + p_mem_rep_list[index].stride[0] = m_payload_len; + p_mem_rep_list[index].mr = mr; + m_sge_ptrs[CB_UMR_PAYLOAD] = prev_addr; + index++; + break; + case SEPERATE_NETWORK_HDRS: + if (m_hdr_len) { + packet_len += m_hdr_len; + // for size calculation in read_cyclic + m_hdr_len = packet_len; + } else { + m_hdr_len = net_len; + } + p_mem_rep_list[index].base_addr = base_ptr; + p_mem_rep_list[index].byte_count[0] = packet_len; + p_mem_rep_list[index].stride[0] = packet_len; + p_mem_rep_list[index].mr = mr; + m_sge_ptrs[CB_UMR_HDR] = base_ptr; + curr_data_len = packets_num * packet_len; + prev_addr += curr_data_len; + index++; + p_mem_rep_list[index].base_addr = prev_addr; + p_mem_rep_list[index].byte_count[0] = m_payload_len; + p_mem_rep_list[index].stride[0] = m_payload_len; + p_mem_rep_list[index].mr = mr; + m_sge_ptrs[CB_UMR_PAYLOAD] = prev_addr; + index++; + break; + default: + ring_logpanic("bad packet_receive_mode\n"); + } + // use base_ptr as base_addr to corrupt user data and prevent stack + // corruption in case of unexpected big packet + p_mem_rep_list[index].base_addr = pad_addr; + p_mem_rep_list[index].byte_count[0] = pad_len; + p_mem_rep_list[index].stride[0] = 0; + p_mem_rep_list[index].mr = dump_mr; + + // allocate empty lkey + memset(&mrin, 0, sizeof(mrin)); + mrin.pd = m_p_ib_ctx->get_ibv_pd(); + mrin.attr.create_flags = IBV_EXP_MR_INDIRECT_KLMS; + mrin.attr.exp_access_flags = IBV_EXP_ACCESS_LOCAL_WRITE; + mrin.attr.max_klm_list_size = umr_blocks; + m_p_umr_mr = ibv_exp_create_mr(&mrin); + BULLSEYE_EXCLUDE_BLOCK_START + if (!m_p_umr_mr) { + ring_logdbg("Failed creating mr %m", errno); + retval = -1; + goto cleanup; + } + BULLSEYE_EXCLUDE_BLOCK_END + memset(&m_umr_wr, 0, sizeof(m_umr_wr)); + m_umr_wr.ext_op.umr.umr_type = IBV_EXP_UMR_REPEAT; + m_umr_wr.ext_op.umr.mem_list.rb.mem_repeat_block_list = p_mem_rep_list; + m_umr_wr.ext_op.umr.mem_list.rb.stride_dim = ndim; + m_umr_wr.ext_op.umr.mem_list.rb.repeat_count = &packets_num; + m_umr_wr.exp_send_flags = IBV_EXP_SEND_INLINE; + m_umr_wr.ext_op.umr.exp_access = IBV_EXP_ACCESS_LOCAL_WRITE; + m_umr_wr.ext_op.umr.modified_mr = m_p_umr_mr; + m_umr_wr.ext_op.umr.base_addr = (uint64_t)mr->addr; + m_umr_wr.ext_op.umr.num_mrs = umr_blocks; + m_umr_wr.exp_send_flags |= IBV_EXP_SEND_SIGNALED; + m_umr_wr.exp_opcode = IBV_EXP_WR_UMR_FILL; + + if (!m_p_ib_ctx->post_umr_wr(m_umr_wr)) { + ring_logerr("Failed in ibv_exp_post_send IBV_EXP_WR_UMR_FILL\n"); + // prevent removal + m_umr_wr.exp_opcode = IBV_EXP_WR_NOP; + retval = -1; + goto cleanup; + } + + m_buff_data.addr = m_umr_wr.ext_op.umr.base_addr; + m_buff_data.length = m_stride_size * m_strides_num; + m_buff_data.lkey = m_p_umr_mr->lkey; +cleanup: + for (int i = 0; i < umr_blocks; i++) { + if (p_mem_rep_list[i].stride) { + delete[] p_mem_rep_list[i].stride; + p_mem_rep_list[i].stride = NULL; + } + if (p_mem_rep_list[i].byte_count) { + delete[] p_mem_rep_list[i].byte_count; + p_mem_rep_list[i].byte_count = NULL; + } + } + + delete[] p_mem_rep_list; + p_mem_rep_list = NULL; + + if (retval == -1) { + remove_umr_res(); + } + return retval; +} + +void ring_eth_cb::remove_umr_res() +{ + if (m_umr_wr.exp_opcode == IBV_EXP_WR_UMR_FILL) { + m_umr_wr.exp_opcode = IBV_EXP_WR_UMR_INVALIDATE; + if (m_p_ib_ctx->post_umr_wr(m_umr_wr)) { + ring_logdbg("Releasing UMR failed\n"); + } + } + + if (m_p_umr_mr) { + ibv_dereg_mr(m_p_umr_mr); + m_p_umr_mr = NULL; + } + ring_logdbg("UMR resources removed\n"); +} + +int ring_eth_cb::drain_and_proccess() +{ + return 0; +} + +int ring_eth_cb::poll_and_process_element_rx(uint64_t* p_cq_poll_sn, + void* pv_fd_ready_array) +{ + NOT_IN_USE(p_cq_poll_sn); + NOT_IN_USE(pv_fd_ready_array); + return 0; +} + +/** + * loop poll_cq + * @param limit + * @return TBD about -1 on error, + * 0 if cq is empty + * 1 if done looping + * 2 if need to return due to WQ or filler + */ +inline mp_loop_result ring_eth_cb::mp_loop_padded(size_t limit) +{ + struct mlx5_cqe64 *cqe64; + uint16_t size = 0; + uint32_t flags = 0, used_strides = 0; + + while (m_curr_packets < limit) { + int ret = ((cq_mgr_mp *)m_p_cq_mgr_rx)->poll_mp_cq(size, used_strides, + flags, cqe64); + if (size == 0) { + ring_logfine("no packet found"); + return MP_LOOP_DRAINED; + } + if (unlikely(ret == -1)) { + ring_logdbg("poll_mp_cq failed with errno %m", errno); + return MP_LOOP_RETURN_TO_APP; + } + m_curr_wqe_used_strides += used_strides; + if (unlikely(flags & VMA_MP_RQ_BAD_PACKET)) { + if (m_curr_wqe_used_strides >= m_strides_num) { + reload_wq(); + } + return MP_LOOP_RETURN_TO_APP; + } + m_padd_mode_used_strides += used_strides; + m_p_ring_stat->n_rx_pkt_count++; + m_p_ring_stat->n_rx_byte_count += size; + ++m_curr_packets; + if (unlikely(m_curr_wqe_used_strides >= m_strides_num)) { + if (reload_wq()) { + return MP_LOOP_RETURN_TO_APP; + } + } + } + ring_logfine("mp_loop finished all iterations"); + return MP_LOOP_LIMIT; +} + +/** + * loop poll_cq + * @param limit + * @return TBD about -1 on error, + * 0 if cq is empty + * 1 if done looping + * 2 if need to return due to WQ or filler + */ +inline mp_loop_result ring_eth_cb::mp_loop(size_t limit) +{ + struct mlx5_cqe64 *cqe64; + uint16_t size = 0; + uint32_t flags = 0, used_strides = 0; + + while (m_curr_packets < limit) { + int ret = ((cq_mgr_mp *)m_p_cq_mgr_rx)->poll_mp_cq(size, used_strides, + flags, cqe64); + if (size == 0) { + ring_logfine("no packet found"); + return MP_LOOP_DRAINED; + } + if (unlikely(ret == -1)) { + ring_logdbg("poll_mp_cq failed with errno %m", errno); + return MP_LOOP_RETURN_TO_APP; + } + m_curr_wqe_used_strides += used_strides; + if (unlikely(size > m_packet_size)) { + errno = EMSGSIZE; + ring_logerr("got unexpected packet size, expected " + "packet size %u but got %d, user data is " + "corrupted", m_packet_size, size); + return MP_LOOP_RETURN_TO_APP; + } + if (unlikely(flags & VMA_MP_RQ_BAD_PACKET)) { + if (m_curr_wqe_used_strides >= m_strides_num) { + reload_wq(); + } + return MP_LOOP_RETURN_TO_APP; + } + m_p_ring_stat->n_rx_pkt_count++; + m_p_ring_stat->n_rx_byte_count += size; + ++m_curr_packets; + if (unlikely(m_curr_wqe_used_strides >= m_strides_num)) { + if (reload_wq()) { + return MP_LOOP_RETURN_TO_APP; + } + } + } + ring_logfine("mp_loop finished all iterations"); + return MP_LOOP_LIMIT; +} + +/* + * all WQE are contagious in memory so we need to return to the user + * true if last WQE was posted so we're at the end of the buffer + * + */ +inline bool ring_eth_cb::reload_wq() +{ + // in current implementation after each WQe is used by the HW + // the ring reloads it to the HW again that why 1 is used + ((cq_mgr_mp *)m_p_cq_mgr_rx)->update_dbell(); + ((qp_mgr_mp *)m_p_qp_mgr)->post_recv(m_curr_wq, 1); + m_curr_wq = (m_curr_wq + 1) % m_wq_count; + m_curr_wqe_used_strides = 0; + if (m_curr_wq == 0) { + m_all_wqes_used_strides = 0; + return true; + } + m_all_wqes_used_strides += m_strides_num; + return false; +} + +int ring_eth_cb::cyclic_buffer_read(vma_completion_cb_t &completion, + size_t min, size_t max, int flags) +{ + uint32_t poll_flags = 0, used_strides = 0; + uint16_t size; + struct mlx5_cqe64 *cqe64; + + // sanity check + if (unlikely(min > max || max == 0 || flags != MSG_DONTWAIT)) { + errno = EINVAL; + ring_logdbg("Illegal values, got min: %d, max: %d, flags %d", + min, max, flags); + if (flags != MSG_DONTWAIT) { + ring_logdbg("only %d flag is currently supported", + MSG_DONTWAIT); + } + return -1; + } + int prev_used_strides = m_curr_wqe_used_strides; + int ret = ((cq_mgr_mp *)m_p_cq_mgr_rx)->poll_mp_cq(size, used_strides, + poll_flags, cqe64); + // empty + if (size == 0) { + return 0; + } + + if (m_packet_receive_mode != PADDED_PACKET && + unlikely(size > m_packet_size)) { + errno = EMSGSIZE; + ring_logerr("got unexpected packet size, expected " + "packet size %u but got %d, user data is " + "corrupted", m_packet_size, size); + return -1; + } + if (unlikely(ret == -1)) { + ring_logdbg("poll_mp_cq failed with errno %m", errno); + return -1; + } + m_curr_wqe_used_strides += used_strides; + m_padd_mode_used_strides += used_strides; + // set it here because we might not have min packets avail in this run + if (likely(!(poll_flags & VMA_MP_RQ_BAD_PACKET))) { + m_p_ring_stat->n_rx_pkt_count++; + m_p_ring_stat->n_rx_byte_count += size; + if (unlikely(m_curr_payload_addr == NULL)) { + // data is in calculated UMR location array + + // number of strides in old WQEs (e.g. first WQE that was already consumed) + + // number of used strides in current WQE + prev_used_strides += m_all_wqes_used_strides; + m_curr_payload_addr = (void *)(m_sge_ptrs[CB_UMR_PAYLOAD] + + (uint32_t)m_payload_len * prev_used_strides); + m_curr_hdr_ptr = (void *)(m_sge_ptrs[CB_UMR_HDR] + + (uint32_t)m_hdr_len * prev_used_strides); + if (completion.comp_mask & VMA_CB_MASK_TIMESTAMP) { + convert_hw_time_to_system_time(ntohll(cqe64->timestamp), + &m_curr_hw_timestamp); + } + m_curr_packets = 1; + } else { + m_curr_packets++; + } + bool return_to_app = false; + if (unlikely(m_curr_wqe_used_strides >= m_strides_num)) { + return_to_app = reload_wq(); + } + if (!return_to_app) { + if (m_packet_receive_mode == PADDED_PACKET) { + ret = mp_loop_padded(min); + if (ret == MP_LOOP_LIMIT) { // there might be more to drain + mp_loop_padded(max); + } + } else { + ret = mp_loop(min); + if (ret == MP_LOOP_LIMIT) { // there might be more to drain + mp_loop(max); + } + } + if (ret == MP_LOOP_DRAINED) { // no packets left + ((cq_mgr_mp *)m_p_cq_mgr_rx)->update_max_drain(m_curr_packets); + return 0; + } + } + } + ((cq_mgr_mp *)m_p_cq_mgr_rx)->update_max_drain(m_curr_packets); + completion.payload_ptr = m_curr_payload_addr; + if (m_packet_receive_mode == PADDED_PACKET) { + // support packet taking more then one stride + completion.payload_length = m_padd_mode_used_strides * m_stride_size; + } else { + completion.payload_length = m_payload_len * m_curr_packets; + } + completion.packets = m_curr_packets; + completion.usr_hdr_ptr = m_curr_hdr_ptr; + completion.usr_hdr_ptr_length = m_hdr_len * m_curr_packets; + // hw_timestamp of first packet in batch + completion.hw_timestamp = m_curr_hw_timestamp; + m_curr_payload_addr = 0; + m_padd_mode_used_strides = 0; + ring_logdbg("Returning completion, buffer ptr %p, data size %zd, " + "usr hdr ptr %p usr hdr size %zd, number of packets %zd curr wqe idx %d", + completion.payload_ptr, completion.payload_length, + completion.usr_hdr_ptr, completion.usr_hdr_ptr_length, + m_curr_packets, m_curr_wq); + return 0; +} + +ring_eth_cb::~ring_eth_cb() +{ + struct ibv_exp_destroy_res_domain_attr attr; + + m_lock_ring_rx.lock(); + flow_udp_del_all(); + flow_tcp_del_all(); + m_lock_ring_rx.unlock(); + + memset(&attr, 0, sizeof(attr)); + int res = ibv_exp_destroy_res_domain(m_p_ib_ctx->get_ibv_context(), + m_res_domain, &attr); + if (res) { + ring_logdbg("call to ibv_exp_destroy_res_domain returned %d", res); + } + + remove_umr_res(); +} +#endif /* HAVE_MP_RQ */ + diff --git a/src/vma/dev/ring_eth_cb.h b/src/vma/dev/ring_eth_cb.h new file mode 100644 index 0000000..8601c48 --- /dev/null +++ b/src/vma/dev/ring_eth_cb.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef SRC_VMA_DEV_RING_ETH_CB_H_ +#define SRC_VMA_DEV_RING_ETH_CB_H_ + +#include +#include + +#ifdef HAVE_MP_RQ + +#define VMA_MP_RQ_BAD_PACKET (1 << 31) // last bit + +enum mp_loop_result { + MP_LOOP_DRAINED, + MP_LOOP_LIMIT, + MP_LOOP_RETURN_TO_APP, +}; + +enum RING_CB_UMR_ALLOC_IDX { + CB_UMR_HDR = 0, + CB_UMR_PAYLOAD, + CB_UMR_LAST +}; + +class cq_mgr_mp; + +class ring_eth_cb : public ring_eth +{ +public: + ring_eth_cb(int if_index, + vma_cyclic_buffer_ring_attr *mp_ring, iovec *mem_sec = NULL, + ring *parent = NULL); + virtual ~ring_eth_cb(); + ibv_exp_res_domain* get_res_domain() const {return m_res_domain;}; + uint32_t get_wq_count() const {return m_wq_count;}; + uint8_t get_single_wqe_log_num_of_strides() const {return m_single_wqe_log_num_of_strides;}; + uint32_t get_strides_num() const {return m_strides_num;}; + uint8_t get_single_stride_log_num_of_bytes() const {return m_single_stride_log_num_of_bytes;}; + uint32_t get_stride_size() const {return m_stride_size;}; + uint32_t get_mem_lkey(ib_ctx_handler* ib_ctx) const {return m_alloc.find_lkey_by_ib_ctx(ib_ctx);} + virtual int drain_and_proccess(); + virtual int poll_and_process_element_rx(uint64_t* p_cq_poll_sn, void* pv_fd_ready_array = NULL); + int get_mem_info(ibv_sge &mem_info); + int cyclic_buffer_read(vma_completion_cb_t &completion, + size_t min, size_t max, int flags); + void* allocate_memory(iovec *mem_desc, size_t buffer_size); +protected: + virtual qp_mgr* create_qp_mgr(const ib_ctx_handler* ib_ctx, + uint8_t port_num, + struct ibv_comp_channel* p_rx_comp_event_channel); +private: + uint32_t m_curr_wqe_used_strides; + size_t m_curr_packets; + uint32_t m_padd_mode_used_strides; + uint16_t m_packet_size; + // These members are used to store intermediate results before + // returning from the user's call to get the data. + uint32_t m_strides_num; + uint32_t m_stride_size; + uint32_t m_all_wqes_used_strides; + uint8_t m_single_wqe_log_num_of_strides; + uint8_t m_single_stride_log_num_of_bytes; + vma_cb_packet_rec_mode m_packet_receive_mode; + uint16_t m_wq_count; + uint16_t m_curr_wq; + void* m_curr_payload_addr; + void* m_curr_hdr_ptr; + uint64_t m_sge_ptrs[CB_UMR_LAST]; + uint16_t m_hdr_len; // calculate user header offset in buffer + uint16_t m_payload_len; // calculate payload offset in buffer + ibv_sge m_buff_data; + struct timespec m_curr_hw_timestamp; + vma_allocator m_alloc; + vma_allocator m_dump_mr; + struct ibv_exp_send_wr m_umr_wr; + struct ibv_exp_res_domain* m_res_domain; + struct ibv_mr* m_p_umr_mr; + bool m_external_mem; + inline mp_loop_result mp_loop(size_t limit); + inline mp_loop_result mp_loop_padded(size_t limit); + inline bool reload_wq(); + int allocate_umr_mem(vma_cyclic_buffer_ring_attr *cb_ring, + iovec *mem_desc, uint16_t net_len); + void remove_umr_res(); +}; + +#endif /* HAVE_MP_RQ */ +#endif /* SRC_VMA_DEV_RING_ETH_CB_H_ */ diff --git a/src/vma/dev/ring_eth_direct.cpp b/src/vma/dev/ring_eth_direct.cpp new file mode 100644 index 0000000..de32ba8 --- /dev/null +++ b/src/vma/dev/ring_eth_direct.cpp @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ring_eth_direct.h" +#include "qp_mgr_eth_direct.h" + + +#undef MODULE_NAME +#define MODULE_NAME "ring_direct" +#undef MODULE_HDR +#define MODULE_HDR MODULE_NAME "%d:%s() " + + +ring_eth_direct::ring_eth_direct(int if_index, + vma_external_mem_attr *ext_ring_attr, ring *parent): + ring_eth(if_index, + parent, RING_ETH_DIRECT, false) +{ + m_ring_attr.comp_mask = ext_ring_attr->comp_mask; + + /* Complete resources initialization */ + ring_simple::create_resources(); +} + +qp_mgr* ring_eth_direct::create_qp_mgr(const ib_ctx_handler* ib_ctx, + uint8_t port_num, + struct ibv_comp_channel* p_rx_comp_event_channel) +{ +#if defined(DEFINED_DIRECT_VERBS) + return new qp_mgr_eth_direct(this, ib_ctx, port_num, p_rx_comp_event_channel, + get_tx_num_wr(), m_partition); +#endif + NOT_IN_USE(ib_ctx); + NOT_IN_USE(port_num); + NOT_IN_USE(p_rx_comp_event_channel); + return NULL; +} + +void ring_eth_direct::init_tx_buffers(uint32_t count) +{ + NOT_IN_USE(count); +} + +mem_buf_desc_t* ring_eth_direct::mem_buf_tx_get(ring_user_id_t id, bool b_block, + int n_num_mem_bufs) +{ + NOT_IN_USE(id); + NOT_IN_USE(b_block); + NOT_IN_USE(n_num_mem_bufs); + return NULL; +} + +int ring_eth_direct::drain_and_proccess(cq_type_t cq_type) +{ + NOT_IN_USE(cq_type); + return 0; +} + +int ring_eth_direct::poll_and_process_element_rx(uint64_t* p_cq_poll_sn, + void* pv_fd_ready_array) +{ + NOT_IN_USE(p_cq_poll_sn); + NOT_IN_USE(pv_fd_ready_array); + return 0; +} + +int ring_eth_direct::reg_mr(void *addr, size_t length, uint32_t &lkey) +{ + ring_logdbg("reg_mr()"); + if (unlikely(addr == NULL) || length == 0) { + ring_logdbg("address is %p length is %zd", addr, length); + errno = EINVAL; + return -1; + } + auto_unlocker lock(m_lock_ring_tx); + + addr_len_mr_map_t::iterator it = m_mr_map.find(pair_void_size_t(addr, length)); + if (unlikely(it != m_mr_map.end())) { + ring_logdbg("memory %p is already registered with length %zd", + addr, length); + lkey = it->second.first; + it->second.second++; + return 0; + } + lkey = m_p_ib_ctx->mem_reg(addr, length, VMA_IBV_ACCESS_LOCAL_WRITE); + if (lkey == (uint32_t)-1) { + ring_logdbg("failed registering MR"); + return -1; + } + ring_logdbg("registered memory as lkey:%u addr ptr %p length %zd", + lkey, addr, length); + m_mr_map[pair_void_size_t(addr, length)] = pair_mr_ref_t(lkey, 1); + return 0; +} + +int ring_eth_direct::dereg_mr(void *addr, size_t length) +{ + auto_unlocker lock(m_lock_ring_tx); + pair_void_size_t p(addr, length); + + addr_len_mr_map_t::iterator it = m_mr_map.find(p); + if (unlikely(it == m_mr_map.end())) { + ring_logdbg("could not find mr in map, addr is %p, length is %zd", + addr, length); + return -1; + } + if (it->second.second > 1) { + it->second.second--; + ring_logdbg("decreased ref count to %d",it->second.second); + return 0; + } + uint32_t lkey = it->second.first; + ring_logdbg("deregistered memory as lkey:%u addr %p length %zd", + lkey, addr, length); + m_p_ib_ctx->mem_dereg(lkey); + m_mr_map.erase(p); + return 0; +} + +ring_eth_direct::~ring_eth_direct() +{ + addr_len_mr_map_t::iterator it = m_mr_map.begin(); + + for (;it != m_mr_map.end();it++) { + ring_logwarn("resource leak! registered memory was not released," + " addr %p, lenght %zd",it->first.first, + it->first.second); + } + m_mr_map.clear(); +} + diff --git a/src/vma/dev/ring_eth_direct.h b/src/vma/dev/ring_eth_direct.h new file mode 100644 index 0000000..7695fc7 --- /dev/null +++ b/src/vma/dev/ring_eth_direct.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef SRC_VMA_DEV_RING_ETH_DIRECT_H_ +#define SRC_VMA_DEV_RING_ETH_DIRECT_H_ + +#include +#include "dev/ring_simple.h" + + +typedef std::pair pair_void_size_t; +typedef std::pair pair_mr_ref_t; +namespace std { namespace tr1 { +template<> +class hash +{ +public: + size_t operator()(const pair_void_size_t &key) const + { + hash_hash; + return _hash((uint64_t)key.first ^ key.second); + } +}; +}} + +typedef std::tr1::unordered_map addr_len_mr_map_t; + +class ring_eth_direct : public ring_eth +{ +public: + ring_eth_direct(int if_index, + vma_external_mem_attr *ext_ring_attr, ring *parent = NULL); + virtual ~ring_eth_direct(); + virtual qp_mgr* create_qp_mgr(const ib_ctx_handler* ib_ctx, + uint8_t port_num, + struct ibv_comp_channel* p_rx_comp_event_channel); + // memory handler + virtual int reg_mr(void *addr, size_t length, uint32_t &lkey); + virtual int dereg_mr(void *addr, size_t length); + // dummy functions to block memory usage and internal thread + virtual void init_tx_buffers(uint32_t count); + virtual mem_buf_desc_t* mem_buf_tx_get(ring_user_id_t id, bool b_block, int n_num_mem_bufs = 1); + virtual int drain_and_proccess(cq_type_t cq_type); + virtual int poll_and_process_element_rx(uint64_t* p_cq_poll_sn, + void* pv_fd_ready_array); +private: + vma_external_mem_attr m_ring_attr; + addr_len_mr_map_t m_mr_map; +}; + + +#endif /* SRC_VMA_DEV_RING_ETH_DIRECT_H_ */ diff --git a/src/vma/dev/ring_profile.cpp b/src/vma/dev/ring_profile.cpp new file mode 100644 index 0000000..0965636 --- /dev/null +++ b/src/vma/dev/ring_profile.cpp @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +ring_profiles_collection *g_p_ring_profile = NULL; + + +ring_profile::ring_profile(const vma_ring_type_attr *ring_desc) { + m_ring_desc.comp_mask = ring_desc->comp_mask; + m_ring_desc.ring_type = ring_desc->ring_type; + switch (ring_desc->ring_type) { + case VMA_RING_CYCLIC_BUFFER: { + vma_cyclic_buffer_ring_attr &r = m_ring_desc.ring_cyclicb; + + memset(&r, 0, sizeof(m_ring_desc.ring_cyclicb)); + r.comp_mask = ring_desc->ring_cyclicb.comp_mask; + r.num = ring_desc->ring_cyclicb.num; + r.stride_bytes = ring_desc->ring_cyclicb.stride_bytes; + r.packet_receive_mode = ring_desc->ring_cyclicb.packet_receive_mode; + + if (r.comp_mask & VMA_CB_HDR_BYTE) { + r.hdr_bytes = ring_desc->ring_cyclicb.hdr_bytes; + } + break; + } + case VMA_RING_PACKET: + m_ring_desc.ring_pktq.comp_mask = ring_desc->ring_pktq.comp_mask; + break; + case VMA_RING_EXTERNAL_MEM: + m_ring_desc.ring_ext.comp_mask = ring_desc->ring_ext.comp_mask; + break; + default: + break; + } + create_string(); +}; + +const char* ring_profile::get_vma_ring_type_str() +{ + switch (m_ring_desc.ring_type) { + case VMA_RING_PACKET: return "VMA_PKTS_RING"; + case VMA_RING_CYCLIC_BUFFER: return "VMA_CB_RING"; + case VMA_RING_EXTERNAL_MEM: return "VMA_EXTERNAL_MEM_RING"; + default: return ""; + } +}; + +ring_profile::ring_profile() +{ + m_ring_desc.ring_type = VMA_RING_PACKET; + m_ring_desc.comp_mask = 0; + m_ring_desc.ring_pktq.comp_mask = 0; + create_string(); +}; + + +void ring_profile::create_string() +{ + ostringstream s; + + s<second == *profile) { + return it->first; + } + } + // key 0 is invalid + vma_ring_profile_key key = m_curr_idx; + m_curr_idx++; + ring_profile *prof = new ring_profile(profile); + m_profs_map[key] = prof; + return key; +} + +ring_profile* ring_profiles_collection::get_profile(vma_ring_profile_key key) +{ + ring_profile_map_t::iterator iter = m_profs_map.find(key); + if (iter != m_profs_map.end()) { + return iter->second; + } + return NULL; +} + +ring_profiles_collection::~ring_profiles_collection() +{ + ring_profile_map_t::iterator iter; + + while ((iter = m_profs_map.begin()) != m_profs_map.end()) { + delete (iter->second); + m_profs_map.erase(iter); + } +} diff --git a/src/vma/dev/ring_profile.h b/src/vma/dev/ring_profile.h new file mode 100644 index 0000000..6e8e268 --- /dev/null +++ b/src/vma/dev/ring_profile.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef SRC_VMA_DEV_RING_PROFILE_H_ +#define SRC_VMA_DEV_RING_PROFILE_H_ + +#include +#include "net_device_val.h" +#include "vma_extra.h" + +#define START_RING_INDEX 1 // beneath it's not defined + +class ring_profile; +class ring_profiles_collection; + + +typedef std::tr1::unordered_map ring_profile_map_t; + +extern ring_profiles_collection *g_p_ring_profile; + + +class ring_profile +{ +public: + ring_profile(); + ring_profile(const vma_ring_type_attr *ring_desc); + vma_ring_type get_ring_type() {return m_ring_desc.ring_type;} + struct vma_ring_type_attr* get_desc(){return &m_ring_desc;} + bool operator==(const vma_ring_type_attr &p2); + const char* to_str(){ return m_str.c_str();} + const char* get_vma_ring_type_str(); +private: + void create_string(); + std::string m_str; + vma_ring_type_attr m_ring_desc; +}; + +class ring_profiles_collection +{ +public: + ring_profiles_collection(); + ~ring_profiles_collection(); + vma_ring_profile_key add_profile(vma_ring_type_attr *profile); + ring_profile* get_profile(vma_ring_profile_key key); + +private: + ring_profile_map_t m_profs_map; + vma_ring_profile_key m_curr_idx; +}; +#endif /* SRC_VMA_DEV_RING_PROFILE_H_ */ diff --git a/src/vma/dev/ring_simple.cpp b/src/vma/dev/ring_simple.cpp new file mode 100644 index 0000000..db54810 --- /dev/null +++ b/src/vma/dev/ring_simple.cpp @@ -0,0 +1,1088 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "ring_simple.h" + +#include "vma/util/valgrind.h" +#include "vma/util/sg_array.h" +#include "vma/sock/fd_collection.h" +#if defined(DEFINED_DIRECT_VERBS) +#include "vma/dev/qp_mgr_eth_mlx5.h" +#endif + +#undef MODULE_NAME +#define MODULE_NAME "ring_simple" +#undef MODULE_HDR +#define MODULE_HDR MODULE_NAME "%d:%s() " + +#define ALIGN_WR_DOWN(_num_wr_) (max(32, ((_num_wr_ ) & ~(0xf)))) +#define RING_TX_BUFS_COMPENSATE 256 + +#define RING_LOCK_AND_RUN(__lock__, __func_and_params__) \ + __lock__.lock(); __func_and_params__; __lock__.unlock(); + +#define RING_LOCK_RUN_AND_UPDATE_RET(__lock__, __func_and_params__) \ + __lock__.lock(); ret = __func_and_params__; __lock__.unlock(); + +#define RING_TRY_LOCK_RUN_AND_UPDATE_RET(__lock__, __func_and_params__) \ + if (!__lock__.trylock()) { ret = __func_and_params__; __lock__.unlock(); } \ + else { errno = EAGAIN; } + +/**/ +/** inlining functions can only help if they are implemented before their usage **/ +/**/ + +inline void ring_simple::send_status_handler(int ret, vma_ibv_send_wr* p_send_wqe) +{ + BULLSEYE_EXCLUDE_BLOCK_START + if (unlikely(ret)) { + // Error during post_send, reclaim the tx buffer + if(p_send_wqe) { + mem_buf_desc_t* p_mem_buf_desc = (mem_buf_desc_t*)(p_send_wqe->wr_id); + mem_buf_tx_release(p_mem_buf_desc, true); + } + } + else { + // Update TX statistics + sg_array sga(p_send_wqe->sg_list, p_send_wqe->num_sge); + m_p_ring_stat->n_tx_byte_count += sga.length(); + ++m_p_ring_stat->n_tx_pkt_count; + + // Decrease counter in order to keep track of how many missing buffers we have when + // doing ring->restart() and then drain_tx_buffers_to_buffer_pool() + m_missing_buf_ref_count--; + } + BULLSEYE_EXCLUDE_BLOCK_END +} + +qp_mgr* ring_eth::create_qp_mgr(const ib_ctx_handler* ib_ctx, uint8_t port_num, struct ibv_comp_channel* p_rx_comp_event_channel) +{ +#if defined(DEFINED_DIRECT_VERBS) + if (qp_mgr::is_lib_mlx5(((ib_ctx_handler*)ib_ctx)->get_ibname())) { + return new qp_mgr_eth_mlx5(this, ib_ctx, port_num, p_rx_comp_event_channel, get_tx_num_wr(), m_partition); + } +#endif + return new qp_mgr_eth(this, ib_ctx, port_num, p_rx_comp_event_channel, get_tx_num_wr(), m_partition); +} + +qp_mgr* ring_ib::create_qp_mgr(const ib_ctx_handler* ib_ctx, uint8_t port_num, struct ibv_comp_channel* p_rx_comp_event_channel) +{ + return new qp_mgr_ib(this, ib_ctx, port_num, p_rx_comp_event_channel, get_tx_num_wr(), m_partition); +} + +ring_simple::ring_simple(int if_index, ring* parent, ring_type_t type): + ring_slave(if_index, parent, type), + m_p_ib_ctx(NULL), + m_p_qp_mgr(NULL), + m_p_cq_mgr_rx(NULL), + m_p_cq_mgr_tx(NULL), + m_lock_ring_tx_buf_wait("ring:lock_tx_buf_wait"), m_tx_num_bufs(0), m_tx_num_wr(0), m_tx_num_wr_free(0), + m_b_qp_tx_first_flushed_completion_handled(false), m_missing_buf_ref_count(0), + m_tx_lkey(0), + m_gro_mgr(safe_mce_sys().gro_streams_max, MAX_GRO_BUFS), m_up(false), + m_p_rx_comp_event_channel(NULL), m_p_tx_comp_event_channel(NULL), m_p_l2_addr(NULL) +{ + net_device_val* p_ndev = g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index()); + const slave_data_t * p_slave = p_ndev->get_slave(get_if_index()); + + ring_logdbg("new ring_simple()"); + + /* m_p_ib_ctx, m_tx_lkey should be initialized to be used + * in ring_eth_direct, ring_eth_cb constructors + */ + BULLSEYE_EXCLUDE_BLOCK_START + m_p_ib_ctx = p_slave->p_ib_ctx; + if(m_p_ib_ctx == NULL) { + ring_logpanic("m_p_ib_ctx = NULL. It can be related to wrong bonding configuration"); + } + + m_tx_lkey = g_buffer_pool_tx->find_lkey_by_ib_ctx_thread_safe(m_p_ib_ctx); + if (m_tx_lkey == 0) { + __log_info_panic("invalid lkey found %lu", m_tx_lkey); + } + BULLSEYE_EXCLUDE_BLOCK_END + + /* initialization basing on ndev information */ + m_mtu = p_ndev->get_mtu(); + + memset(&m_cq_moderation_info, 0, sizeof(m_cq_moderation_info)); +#ifdef DEFINED_TSO + memset(&m_tso, 0, sizeof(m_tso)); +#endif /* DEFINED_TSO */ + + m_socketxtreme.active = safe_mce_sys().enable_socketxtreme; + INIT_LIST_HEAD(&m_socketxtreme.ec_list); + m_socketxtreme.completion = NULL; +} + +ring_simple::~ring_simple() +{ + ring_logdbg("delete ring_simple()"); + + // Go over all hash and for each flow: 1.Detach from qp 2.Delete related rfs object 3.Remove flow from hash + m_lock_ring_rx.lock(); + flow_udp_del_all(); + flow_tcp_del_all(); + m_lock_ring_rx.unlock(); + + // Allow last few post sends to be sent by HCA. + // Was done in order to allow iperf's FIN packet to be sent. + usleep(25000); + + /* coverity[double_lock] TODO: RM#1049980 */ + m_lock_ring_rx.lock(); + m_lock_ring_tx.lock(); + + if (m_p_qp_mgr) { + // 'down' the active QP/CQ + /* TODO: consider avoid using sleep */ + /* coverity[sleep] */ + m_p_qp_mgr->down(); + + // Release QP/CQ resources + delete m_p_qp_mgr; + m_p_qp_mgr = NULL; + } + + delete_l2_address(); + + // Delete the rx channel fd from the global fd collection + if (g_p_fd_collection) { + if (m_p_rx_comp_event_channel) { + g_p_fd_collection->del_cq_channel_fd(m_p_rx_comp_event_channel->fd, true); + } + if (m_p_tx_comp_event_channel) { + g_p_fd_collection->del_cq_channel_fd(m_p_tx_comp_event_channel->fd, true); + } + } + + if (m_p_rx_comp_event_channel) { + IF_VERBS_FAILURE(ibv_destroy_comp_channel(m_p_rx_comp_event_channel)) { + ring_logdbg("destroy comp channel failed (errno=%d %m)", errno); + } ENDIF_VERBS_FAILURE; + VALGRIND_MAKE_MEM_UNDEFINED(m_p_rx_comp_event_channel, sizeof(struct ibv_comp_channel)); + } + + delete[] m_p_n_rx_channel_fds; + + ring_logdbg("Tx buffer poll: free count = %u, sender_has = %d, total = %d, %s (%d)", + m_tx_pool.size(), m_missing_buf_ref_count, m_tx_num_bufs, + ((m_tx_num_bufs - m_tx_pool.size() - m_missing_buf_ref_count) ? + "bad accounting!!" : "good accounting"), + (m_tx_num_bufs - m_tx_pool.size() - m_missing_buf_ref_count)); + ring_logdbg("Tx WR num: free count = %d, total = %d, %s (%d)", + m_tx_num_wr_free, m_tx_num_wr, + ((m_tx_num_wr - m_tx_num_wr_free) ? "bad accounting!!":"good accounting"), (m_tx_num_wr - m_tx_num_wr_free)); + ring_logdbg("Rx buffer pool: %d free global buffers available", m_tx_pool.size()); + + // Release verbs resources + if (m_p_tx_comp_event_channel) { + IF_VERBS_FAILURE(ibv_destroy_comp_channel(m_p_tx_comp_event_channel)) { + ring_logdbg("destroy comp channel failed (errno=%d %m)", errno); + } ENDIF_VERBS_FAILURE; + VALGRIND_MAKE_MEM_UNDEFINED(m_p_tx_comp_event_channel, sizeof(struct ibv_comp_channel)); + m_p_tx_comp_event_channel = NULL; + } + + /* coverity[double_unlock] TODO: RM#1049980 */ + m_lock_ring_rx.unlock(); + m_lock_ring_tx.unlock(); + + ring_logdbg("queue of event completion elements is %s", + (list_empty(&m_socketxtreme.ec_list) ? "empty" : "not empty")); + while (!list_empty(&m_socketxtreme.ec_list)) { + struct ring_ec *ec = NULL; + ec = get_ec(); + if (ec) { + del_ec(ec); + } + } + + ring_logdbg("delete ring_simple() completed"); +} + +void ring_simple::create_resources() +{ + net_device_val* p_ndev = g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index()); + const slave_data_t * p_slave = p_ndev->get_slave(get_if_index()); + + save_l2_address(p_slave->p_L2_addr); + m_p_tx_comp_event_channel = ibv_create_comp_channel(m_p_ib_ctx->get_ibv_context()); + if (m_p_tx_comp_event_channel == NULL) { + VLOG_PRINTF_INFO_ONCE_THEN_ALWAYS(VLOG_ERROR, VLOG_DEBUG, "ibv_create_comp_channel for tx failed. m_p_tx_comp_event_channel = %p (errno=%d %m)", m_p_tx_comp_event_channel, errno); + if (errno == EMFILE) { + VLOG_PRINTF_INFO_ONCE_THEN_ALWAYS(VLOG_ERROR, VLOG_DEBUG, "did we run out of file descriptors? traffic may not be offloaded, increase ulimit -n"); + } + throw_vma_exception("create event channel failed"); + } + BULLSEYE_EXCLUDE_BLOCK_END + VALGRIND_MAKE_MEM_DEFINED(m_p_tx_comp_event_channel, sizeof(struct ibv_comp_channel)); + // Check device capabilities for max QP work requests + uint32_t max_qp_wr = ALIGN_WR_DOWN(m_p_ib_ctx->get_ibv_device_attr()->max_qp_wr - 1); + m_tx_num_wr = safe_mce_sys().tx_num_wr; + if (m_tx_num_wr > max_qp_wr) { + ring_logwarn("Allocating only %d Tx QP work requests while user requested %s=%d for QP on interface %d.%d.%d.%d", + max_qp_wr, SYS_VAR_TX_NUM_WRE, m_tx_num_wr); + m_tx_num_wr = max_qp_wr; + } + ring_logdbg("ring attributes: m_tx_num_wr = %d", m_tx_num_wr); + + m_tx_num_wr_free = m_tx_num_wr; + +#ifdef DEFINED_TSO + memset(&m_tso, 0, sizeof(m_tso)); + if (safe_mce_sys().enable_tso && (1 == validate_tso(get_if_index()))) { + if (vma_check_dev_attr_tso(m_p_ib_ctx->get_ibv_device_attr())) { + const vma_ibv_tso_caps *caps = &vma_get_tso_caps(m_p_ib_ctx->get_ibv_device_attr_ex()); + if (ibv_is_qpt_supported(caps->supported_qpts, IBV_QPT_RAW_PACKET) || + ibv_is_qpt_supported(caps->supported_qpts, IBV_QPT_UD)) { + m_tso.max_payload_sz = caps->max_tso; + /* ETH(14) + IP(20) + TCP(20) + TCP OPTIONS(40) */ + m_tso.max_header_sz = 94; + } + } + } + ring_logdbg("ring attributes: m_tso = %d", is_tso()); + ring_logdbg("ring attributes: m_tso:max_payload_sz = %d", get_max_payload_sz()); + ring_logdbg("ring attributes: m_tso:max_header_sz = %d", get_max_header_sz()); +#endif /* DEFINED_TSO */ + + m_flow_tag_enabled = m_p_ib_ctx->get_flow_tag_capability(); + ring_logdbg("ring attributes: m_flow_tag_enabled = %d", m_flow_tag_enabled); + + m_p_rx_comp_event_channel = ibv_create_comp_channel(m_p_ib_ctx->get_ibv_context()); // ODED TODO: Adjust the ibv_context to be the exact one in case of different devices + BULLSEYE_EXCLUDE_BLOCK_START + if (m_p_rx_comp_event_channel == NULL) { + VLOG_PRINTF_INFO_ONCE_THEN_ALWAYS(VLOG_ERROR, VLOG_DEBUG, "ibv_create_comp_channel for rx failed. p_rx_comp_event_channel = %p (errno=%d %m)", m_p_rx_comp_event_channel, errno); + if (errno == EMFILE) { + VLOG_PRINTF_INFO_ONCE_THEN_ALWAYS(VLOG_ERROR, VLOG_DEBUG, "did we run out of file descriptors? traffic may not be offloaded, increase ulimit -n"); + } + throw_vma_exception("create event channel failed"); + } + BULLSEYE_EXCLUDE_BLOCK_END + VALGRIND_MAKE_MEM_DEFINED(m_p_rx_comp_event_channel, sizeof(struct ibv_comp_channel)); + m_p_n_rx_channel_fds = new int[1]; + m_p_n_rx_channel_fds[0] = m_p_rx_comp_event_channel->fd; + // Add the rx channel fd to the global fd collection + if (g_p_fd_collection) { + // Create new cq_channel info in the global fd collection + g_p_fd_collection->add_cq_channel_fd(m_p_n_rx_channel_fds[0], this); + g_p_fd_collection->add_cq_channel_fd(m_p_tx_comp_event_channel->fd, this); + } + + m_p_qp_mgr = create_qp_mgr(m_p_ib_ctx, p_slave->port_num, m_p_rx_comp_event_channel); + BULLSEYE_EXCLUDE_BLOCK_START + if (m_p_qp_mgr == NULL) { + ring_logerr("Failed to allocate qp_mgr!"); + throw_vma_exception("create qp failed"); + } + BULLSEYE_EXCLUDE_BLOCK_END + + // save cq_mgr pointers + m_p_cq_mgr_rx = m_p_qp_mgr->get_rx_cq_mgr(); + m_p_cq_mgr_tx = m_p_qp_mgr->get_tx_cq_mgr(); + + init_tx_buffers(RING_TX_BUFS_COMPENSATE); + + if (safe_mce_sys().cq_moderation_enable) { + modify_cq_moderation(safe_mce_sys().cq_moderation_period_usec, safe_mce_sys().cq_moderation_count); + } + + if (p_slave->active) { + // 'up' the active QP/CQ resource + m_up = true; + m_p_qp_mgr->up(); + } + + ring_logdbg("new ring_simple() completed"); +} + +int ring_simple::request_notification(cq_type_t cq_type, uint64_t poll_sn) +{ + int ret = 1; + if (likely(CQT_RX == cq_type)) { + RING_TRY_LOCK_RUN_AND_UPDATE_RET(m_lock_ring_rx, + m_p_cq_mgr_rx->request_notification(poll_sn); + ++m_p_ring_stat->simple.n_rx_interrupt_requests); + } else { + RING_TRY_LOCK_RUN_AND_UPDATE_RET(m_lock_ring_tx, m_p_cq_mgr_tx->request_notification(poll_sn)); + } + + return ret; +} + +int ring_simple::ack_and_arm_cq(cq_type_t cq_type) +{ + if (CQT_RX == cq_type) { + return m_p_cq_mgr_rx->ack_and_request_notification(); + } + return m_p_cq_mgr_tx->ack_and_request_notification(); +} + +int ring_simple::poll_and_process_element_rx(uint64_t* p_cq_poll_sn, void* pv_fd_ready_array /*NULL*/) +{ + int ret = 0; + RING_TRY_LOCK_RUN_AND_UPDATE_RET(m_lock_ring_rx, m_p_cq_mgr_rx->poll_and_process_element_rx(p_cq_poll_sn, pv_fd_ready_array)); + return ret; +} + +int ring_simple::socketxtreme_poll(struct vma_completion_t *vma_completions, unsigned int ncompletions, int flags) +{ + int ret = 0; + int i = 0; + + NOT_IN_USE(flags); + + if (likely(vma_completions) && ncompletions) { + struct ring_ec *ec = NULL; + + m_socketxtreme.completion = vma_completions; + + while (!g_b_exit && (i < (int)ncompletions)) { + m_socketxtreme.completion->events = 0; + /* Check list size to avoid locking */ + if (!list_empty(&m_socketxtreme.ec_list)) { + ec = get_ec(); + if (ec) { + memcpy(m_socketxtreme.completion, &ec->completion, sizeof(ec->completion)); + ec->clear(); + m_socketxtreme.completion++; + i++; + } + } else { + /* Internal thread can raise event on this stage before we + * start rx processing. In this case we can return event + * in right order. It is done to avoid locking and + * may be it is not so critical + */ + mem_buf_desc_t *desc; + if (likely(m_p_cq_mgr_rx->poll_and_process_element_rx(&desc))) { + desc->rx.socketxtreme_polled = true; + rx_process_buffer(desc, NULL); + if (m_socketxtreme.completion->events) { + m_socketxtreme.completion++; + i++; + } + } else { + break; + } + } + } + + m_socketxtreme.completion = NULL; + + ret = i; + } + else { + ret = -1; + errno = EINVAL; + } + + return ret; +} + +int ring_simple::wait_for_notification_and_process_element(int cq_channel_fd, uint64_t* p_cq_poll_sn, void* pv_fd_ready_array /*NULL*/) +{ + int ret = -1; + if (m_p_cq_mgr_rx != NULL) { + RING_TRY_LOCK_RUN_AND_UPDATE_RET(m_lock_ring_rx, + m_p_cq_mgr_rx->wait_for_notification_and_process_element(p_cq_poll_sn, pv_fd_ready_array); + ++m_p_ring_stat->simple.n_rx_interrupt_received); + } else { + ring_logerr("Can't find rx_cq for the rx_comp_event_channel_fd (= %d)", cq_channel_fd); + } + + return ret; +} + +bool ring_simple::reclaim_recv_buffers(descq_t *rx_reuse) +{ + bool ret = false; + RING_TRY_LOCK_RUN_AND_UPDATE_RET(m_lock_ring_rx, m_p_cq_mgr_rx->reclaim_recv_buffers(rx_reuse)); + return ret; +} + +bool ring_simple::reclaim_recv_buffers(mem_buf_desc_t* rx_reuse_lst) +{ + bool ret = false; + RING_TRY_LOCK_RUN_AND_UPDATE_RET(m_lock_ring_rx, m_p_cq_mgr_rx->reclaim_recv_buffers(rx_reuse_lst)); + return ret; +} + +bool ring_simple::reclaim_recv_buffers_no_lock(mem_buf_desc_t* rx_reuse_lst) +{ + return m_p_cq_mgr_rx->reclaim_recv_buffers_no_lock(rx_reuse_lst); +} + +int ring_simple::reclaim_recv_single_buffer(mem_buf_desc_t* rx_reuse) +{ + return m_p_cq_mgr_rx->reclaim_recv_single_buffer(rx_reuse); +} + +void ring_simple::mem_buf_desc_completion_with_error_rx(mem_buf_desc_t* p_rx_wc_buf_desc) +{ + m_p_cq_mgr_rx->mem_buf_desc_completion_with_error(p_rx_wc_buf_desc); +} + +void ring_simple::mem_buf_desc_completion_with_error_tx(mem_buf_desc_t* p_tx_wc_buf_desc) +{ + if (m_b_qp_tx_first_flushed_completion_handled) { + p_tx_wc_buf_desc->p_next_desc = NULL; // All wr are flushed so we need to disconnect the Tx list + } + else { + m_b_qp_tx_first_flushed_completion_handled = true; // This is true for all wr except for the first one which might point to already sent wr + } + m_tx_num_wr_free += mem_buf_tx_release(p_tx_wc_buf_desc, false, false); +} + +void ring_simple::mem_buf_desc_return_to_owner_rx(mem_buf_desc_t* p_mem_buf_desc, void* pv_fd_ready_array /*NULL*/) +{ + ring_logfuncall(""); + RING_LOCK_AND_RUN(m_lock_ring_rx, m_p_cq_mgr_rx->mem_buf_desc_return_to_owner(p_mem_buf_desc, pv_fd_ready_array)); +} + +void ring_simple::mem_buf_desc_return_to_owner_tx(mem_buf_desc_t* p_mem_buf_desc) +{ + ring_logfuncall(""); + RING_LOCK_AND_RUN(m_lock_ring_tx, m_tx_num_wr_free += put_tx_buffers(p_mem_buf_desc)); +} + +void ring_simple::mem_buf_desc_return_single_to_owner_tx(mem_buf_desc_t* p_mem_buf_desc) +{ + ring_logfuncall(""); + RING_LOCK_AND_RUN(m_lock_ring_tx, put_tx_single_buffer(p_mem_buf_desc)); +} + +int ring_simple::drain_and_proccess() +{ + int ret = 0; + RING_TRY_LOCK_RUN_AND_UPDATE_RET(m_lock_ring_rx, m_p_cq_mgr_rx->drain_and_proccess()); + return ret; +} + +mem_buf_desc_t* ring_simple::mem_buf_tx_get(ring_user_id_t id, bool b_block, int n_num_mem_bufs /* default = 1 */) +{ + NOT_IN_USE(id); + int ret = 0; + mem_buf_desc_t* buff_list = NULL; + uint64_t poll_sn = 0; + + ring_logfuncall("n_num_mem_bufs=%d", n_num_mem_bufs); + + m_lock_ring_tx.lock(); + buff_list = get_tx_buffers(n_num_mem_bufs); + while (!buff_list) { + + // Try to poll once in the hope that we get a few freed tx mem_buf_desc + ret = m_p_cq_mgr_tx->poll_and_process_element_tx(&poll_sn); + if (ret < 0) { + ring_logdbg("failed polling on tx cq_mgr (qp_mgr=%p, cq_mgr_tx=%p) (ret=%d %m)", m_p_qp_mgr, m_p_cq_mgr_tx, ret); + /* coverity[double_unlock] TODO: RM#1049980 */ + m_lock_ring_tx.unlock(); + return NULL; + } + else if (ret > 0) { + ring_logfunc("polling succeeded on tx cq_mgr (%d wce)", ret); + buff_list = get_tx_buffers(n_num_mem_bufs); + } + else if (b_block) { // (ret == 0) + // Arm & Block on tx cq_mgr notification channel + // until we get a few freed tx mem_buf_desc & data buffers + + // Only a single thread should block on next Tx cqe event, hence the dedicated lock! + /* coverity[double_unlock] coverity[unlock] TODO: RM#1049980 */ + m_lock_ring_tx.unlock(); + m_lock_ring_tx_buf_wait.lock(); + /* coverity[double_lock] TODO: RM#1049980 */ + m_lock_ring_tx.lock(); + + // poll once more (in the hope that we get a few freed tx mem_buf_desc) + buff_list = get_tx_buffers(n_num_mem_bufs); + if (!buff_list) { + // Arm the CQ event channel for next Tx buffer release (tx cqe) + ret = m_p_cq_mgr_tx->request_notification(poll_sn); + if (ret < 0) { + // this is most likely due to cq_poll_sn out of sync, need to poll_cq again + ring_logdbg("failed arming tx cq_mgr (qp_mgr=%p, cq_mgr_tx=%p) (errno=%d %m)", m_p_qp_mgr, m_p_cq_mgr_tx, errno); + } + else if (ret == 0) { + + // prepare to block + // CQ is armed, block on the CQ's Tx event channel (fd) + struct pollfd poll_fd = { /*.fd=*/ 0, /*.events=*/ POLLIN, /*.revents=*/ 0}; + poll_fd.fd = get_tx_comp_event_channel()->fd; + + // Now it is time to release the ring lock (for restart events to be handled while this thread block on CQ channel) + /* coverity[double_unlock] coverity[unlock] TODO: RM#1049980 */ + m_lock_ring_tx.unlock(); + + ret = orig_os_api.poll(&poll_fd, 1, 100); + if (ret == 0) { + m_lock_ring_tx_buf_wait.unlock(); + /* coverity[double_lock] TODO: RM#1049980 */ + m_lock_ring_tx.lock(); + buff_list = get_tx_buffers(n_num_mem_bufs); + continue; + } else if (ret < 0) { + ring_logdbg("failed blocking on tx cq_mgr (errno=%d %m)", errno); + m_lock_ring_tx_buf_wait.unlock(); + return NULL; + } + /* coverity[double_lock] TODO: RM#1049980 */ + m_lock_ring_tx.lock(); + + // Find the correct Tx cq_mgr from the CQ event, + // It might not be the active_cq object since we have a single TX CQ comp channel for all cq_mgr's + cq_mgr* p_cq_mgr_tx = get_cq_mgr_from_cq_event(get_tx_comp_event_channel()); + if (p_cq_mgr_tx) { + + // Allow additional CQ arming now + p_cq_mgr_tx->m_b_notification_armed = false; + + // Perform a non blocking event read, clear the fd channel + ret = p_cq_mgr_tx->poll_and_process_element_tx(&poll_sn); + if (ret < 0) { + ring_logdbg("failed handling Tx cq_mgr channel (qp_mgr=%p, cq_mgr_tx=%p) (errno=%d %m)", m_p_qp_mgr, m_p_cq_mgr_tx, errno); + /* coverity[double_unlock] TODO: RM#1049980 */ + m_lock_ring_tx.unlock(); + m_lock_ring_tx_buf_wait.unlock(); + return NULL; + } + ring_logfunc("polling/blocking succeeded on tx cq_mgr (we got %d wce)", ret); + } + } + buff_list = get_tx_buffers(n_num_mem_bufs); + } + /* coverity[double_unlock] TODO: RM#1049980 */ + m_lock_ring_tx.unlock(); + m_lock_ring_tx_buf_wait.unlock(); + /* coverity[double_lock] TODO: RM#1049980 */ + m_lock_ring_tx.lock(); + } + else { + // get out on non blocked socket + m_lock_ring_tx.unlock(); + return NULL; + } + } + + // We got the buffers + // Increase counter in order to keep track of how many buffers ring is missing when reclaiming them during ring->restart() + m_missing_buf_ref_count += n_num_mem_bufs; + + /* coverity[double_unlock] TODO: RM#1049980 */ + m_lock_ring_tx.unlock(); + return buff_list; +} + +int ring_simple::mem_buf_tx_release(mem_buf_desc_t* p_mem_buf_desc_list, bool b_accounting, bool trylock/*=false*/) +{ + ring_logfuncall(""); + + if (!trylock) + m_lock_ring_tx.lock(); + else if (m_lock_ring_tx.trylock()) + return 0; + + int accounting = put_tx_buffers(p_mem_buf_desc_list); + if (b_accounting) + m_missing_buf_ref_count -= accounting; + m_lock_ring_tx.unlock(); + return accounting; +} + +/* note that this function is inline, so keep it above the functions using it */ +inline int ring_simple::send_buffer(vma_ibv_send_wr* p_send_wqe, vma_wr_tx_packet_attr attr) +{ + //Note: this is debatable logic as it count of WQEs waiting completion but + //our SQ is cyclic buffer so in reality only last WQE is still being sent + //and other SQ is mostly free to work on. + int ret = 0; + if (likely(m_tx_num_wr_free > 0)) { + ret = m_p_qp_mgr->send(p_send_wqe, attr); + --m_tx_num_wr_free; + } else if (is_available_qp_wr(is_set(attr, VMA_TX_PACKET_BLOCK))) { + ret = m_p_qp_mgr->send(p_send_wqe, attr); + } else { + ring_logdbg("silent packet drop, no available WR in QP!"); + ret = -1; + if(p_send_wqe) { + mem_buf_desc_t* p_mem_buf_desc = (mem_buf_desc_t*)(p_send_wqe->wr_id); + p_mem_buf_desc->p_next_desc = NULL; + } + } + return ret; +} + +bool ring_simple::get_hw_dummy_send_support(ring_user_id_t id, vma_ibv_send_wr* p_send_wqe) +{ + NOT_IN_USE(id); + NOT_IN_USE(p_send_wqe); + + return m_p_qp_mgr->get_hw_dummy_send_support(); +} + +void ring_simple::send_ring_buffer(ring_user_id_t id, vma_ibv_send_wr* p_send_wqe, vma_wr_tx_packet_attr attr) +{ + NOT_IN_USE(id); + +#ifdef DEFINED_SW_CSUM + { +#else + if (attr & VMA_TX_SW_CSUM) { +#endif + compute_tx_checksum((mem_buf_desc_t*)(p_send_wqe->wr_id), attr & VMA_TX_PACKET_L3_CSUM, attr & VMA_TX_PACKET_L4_CSUM); + attr = (vma_wr_tx_packet_attr) (attr & ~(VMA_TX_PACKET_L3_CSUM | VMA_TX_PACKET_L4_CSUM)); + } + + auto_unlocker lock(m_lock_ring_tx); +#ifdef DEFINED_TSO +#else + p_send_wqe->sg_list[0].lkey = m_tx_lkey; +#endif /* DEFINED_TSO */ + int ret = send_buffer(p_send_wqe, attr); + send_status_handler(ret, p_send_wqe); +} + +void ring_simple::send_lwip_buffer(ring_user_id_t id, vma_ibv_send_wr* p_send_wqe, vma_wr_tx_packet_attr attr) +{ + NOT_IN_USE(id); + +#ifdef DEFINED_SW_CSUM + compute_tx_checksum((mem_buf_desc_t*)(p_send_wqe->wr_id), attr & VMA_TX_PACKET_L3_CSUM, attr & VMA_TX_PACKET_L4_CSUM); + attr = (vma_wr_tx_packet_attr) (attr & ~(VMA_TX_PACKET_L3_CSUM | VMA_TX_PACKET_L4_CSUM)); +#endif + + auto_unlocker lock(m_lock_ring_tx); +#ifdef DEFINED_TSO +#else + p_send_wqe->sg_list[0].lkey = m_tx_lkey; + mem_buf_desc_t* p_mem_buf_desc = (mem_buf_desc_t*)(p_send_wqe->wr_id); + p_mem_buf_desc->lwip_pbuf.pbuf.ref++; +#endif /* DEFINED_TSO */ + int ret = send_buffer(p_send_wqe, attr); + send_status_handler(ret, p_send_wqe); +} + +/* + * called under m_lock_ring_tx lock + */ +bool ring_simple::is_available_qp_wr(bool b_block) +{ + int ret = 0; + uint64_t poll_sn = 0; + + while (m_tx_num_wr_free <= 0) { + // Try to poll once in the hope that we get a few freed tx mem_buf_desc + ret = m_p_cq_mgr_tx->poll_and_process_element_tx(&poll_sn); + if (ret < 0) { + ring_logdbg("failed polling on tx cq_mgr (qp_mgr=%p, cq_mgr_tx=%p) (ret=%d %m)", m_p_qp_mgr, m_p_cq_mgr_tx, ret); + /* coverity[missing_unlock] */ + return false; + } else if (ret > 0) { + ring_logfunc("polling succeeded on tx cq_mgr (%d wce)", ret); + } else if (b_block){ + // Arm & Block on tx cq_mgr notification channel + // until we get a few freed tx mem_buf_desc & data buffers + + // Only a single thread should block on next Tx cqe event, hence the dedicated lock! + /* coverity[double_unlock] TODO: RM#1049980 */ + m_lock_ring_tx.unlock(); + m_lock_ring_tx_buf_wait.lock(); + /* coverity[double_lock] TODO: RM#1049980 */ + m_lock_ring_tx.lock(); + + if (m_tx_num_wr_free <= 0) { + // Arm the CQ event channel for next Tx buffer release (tx cqe) + ret = m_p_cq_mgr_tx->request_notification(poll_sn); + if (ret < 0) { + // this is most likely due to cq_poll_sn out of sync, need to poll_cq again + ring_logdbg("failed arming tx cq_mgr (qp_mgr=%p, cq_mgr_tx=%p) (errno=%d %m)", m_p_qp_mgr, m_p_cq_mgr_tx, errno); + } + else if (ret == 0) { + + // prepare to block + // CQ is armed, block on the CQ's Tx event channel (fd) + struct pollfd poll_fd = { /*.fd=*/ 0, /*.events=*/ POLLIN, /*.revents=*/ 0}; + poll_fd.fd = get_tx_comp_event_channel()->fd; + + // Now it is time to release the ring lock (for restart events to be handled while this thread block on CQ channel) + /* coverity[double_unlock] TODO: RM#1049980 */ + m_lock_ring_tx.unlock(); + + ret = orig_os_api.poll(&poll_fd, 1, -1); + if (ret <= 0) { + ring_logdbg("failed blocking on tx cq_mgr (errno=%d %m)", errno); + m_lock_ring_tx_buf_wait.unlock(); + /* coverity[double_lock] TODO: RM#1049980 */ + m_lock_ring_tx.lock(); + /* coverity[missing_unlock] */ + return false; + } + /* coverity[double_lock] TODO: RM#1049980 */ + m_lock_ring_tx.lock(); + + // Find the correct Tx cq_mgr from the CQ event, + // It might not be the active_cq object since we have a single TX CQ comp channel for all cq_mgr's + cq_mgr* p_cq_mgr_tx = get_cq_mgr_from_cq_event(get_tx_comp_event_channel()); + if (p_cq_mgr_tx) { + + // Allow additional CQ arming now + p_cq_mgr_tx->m_b_notification_armed = false; + + // Perform a non blocking event read, clear the fd channel + ret = p_cq_mgr_tx->poll_and_process_element_tx(&poll_sn); + if (ret < 0) { + ring_logdbg("failed handling Tx cq_mgr channel (qp_mgr=%p, cq_mgr_tx=%p) (errno=%d %m)", m_p_qp_mgr, m_p_cq_mgr_tx, errno); + /* coverity[double_unlock] TODO: RM#1049980 */ + m_lock_ring_tx.unlock(); + m_lock_ring_tx_buf_wait.unlock(); + /* coverity[double_lock] TODO: RM#1049980 */ + m_lock_ring_tx.lock(); + return false; + } + ring_logfunc("polling/blocking succeeded on tx cq_mgr (we got %d wce)", ret); + } + } + } + /* coverity[double_unlock] TODO: RM#1049980 */ + m_lock_ring_tx.unlock(); + m_lock_ring_tx_buf_wait.unlock(); + /* coverity[double_lock] TODO: RM#1049980 */ + m_lock_ring_tx.lock(); + } else { + return false; + } + } + + --m_tx_num_wr_free; + return true; +} + +void ring_simple::init_tx_buffers(uint32_t count) +{ + request_more_tx_buffers(count, m_tx_lkey); + m_tx_num_bufs = m_tx_pool.size(); +} + +void ring_simple::inc_cq_moderation_stats(size_t sz_data) +{ + m_cq_moderation_info.bytes += sz_data; + ++m_cq_moderation_info.packets; +} + +//call under m_lock_ring_tx lock +mem_buf_desc_t* ring_simple::get_tx_buffers(uint32_t n_num_mem_bufs) +{ + mem_buf_desc_t* head = NULL; + if (unlikely(m_tx_pool.size() < n_num_mem_bufs)) { + int count = MAX(RING_TX_BUFS_COMPENSATE, n_num_mem_bufs); + if (request_more_tx_buffers(count, m_tx_lkey)) { + m_tx_num_bufs += count; + } + + if (unlikely(m_tx_pool.size() < n_num_mem_bufs)) { + return head; + } + } + + head = m_tx_pool.get_and_pop_back(); + head->lwip_pbuf.pbuf.ref = 1; + n_num_mem_bufs--; + + mem_buf_desc_t* next = head; + while (n_num_mem_bufs) { + next->p_next_desc = m_tx_pool.get_and_pop_back(); + next = next->p_next_desc; + next->lwip_pbuf.pbuf.ref = 1; + n_num_mem_bufs--; + } + + return head; +} + +void ring_simple::return_to_global_pool() +{ + if (unlikely(m_tx_pool.size() > (m_tx_num_bufs / 2) && m_tx_num_bufs >= RING_TX_BUFS_COMPENSATE * 2)) { + int return_bufs = m_tx_pool.size() / 2; + m_tx_num_bufs -= return_bufs; + g_buffer_pool_tx->put_buffers_thread_safe(&m_tx_pool, return_bufs); + } +} + +//call under m_lock_ring_tx lock +int ring_simple::put_tx_buffers(mem_buf_desc_t* buff_list) +{ + int count = 0, freed=0; + mem_buf_desc_t *next; + + while (buff_list) { + next = buff_list->p_next_desc; + buff_list->p_next_desc = NULL; + + if (buff_list->tx.dev_mem_length) + m_p_qp_mgr->dm_release_data(buff_list); + + //potential race, ref is protected here by ring_tx lock, and in dst_entry_tcp & sockinfo_tcp by tcp lock + if (likely(buff_list->lwip_pbuf.pbuf.ref)) + buff_list->lwip_pbuf.pbuf.ref--; + else + ring_logerr("ref count of %p is already zero, double free??", buff_list); + + if (buff_list->lwip_pbuf.pbuf.ref == 0) { + free_lwip_pbuf(&buff_list->lwip_pbuf); + m_tx_pool.push_back(buff_list); + freed++; + } + count++; + buff_list = next; + } + ring_logfunc("buf_list: %p count: %d freed: %d\n", buff_list, count, freed); + + return_to_global_pool(); + + return count; +} + +//call under m_lock_ring_tx lock +int ring_simple::put_tx_single_buffer(mem_buf_desc_t* buff) +{ + int count = 0; + + if (likely(buff)) { + + if (buff->tx.dev_mem_length) + m_p_qp_mgr->dm_release_data(buff); + + //potential race, ref is protected here by ring_tx lock, and in dst_entry_tcp & sockinfo_tcp by tcp lock + if (likely(buff->lwip_pbuf.pbuf.ref)) + buff->lwip_pbuf.pbuf.ref--; + else + ring_logerr("ref count of %p is already zero, double free??", buff); + + if (buff->lwip_pbuf.pbuf.ref == 0) { + buff->p_next_desc = NULL; + free_lwip_pbuf(&buff->lwip_pbuf); + m_tx_pool.push_back(buff); + count++; + } + } + + return_to_global_pool(); + + return count; +} + +void ring_simple::modify_cq_moderation(uint32_t period, uint32_t count) +{ + uint32_t period_diff = period > m_cq_moderation_info.period ? + period - m_cq_moderation_info.period : m_cq_moderation_info.period - period; + uint32_t count_diff = count > m_cq_moderation_info.count ? + count - m_cq_moderation_info.count : m_cq_moderation_info.count - count; + + if (period_diff < (m_cq_moderation_info.period / 20) && (count_diff < m_cq_moderation_info.count / 20)) + return; + + m_cq_moderation_info.period = period; + m_cq_moderation_info.count = count; + + m_p_ring_stat->simple.n_rx_cq_moderation_period = period; + m_p_ring_stat->simple.n_rx_cq_moderation_count = count; + + //todo all cqs or just active? what about HA? + priv_ibv_modify_cq_moderation(m_p_cq_mgr_rx->get_ibv_cq_hndl(), period, count); +} + +void ring_simple::adapt_cq_moderation() +{ + if (m_lock_ring_rx.trylock()) { + ++m_cq_moderation_info.missed_rounds; + return; //todo try again sooner? + } + + uint32_t missed_rounds = m_cq_moderation_info.missed_rounds; + + //todo collect bytes and packets from all rings ?? + int64_t interval_bytes = m_cq_moderation_info.bytes - m_cq_moderation_info.prev_bytes; + int64_t interval_packets = m_cq_moderation_info.packets - m_cq_moderation_info.prev_packets; + + m_cq_moderation_info.prev_bytes = m_cq_moderation_info.bytes; + m_cq_moderation_info.prev_packets = m_cq_moderation_info.packets; + m_cq_moderation_info.missed_rounds = 0; + + BULLSEYE_EXCLUDE_BLOCK_START + if (interval_bytes < 0 || interval_packets < 0) { + //rare wrap-around of 64 bit, just ignore + m_lock_ring_rx.unlock(); + return; + } + BULLSEYE_EXCLUDE_BLOCK_END + + if (interval_packets == 0) { + // todo if no traffic, set moderation to default? + modify_cq_moderation(safe_mce_sys().cq_moderation_period_usec, safe_mce_sys().cq_moderation_count); + m_lock_ring_rx.unlock(); + return; + } + + uint32_t avg_packet_size = interval_bytes / interval_packets; + uint32_t avg_packet_rate = (interval_packets * 1000) / (safe_mce_sys().cq_aim_interval_msec * (1 + missed_rounds)); + + uint32_t ir_rate = safe_mce_sys().cq_aim_interrupts_rate_per_sec; + + int count = MIN(avg_packet_rate / ir_rate, safe_mce_sys().cq_aim_max_count); + int period = MIN(safe_mce_sys().cq_aim_max_period_usec, ((1000000 / ir_rate) - (1000000 / MAX(avg_packet_rate, ir_rate)))); + + if (avg_packet_size < 1024 && avg_packet_rate < 450000) { + modify_cq_moderation(0, 0); //latency mode + //todo latency for big messages is not good + // the rate is affected by the moderation and the moderation by the rate.. + // so each cycle change from 0 to max, and max to 0, .. + } else { + modify_cq_moderation(period, count); //throughput mode + } + + m_lock_ring_rx.unlock(); +} + +void ring_simple::start_active_qp_mgr() { + m_lock_ring_rx.lock(); + m_lock_ring_tx.lock(); + if (!m_up) { + /* TODO: consider avoid using sleep */ + /* coverity[sleep] */ + m_p_qp_mgr->up(); + m_b_qp_tx_first_flushed_completion_handled = false; + m_up = true; + } + m_lock_ring_tx.unlock(); + m_lock_ring_rx.unlock(); +} + +void ring_simple::stop_active_qp_mgr() { + m_lock_ring_rx.lock(); + m_lock_ring_tx.lock(); + if (m_up) { + m_up = false; + /* TODO: consider avoid using sleep */ + /* coverity[sleep] */ + m_p_qp_mgr->down(); + } + m_lock_ring_tx.unlock(); + m_lock_ring_rx.unlock(); +} + +bool ring_simple::is_up() { + return m_up; +} + +int ring_simple::modify_ratelimit(struct vma_rate_limit_t &rate_limit) +{ + if (!m_p_ib_ctx->is_packet_pacing_supported(rate_limit.rate)) { + ring_logwarn("Packet pacing is not supported for this device"); + return -1; + } + + if ((rate_limit.max_burst_sz || rate_limit.typical_pkt_sz) && !m_p_ib_ctx->get_burst_capability()) { + ring_logwarn("Burst is not supported for this device"); + return -1; + } + + uint32_t rl_changes = m_p_qp_mgr->is_ratelimit_change(rate_limit); + + if (m_up && rl_changes) + return m_p_qp_mgr->modify_qp_ratelimit(rate_limit, rl_changes); + + return 0; +} + +int ring_simple::get_ring_descriptors(vma_mlx_hw_device_data &d) +{ + d.dev_data.vendor_id = m_p_ib_ctx->get_ibv_device_attr()->vendor_id; + d.dev_data.vendor_part_id = m_p_ib_ctx->get_ibv_device_attr()->vendor_part_id; + if (m_p_ib_ctx->is_packet_pacing_supported()) { + d.dev_data.device_cap |= VMA_HW_PP_EN; + } + if (m_p_ib_ctx->get_burst_capability()) { + d.dev_data.device_cap |= VMA_HW_PP_BURST_EN; + } + if (vma_is_umr_supported(m_p_ib_ctx->get_ibv_device_attr())) { + d.dev_data.device_cap |= VMA_HW_UMR_EN; + } + if (vma_is_mp_rq_supported(m_p_ib_ctx->get_ibv_device_attr())) { + d.dev_data.device_cap |= VMA_HW_MP_RQ_EN; + } + d.valid_mask = DATA_VALID_DEV; + + ring_logdbg("found device with Vendor-ID %u, ID %u, Device cap %u", d.dev_data.vendor_part_id, + d.dev_data.vendor_id, d.dev_data.device_cap); + if (!m_p_qp_mgr->fill_hw_descriptors(d)) { + return -1; + } + if (m_p_cq_mgr_rx->fill_cq_hw_descriptors(d.rq_data.wq_data.cq_data)) { + d.valid_mask |= DATA_VALID_RQ; + } + + if (m_p_cq_mgr_tx->fill_cq_hw_descriptors(d.sq_data.wq_data.cq_data)) { + d.valid_mask |= DATA_VALID_SQ; + } + VALGRIND_MAKE_MEM_DEFINED(&d, sizeof(d)); + return 0; +} + +uint32_t ring_simple::get_max_inline_data() +{ + return m_p_qp_mgr->get_max_inline_data(); +} + +#ifdef DEFINED_TSO +uint32_t ring_simple::get_max_send_sge(void) +{ + return m_p_qp_mgr->get_max_send_sge(); +} + +uint32_t ring_simple::get_max_payload_sz(void) +{ + return m_tso.max_payload_sz; +} + +uint16_t ring_simple::get_max_header_sz(void) +{ + return m_tso.max_header_sz; +} + +bool ring_simple::is_tso(void) +{ + return (m_tso.max_payload_sz && m_tso.max_header_sz); +} +#endif /* DEFINED_TSO */ diff --git a/src/vma/dev/ring_simple.h b/src/vma/dev/ring_simple.h new file mode 100644 index 0000000..f7cac45 --- /dev/null +++ b/src/vma/dev/ring_simple.h @@ -0,0 +1,265 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RING_SIMPLE_H +#define RING_SIMPLE_H + +#include "ring_slave.h" + +#include "vma/dev/gro_mgr.h" +#include "vma/dev/net_device_table_mgr.h" + +struct cq_moderation_info { + uint32_t period; + uint32_t count; + uint64_t packets; + uint64_t bytes; + uint64_t prev_packets; + uint64_t prev_bytes; + uint32_t missed_rounds; +}; + +/** + * @class ring simple + * + * Object to manages the QP and CQ operation + * This object is used for Rx & Tx at the same time + * + */ +class ring_simple : public ring_slave +{ +public: + ring_simple(int if_index, ring* parent, ring_type_t type); + virtual ~ring_simple(); + + virtual int request_notification(cq_type_t cq_type, uint64_t poll_sn); + virtual int poll_and_process_element_rx(uint64_t* p_cq_poll_sn, void* pv_fd_ready_array = NULL); + virtual void adapt_cq_moderation(); + virtual bool reclaim_recv_buffers(descq_t *rx_reuse); + virtual bool reclaim_recv_buffers(mem_buf_desc_t* rx_reuse_lst); + bool reclaim_recv_buffers_no_lock(mem_buf_desc_t* rx_reuse_lst); // No locks + virtual int reclaim_recv_single_buffer(mem_buf_desc_t* rx_reuse); // No locks + virtual int socketxtreme_poll(struct vma_completion_t *vma_completions, unsigned int ncompletions, int flags); + virtual int drain_and_proccess(); + virtual int wait_for_notification_and_process_element(int cq_channel_fd, uint64_t* p_cq_poll_sn, void* pv_fd_ready_array = NULL); + // Tx completion handling at the qp_mgr level is just re listing the desc+data buffer in the free lists + void mem_buf_desc_completion_with_error_tx(mem_buf_desc_t* p_tx_wc_buf_desc); // Assume locked... + void mem_buf_desc_completion_with_error_rx(mem_buf_desc_t* p_rx_wc_buf_desc); // Assume locked... + void mem_buf_desc_return_to_owner_tx(mem_buf_desc_t* p_mem_buf_desc); + void mem_buf_desc_return_to_owner_rx(mem_buf_desc_t* p_mem_buf_desc, void* pv_fd_ready_array = NULL); + inline int send_buffer(vma_ibv_send_wr* p_send_wqe, vma_wr_tx_packet_attr attr); + virtual bool is_up(); + void start_active_qp_mgr(); + void stop_active_qp_mgr(); + virtual mem_buf_desc_t* mem_buf_tx_get(ring_user_id_t id, bool b_block, int n_num_mem_bufs = 1); + virtual int mem_buf_tx_release(mem_buf_desc_t* p_mem_buf_desc_list, bool b_accounting, bool trylock = false); + virtual void send_ring_buffer(ring_user_id_t id, vma_ibv_send_wr* p_send_wqe, vma_wr_tx_packet_attr attr); + virtual void send_lwip_buffer(ring_user_id_t id, vma_ibv_send_wr* p_send_wqe, vma_wr_tx_packet_attr attr); + virtual void mem_buf_desc_return_single_to_owner_tx(mem_buf_desc_t* p_mem_buf_desc); + virtual bool get_hw_dummy_send_support(ring_user_id_t id, vma_ibv_send_wr* p_send_wqe); + inline void convert_hw_time_to_system_time(uint64_t hwtime, struct timespec* systime) { m_p_ib_ctx->convert_hw_time_to_system_time(hwtime, systime); } + inline uint32_t get_qpn() const { return (m_p_l2_addr ? ((IPoIB_addr *)m_p_l2_addr)->get_qpn() : 0); } + virtual uint32_t get_underly_qpn() { return m_p_qp_mgr->get_underly_qpn(); } + virtual int modify_ratelimit(struct vma_rate_limit_t &rate_limit); + virtual int get_tx_channel_fd() const { return m_p_tx_comp_event_channel ? m_p_tx_comp_event_channel->fd : -1; }; + virtual uint32_t get_max_inline_data(); +#ifdef DEFINED_TSO + virtual uint32_t get_max_send_sge(void); + virtual uint32_t get_max_payload_sz(void); + virtual uint16_t get_max_header_sz(void); + virtual uint32_t get_tx_lkey(ring_user_id_t id) { NOT_IN_USE(id); return m_tx_lkey; } + virtual bool is_tso(void); +#endif /* DEFINED_TSO */ + + struct ibv_comp_channel* get_tx_comp_event_channel() { return m_p_tx_comp_event_channel; } + int get_ring_descriptors(vma_mlx_hw_device_data &data); + void modify_cq_moderation(uint32_t period, uint32_t count); + int ack_and_arm_cq(cq_type_t cq_type); + friend class cq_mgr; + friend class cq_mgr_mlx5; + friend class qp_mgr; + friend class qp_mgr_eth_mlx5; + friend class rfs; + friend class rfs_uc; + friend class rfs_uc_tcp_gro; + friend class rfs_mc; + friend class ring_bond; + +protected: + virtual qp_mgr* create_qp_mgr(const ib_ctx_handler* ib_ctx, uint8_t port_num, struct ibv_comp_channel* p_rx_comp_event_channel) = 0; + void create_resources(); + virtual void init_tx_buffers(uint32_t count); + virtual void inc_cq_moderation_stats(size_t sz_data); +#ifdef DEFINED_TSO + void set_tx_num_wr(int32_t num_wr) { m_tx_num_wr = m_tx_num_wr_free = num_wr; } +#endif /* DEFINED_TSO */ + uint32_t get_tx_num_wr() { return m_tx_num_wr; } + uint32_t get_mtu() { return m_mtu; } + + ib_ctx_handler* m_p_ib_ctx; + qp_mgr* m_p_qp_mgr; + struct cq_moderation_info m_cq_moderation_info; + cq_mgr* m_p_cq_mgr_rx; + cq_mgr* m_p_cq_mgr_tx; +private: + bool is_socketxtreme(void) {return m_socketxtreme.active;} + + void put_ec(struct ring_ec *ec) + { + m_socketxtreme.lock_ec_list.lock(); + list_add_tail(&ec->list, &m_socketxtreme.ec_list); + m_socketxtreme.lock_ec_list.unlock(); + } + + void del_ec(struct ring_ec *ec) + { + m_socketxtreme.lock_ec_list.lock(); + list_del_init(&ec->list); + ec->clear(); + m_socketxtreme.lock_ec_list.unlock(); + } + + inline ring_ec* get_ec(void) + { + struct ring_ec *ec = NULL; + + m_socketxtreme.lock_ec_list.lock(); + if (!list_empty(&m_socketxtreme.ec_list)) { + ec = list_entry(m_socketxtreme.ec_list.next, struct ring_ec, list); + list_del_init(&ec->list); + } + m_socketxtreme.lock_ec_list.unlock(); + return ec; + } + + struct vma_completion_t *get_comp(void) + { + return m_socketxtreme.completion; + } + + struct { + /* queue of event completion elements + * this queue is stored events related different sockinfo (sockets) + * In current implementation every sockinfo (socket) can have single event + * in this queue + */ + struct list_head ec_list; + + /* Thread-safety lock for get/put operations under the queue */ + lock_spin lock_ec_list; + + /* This completion is introduced to process events directly w/o + * storing them in the queue of event completion elements + */ + struct vma_completion_t* completion; + + /* This flag is enabled in case socketxtreme_poll() call is done */ + bool active; + } m_socketxtreme; + + inline void send_status_handler(int ret, vma_ibv_send_wr* p_send_wqe); + inline mem_buf_desc_t* get_tx_buffers(uint32_t n_num_mem_bufs); + inline int put_tx_buffers(mem_buf_desc_t* buff_list); + inline int put_tx_single_buffer(mem_buf_desc_t* buff); + inline void return_to_global_pool(); + bool is_available_qp_wr(bool b_block); + void save_l2_address(const L2_address* p_l2_addr) { delete_l2_address(); m_p_l2_addr = p_l2_addr->clone(); }; + void delete_l2_address() { if (m_p_l2_addr) delete m_p_l2_addr; m_p_l2_addr = NULL; }; + + lock_mutex m_lock_ring_tx_buf_wait; + uint32_t m_tx_num_bufs; + uint32_t m_tx_num_wr; + int32_t m_tx_num_wr_free; + bool m_b_qp_tx_first_flushed_completion_handled; + uint32_t m_missing_buf_ref_count; + uint32_t m_tx_lkey; // this is the registered memory lkey for a given specific device for the buffer pool use + gro_mgr m_gro_mgr; + bool m_up; + struct ibv_comp_channel* m_p_rx_comp_event_channel; + struct ibv_comp_channel* m_p_tx_comp_event_channel; + L2_address* m_p_l2_addr; + uint32_t m_mtu; + +#ifdef DEFINED_TSO + struct { + /* Maximum length of TCP payload for TSO */ + uint32_t max_payload_sz; + + /* Maximum length of header for TSO */ + uint16_t max_header_sz; + } m_tso; +#endif /* DEFINED_TSO */ +}; + +class ring_eth : public ring_simple +{ +public: + ring_eth(int if_index, + ring* parent = NULL, ring_type_t type = RING_ETH, bool call_create_res = true): + ring_simple(if_index, parent, type) { + net_device_val_eth* p_ndev = + dynamic_cast(g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index())); + if (p_ndev) { + m_partition = p_ndev->get_vlan(); + + /* Do resource initialization for + * ring_eth_direct, ring_eth_cb inside related + * constructors because + * they use own create_qp_mgr() methods + */ + if (call_create_res) { + create_resources(); + } + } + } +protected: + virtual qp_mgr* create_qp_mgr(const ib_ctx_handler* ib_ctx, uint8_t port_num, struct ibv_comp_channel* p_rx_comp_event_channel); +}; + +class ring_ib : public ring_simple +{ +public: + ring_ib(int if_index, + ring* parent = NULL): + ring_simple(if_index, parent, RING_IB) { + net_device_val_ib* p_ndev = + dynamic_cast(g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index())); + if (p_ndev) { + m_partition = p_ndev->get_pkey(); + create_resources(); + } + } +protected: + virtual qp_mgr* create_qp_mgr(const ib_ctx_handler* ib_ctx, uint8_t port_num, struct ibv_comp_channel* p_rx_comp_event_channel); +}; + +#endif //RING_SIMPLE_H diff --git a/src/vma/dev/ring_slave.cpp b/src/vma/dev/ring_slave.cpp new file mode 100644 index 0000000..f44674f --- /dev/null +++ b/src/vma/dev/ring_slave.cpp @@ -0,0 +1,952 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ring_slave.h" + +#include "vma/proto/ip_frag.h" +#include "vma/proto/igmp_mgr.h" +#include "vma/dev/rfs_mc.h" +#include "vma/dev/rfs_uc_tcp_gro.h" +#include "vma/sock/fd_collection.h" +#include "vma/sock/sockinfo.h" + +#undef MODULE_NAME +#define MODULE_NAME "ring_slave" +#undef MODULE_HDR +#define MODULE_HDR MODULE_NAME "%d:%s() " + +#ifndef IGMP_V3_MEMBERSHIP_REPORT +#define IGMP_V3_MEMBERSHIP_REPORT 0x22 /* V3 version of 0x11 */ /* ALEXR: taken from */ +#endif + +// String formating helper function for IGMP +const char* priv_igmp_type_tostr(uint8_t igmptype) +{ + switch (igmptype) { + case IGMP_HOST_MEMBERSHIP_QUERY: return "IGMP_QUERY"; + case IGMP_HOST_MEMBERSHIP_REPORT: return "IGMPV1_REPORT"; + case IGMP_V2_MEMBERSHIP_REPORT: return "IGMPV2_REPORT"; + case IGMP_V3_MEMBERSHIP_REPORT: return "IGMPV3_REPORT"; + case IGMP_HOST_LEAVE_MESSAGE: return "IGMP_LEAVE_MESSAGE"; + default: return "IGMP type UNKNOWN"; + } +} + +ring_slave::ring_slave(int if_index, ring* parent, ring_type_t type): + ring(), + m_lock_ring_rx("ring_slave:lock_rx"), + m_lock_ring_tx("ring_slave:lock_tx"), + m_partition(0), + m_flow_tag_enabled(false), + m_b_sysvar_eth_mc_l2_only_rules(safe_mce_sys().eth_mc_l2_only_rules), + m_b_sysvar_mc_force_flowtag(safe_mce_sys().mc_force_flowtag), + m_type(type) +{ + net_device_val* p_ndev = NULL; + const slave_data_t * p_slave = NULL; + + /* Configure ring() fields */ + set_parent(parent); + set_if_index(if_index); + + /* Sanity check */ + p_ndev = g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index()); + if (NULL == p_ndev) { + ring_logpanic("Invalid if_index = %d", if_index); + } + + p_slave = p_ndev->get_slave(get_if_index()); + + /* Configure ring_slave() fields */ + m_transport_type = p_ndev->get_transport_type(); + m_local_if = p_ndev->get_local_addr(); + + /* Set the same ring active status as related slave has for all ring types + * excluding ring with type RING_TAP that does not have related slave device. + * So it is marked as active just in case related netvsc device is absent. + */ + m_active = p_slave ? + p_slave->active : + p_ndev->get_slave_array().empty(); + + // use local copy of stats by default + m_p_ring_stat = &m_ring_stat; + memset(m_p_ring_stat, 0, sizeof(*m_p_ring_stat)); + m_p_ring_stat->n_type = m_type; + if (m_parent != this) { + m_ring_stat.p_ring_master = m_parent; + } + + m_tx_pool.set_id("ring_slave (%p) : m_tx_pool", this); + + vma_stats_instance_create_ring_block(m_p_ring_stat); + + print_val(); +} + +ring_slave::~ring_slave() +{ + print_val(); + + if (m_p_ring_stat) { + vma_stats_instance_remove_ring_block(m_p_ring_stat); + } + + /* Release TX buffer poll */ + g_buffer_pool_tx->put_buffers_thread_safe(&m_tx_pool, m_tx_pool.size()); +} + +void ring_slave::print_val() +{ + ring_logdbg("%d: 0x%X: parent 0x%X type %s", + m_if_index, this, + ((uintptr_t)this == (uintptr_t)m_parent ? 0 : m_parent), + ring_type_str[m_type]); +} + +void ring_slave::restart() +{ + ring_logpanic("Can't restart a slave ring"); +} + +bool ring_slave::is_active_member(ring_slave* rng, ring_user_id_t) +{ + return (this == rng); +} + +bool ring_slave::is_member(ring_slave* rng) +{ + return (this == rng); +} + +ring_user_id_t ring_slave::generate_id() +{ + return 0; +} + +ring_user_id_t ring_slave::generate_id(const address_t, const address_t, + uint16_t, uint16_t, uint32_t, uint32_t, uint16_t, uint16_t) +{ + return 0; +} + +void ring_slave::inc_tx_retransmissions_stats(ring_user_id_t) { + m_p_ring_stat->n_tx_retransmits++; +} + +bool ring_slave::attach_flow(flow_tuple& flow_spec_5t, pkt_rcvr_sink *sink) +{ + rfs* p_rfs; + rfs* p_tmp_rfs = NULL; + sockinfo* si = static_cast (sink); + + if (si == NULL) + return false; + + uint32_t flow_tag_id = si->get_flow_tag_val(); // spec will not be attached to rule + if (!m_flow_tag_enabled) { + flow_tag_id = 0; + } + ring_logdbg("flow: %s, with sink (%p), flow tag id %d " + "m_flow_tag_enabled: %d", flow_spec_5t.to_str(), si, + flow_tag_id, m_flow_tag_enabled); + + /* + * //auto_unlocker lock(m_lock_ring_rx); + * todo instead of locking the whole function which have many "new" calls, + * we'll only lock the parts that touch the ring members. + * if some of the constructors need the ring locked, we need to modify + * and add separate functions for that, which will be called after ctor with ring locked. + * currently we assume the ctors does not require the ring to be locked. + */ + m_lock_ring_rx.lock(); + + /* Get the appropriate hash map (tcp, uc or mc) from the 5t details */ + if (flow_spec_5t.is_udp_uc()) { + flow_spec_udp_key_t key_udp_uc(flow_spec_5t.get_dst_ip(), flow_spec_5t.get_dst_port()); + if (flow_tag_id && si->flow_in_reuse()) { + flow_tag_id = FLOW_TAG_MASK; + ring_logdbg("UC flow tag for socketinfo=%p is disabled: SO_REUSEADDR or SO_REUSEPORT were enabled", si); + } + p_rfs = m_flow_udp_uc_map.get(key_udp_uc, NULL); + if (p_rfs == NULL) { + // No rfs object exists so a new one must be created and inserted in the flow map + m_lock_ring_rx.unlock(); + try { + p_tmp_rfs = new rfs_uc(&flow_spec_5t, this, NULL, flow_tag_id); + } catch(vma_exception& e) { + ring_logerr("%s", e.message); + return false; + } + BULLSEYE_EXCLUDE_BLOCK_START + if (p_tmp_rfs == NULL) { + ring_logerr("Failed to allocate rfs!"); + return false; + } + BULLSEYE_EXCLUDE_BLOCK_END + m_lock_ring_rx.lock(); + p_rfs = m_flow_udp_uc_map.get(key_udp_uc, NULL); + if (p_rfs) { + delete p_tmp_rfs; + } else { + p_rfs = p_tmp_rfs; + m_flow_udp_uc_map.set(key_udp_uc, p_rfs); + } + } + } else if (flow_spec_5t.is_udp_mc()) { + flow_spec_udp_key_t key_udp_mc(flow_spec_5t.get_dst_ip(), flow_spec_5t.get_dst_port()); + + if (flow_tag_id) { + if (m_b_sysvar_mc_force_flowtag || !si->flow_in_reuse()) { + ring_logdbg("MC flow tag ID=%d for socketinfo=%p is enabled: force_flowtag=%d, SO_REUSEADDR | SO_REUSEPORT=%d", + flow_tag_id, si, m_b_sysvar_mc_force_flowtag, si->flow_in_reuse()); + } else { + flow_tag_id = FLOW_TAG_MASK; + ring_logdbg("MC flow tag for socketinfo=%p is disabled: force_flowtag=0, SO_REUSEADDR or SO_REUSEPORT were enabled", si); + } + } + // Note for CX3: + // For IB MC flow, the port is zeroed in the ibv_flow_spec when calling to ibv_flow_spec(). + // It means that for every MC group, even if we have sockets with different ports - only one rule in the HW. + // So the hash map below keeps track of the number of sockets per rule so we know when to call ibv_attach and ibv_detach + rfs_rule_filter* l2_mc_ip_filter = NULL; + if ((m_transport_type == VMA_TRANSPORT_IB && 0 == get_underly_qpn()) || m_b_sysvar_eth_mc_l2_only_rules) { + rule_filter_map_t::iterator l2_mc_iter = m_l2_mc_ip_attach_map.find(key_udp_mc.dst_ip); + if (l2_mc_iter == m_l2_mc_ip_attach_map.end()) { // It means that this is the first time attach called with this MC ip + m_l2_mc_ip_attach_map[key_udp_mc.dst_ip].counter = 1; + } else { + m_l2_mc_ip_attach_map[key_udp_mc.dst_ip].counter = ((l2_mc_iter->second.counter) + 1); + } + } + p_rfs = m_flow_udp_mc_map.get(key_udp_mc, NULL); + if (p_rfs == NULL) { // It means that no rfs object exists so I need to create a new one and insert it to the flow map + m_lock_ring_rx.unlock(); + if ((m_transport_type == VMA_TRANSPORT_IB && 0 == get_underly_qpn()) || m_b_sysvar_eth_mc_l2_only_rules) { + l2_mc_ip_filter = new rfs_rule_filter(m_l2_mc_ip_attach_map, key_udp_mc.dst_ip, flow_spec_5t); + } + try { + p_tmp_rfs = new rfs_mc(&flow_spec_5t, this, l2_mc_ip_filter, flow_tag_id); + } catch(vma_exception& e) { + ring_logerr("%s", e.message); + return false; + } catch(const std::bad_alloc &e) { + NOT_IN_USE(e); + ring_logerr("Failed to allocate rfs!"); + return false; + } + m_lock_ring_rx.lock(); + p_rfs = m_flow_udp_mc_map.get(key_udp_mc, NULL); + if (p_rfs) { + delete p_tmp_rfs; + } else { + p_rfs = p_tmp_rfs; + m_flow_udp_mc_map.set(key_udp_mc, p_rfs); + } + } + } else if (flow_spec_5t.is_tcp()) { + flow_spec_tcp_key_t key_tcp(flow_spec_5t.get_dst_ip(), flow_spec_5t.get_src_ip(), + flow_spec_5t.get_dst_port(), flow_spec_5t.get_src_port()); + rule_key_t rule_key(flow_spec_5t.get_dst_ip(), flow_spec_5t.get_dst_port()); + rfs_rule_filter* tcp_dst_port_filter = NULL; + if (safe_mce_sys().tcp_3t_rules) { + rule_filter_map_t::iterator tcp_dst_port_iter = m_tcp_dst_port_attach_map.find(rule_key.key); + if (tcp_dst_port_iter == m_tcp_dst_port_attach_map.end()) { + m_tcp_dst_port_attach_map[rule_key.key].counter = 1; + } else { + m_tcp_dst_port_attach_map[rule_key.key].counter = ((tcp_dst_port_iter->second.counter) + 1); + } + } + + p_rfs = m_flow_tcp_map.get(key_tcp, NULL); + if (p_rfs == NULL) { // It means that no rfs object exists so I need to create a new one and insert it to the flow map + m_lock_ring_rx.unlock(); + if (safe_mce_sys().tcp_3t_rules) { + flow_tuple tcp_3t_only(flow_spec_5t.get_dst_ip(), flow_spec_5t.get_dst_port(), 0, 0, flow_spec_5t.get_protocol()); + tcp_dst_port_filter = new rfs_rule_filter(m_tcp_dst_port_attach_map, rule_key.key, tcp_3t_only); + } + if(safe_mce_sys().gro_streams_max && flow_spec_5t.is_5_tuple() && is_simple()) { + // When the gro mechanism is being used, packets must be processed in the rfs + // layer. This must not be bypassed by using flow tag. + if (flow_tag_id) { + flow_tag_id = FLOW_TAG_MASK; + ring_logdbg("flow_tag_id = %d is disabled to enable TCP GRO socket to be processed on RFS!", flow_tag_id); + } + p_tmp_rfs = new (std::nothrow)rfs_uc_tcp_gro(&flow_spec_5t, this, tcp_dst_port_filter, flow_tag_id); + } else { + try { + p_tmp_rfs = new (std::nothrow)rfs_uc(&flow_spec_5t, this, tcp_dst_port_filter, flow_tag_id); + } catch(vma_exception& e) { + ring_logerr("%s", e.message); + return false; + } + } + BULLSEYE_EXCLUDE_BLOCK_START + if (p_tmp_rfs == NULL) { + ring_logerr("Failed to allocate rfs!"); + return false; + } + BULLSEYE_EXCLUDE_BLOCK_END + /* coverity[double_lock] TODO: RM#1049980 */ + m_lock_ring_rx.lock(); + p_rfs = m_flow_tcp_map.get(key_tcp, NULL); + if (p_rfs) { + delete p_tmp_rfs; + } else { + p_rfs = p_tmp_rfs; + m_flow_tcp_map.set(key_tcp, p_rfs); + } + } + BULLSEYE_EXCLUDE_BLOCK_START + } else { + m_lock_ring_rx.unlock(); + ring_logerr("Could not find map (TCP, UC or MC) for requested flow"); + return false; + } + BULLSEYE_EXCLUDE_BLOCK_END + + bool ret = p_rfs->attach_flow(sink); + if (ret) { + if (flow_tag_id && (flow_tag_id != FLOW_TAG_MASK)) { + // A flow with FlowTag was attached succesfully, check stored rfs for fast path be tag_id + si->set_flow_tag(flow_tag_id); + ring_logdbg("flow_tag: %d registration is done!", flow_tag_id); + } + if (flow_spec_5t.is_tcp() && !flow_spec_5t.is_3_tuple()) { + // save the single 5tuple TCP connected socket for improved fast path + si->set_tcp_flow_is_5t(); + ring_logdbg("single 5T TCP update m_tcp_flow_is_5t m_flow_tag_enabled: %d", m_flow_tag_enabled); + } + } else { + ring_logerr("attach_flow=%d failed!", ret); + } + /* coverity[double_unlock] TODO: RM#1049980 */ + m_lock_ring_rx.unlock(); + return ret; +} + +bool ring_slave::detach_flow(flow_tuple& flow_spec_5t, pkt_rcvr_sink* sink) +{ + rfs* p_rfs = NULL; + + ring_logdbg("flow: %s, with sink (%p)", flow_spec_5t.to_str(), sink); + + auto_unlocker lock(m_lock_ring_rx); + + /* Get the appropriate hash map (tcp, uc or mc) from the 5t details */ + if (flow_spec_5t.is_udp_uc()) { + flow_spec_udp_key_t key_udp_uc(flow_spec_5t.get_dst_ip(), flow_spec_5t.get_dst_port()); + p_rfs = m_flow_udp_uc_map.get(key_udp_uc, NULL); + BULLSEYE_EXCLUDE_BLOCK_START + if (p_rfs == NULL) { + ring_logdbg("Could not find rfs object to detach!"); + return false; + } + BULLSEYE_EXCLUDE_BLOCK_END + p_rfs->detach_flow(sink); + if (p_rfs->get_num_of_sinks() == 0) { + BULLSEYE_EXCLUDE_BLOCK_START + if (!(m_flow_udp_uc_map.del(key_udp_uc))) { + ring_logdbg("Could not find rfs object to delete in ring udp uc hash map!"); + } + BULLSEYE_EXCLUDE_BLOCK_END + delete p_rfs; + } + } else if (flow_spec_5t.is_udp_mc()) { + int keep_in_map = 1; + flow_spec_udp_key_t key_udp_mc(flow_spec_5t.get_dst_ip(), flow_spec_5t.get_dst_port()); + if (m_transport_type == VMA_TRANSPORT_IB || m_b_sysvar_eth_mc_l2_only_rules) { + rule_filter_map_t::iterator l2_mc_iter = m_l2_mc_ip_attach_map.find(key_udp_mc.dst_ip); + BULLSEYE_EXCLUDE_BLOCK_START + if (l2_mc_iter == m_l2_mc_ip_attach_map.end()) { + ring_logdbg("Could not find matching counter for the MC group!"); + BULLSEYE_EXCLUDE_BLOCK_END + } else { + keep_in_map = m_l2_mc_ip_attach_map[key_udp_mc.dst_ip].counter = MAX(0 , ((l2_mc_iter->second.counter) - 1)); + } + } + p_rfs = m_flow_udp_mc_map.get(key_udp_mc, NULL); + BULLSEYE_EXCLUDE_BLOCK_START + if (p_rfs == NULL) { + ring_logdbg("Could not find rfs object to detach!"); + return false; + } + BULLSEYE_EXCLUDE_BLOCK_END + p_rfs->detach_flow(sink); + if(!keep_in_map){ + m_l2_mc_ip_attach_map.erase(m_l2_mc_ip_attach_map.find(key_udp_mc.dst_ip)); + } + if (p_rfs->get_num_of_sinks() == 0) { + BULLSEYE_EXCLUDE_BLOCK_START + if (!(m_flow_udp_mc_map.del(key_udp_mc))) { + ring_logdbg("Could not find rfs object to delete in ring udp mc hash map!"); + } + BULLSEYE_EXCLUDE_BLOCK_END + delete p_rfs; + } + } else if (flow_spec_5t.is_tcp()) { + int keep_in_map = 1; + flow_spec_tcp_key_t key_tcp(flow_spec_5t.get_dst_ip(), flow_spec_5t.get_src_ip(), + flow_spec_5t.get_dst_port(), flow_spec_5t.get_src_port()); + rule_key_t rule_key(flow_spec_5t.get_dst_ip(), flow_spec_5t.get_dst_port()); + if (safe_mce_sys().tcp_3t_rules) { + rule_filter_map_t::iterator tcp_dst_port_iter = m_tcp_dst_port_attach_map.find(rule_key.key); + BULLSEYE_EXCLUDE_BLOCK_START + if (tcp_dst_port_iter == m_tcp_dst_port_attach_map.end()) { + ring_logdbg("Could not find matching counter for TCP src port!"); + BULLSEYE_EXCLUDE_BLOCK_END + } else { + keep_in_map = m_tcp_dst_port_attach_map[rule_key.key].counter = MAX(0 , ((tcp_dst_port_iter->second.counter) - 1)); + } + } + p_rfs = m_flow_tcp_map.get(key_tcp, NULL); + BULLSEYE_EXCLUDE_BLOCK_START + if (p_rfs == NULL) { + ring_logdbg("Could not find rfs object to detach!"); + return false; + } + BULLSEYE_EXCLUDE_BLOCK_END + + p_rfs->detach_flow(sink); + if(!keep_in_map){ + m_tcp_dst_port_attach_map.erase(m_tcp_dst_port_attach_map.find(rule_key.key)); + } + if (p_rfs->get_num_of_sinks() == 0) { + BULLSEYE_EXCLUDE_BLOCK_START + if (!(m_flow_tcp_map.del(key_tcp))) { + ring_logdbg("Could not find rfs object to delete in ring tcp hash map!"); + } + BULLSEYE_EXCLUDE_BLOCK_END + delete p_rfs; + } + BULLSEYE_EXCLUDE_BLOCK_START + } else { + ring_logerr("Could not find map (TCP, UC or MC) for requested flow"); + return false; + } + BULLSEYE_EXCLUDE_BLOCK_END + + return true; +} + +// calling sockinfo callback with RFS bypass +static inline bool check_rx_packet(sockinfo *si, mem_buf_desc_t* p_rx_wc_buf_desc, void *fd_ready_array) +{ + // Dispatching: Notify new packet to the FIRST registered receiver ONLY +#ifdef RDTSC_MEASURE_RX_DISPATCH_PACKET + RDTSC_TAKE_START(g_rdtsc_instr_info_arr[RDTSC_FLOW_RX_DISPATCH_PACKET]); +#endif //RDTSC_MEASURE_RX_DISPATCH_PACKET + + p_rx_wc_buf_desc->reset_ref_count(); + p_rx_wc_buf_desc->inc_ref_count(); + + si->rx_input_cb(p_rx_wc_buf_desc,fd_ready_array); + +#ifdef RDTSC_MEASURE_RX_DISPATCH_PACKET + RDTSC_TAKE_END(g_rdtsc_instr_info_arr[RDTSC_FLOW_RX_DISPATCH_PACKET]); +#endif //RDTSC_MEASURE_RX_DISPATCH_PACKET + + // Check packet ref_count to see the last receiver is interested in this packet + if (p_rx_wc_buf_desc->dec_ref_count() > 1) { + // The sink will be responsible to return the buffer to CQ for reuse + return true; + } + // Reuse this data buffer & mem_buf_desc + return false; +} + +// All CQ wce come here for some basic sanity checks and then are distributed to the correct ring handler +// Return values: false = Reuse this data buffer & mem_buf_desc +bool ring_slave::rx_process_buffer(mem_buf_desc_t* p_rx_wc_buf_desc, void* pv_fd_ready_array) +{ + size_t sz_data = 0; + size_t transport_header_len; + uint16_t ip_hdr_len = 0; + uint16_t ip_tot_len = 0; + uint16_t ip_frag_off = 0; + uint16_t n_frag_offset = 0; + struct ethhdr* p_eth_h = (struct ethhdr*)(p_rx_wc_buf_desc->p_buffer); + struct iphdr* p_ip_h = NULL; + struct udphdr* p_udp_h = NULL; + + // Validate buffer size + sz_data = p_rx_wc_buf_desc->sz_data; + if (unlikely(sz_data > p_rx_wc_buf_desc->sz_buffer)) { + if (sz_data == IP_FRAG_FREED) { + ring_logfuncall("Rx buffer dropped - old fragment part"); + } else { + ring_logwarn("Rx buffer dropped - buffer too small (%d, %d)", sz_data, p_rx_wc_buf_desc->sz_buffer); + } + return false; + } + + inc_cq_moderation_stats(sz_data); + + m_p_ring_stat->n_rx_byte_count += sz_data; + ++m_p_ring_stat->n_rx_pkt_count; + + // This is an internal function (within ring and 'friends'). No need for lock mechanism. + if (likely(m_flow_tag_enabled && p_rx_wc_buf_desc->rx.flow_tag_id && + p_rx_wc_buf_desc->rx.flow_tag_id != FLOW_TAG_MASK && + !p_rx_wc_buf_desc->rx.is_sw_csum_need)) { + sockinfo* si = NULL; + // trying to get sockinfo per flow_tag_id-1 as it was incremented at attach + // to allow mapping sockfd=0 + si = static_cast (g_p_fd_collection->get_sockfd(p_rx_wc_buf_desc->rx.flow_tag_id-1)); + + if (likely((si != NULL) && si->flow_tag_enabled())) { + // will process packets with set flow_tag_id and enabled for the socket + if (p_eth_h->h_proto == htons(ETH_P_8021Q)) { + // Handle VLAN header as next protocol + transport_header_len = ETH_VLAN_HDR_LEN; + } else { + transport_header_len = ETH_HDR_LEN; + } + p_ip_h = (struct iphdr*)(p_rx_wc_buf_desc->p_buffer + transport_header_len); + ip_hdr_len = 20; //(int)(p_ip_h->ihl)*4; + ip_tot_len = ntohs(p_ip_h->tot_len); + + ring_logfunc("FAST PATH Rx packet info: transport_header_len: %d, IP_header_len: %d L3 proto: %d tcp_5t: %d", + transport_header_len, p_ip_h->ihl, p_ip_h->protocol, si->tcp_flow_is_5t()); + + if (likely(si->tcp_flow_is_5t())) { + // we have a single 5tuple TCP connected socket, use simpler fast path + struct tcphdr* p_tcp_h = (struct tcphdr*)((uint8_t*)p_ip_h + ip_hdr_len); + + // Update the L3 and L4 info + p_rx_wc_buf_desc->rx.src.sin_family = AF_INET; + p_rx_wc_buf_desc->rx.src.sin_port = p_tcp_h->source; + p_rx_wc_buf_desc->rx.src.sin_addr.s_addr = p_ip_h->saddr; + + p_rx_wc_buf_desc->rx.dst.sin_family = AF_INET; + p_rx_wc_buf_desc->rx.dst.sin_port = p_tcp_h->dest; + p_rx_wc_buf_desc->rx.dst.sin_addr.s_addr = p_ip_h->daddr; + // Update packet descriptor with datagram base address and length + p_rx_wc_buf_desc->rx.frag.iov_base = (uint8_t*)p_tcp_h + sizeof(struct tcphdr); + p_rx_wc_buf_desc->rx.frag.iov_len = ip_tot_len - ip_hdr_len - sizeof(struct tcphdr); + p_rx_wc_buf_desc->rx.sz_payload = ip_tot_len - ip_hdr_len - p_tcp_h->doff*4; + + p_rx_wc_buf_desc->rx.tcp.p_ip_h = p_ip_h; + p_rx_wc_buf_desc->rx.tcp.p_tcp_h = p_tcp_h; + p_rx_wc_buf_desc->rx.tcp.n_transport_header_len = transport_header_len; + p_rx_wc_buf_desc->rx.n_frags = 1; + + ring_logfunc("FAST PATH Rx TCP segment info: src_port=%d, dst_port=%d, flags='%s%s%s%s%s%s' seq=%u, ack=%u, win=%u, payload_sz=%u", + ntohs(p_tcp_h->source), ntohs(p_tcp_h->dest), + p_tcp_h->urg?"U":"", p_tcp_h->ack?"A":"", p_tcp_h->psh?"P":"", + p_tcp_h->rst?"R":"", p_tcp_h->syn?"S":"", p_tcp_h->fin?"F":"", + ntohl(p_tcp_h->seq), ntohl(p_tcp_h->ack_seq), ntohs(p_tcp_h->window), + p_rx_wc_buf_desc->rx.sz_payload); + + return check_rx_packet(si, p_rx_wc_buf_desc, pv_fd_ready_array); + + } else if (likely(p_ip_h->protocol==IPPROTO_UDP)) { + // Get the udp header pointer + udp payload size + p_udp_h = (struct udphdr*)((uint8_t*)p_ip_h + ip_hdr_len); + + // Update the L3 and L4 info + p_rx_wc_buf_desc->rx.src.sin_family = AF_INET; + p_rx_wc_buf_desc->rx.src.sin_port = p_udp_h->source; + p_rx_wc_buf_desc->rx.src.sin_addr.s_addr = p_ip_h->saddr; + + p_rx_wc_buf_desc->rx.dst.sin_family = AF_INET; + p_rx_wc_buf_desc->rx.dst.sin_port = p_udp_h->dest; + p_rx_wc_buf_desc->rx.dst.sin_addr.s_addr = p_ip_h->daddr; + // Update packet descriptor with datagram base address and length + p_rx_wc_buf_desc->rx.frag.iov_base = (uint8_t*)p_udp_h + sizeof(struct udphdr); + p_rx_wc_buf_desc->rx.frag.iov_len = ip_tot_len - ip_hdr_len - sizeof(struct udphdr); + p_rx_wc_buf_desc->rx.sz_payload = ntohs(p_udp_h->len) - sizeof(struct udphdr); + + p_rx_wc_buf_desc->rx.udp.local_if = m_local_if; + p_rx_wc_buf_desc->rx.n_frags = 1; + + ring_logfunc("FAST PATH Rx UDP datagram info: src_port=%d, dst_port=%d, payload_sz=%d, csum=%#x", + ntohs(p_udp_h->source), ntohs(p_udp_h->dest), p_rx_wc_buf_desc->rx.sz_payload, p_udp_h->check); + + return check_rx_packet(si, p_rx_wc_buf_desc, pv_fd_ready_array); + } + } + } + + // Validate transport type headers + switch (m_transport_type) { + case VMA_TRANSPORT_IB: + { + // Get the data buffer start pointer to the ipoib header pointer + struct ipoibhdr* p_ipoib_h = (struct ipoibhdr*)(p_rx_wc_buf_desc->p_buffer + GRH_HDR_LEN); + + transport_header_len = GRH_HDR_LEN + IPOIB_HDR_LEN; + + // Validate IPoIB header + if (unlikely(p_ipoib_h->ipoib_header != htonl(IPOIB_HEADER))) { + ring_logwarn("Rx buffer dropped - Invalid IPOIB Header Type (%#x : %#x)", p_ipoib_h->ipoib_header, htonl(IPOIB_HEADER)); + return false; + } + } + break; + case VMA_TRANSPORT_ETH: + { +// printf("\nring_slave::rx_process_buffer\n"); +// { +// struct ethhdr* p_eth_h = (struct ethhdr*)(p_rx_wc_buf_desc->p_buffer); +// +// int i = 0; +// printf("p_eth_h->h_dest [0]=%d, [1]=%d, [2]=%d, [3]=%d, [4]=%d, [5]=%d\n", +// (uint8_t)p_eth_h->h_dest[0], (uint8_t)p_eth_h->h_dest[1], (uint8_t)p_eth_h->h_dest[2], (uint8_t)p_eth_h->h_dest[3], (uint8_t)p_eth_h->h_dest[4], (uint8_t)p_eth_h->h_dest[5]); +// printf("p_eth_h->h_source [0]=%d, [1]=%d, [2]=%d, [3]=%d, [4]=%d, [5]=%d\n", +// (uint8_t)p_eth_h->h_source[0], (uint8_t)p_eth_h->h_source[1], (uint8_t)p_eth_h->h_source[2], (uint8_t)p_eth_h->h_source[3], (uint8_t)p_eth_h->h_source[4], (uint8_t)p_eth_h->h_source[5]); +// +// while(i++<62){ +// printf("%d, ", (uint8_t)p_rx_wc_buf_desc->p_buffer[i]); +// } +// printf("\n"); +// } + + uint16_t h_proto = p_eth_h->h_proto; + + ring_logfunc("Rx buffer Ethernet dst=" ETH_HW_ADDR_PRINT_FMT " <- src=" ETH_HW_ADDR_PRINT_FMT " type=%#x", + ETH_HW_ADDR_PRINT_ADDR(p_eth_h->h_dest), + ETH_HW_ADDR_PRINT_ADDR(p_eth_h->h_source), + htons(h_proto)); + + // Handle VLAN header as next protocol + struct vlanhdr* p_vlan_hdr = NULL; + uint16_t packet_vlan = 0; + if (h_proto == htons(ETH_P_8021Q)) { + p_vlan_hdr = (struct vlanhdr*)((uint8_t*)p_eth_h + ETH_HDR_LEN); + transport_header_len = ETH_VLAN_HDR_LEN; + h_proto = p_vlan_hdr->h_vlan_encapsulated_proto; + packet_vlan = (htons(p_vlan_hdr->h_vlan_TCI) & VLAN_VID_MASK); + } else { + transport_header_len = ETH_HDR_LEN; + } + + //TODO: Remove this code when handling vlan in flow steering will be available. Change this code if vlan stripping is performed. + if((m_partition & VLAN_VID_MASK) != packet_vlan) { + ring_logfunc("Rx buffer dropped- Mismatched vlan. Packet vlan = %d, Local vlan = %d", packet_vlan, m_partition & VLAN_VID_MASK); + return false; + } + + // Validate IP header as next protocol + if (unlikely(h_proto != htons(ETH_P_IP))) { + ring_logwarn("Rx buffer dropped - Invalid Ethr Type (%#x : %#x)", p_eth_h->h_proto, htons(ETH_P_IP)); + return false; + } + } + break; + default: + ring_logwarn("Rx buffer dropped - Unknown transport type %d", m_transport_type); + return false; + } + + // Jump to IP header - Skip IB (GRH and IPoIB) or Ethernet (MAC) header sizes + sz_data -= transport_header_len; + + // Validate size for IPv4 header + if (unlikely(sz_data < sizeof(struct iphdr))) { + ring_logwarn("Rx buffer dropped - buffer too small for IPv4 header (%d, %d)", sz_data, sizeof(struct iphdr)); + return false; + } + + // Get the ip header pointer + p_ip_h = (struct iphdr*)(p_rx_wc_buf_desc->p_buffer + transport_header_len); + + // Drop all non IPv4 packets + if (unlikely(p_ip_h->version != IPV4_VERSION)) { + ring_logwarn("Rx packet dropped - not IPV4 packet (got version: %#x)", p_ip_h->version); + return false; + } + + // Check that received buffer size is not smaller then the ip datagram total size + ip_tot_len = ntohs(p_ip_h->tot_len); + if (unlikely(sz_data < ip_tot_len)) { + ring_logwarn("Rx packet dropped - buffer too small for received datagram (RxBuf:%d IP:%d)", sz_data, ip_tot_len); + ring_loginfo("Rx packet info (buf->%p, bufsize=%d), id=%d", p_rx_wc_buf_desc->p_buffer, p_rx_wc_buf_desc->sz_data, ntohs(p_ip_h->id)); + vlog_print_buffer(VLOG_INFO, "rx packet data: ", "\n", (const char*)p_rx_wc_buf_desc->p_buffer, min(112, (int)p_rx_wc_buf_desc->sz_data)); + return false; + } else if (sz_data > ip_tot_len) { + p_rx_wc_buf_desc->sz_data -= (sz_data - ip_tot_len); + } + + // Read fragmentation parameters + ip_frag_off = ntohs(p_ip_h->frag_off); + n_frag_offset = (ip_frag_off & FRAGMENT_OFFSET) * 8; + + ring_logfunc("Rx ip packet info: dst=%d.%d.%d.%d, src=%d.%d.%d.%d, packet_sz=%d, offset=%d, id=%d, proto=%s[%d] (local if: %d.%d.%d.%d)", + NIPQUAD(p_ip_h->daddr), NIPQUAD(p_ip_h->saddr), + (sz_data > ip_tot_len ? ip_tot_len : sz_data), n_frag_offset, ntohs(p_ip_h->id), + iphdr_protocol_type_to_str(p_ip_h->protocol), p_ip_h->protocol, + NIPQUAD(p_rx_wc_buf_desc->rx.dst.sin_addr.s_addr)); + + // Check that the ip datagram has at least the udp header size for the first ip fragment (besides the ip header) + ip_hdr_len = (int)(p_ip_h->ihl)*4; + if (unlikely((n_frag_offset == 0) && (ip_tot_len < (ip_hdr_len + sizeof(struct udphdr))))) { + ring_logwarn("Rx packet dropped - ip packet too small (%d bytes)- udp header cut!", ip_tot_len); + return false; + } + + // Handle fragmentation + p_rx_wc_buf_desc->rx.n_frags = 1; + if (unlikely((ip_frag_off & MORE_FRAGMENTS_FLAG) || n_frag_offset)) { // Currently we don't expect to receive fragments + //for disabled fragments handling: + /*ring_logwarn("Rx packet dropped - VMA doesn't support fragmentation in receive flow!"); + ring_logwarn("packet info: dst=%d.%d.%d.%d, src=%d.%d.%d.%d, packet_sz=%d, frag_offset=%d, id=%d, proto=%s[%d], transport type=%s, (local if: %d.%d.%d.%d)", + NIPQUAD(p_ip_h->daddr), NIPQUAD(p_ip_h->saddr), + (sz_data > ip_tot_len ? ip_tot_len : sz_data), n_frag_offset, ntohs(p_ip_h->id), + iphdr_protocol_type_to_str(p_ip_h->protocol), p_ip_h->protocol, (m_transport_type ? "ETH" : "IB"), + NIPQUAD(local_addr)); + return false;*/ +#if 1 //handle fragments + // Update fragments descriptor with datagram base address and length + p_rx_wc_buf_desc->rx.frag.iov_base = (uint8_t*)p_ip_h + ip_hdr_len; + p_rx_wc_buf_desc->rx.frag.iov_len = ip_tot_len - ip_hdr_len; + + // Add ip fragment packet to out fragment manager + mem_buf_desc_t* new_buf = NULL; + int ret = -1; + if (g_p_ip_frag_manager) + ret = g_p_ip_frag_manager->add_frag(p_ip_h, p_rx_wc_buf_desc, &new_buf); + if (ret < 0) // Finished with error + return false; + if (!new_buf) // This is fragment + return true; + + // Re-calc all ip related values for new ip packet of head fragmentation list + p_rx_wc_buf_desc = new_buf; + p_ip_h = (struct iphdr*)(p_rx_wc_buf_desc->p_buffer + transport_header_len); + ip_hdr_len = (int)(p_ip_h->ihl)*4; + ip_tot_len = ntohs(p_ip_h->tot_len); + + mem_buf_desc_t *tmp; + for (tmp = p_rx_wc_buf_desc; tmp; tmp = tmp->p_next_desc) { + ++p_rx_wc_buf_desc->rx.n_frags; + } +#endif + } + + if (p_rx_wc_buf_desc->rx.is_sw_csum_need && compute_ip_checksum((unsigned short*)p_ip_h, p_ip_h->ihl * 2)) { + return false; // false ip checksum + } + +//We want to enable loopback between processes for IB +#if 0 + //AlexV: We don't support Tx MC Loopback today! + if (p_ip_h->saddr == m_local_if) { + ring_logfunc("Rx udp datagram discarded - mc loop disabled"); + return false; + } +#endif + rfs* p_rfs = NULL; + + // Update the L3 info + p_rx_wc_buf_desc->rx.src.sin_family = AF_INET; + p_rx_wc_buf_desc->rx.src.sin_addr.s_addr = p_ip_h->saddr; + p_rx_wc_buf_desc->rx.dst.sin_family = AF_INET; + p_rx_wc_buf_desc->rx.dst.sin_addr.s_addr = p_ip_h->daddr; + + switch (p_ip_h->protocol) { + case IPPROTO_UDP: + { + // Get the udp header pointer + udp payload size + p_udp_h = (struct udphdr*)((uint8_t*)p_ip_h + ip_hdr_len); + + // Update packet descriptor with datagram base address and length + p_rx_wc_buf_desc->rx.frag.iov_base = (uint8_t*)p_udp_h + sizeof(struct udphdr); + p_rx_wc_buf_desc->rx.frag.iov_len = ip_tot_len - ip_hdr_len - sizeof(struct udphdr); + + if (p_rx_wc_buf_desc->rx.is_sw_csum_need && p_udp_h->check && compute_udp_checksum_rx(p_ip_h, p_udp_h, p_rx_wc_buf_desc)) { + return false; // false udp checksum + } + + size_t sz_payload = ntohs(p_udp_h->len) - sizeof(struct udphdr); + ring_logfunc("Rx udp datagram info: src_port=%d, dst_port=%d, payload_sz=%d, csum=%#x", + ntohs(p_udp_h->source), ntohs(p_udp_h->dest), sz_payload, p_udp_h->check); + + // Update the L3 info + p_rx_wc_buf_desc->rx.udp.local_if = m_local_if; + + // Update the L4 info + p_rx_wc_buf_desc->rx.src.sin_port = p_udp_h->source; + p_rx_wc_buf_desc->rx.dst.sin_port = p_udp_h->dest; + p_rx_wc_buf_desc->rx.sz_payload = sz_payload; + + // Find the relevant hash map and pass the packet to the rfs for dispatching + if (!(IN_MULTICAST_N(p_rx_wc_buf_desc->rx.dst.sin_addr.s_addr))) { // This is UDP UC packet + p_rfs = m_flow_udp_uc_map.get(flow_spec_udp_key_t(p_rx_wc_buf_desc->rx.dst.sin_addr.s_addr, + p_rx_wc_buf_desc->rx.dst.sin_port), NULL); + } else { // This is UDP MC packet + p_rfs = m_flow_udp_mc_map.get(flow_spec_udp_key_t(p_rx_wc_buf_desc->rx.dst.sin_addr.s_addr, + p_rx_wc_buf_desc->rx.dst.sin_port), NULL); + } + } + break; + + case IPPROTO_TCP: + { + // Get the tcp header pointer + tcp payload size + struct tcphdr* p_tcp_h = (struct tcphdr*)((uint8_t*)p_ip_h + ip_hdr_len); + + if (p_rx_wc_buf_desc->rx.is_sw_csum_need && compute_tcp_checksum(p_ip_h, (unsigned short*) p_tcp_h)) { + return false; // false tcp checksum + } + + size_t sz_payload = ip_tot_len - ip_hdr_len - p_tcp_h->doff*4; + ring_logfunc("Rx TCP segment info: src_port=%d, dst_port=%d, flags='%s%s%s%s%s%s' seq=%u, ack=%u, win=%u, payload_sz=%u", + ntohs(p_tcp_h->source), ntohs(p_tcp_h->dest), + p_tcp_h->urg?"U":"", p_tcp_h->ack?"A":"", p_tcp_h->psh?"P":"", + p_tcp_h->rst?"R":"", p_tcp_h->syn?"S":"", p_tcp_h->fin?"F":"", + ntohl(p_tcp_h->seq), ntohl(p_tcp_h->ack_seq), ntohs(p_tcp_h->window), + sz_payload); + + // Update packet descriptor with datagram base address and length + p_rx_wc_buf_desc->rx.frag.iov_base = (uint8_t*)p_tcp_h + sizeof(struct tcphdr); + p_rx_wc_buf_desc->rx.frag.iov_len = ip_tot_len - ip_hdr_len - sizeof(struct tcphdr); + + // Update the L4 info + p_rx_wc_buf_desc->rx.src.sin_port = p_tcp_h->source; + p_rx_wc_buf_desc->rx.dst.sin_port = p_tcp_h->dest; + p_rx_wc_buf_desc->rx.sz_payload = sz_payload; + + p_rx_wc_buf_desc->rx.tcp.p_ip_h = p_ip_h; + p_rx_wc_buf_desc->rx.tcp.p_tcp_h = p_tcp_h; + + // Find the relevant hash map and pass the packet to the rfs for dispatching + p_rfs = m_flow_tcp_map.get(flow_spec_tcp_key_t(p_rx_wc_buf_desc->rx.dst.sin_addr.s_addr, + p_rx_wc_buf_desc->rx.src.sin_addr.s_addr, p_rx_wc_buf_desc->rx.dst.sin_port, + p_rx_wc_buf_desc->rx.src.sin_port), NULL); + + p_rx_wc_buf_desc->rx.tcp.n_transport_header_len = transport_header_len; + + if (unlikely(p_rfs == NULL)) { // If we didn't find a match for TCP 5T, look for a match with TCP 3T + p_rfs = m_flow_tcp_map.get(flow_spec_tcp_key_t(p_rx_wc_buf_desc->rx.dst.sin_addr.s_addr, 0, + p_rx_wc_buf_desc->rx.dst.sin_port, 0), NULL); + } + } + break; + + case IPPROTO_IGMP: + { + struct igmp* p_igmp_h= (struct igmp*)((uint8_t*)p_ip_h + ip_hdr_len); + NOT_IN_USE(p_igmp_h); /* to supress warning in case VMA_MAX_DEFINED_LOG_LEVEL */ + ring_logdbg("Rx IGMP packet info: type=%s (%d), group=%d.%d.%d.%d, code=%d", + priv_igmp_type_tostr(p_igmp_h->igmp_type), p_igmp_h->igmp_type, + NIPQUAD(p_igmp_h->igmp_group.s_addr), p_igmp_h->igmp_code); + if (m_transport_type == VMA_TRANSPORT_IB || m_b_sysvar_eth_mc_l2_only_rules) { + ring_logdbg("Transport type is IB (or eth_mc_l2_only_rules), passing igmp packet to igmp_manager to process"); + if(g_p_igmp_mgr) { + (g_p_igmp_mgr->process_igmp_packet(p_ip_h, m_local_if)); + return false; // we return false in order to free the buffer, although we handled the packet + } + ring_logdbg("IGMP packet drop. IGMP manager does not exist."); + return false; + } + ring_logerr("Transport type is ETH, dropping the packet"); + return false; + } + break; + + default: + ring_logwarn("Rx packet dropped - undefined protocol = %d", p_ip_h->protocol); + return false; + } + + if (unlikely(p_rfs == NULL)) { + ring_logdbg("Rx packet dropped - rfs object not found: dst:%d.%d.%d.%d:%d, src%d.%d.%d.%d:%d, proto=%s[%d]", + NIPQUAD(p_rx_wc_buf_desc->rx.dst.sin_addr.s_addr), ntohs(p_rx_wc_buf_desc->rx.dst.sin_port), + NIPQUAD(p_rx_wc_buf_desc->rx.src.sin_addr.s_addr), ntohs(p_rx_wc_buf_desc->rx.src.sin_port), + iphdr_protocol_type_to_str(p_ip_h->protocol), p_ip_h->protocol); + + return false; + } + return p_rfs->rx_dispatch_packet(p_rx_wc_buf_desc, pv_fd_ready_array); +} + +void ring_slave::flow_udp_del_all() +{ + flow_spec_udp_key_t map_key_udp; + flow_spec_udp_map_t::iterator itr_udp; + + itr_udp = m_flow_udp_uc_map.begin(); + while (itr_udp != m_flow_udp_uc_map.end()) { + rfs *p_rfs = itr_udp->second; + map_key_udp = itr_udp->first; + if (p_rfs) { + delete p_rfs; + } + if (!(m_flow_udp_uc_map.del(map_key_udp))) { + ring_logdbg("Could not find rfs object to delete in ring udp uc hash map!"); + } + itr_udp = m_flow_udp_uc_map.begin(); + } + + itr_udp = m_flow_udp_mc_map.begin(); + while (itr_udp != m_flow_udp_mc_map.end()) { + rfs *p_rfs = itr_udp->second; + map_key_udp = itr_udp->first; + if (p_rfs) { + delete p_rfs; + } + if (!(m_flow_udp_mc_map.del(map_key_udp))) { + ring_logdbg("Could not find rfs object to delete in ring udp mc hash map!"); + } + itr_udp = m_flow_udp_mc_map.begin(); + } +} + +void ring_slave::flow_tcp_del_all() +{ + flow_spec_tcp_key_t map_key_tcp; + flow_spec_tcp_map_t::iterator itr_tcp; + + while ((itr_tcp = m_flow_tcp_map.begin()) != m_flow_tcp_map.end()) { + rfs *p_rfs = itr_tcp->second; + map_key_tcp = itr_tcp->first; + if (p_rfs) { + delete p_rfs; + } + if (!(m_flow_tcp_map.del(map_key_tcp))) { + ring_logdbg("Could not find rfs object to delete in ring tcp hash map!"); + } + } +} + +bool ring_slave::request_more_tx_buffers(uint32_t count, uint32_t lkey) +{ + ring_logfuncall("Allocating additional %d buffers for internal use", count); + + bool res = g_buffer_pool_tx->get_buffers_thread_safe(m_tx_pool, this, count, lkey); + if (!res) { + ring_logfunc("Out of mem_buf_desc from TX free pool for internal object pool"); + return false; + } + + return true; +} diff --git a/src/vma/dev/ring_slave.h b/src/vma/dev/ring_slave.h new file mode 100644 index 0000000..19bacd2 --- /dev/null +++ b/src/vma/dev/ring_slave.h @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RING_SLAVE_H_ +#define RING_SLAVE_H_ + +#include "ring.h" + +#include "vma/dev/net_device_table_mgr.h" + +class rfs; + +typedef struct __attribute__((packed)) flow_spec_udp_key_t { + in_addr_t dst_ip; + in_port_t dst_port; + + flow_spec_udp_key_t () { + flow_spec_udp_key_helper(INADDR_ANY, INPORT_ANY); + } //Default constructor + flow_spec_udp_key_t (in_addr_t d_ip, in_addr_t d_port) { + flow_spec_udp_key_helper(d_ip, d_port); + }//Constructor + void flow_spec_udp_key_helper(in_addr_t d_ip, in_addr_t d_port) { + memset(this, 0, sizeof(*this));// Silencing coverity + dst_ip = d_ip; + dst_port = d_port; + }; +} flow_spec_udp_key_t; + +typedef struct __attribute__((packed)) flow_spec_tcp_key_t { + in_addr_t dst_ip; + in_addr_t src_ip; + in_port_t dst_port; + in_port_t src_port; + + flow_spec_tcp_key_t () { + flow_spec_tcp_key_helper (INADDR_ANY, INADDR_ANY, INPORT_ANY, INPORT_ANY); + } //Default constructor + flow_spec_tcp_key_t (in_addr_t d_ip, in_addr_t s_ip, in_addr_t d_port, in_addr_t s_port) { + flow_spec_tcp_key_helper (d_ip, s_ip, d_port, s_port); + }//Constructor + void flow_spec_tcp_key_helper(in_addr_t d_ip, in_addr_t s_ip, in_addr_t d_port, in_addr_t s_port) { + memset(this, 0, sizeof(*this));// Silencing coverity + dst_ip = d_ip; + src_ip = s_ip; + dst_port = d_port; + src_port = s_port; + }; +} flow_spec_tcp_key_t; + + +/* UDP flow to rfs object hash map */ +inline bool +operator==(flow_spec_udp_key_t const& key1, flow_spec_udp_key_t const& key2) +{ + return (key1.dst_port == key2.dst_port) && + (key1.dst_ip == key2.dst_ip); +} + +typedef hash_map flow_spec_udp_map_t; + + +/* TCP flow to rfs object hash map */ +inline bool +operator==(flow_spec_tcp_key_t const& key1, flow_spec_tcp_key_t const& key2) +{ + return (key1.src_port == key2.src_port) && + (key1.src_ip == key2.src_ip) && + (key1.dst_port == key2.dst_port) && + (key1.dst_ip == key2.dst_ip); +} + +typedef hash_map flow_spec_tcp_map_t; + +struct counter_and_ibv_flows { + int counter; + std::vector ibv_flows; +}; + +// rule key based on ip and port +struct rule_key_t { + uint64_t key; + + rule_key_t(in_addr_t addr, in_port_t port) { + key = (uint64_t) addr << 32 | port; + } +}; + +typedef std::tr1::unordered_map rule_filter_map_t; + + +class ring_slave : public ring +{ +public: + ring_slave(int if_index, ring* parent, ring_type_t type); + virtual ~ring_slave(); + + virtual void print_val(); + virtual void restart(); + virtual int get_num_resources() const { return 1; }; + virtual bool is_member(ring_slave* rng); + virtual bool is_active_member(ring_slave* rng, ring_user_id_t id); + virtual ring_user_id_t generate_id(); + virtual ring_user_id_t generate_id(const address_t src_mac, const address_t dst_mac, uint16_t eth_proto, uint16_t encap_proto, uint32_t src_ip, uint32_t dst_ip, uint16_t src_port, uint16_t dst_port); + virtual bool is_up() = 0; + virtual void inc_tx_retransmissions_stats(ring_user_id_t id); + virtual bool rx_process_buffer(mem_buf_desc_t* p_rx_wc_buf_desc, void* pv_fd_ready_array); + virtual int reclaim_recv_single_buffer(mem_buf_desc_t* rx_reuse) = 0; + virtual void inc_cq_moderation_stats(size_t sz_data) = 0; + virtual uint32_t get_underly_qpn() = 0; + virtual bool attach_flow(flow_tuple& flow_spec_5t, pkt_rcvr_sink* sink); + virtual bool detach_flow(flow_tuple& flow_spec_5t, pkt_rcvr_sink* sink); + + inline bool is_simple() const { return m_type != RING_TAP; } + inline bool is_mp_ring() const { return m_type == RING_ETH_CB; } + transport_type_t get_transport_type() const { return m_transport_type; } + inline ring_type_t get_type() const { return m_type; } + + bool m_active; /* State indicator */ + +protected: + + bool request_more_tx_buffers(uint32_t count, uint32_t lkey); + void flow_udp_del_all(); + void flow_tcp_del_all(); + + flow_spec_tcp_map_t m_flow_tcp_map; + flow_spec_udp_map_t m_flow_udp_mc_map; + flow_spec_udp_map_t m_flow_udp_uc_map; + + // For IB MC flow, the port is zeroed in the ibv_flow_spec when calling to ibv_flow_spec(). + // It means that for every MC group, even if we have sockets with different ports - only one rule in the HW. + // So the hash map below keeps track of the number of sockets per rule so we know when to call ibv_attach and ibv_detach + rule_filter_map_t m_l2_mc_ip_attach_map; + rule_filter_map_t m_tcp_dst_port_attach_map; + + descq_t m_tx_pool; + transport_type_t m_transport_type; /* transport ETH/IB */ + ring_stats_t* m_p_ring_stat; + lock_spin_recursive m_lock_ring_rx; + lock_spin_recursive m_lock_ring_tx; + in_addr_t m_local_if; + uint16_t m_partition; + bool m_flow_tag_enabled; + const bool m_b_sysvar_eth_mc_l2_only_rules; + const bool m_b_sysvar_mc_force_flowtag; + +private: + ring_type_t m_type; /* ring type */ + ring_stats_t m_ring_stat; +}; + + +#endif /* RING_SLAVE_H_ */ diff --git a/src/vma/dev/ring_tap.cpp b/src/vma/dev/ring_tap.cpp new file mode 100644 index 0000000..89685cb --- /dev/null +++ b/src/vma/dev/ring_tap.cpp @@ -0,0 +1,611 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "ring_tap.h" + +#include +#include "vma/util/sg_array.h" +#include "vma/sock/fd_collection.h" +#include "vma/dev/net_device_table_mgr.h" + +#undef MODULE_NAME +#define MODULE_NAME "ring_tap" +#undef MODULE_HDR +#define MODULE_HDR MODULE_NAME "%d:%s() " + +ring_tap::ring_tap(int if_index, ring* parent): + ring_slave(if_index, parent, RING_TAP), + m_tap_fd(-1), + m_vf_ring(NULL), + m_sysvar_qp_compensation_level(safe_mce_sys().qp_compensation_level), + m_tap_data_available(false) +{ + int rc = 0; + struct vma_msg_flow data; + char tap_if_name[IFNAMSIZ] = {0}; + net_device_val* p_ndev = g_p_net_device_table_mgr->get_net_device_val(m_parent->get_if_index()); + + /* Create TAP device and update ring class with new if_index */ + tap_create(p_ndev); + + /* Register tap ring to the internal thread */ + m_p_n_rx_channel_fds = new int[1]; + m_p_n_rx_channel_fds[0] = m_tap_fd; + + if (m_tap_fd >= 0) { + g_p_fd_collection->addtapfd(m_tap_fd, this); + g_p_event_handler_manager->update_epfd(m_tap_fd, + EPOLL_CTL_ADD, EPOLLIN | EPOLLPRI | EPOLLONESHOT); + } + + /* Initialize RX buffer poll */ + request_more_rx_buffers(); + m_rx_pool.set_id("ring_tap (%p) : m_rx_pool", this); + + /* Initialize TX buffer poll */ + request_more_tx_buffers(m_sysvar_qp_compensation_level, 0); + + /* Update ring statistics */ + m_p_ring_stat->tap.n_tap_fd = m_tap_fd; + if_indextoname(get_if_index(), tap_if_name); + memcpy(m_p_ring_stat->tap.s_tap_name, tap_if_name, IFNAMSIZ); + + /* create egress rule (redirect traffic from tap device to physical interface) */ + rc = prepare_flow_message(data, VMA_MSG_FLOW_ADD); + if (rc != 0) { + ring_logwarn("Add TC rule failed with error=%d", rc); + } +} + +ring_tap::~ring_tap() +{ + m_lock_ring_rx.lock(); + flow_udp_del_all(); + flow_tcp_del_all(); + m_lock_ring_rx.unlock(); + + g_p_event_handler_manager->update_epfd(m_tap_fd, + EPOLL_CTL_DEL, EPOLLIN | EPOLLPRI | EPOLLONESHOT); + + if (g_p_fd_collection) { + g_p_fd_collection->del_tapfd(m_tap_fd); + } + + /* Release RX buffer poll */ + g_buffer_pool_rx->put_buffers_thread_safe(&m_rx_pool, m_rx_pool.size()); + + delete[] m_p_n_rx_channel_fds; + + /* TAP device release */ + tap_destroy(); +} + + +void ring_tap::tap_create(net_device_val* p_ndev) +{ + #define TAP_NAME_FORMAT "t%x%x" // t + #define TAP_STR_LENGTH 512 + #define TAP_DISABLE_IPV6 "sysctl -w net.ipv6.conf.%s.disable_ipv6=1" + + int rc = 0, tap_if_index = -1, ioctl_sock = -1; + struct ifreq ifr; + char command_str[TAP_STR_LENGTH], return_str[TAP_STR_LENGTH], tap_name[IFNAMSIZ]; + unsigned char hw_addr[ETH_ALEN]; + + /* Open TAP device */ + if( (m_tap_fd = orig_os_api.open("/dev/net/tun", O_RDWR)) < 0 ) { + ring_logerr("FAILED to open tap %m"); + rc = -errno; + goto error; + } + + /* Tap name */ + rc = snprintf(tap_name, sizeof(tap_name), TAP_NAME_FORMAT, getpid() & 0xFFFFFFF, m_tap_fd & 0xFFFFFFF); + if (unlikely(((int)sizeof(tap_name) < rc) || (rc < 0))) { + ring_logerr("FAILED to create tap name %m"); + rc = -errno; + goto error; + } + + /* Init ifr */ + memset(&ifr, 0, sizeof(ifr)); + rc = snprintf(ifr.ifr_name, IFNAMSIZ, "%s", tap_name); + if (unlikely((IFNAMSIZ < rc) || (rc < 0))) { + ring_logerr("FAILED to create tap name %m"); + rc = -errno; + goto error; + } + + /* Setting TAP attributes */ + ifr.ifr_flags = IFF_TAP | IFF_NO_PI | IFF_ONE_QUEUE; + if ((rc = orig_os_api.ioctl(m_tap_fd, TUNSETIFF, (void *) &ifr)) < 0) { + ring_logerr("ioctl failed fd = %d, %d %m", m_tap_fd, rc); + rc = -errno; + goto error; + } + + /* Set TAP fd nonblocking */ + if ((rc = orig_os_api.fcntl(m_tap_fd, F_SETFL, O_NONBLOCK)) < 0) { + ring_logerr("ioctl failed fd = %d, %d %m", m_tap_fd, rc); + rc = -errno; + goto error; + } + + /* Disable Ipv6 for TAP interface */ + snprintf(command_str, TAP_STR_LENGTH, TAP_DISABLE_IPV6, tap_name); + if (run_and_retreive_system_command(command_str, return_str, TAP_STR_LENGTH) < 0) { + ring_logerr("sysctl ipv6 failed fd = %d, %m", m_tap_fd); + rc = -errno; + goto error; + } + + /* Create socket */ + if ((ioctl_sock = orig_os_api.socket(AF_INET, SOCK_DGRAM, 0)) < 0 ) { + ring_logerr("FAILED to open socket"); + rc = -errno; + goto error; + } + + /* Set MAC address */ + ifr.ifr_hwaddr.sa_family = AF_LOCAL; + get_local_ll_addr(p_ndev->get_ifname_link(), hw_addr, ETH_ALEN, false); + memcpy(ifr.ifr_hwaddr.sa_data, hw_addr, ETH_ALEN); + if ((rc = orig_os_api.ioctl(ioctl_sock, SIOCSIFHWADDR, &ifr)) < 0) { + ring_logerr("ioctl SIOCSIFHWADDR failed %d %m, %s", rc, tap_name); + rc = -errno; + goto error; + } + + /* Set link UP */ + ifr.ifr_flags |= (IFF_UP | IFF_SLAVE); + if ((rc = orig_os_api.ioctl(ioctl_sock, SIOCSIFFLAGS, &ifr)) < 0) { + ring_logerr("ioctl SIOCGIFFLAGS failed %d %m, %s", rc, tap_name); + rc = -errno; + goto error; + } + + /* Get TAP interface index */ + tap_if_index = if_nametoindex(tap_name); + if (!tap_if_index) { + ring_logerr("if_nametoindex failed to get tap index [%s]", tap_name); + rc = -errno; + goto error; + } + + /* Update if_index on ring class */ + set_if_index(tap_if_index); + + orig_os_api.close(ioctl_sock); + + ring_logdbg("Tap device %d: %s [fd=%d] was created successfully", + tap_if_index, ifr.ifr_name, m_tap_fd); + + return; + +error: + ring_logerr("Tap device creation failed %d, %m", rc); + + if (ioctl_sock >= 0) { + orig_os_api.close(ioctl_sock); + } + + if (m_tap_fd >= 0) { + orig_os_api.close(m_tap_fd); + } + + m_tap_fd = -1; +} + + +void ring_tap::tap_destroy() +{ + if (m_tap_fd >= 0) { + orig_os_api.close(m_tap_fd); + + m_tap_fd = -1; + } +} + +bool ring_tap::attach_flow(flow_tuple& flow_spec_5t, pkt_rcvr_sink *sink) +{ + auto_unlocker lock(m_lock_ring_rx); + bool ret = ring_slave::attach_flow(flow_spec_5t, sink); + + if (ret && (flow_spec_5t.is_tcp() || flow_spec_5t.is_udp_uc())) { + int rc = 0; + struct vma_msg_flow data; + rc = prepare_flow_message(data, VMA_MSG_FLOW_ADD, flow_spec_5t); + if (rc != 0) { + if (!g_b_exit) { + ring_logwarn("Add TC rule failed with error=%d", rc); + } + ring_slave::detach_flow(flow_spec_5t, sink); + ret = false; + } + } + + return ret; +} + +bool ring_tap::detach_flow(flow_tuple& flow_spec_5t, pkt_rcvr_sink* sink) +{ + auto_unlocker lock(m_lock_ring_rx); + bool ret = ring_slave::detach_flow(flow_spec_5t, sink); + + if (flow_spec_5t.is_tcp() || flow_spec_5t.is_udp_uc()) { + int rc = 0; + struct vma_msg_flow data; + rc = prepare_flow_message(data, VMA_MSG_FLOW_DEL, flow_spec_5t); + if (rc != 0) { + if (!g_b_exit) { + ring_logwarn("Del TC rule failed with error=%d", rc); + } + ret = false; + } + } + + return ret; +} + +int ring_tap::poll_and_process_element_rx(uint64_t*, void* pv_fd_ready_array) +{ + return process_element_rx(pv_fd_ready_array); +} + +int ring_tap::wait_for_notification_and_process_element(int, uint64_t*, void* pv_fd_ready_array) +{ + return process_element_rx(pv_fd_ready_array); +} + +int ring_tap::drain_and_proccess() +{ + return process_element_rx(NULL); +} + +bool ring_tap::reclaim_recv_buffers(descq_t *rx_reuse) +{ + while (!rx_reuse->empty()) { + mem_buf_desc_t* buff = rx_reuse->get_and_pop_front(); + reclaim_recv_buffers(buff); + } + + if (m_rx_pool.size() >= m_sysvar_qp_compensation_level * 2) { + int buff_to_rel = m_rx_pool.size() - m_sysvar_qp_compensation_level; + + g_buffer_pool_rx->put_buffers_thread_safe(&m_rx_pool, buff_to_rel); + m_p_ring_stat->tap.n_rx_buffers = m_rx_pool.size(); + } + + return true; +} + +bool ring_tap::reclaim_recv_buffers(mem_buf_desc_t *buff) +{ + if (buff && (buff->dec_ref_count() <= 1)) { + mem_buf_desc_t* temp = NULL; + while (buff) { + if(buff->lwip_pbuf_dec_ref_count() <= 0) { + temp = buff; + buff = temp->p_next_desc; + temp->p_next_desc = NULL; + temp->p_prev_desc = NULL; + temp->reset_ref_count(); + temp->rx.tcp.gro = 0; + temp->rx.is_vma_thr = false; + temp->rx.socketxtreme_polled = false; + temp->rx.flow_tag_id = 0; + temp->rx.tcp.p_ip_h = NULL; + temp->rx.tcp.p_tcp_h = NULL; + temp->rx.timestamps.sw.tv_nsec = 0; + temp->rx.timestamps.sw.tv_sec = 0; + temp->rx.timestamps.hw.tv_nsec = 0; + temp->rx.timestamps.hw.tv_sec = 0; + temp->rx.hw_raw_timestamp = 0; + free_lwip_pbuf(&temp->lwip_pbuf); + m_rx_pool.push_back(temp); + } + else { + buff->reset_ref_count(); + buff = buff->p_next_desc; + } + } + m_p_ring_stat->tap.n_rx_buffers = m_rx_pool.size(); + return true; + } + return false; +} + +void ring_tap::send_ring_buffer(ring_user_id_t id, vma_ibv_send_wr* p_send_wqe, vma_wr_tx_packet_attr attr) +{ + NOT_IN_USE(id); + compute_tx_checksum((mem_buf_desc_t*)(p_send_wqe->wr_id), attr & VMA_TX_PACKET_L3_CSUM, attr & VMA_TX_PACKET_L4_CSUM); + + auto_unlocker lock(m_lock_ring_tx); + int ret = send_buffer(p_send_wqe, attr); + send_status_handler(ret, p_send_wqe); +} + +void ring_tap::send_lwip_buffer(ring_user_id_t id, vma_ibv_send_wr* p_send_wqe, vma_wr_tx_packet_attr attr) +{ + NOT_IN_USE(id); + compute_tx_checksum((mem_buf_desc_t*)(p_send_wqe->wr_id), attr & VMA_TX_PACKET_L3_CSUM, attr & VMA_TX_PACKET_L4_CSUM); + + auto_unlocker lock(m_lock_ring_tx); + mem_buf_desc_t* p_mem_buf_desc = (mem_buf_desc_t*)(p_send_wqe->wr_id); + p_mem_buf_desc->lwip_pbuf.pbuf.ref++; + int ret = send_buffer(p_send_wqe, attr); + send_status_handler(ret, p_send_wqe); +} + +int ring_tap::prepare_flow_message(vma_msg_flow& data, msg_flow_t flow_action, + flow_tuple& flow_spec_5t) +{ + int rc = 0; + + memset(&data, 0, sizeof(data)); + data.hdr.code = VMA_MSG_FLOW; + data.hdr.ver = VMA_AGENT_VER; + data.hdr.pid = getpid(); + + data.action = flow_action; + data.if_id = get_parent()->get_if_index(); + data.tap_id = get_if_index(); + + data.flow.dst_ip = flow_spec_5t.get_dst_ip(); + data.flow.dst_port = flow_spec_5t.get_dst_port(); + + if (flow_spec_5t.is_3_tuple()) { + data.type = flow_spec_5t.is_tcp() ? VMA_MSG_FLOW_TCP_3T : VMA_MSG_FLOW_UDP_3T; + } else { + data.type = flow_spec_5t.is_tcp() ? VMA_MSG_FLOW_TCP_5T : VMA_MSG_FLOW_UDP_5T; + data.flow.t5.src_ip = flow_spec_5t.get_src_ip(); + data.flow.t5.src_port = flow_spec_5t.get_src_port(); + } + + rc = g_p_agent->send_msg_flow(&data); + + return rc; +} + +int ring_tap::prepare_flow_message(vma_msg_flow& data, msg_flow_t flow_action) +{ + int rc = 0; + + memset(&data, 0, sizeof(data)); + data.hdr.code = VMA_MSG_FLOW; + data.hdr.ver = VMA_AGENT_VER; + data.hdr.pid = getpid(); + data.action = flow_action; + data.if_id = get_parent()->get_if_index(); + data.tap_id = get_if_index(); + data.type = VMA_MSG_FLOW_EGRESS; + + rc = g_p_agent->send_msg_flow(&data); + + return rc; +} + +int ring_tap::process_element_rx(void* pv_fd_ready_array) +{ + int ret = 0; + + if(m_tap_data_available) { + auto_unlocker lock(m_lock_ring_rx); + if (m_rx_pool.size() || request_more_rx_buffers()) { + mem_buf_desc_t *buff = m_rx_pool.get_and_pop_front(); + ret = orig_os_api.read(m_tap_fd, buff->p_buffer, buff->sz_buffer); + if (ret > 0) { + /* Data was read and processed successfully */ + buff->sz_data = ret; + buff->rx.is_sw_csum_need = 1; + if ((ret = rx_process_buffer(buff, pv_fd_ready_array))) { + m_p_ring_stat->tap.n_rx_buffers--; + } + } + if (ret <= 0){ + /* Unable to read data, return buffer to pool */ + ret = 0; + m_rx_pool.push_front(buff); + } + + m_tap_data_available = false; + g_p_event_handler_manager->update_epfd(m_tap_fd, + EPOLL_CTL_MOD, EPOLLIN | EPOLLPRI | EPOLLONESHOT); + } + } + + return ret; +} + +bool ring_tap::request_more_rx_buffers() +{ + ring_logfuncall("Allocating additional %d buffers for internal use", + m_sysvar_qp_compensation_level); + + bool res = g_buffer_pool_rx->get_buffers_thread_safe(m_rx_pool, + this, m_sysvar_qp_compensation_level, 0); + if (!res) { + ring_logfunc("Out of mem_buf_desc from RX free pool for internal object pool"); + return false; + } + + m_p_ring_stat->tap.n_rx_buffers = m_rx_pool.size(); + + return true; +} + +mem_buf_desc_t* ring_tap::mem_buf_tx_get(ring_user_id_t id, bool b_block, int n_num_mem_bufs) +{ + mem_buf_desc_t* head = NULL; + + NOT_IN_USE(id); + NOT_IN_USE(b_block); + + ring_logfuncall("n_num_mem_bufs=%d", n_num_mem_bufs); + + m_lock_ring_tx.lock(); + + if (unlikely((int)m_tx_pool.size() < n_num_mem_bufs)) { + request_more_tx_buffers(m_sysvar_qp_compensation_level, 0); + + if (unlikely((int)m_tx_pool.size() < n_num_mem_bufs)) { + m_lock_ring_tx.unlock(); + return head; + } + } + + head = m_tx_pool.get_and_pop_back(); + head->lwip_pbuf.pbuf.ref = 1; + n_num_mem_bufs--; + + mem_buf_desc_t* next = head; + while (n_num_mem_bufs) { + next->p_next_desc = m_tx_pool.get_and_pop_back(); + next = next->p_next_desc; + next->lwip_pbuf.pbuf.ref = 1; + n_num_mem_bufs--; + } + + m_lock_ring_tx.unlock(); + + return head; +} + +inline void ring_tap::return_to_global_pool() +{ + if (m_tx_pool.size() >= m_sysvar_qp_compensation_level * 2) { + int return_bufs = m_tx_pool.size() - m_sysvar_qp_compensation_level; + g_buffer_pool_tx->put_buffers_thread_safe(&m_tx_pool, return_bufs); + } +} + +void ring_tap::mem_buf_desc_return_single_to_owner_tx(mem_buf_desc_t* p_mem_buf_desc) +{ + auto_unlocker lock(m_lock_ring_tx); + + int count = 0; + + if (likely(p_mem_buf_desc)) { + //potential race, ref is protected here by ring_tx lock, and in dst_entry_tcp & sockinfo_tcp by tcp lock + if (likely(p_mem_buf_desc->lwip_pbuf.pbuf.ref)) + p_mem_buf_desc->lwip_pbuf.pbuf.ref--; + else + ring_logerr("ref count of %p is already zero, double free??", p_mem_buf_desc); + + if (p_mem_buf_desc->lwip_pbuf.pbuf.ref == 0) { + p_mem_buf_desc->p_next_desc = NULL; + free_lwip_pbuf(&p_mem_buf_desc->lwip_pbuf); + m_tx_pool.push_back(p_mem_buf_desc); + count++; + } + } + + return_to_global_pool(); +} + +int ring_tap::mem_buf_tx_release(mem_buf_desc_t* buff_list, bool b_accounting, bool trylock) +{ + int count = 0, freed=0; + mem_buf_desc_t *next; + + NOT_IN_USE(b_accounting); + + if (!trylock) { + m_lock_ring_tx.lock(); + } else if (m_lock_ring_tx.trylock()) { + return 0; + } + + while (buff_list) { + next = buff_list->p_next_desc; + buff_list->p_next_desc = NULL; + + //potential race, ref is protected here by ring_tx lock, and in dst_entry_tcp & sockinfo_tcp by tcp lock + if (likely(buff_list->lwip_pbuf.pbuf.ref)) { + buff_list->lwip_pbuf.pbuf.ref--; + } else { + ring_logerr("ref count of %p is already zero, double free??", buff_list); + } + + if (buff_list->lwip_pbuf.pbuf.ref == 0) { + free_lwip_pbuf(&buff_list->lwip_pbuf); + m_tx_pool.push_back(buff_list); + freed++; + } + count++; + buff_list = next; + } + ring_logfunc("buf_list: %p count: %d freed: %d\n", buff_list, count, freed); + + return_to_global_pool(); + + m_lock_ring_tx.unlock(); + + return count; +} + +int ring_tap::send_buffer(vma_ibv_send_wr* wr, vma_wr_tx_packet_attr attr) +{ + int ret = 0; + iovec iovec[wr->num_sge]; + NOT_IN_USE(attr); + + for (int i = 0; i < wr->num_sge; i++) { + iovec[i].iov_base = (void *) wr->sg_list[i].addr; + iovec[i].iov_len = wr->sg_list[i].length; + } + + ret = orig_os_api.writev(m_tap_fd, iovec , wr->num_sge); + if (ret < 0) { + ring_logdbg("writev: tap_fd %d, errno: %d\n", m_tap_fd, errno); + } + + return ret; +} + +void ring_tap::send_status_handler(int ret, vma_ibv_send_wr* p_send_wqe) +{ + // Pay attention that there is a difference in return values in ring_simple and ring_tap + // Non positive value of ret means that we are on error flow (unlike for ring_simple). + if (p_send_wqe) { + mem_buf_desc_t* p_mem_buf_desc = (mem_buf_desc_t*)(p_send_wqe->wr_id); + + if (likely(ret > 0)) { + // Update TX statistics + sg_array sga(p_send_wqe->sg_list, p_send_wqe->num_sge); + m_p_ring_stat->n_tx_byte_count += sga.length(); + ++m_p_ring_stat->n_tx_pkt_count; + } + + mem_buf_tx_release(p_mem_buf_desc, true); + } +} diff --git a/src/vma/dev/ring_tap.h b/src/vma/dev/ring_tap.h new file mode 100644 index 0000000..cfb2934 --- /dev/null +++ b/src/vma/dev/ring_tap.h @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef RING_TAP_H_ +#define RING_TAP_H_ + +#include "ring_slave.h" +#include "vma/util/agent.h" + +class ring_tap : public ring_slave +{ +public: + ring_tap(int if_index, ring* parent); + virtual ~ring_tap(); + + virtual bool is_up() { return (m_vf_ring || m_active); } + virtual bool attach_flow(flow_tuple& flow_spec_5t, pkt_rcvr_sink* sink); + virtual bool detach_flow(flow_tuple& flow_spec_5t, pkt_rcvr_sink* sink); + virtual int poll_and_process_element_rx(uint64_t* p_cq_poll_sn, void* pv_fd_ready_array = NULL); + virtual int wait_for_notification_and_process_element(int cq_channel_fd, uint64_t* p_cq_poll_sn, void* pv_fd_ready_array = NULL); + virtual int drain_and_proccess(); + virtual bool reclaim_recv_buffers(descq_t *rx_reuse); + virtual bool reclaim_recv_buffers(mem_buf_desc_t *buff); + virtual int reclaim_recv_single_buffer(mem_buf_desc_t* rx_reuse) { NOT_IN_USE(rx_reuse); return -1; } + virtual void send_ring_buffer(ring_user_id_t id, vma_ibv_send_wr* p_send_wqe, vma_wr_tx_packet_attr attr); + virtual void send_lwip_buffer(ring_user_id_t id, vma_ibv_send_wr* p_send_wqe, vma_wr_tx_packet_attr attr); + virtual void mem_buf_desc_return_single_to_owner_tx(mem_buf_desc_t* p_mem_buf_desc); + virtual mem_buf_desc_t* mem_buf_tx_get(ring_user_id_t id, bool b_block, int n_num_mem_bufs = 1); + virtual int mem_buf_tx_release(mem_buf_desc_t* p_mem_buf_desc_list, bool b_accounting, bool trylock = false); + virtual bool get_hw_dummy_send_support(ring_user_id_t id, vma_ibv_send_wr* p_send_wqe) { NOT_IN_USE(id); NOT_IN_USE(p_send_wqe); return false; } + virtual int request_notification(cq_type_t cq_type, uint64_t poll_sn) { NOT_IN_USE(cq_type); NOT_IN_USE(poll_sn); return 0; } + virtual void adapt_cq_moderation() {} + + virtual int socketxtreme_poll(struct vma_completion_t *vma_completions, unsigned int ncompletions, int flags) { + NOT_IN_USE(vma_completions); + NOT_IN_USE(ncompletions); + NOT_IN_USE(flags); + return 0; + } + + virtual int modify_ratelimit(struct vma_rate_limit_t &rate_limit) { NOT_IN_USE(rate_limit); return 0; } + void inc_cq_moderation_stats(size_t sz_data) { NOT_IN_USE(sz_data); } + virtual uint32_t get_underly_qpn() { return -1; } + virtual uint32_t get_max_inline_data() { return 0; } +#ifdef DEFINED_TSO + virtual uint32_t get_max_send_sge(void) { return 1; } + virtual uint32_t get_max_payload_sz(void) { return 0; } + virtual uint16_t get_max_header_sz(void) { return 0; } + virtual uint32_t get_tx_lkey(ring_user_id_t id) { NOT_IN_USE(id); return 0; } + virtual bool is_tso(void) { return false; } +#endif /* DEFINED_TSO */ + + inline void set_tap_data_available() { m_tap_data_available = true; } + inline void set_vf_ring(ring_slave *p_ring) { m_vf_ring = p_ring; } + inline void inc_vf_plugouts() { m_p_ring_stat->tap.n_vf_plugouts++; } + +private: + inline void return_to_global_pool(); + int prepare_flow_message(vma_msg_flow& data, msg_flow_t flow_action, flow_tuple& flow_spec_5t); + int prepare_flow_message(vma_msg_flow& data, msg_flow_t flow_action); + int process_element_rx(void* pv_fd_ready_array); + bool request_more_rx_buffers(); + int send_buffer(vma_ibv_send_wr* p_send_wqe, vma_wr_tx_packet_attr attr); + void send_status_handler(int ret, vma_ibv_send_wr* p_send_wqe); + void tap_create(net_device_val* p_ndev); + void tap_destroy(); + + bool is_socketxtreme(void) { return false; } + void put_ec(struct ring_ec *ec) { NOT_IN_USE(ec); } + void del_ec(struct ring_ec *ec) { NOT_IN_USE(ec); } + struct vma_completion_t *get_comp(void) { return NULL; } + + /* These fields are NETVSC mode specific */ + int m_tap_fd; /* file descriptor of tap device */ + ring_slave* m_vf_ring; + const uint32_t m_sysvar_qp_compensation_level; + descq_t m_rx_pool; + bool m_tap_data_available; +}; + +#endif /* RING_TAP_H_ */ diff --git a/src/vma/dev/time_converter.cpp b/src/vma/dev/time_converter.cpp new file mode 100644 index 0000000..6b10a78 --- /dev/null +++ b/src/vma/dev/time_converter.cpp @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "time_converter.h" + +#include +#include "vlogger/vlogger.h" +#include "utils/rdtsc.h" + +#include "vma/util/sys_vars.h" +#include "vma/util/instrumentation.h" +#include "vma/event/event_handler_manager.h" +#include "vma/ib/base/verbs_extra.h" +#include "vma/dev/net_device_table_mgr.h" + +#define MODULE_NAME "time_converter" + +#define ibchtc_logerr __log_err +#define ibchtc_logwarn __log_warn +#define ibchtc_loginfo __log_info +#define ibchtc_logdbg __log_dbg + + +#define IB_CTX_TC_DEVIATION_THRESHOLD 10 + +#define VMA_QUERY_DEVICE_SUPPORTED (1 << 0) +#define VMA_QUERY_VALUES_SUPPORTED (1 << 1) + +uint32_t time_converter::get_single_converter_status(struct ibv_context* ctx) { + uint32_t dev_status = 0; +#ifdef DEFINED_IBV_CQ_TIMESTAMP + int rval; + + // Checking if ibv_exp_query_device() is valid + vma_ibv_device_attr_ex device_attr; + memset(&device_attr, 0, sizeof(device_attr)); + device_attr.comp_mask = VMA_IBV_DEVICE_ATTR_HCA_CORE_CLOCK; + + if ((rval = vma_ibv_query_device(ctx ,&device_attr)) || !device_attr.hca_core_clock) { + ibchtc_logdbg("time_converter::get_single_converter_status :Error in querying hca core clock " + "(vma_ibv_query_device() return value=%d ) (ibv context %p) (errno=%d %m)\n", rval, ctx, errno); + } else { + dev_status |= VMA_QUERY_DEVICE_SUPPORTED; + } + + // Checking if ibv_exp_query_values() is valid + vma_ts_values queried_values; + memset(&queried_values, 0, sizeof(queried_values)); + queried_values.comp_mask = VMA_IBV_VALUES_MASK_RAW_CLOCK; + if ((rval = vma_ibv_query_values(ctx, &queried_values)) || !vma_get_ts_val(queried_values)) { + ibchtc_logdbg("time_converter::get_single_converter_status :Error in querying hw clock, can't convert" + " hw time to system time (vma_ibv_query_values() return value=%d ) (ibv context %p) (errno=%d %m)\n", rval, ctx, errno); + } else { + dev_status |= VMA_QUERY_VALUES_SUPPORTED; + } +#else + NOT_IN_USE(ctx); +#endif + + return dev_status; +} + +ts_conversion_mode_t time_converter::update_device_converters_status(net_device_map_t& net_devices) +{ + ibchtc_logdbg("Checking RX HW time stamp status for all devices [%d]", net_devices.size()); + ts_conversion_mode_t ts_conversion_mode = TS_CONVERSION_MODE_DISABLE; + + if (net_devices.empty()) { + ibchtc_logdbg("No supported devices was found, return"); + return ts_conversion_mode; + } + + +#ifdef DEFINED_IBV_CQ_TIMESTAMP + + if (safe_mce_sys().hw_ts_conversion_mode != TS_CONVERSION_MODE_DISABLE) { + uint32_t devs_status = VMA_QUERY_DEVICE_SUPPORTED | VMA_QUERY_VALUES_SUPPORTED; + + /* Get common time conversion mode for all devices */ + for (net_device_map_index_t::iterator dev_iter = net_devices.begin(); dev_iter != net_devices.end(); dev_iter++) { + if (dev_iter->second->get_state() == net_device_val::RUNNING) { + slave_data_vector_t slaves = dev_iter->second->get_slave_array(); + for (slave_data_vector_t::iterator slaves_iter = slaves.begin(); slaves_iter != slaves.end(); slaves_iter++) { + devs_status &= get_single_converter_status((*slaves_iter)->p_ib_ctx->get_ibv_context()); + } + } + } + + switch (safe_mce_sys().hw_ts_conversion_mode) { + case TS_CONVERSION_MODE_RAW: + ts_conversion_mode = devs_status & VMA_QUERY_DEVICE_SUPPORTED ? TS_CONVERSION_MODE_RAW : TS_CONVERSION_MODE_DISABLE; + break; + case TS_CONVERSION_MODE_BEST_POSSIBLE: + if (devs_status == (VMA_QUERY_DEVICE_SUPPORTED | VMA_QUERY_VALUES_SUPPORTED)) { + ts_conversion_mode = TS_CONVERSION_MODE_SYNC; + } else { + ts_conversion_mode = devs_status & VMA_QUERY_DEVICE_SUPPORTED ? TS_CONVERSION_MODE_RAW : TS_CONVERSION_MODE_DISABLE; + } + break; + case TS_CONVERSION_MODE_SYNC: + ts_conversion_mode = devs_status == (VMA_QUERY_DEVICE_SUPPORTED | VMA_QUERY_VALUES_SUPPORTED) ? TS_CONVERSION_MODE_SYNC : TS_CONVERSION_MODE_DISABLE; + break; + case TS_CONVERSION_MODE_PTP: + ts_conversion_mode = devs_status == (VMA_QUERY_DEVICE_SUPPORTED | VMA_QUERY_VALUES_SUPPORTED) ? TS_CONVERSION_MODE_PTP : TS_CONVERSION_MODE_DISABLE; + break; + default: + ts_conversion_mode = TS_CONVERSION_MODE_DISABLE; + break; + } + } + +#endif + + ibchtc_logdbg("Conversion status was set to %d", ts_conversion_mode); + + for (net_device_map_index_t::iterator dev_iter = net_devices.begin(); dev_iter != net_devices.end(); dev_iter++) { + slave_data_vector_t slaves = dev_iter->second->get_slave_array(); + for (slave_data_vector_t::iterator slaves_iter = slaves.begin(); slaves_iter != slaves.end(); slaves_iter++) { + ts_conversion_mode_t dev_ts_conversion_mode = dev_iter->second->get_state() == net_device_val::RUNNING ? ts_conversion_mode : TS_CONVERSION_MODE_DISABLE; + (*slaves_iter)->p_ib_ctx->set_ctx_time_converter_status(dev_ts_conversion_mode); + } + } + + return ts_conversion_mode; +} + +void time_converter::clean_obj() +{ + if (is_cleaned()) { + return ; + } + + set_cleaned(); + m_timer_handle = NULL; + if (g_p_event_handler_manager->is_running()) { + g_p_event_handler_manager->unregister_timers_event_and_delete(this); + } else { + cleanable_obj::clean_obj(); + } +} diff --git a/src/vma/dev/time_converter.h b/src/vma/dev/time_converter.h new file mode 100644 index 0000000..b8512eb --- /dev/null +++ b/src/vma/dev/time_converter.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef TIME_CONVERTER_H +#define TIME_CONVERTER_H + +#include +#include + +#include "vma/util/sys_vars.h" +#include "vma/sock/cleanable_obj.h" +#include "vma/event/timer_handler.h" + +class net_device_val; +typedef std::tr1::unordered_map net_device_map_t; + +class ctx_timestamping_params_t { +public: + + uint64_t hca_core_clock; + uint64_t sync_hw_clock; + struct timespec sync_systime; + + ctx_timestamping_params_t() : hca_core_clock(0), sync_hw_clock(0) { + sync_systime.tv_sec = 0; + sync_systime.tv_nsec = 0; + } +}; + +class time_converter : public timer_handler, public cleanable_obj +{ +public: + time_converter(): m_timer_handle(NULL), m_converter_status(TS_CONVERSION_MODE_DISABLE) {}; + virtual ~time_converter() = 0; + + virtual void convert_hw_time_to_system_time(uint64_t hwtime, struct timespec* systime) = 0; + virtual void handle_timer_expired(void* user_data) = 0; + virtual void clean_obj(); + ts_conversion_mode_t get_converter_status() { return m_converter_status; }; + + static ts_conversion_mode_t update_device_converters_status(net_device_map_t& net_devices); + +protected: + void* m_timer_handle; + ts_conversion_mode_t m_converter_status; + + static uint32_t get_single_converter_status(struct ibv_context* ctx); +}; + +// pure virtual destructor implementation +inline time_converter::~time_converter() { } + +#endif //TIME_CONVERTER_H diff --git a/src/vma/dev/time_converter_ib_ctx.cpp b/src/vma/dev/time_converter_ib_ctx.cpp new file mode 100644 index 0000000..6037c92 --- /dev/null +++ b/src/vma/dev/time_converter_ib_ctx.cpp @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include +#include +#include "vma/event/event_handler_manager.h" +#include +#include "time_converter_ib_ctx.h" +#include "vma/ib/base/verbs_extra.h" + +#define MODULE_NAME "time_converter_ib_ctx" + +#define ibchtc_logerr __log_err +#define ibchtc_logwarn __log_warn +#define ibchtc_loginfo __log_info +#define ibchtc_logdbg __log_dbg + + +#define UPDATE_HW_TIMER_PERIOD_MS 1000 +#define UPDATE_HW_TIMER_FIRST_ONESHOT_MS 100 +#define UPDATE_HW_TIMER_SECOND_ONESHOT_MS 200 + +#define IB_CTX_TC_DEVIATION_THRESHOLD 10 + +time_converter_ib_ctx::time_converter_ib_ctx(struct ibv_context* ctx, ts_conversion_mode_t ctx_time_converter_mode, uint64_t hca_core_clock) : + m_p_ibv_context(ctx), m_ctx_parmeters_id(0) +{ +#ifdef DEFINED_IBV_CQ_TIMESTAMP + if (ctx_time_converter_mode != TS_CONVERSION_MODE_DISABLE) { + ctx_timestamping_params_t* current_parameters_set = &m_ctx_convert_parmeters[m_ctx_parmeters_id]; + + m_converter_status = TS_CONVERSION_MODE_RAW; + current_parameters_set->hca_core_clock = hca_core_clock * USEC_PER_SEC; + + if (ctx_time_converter_mode != TS_CONVERSION_MODE_RAW) { + if (sync_clocks(¤t_parameters_set->sync_systime, ¤t_parameters_set->sync_hw_clock)) { + m_converter_status = TS_CONVERSION_MODE_SYNC; + + g_p_event_handler_manager->register_timer_event(UPDATE_HW_TIMER_FIRST_ONESHOT_MS, this, ONE_SHOT_TIMER, 0); + g_p_event_handler_manager->register_timer_event(UPDATE_HW_TIMER_SECOND_ONESHOT_MS, this, ONE_SHOT_TIMER, 0); + m_timer_handle = g_p_event_handler_manager->register_timer_event(UPDATE_HW_TIMER_PERIOD_MS, this, PERIODIC_TIMER, 0); + } + } + } +#else + NOT_IN_USE(hca_core_clock); +#endif + if (ctx_time_converter_mode != m_converter_status) { + ibchtc_logwarn("converter status different then expected (ibv context %p, value = %d , expected = %d)" + , m_p_ibv_context, m_converter_status, ctx_time_converter_mode); + } +} + +void time_converter_ib_ctx::handle_timer_expired(void* user_data) { + NOT_IN_USE(user_data); + + if (is_cleaned()) { + return; + } + + fix_hw_clock_deviation(); +} + +uint64_t time_converter_ib_ctx::get_hca_core_clock(){ + return m_ctx_convert_parmeters[m_ctx_parmeters_id].hca_core_clock; +} + + +#ifdef DEFINED_IBV_CQ_TIMESTAMP +bool time_converter_ib_ctx::sync_clocks(struct timespec* st, uint64_t* hw_clock){ + struct timespec st1, st2, diff, st_min = TIMESPEC_INITIALIZER; + vma_ts_values queried_values; + int64_t interval, best_interval = 0; + uint64_t hw_clock_min = 0; + + memset(&queried_values, 0, sizeof(queried_values)); + queried_values.comp_mask = VMA_IBV_VALUES_MASK_RAW_CLOCK; + for (int i = 0 ; i < 10 ; i++) { + clock_gettime(CLOCK_REALTIME, &st1); + if (vma_ibv_query_values(m_p_ibv_context, &queried_values) || !vma_get_ts_val(queried_values)) { + return false; + } + + clock_gettime(CLOCK_REALTIME, &st2); + interval = (st2.tv_sec - st1.tv_sec) * NSEC_PER_SEC + (st2.tv_nsec - st1.tv_nsec); + + if (!best_interval || interval < best_interval) { + best_interval = interval; + hw_clock_min = vma_get_ts_val(queried_values); + + interval /= 2; + diff.tv_sec = interval / NSEC_PER_SEC; + diff.tv_nsec = interval - (diff.tv_sec * NSEC_PER_SEC); + ts_add(&st1, &diff, &st_min); + } + } + *st = st_min; + *hw_clock = hw_clock_min; + return true; +} + +void time_converter_ib_ctx::fix_hw_clock_deviation(){ + ctx_timestamping_params_t* current_parameters_set = &m_ctx_convert_parmeters[m_ctx_parmeters_id]; + + if (!current_parameters_set->hca_core_clock) { + return; + } + + struct timespec current_time, diff_systime; + uint64_t diff_hw_time, diff_systime_nano, estimated_hw_time, hw_clock; + int next_id = (m_ctx_parmeters_id + 1) % 2; + ctx_timestamping_params_t* next_parameters_set = &m_ctx_convert_parmeters[next_id]; + int64_t deviation_hw; + + if (!sync_clocks(¤t_time, &hw_clock)) { + return; + } + + ts_sub(¤t_time, ¤t_parameters_set->sync_systime, &diff_systime); + diff_hw_time = hw_clock - current_parameters_set->sync_hw_clock; + diff_systime_nano = diff_systime.tv_sec * NSEC_PER_SEC + diff_systime.tv_nsec; + + estimated_hw_time = (diff_systime.tv_sec * current_parameters_set->hca_core_clock) + (diff_systime.tv_nsec * current_parameters_set->hca_core_clock / NSEC_PER_SEC); + deviation_hw = estimated_hw_time - diff_hw_time; + + ibchtc_logdbg("ibv device '%s' [%p] : fix_hw_clock_deviation parameters status : %ld.%09ld since last deviation fix, \nUPDATE_HW_TIMER_PERIOD_MS = %d, current_parameters_set = %p, " + "estimated_hw_time = %ld, diff_hw_time = %ld, diff = %ld ,m_hca_core_clock = %ld", m_p_ibv_context->device->name, m_p_ibv_context->device, diff_systime.tv_sec, diff_systime.tv_nsec, + UPDATE_HW_TIMER_PERIOD_MS, current_parameters_set, estimated_hw_time, diff_hw_time, deviation_hw, current_parameters_set->hca_core_clock); + + if (abs(deviation_hw) < IB_CTX_TC_DEVIATION_THRESHOLD) { + return; + } + + next_parameters_set->hca_core_clock = (diff_hw_time * NSEC_PER_SEC) / diff_systime_nano; + next_parameters_set->sync_hw_clock = hw_clock; + next_parameters_set->sync_systime = current_time; + + m_ctx_parmeters_id = next_id; +} + +#else + +void time_converter_ib_ctx::fix_hw_clock_deviation(){} +bool time_converter_ib_ctx::sync_clocks(struct timespec* ts, uint64_t* hw_clock){ NOT_IN_USE(ts); NOT_IN_USE(hw_clock); return false;} + +#endif + +inline void time_converter_ib_ctx::calculate_delta(struct timespec& hw_to_timespec, uint64_t hca_core_clock, uint64_t hw_time_diff) { + hw_to_timespec.tv_sec = hw_time_diff / hca_core_clock; + hw_time_diff -= hw_to_timespec.tv_sec * hca_core_clock; + hw_to_timespec.tv_nsec = (hw_time_diff * NSEC_PER_SEC) / hca_core_clock; +} + +void time_converter_ib_ctx::convert_hw_time_to_system_time(uint64_t hwtime, struct timespec* systime) { + + ctx_timestamping_params_t* current_parameters_set = &m_ctx_convert_parmeters[m_ctx_parmeters_id]; + if (current_parameters_set->hca_core_clock && hwtime) { + + struct timespec hw_to_timespec, sync_systime; + uint64_t hca_core_clock, sync_hw_clock; + + // sync_hw_clock should be zero when m_conversion_mode is CONVERSION_MODE_RAW_OR_FAIL or CONVERSION_MODE_DISABLE + hca_core_clock = current_parameters_set->hca_core_clock; + sync_hw_clock = current_parameters_set->sync_hw_clock; + sync_systime = current_parameters_set->sync_systime; + + // Handle case in which the reference point occurred after the packet has been arrived. + if (hwtime > sync_hw_clock) { + calculate_delta(hw_to_timespec, hca_core_clock, hwtime - sync_hw_clock); + ts_add(&sync_systime, &hw_to_timespec, systime); + } else { + calculate_delta(hw_to_timespec, hca_core_clock, sync_hw_clock - hwtime); + ts_sub(&sync_systime, &hw_to_timespec, systime); + } + } +} diff --git a/src/vma/dev/time_converter_ib_ctx.h b/src/vma/dev/time_converter_ib_ctx.h new file mode 100644 index 0000000..48278dc --- /dev/null +++ b/src/vma/dev/time_converter_ib_ctx.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef TIME_CONVERTER_IB_CTX_H +#define TIME_CONVERTER_IB_CTX_H + +#include +#include +#include "time_converter.h" + + +class time_converter_ib_ctx : public time_converter +{ +public: + time_converter_ib_ctx(struct ibv_context* ctx, ts_conversion_mode_t ctx_time_converter_mode, uint64_t hca_core_clock); + + virtual ~time_converter_ib_ctx() {}; + + void convert_hw_time_to_system_time(uint64_t hwtime, struct timespec* systime); + void handle_timer_expired(void* user_data); + uint64_t get_hca_core_clock(); + +private: + struct ibv_context* m_p_ibv_context; + ctx_timestamping_params_t m_ctx_convert_parmeters[2]; + int m_ctx_parmeters_id; + + void fix_hw_clock_deviation(); + inline void calculate_delta(struct timespec& hw_to_timespec, uint64_t hca_core_clock, uint64_t hw_time_diff); + bool sync_clocks(struct timespec* st, uint64_t* hw_clock); +}; + +#endif // TIME_CONVERTER_IB_CTX_H diff --git a/src/vma/dev/time_converter_ptp.cpp b/src/vma/dev/time_converter_ptp.cpp new file mode 100644 index 0000000..2bceee2 --- /dev/null +++ b/src/vma/dev/time_converter_ptp.cpp @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include +#include +#include "vma/event/event_handler_manager.h" +#include +#include "utils/rdtsc.h" +#include "vma/util/instrumentation.h" +#include "vma/util/utils.h" +#include "vma/dev/time_converter_ptp.h" +#include "vma/ib/base/verbs_extra.h" + + +#ifdef DEFINED_IBV_CLOCK_INFO + +#define MODULE_NAME "tc_ptp" + +#define ibchtc_logerr __log_err +#define ibchtc_logwarn __log_warn +#define ibchtc_loginfo __log_info +#define ibchtc_logdbg __log_info_dbg +#define ibchtc_logfunc __log_info_func + +#define UPDATE_HW_TIMER_PTP_PERIOD_MS 100 + + +time_converter_ptp::time_converter_ptp(struct ibv_context* ctx) : + m_p_ibv_context(ctx), m_clock_values_id(0) +{ + for (size_t i=0; i < ARRAY_SIZE(m_clock_values); i++) { + memset(&m_clock_values[i], 0, sizeof(m_clock_values[i])); + if (vma_ibv_query_clock_info(m_p_ibv_context, &m_clock_values[i])) { + ibchtc_logerr("vma_ibv_query_clock_info failure for clock_info, (ibv context %p)", m_p_ibv_context); + } + } + + m_timer_handle = g_p_event_handler_manager->register_timer_event(UPDATE_HW_TIMER_PTP_PERIOD_MS, this, PERIODIC_TIMER, 0); + m_converter_status = TS_CONVERSION_MODE_PTP; +} + +void time_converter_ptp::handle_timer_expired(void* user_data) { + + NOT_IN_USE(user_data); + + if (is_cleaned()) { + return; + } + + int ret = 0; + ret = vma_ibv_query_clock_info(m_p_ibv_context, &m_clock_values[1 - m_clock_values_id]); + if (ret) + ibchtc_logerr("vma_ibv_query_clock_info failure for clock_info, (ibv context %p) (return value=%d)", m_p_ibv_context, ret); + + m_clock_values_id = 1 - m_clock_values_id; +} + +void time_converter_ptp::convert_hw_time_to_system_time(uint64_t hwtime, struct timespec* systime) { + uint64_t sync_hw_clock = vma_ibv_convert_ts_to_ns(&m_clock_values[m_clock_values_id], hwtime); + systime->tv_sec = sync_hw_clock / NSEC_PER_SEC; + systime->tv_nsec = sync_hw_clock % NSEC_PER_SEC; + + ibchtc_logfunc("hwtime: %09ld", hwtime); + ibchtc_logfunc("systime: %lld.%.9ld", systime->tv_sec, systime->tv_nsec); +} +#endif //DEFINED_IBV_CLOCK_INFO diff --git a/src/vma/dev/time_converter_ptp.h b/src/vma/dev/time_converter_ptp.h new file mode 100644 index 0000000..ca0fea3 --- /dev/null +++ b/src/vma/dev/time_converter_ptp.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef TIME_CONVERTER_PTP_H +#define TIME_CONVERTER_PTP_H + +#include +#include "vma/event/timer_handler.h" +#include +#include "time_converter.h" + +#ifdef DEFINED_IBV_CLOCK_INFO + +class time_converter_ptp : public time_converter +{ +public: + time_converter_ptp(struct ibv_context* ctx); + virtual ~time_converter_ptp() {}; + + inline void convert_hw_time_to_system_time(uint64_t hwtime, struct timespec* systime); + virtual void handle_timer_expired(void* user_data); + +private: + struct ibv_context* m_p_ibv_context; + + vma_ibv_clock_info m_clock_values[2]; + int m_clock_values_id; +}; + +#endif // DEFINED_IBV_CLOCK_INFO +#endif // TIME_CONVERTER_PTP_H diff --git a/src/vma/dev/wqe_send_handler.cpp b/src/vma/dev/wqe_send_handler.cpp new file mode 100644 index 0000000..2ff172b --- /dev/null +++ b/src/vma/dev/wqe_send_handler.cpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "wqe_send_handler.h" + +wqe_send_handler::wqe_send_handler() +{ +} + +wqe_send_handler::~wqe_send_handler() +{ +} + +void wqe_send_handler::init_inline_wqe(vma_ibv_send_wr &wqe_to_init, struct ibv_sge* sge_list, uint32_t num_sge) +{ + init_not_inline_wqe(wqe_to_init, sge_list, num_sge); + enable_inline(wqe_to_init); +} + +void wqe_send_handler::init_not_inline_wqe(vma_ibv_send_wr &wqe_to_init, struct ibv_sge* sge_list, uint32_t num_sge) +{ + init_wqe(wqe_to_init, sge_list, num_sge); + enable_hw_csum(wqe_to_init); +} + +void wqe_send_handler::init_wqe(vma_ibv_send_wr &wqe_to_init, struct ibv_sge* sge_list, uint32_t num_sge) +{ + memset(&wqe_to_init, 0, sizeof(wqe_to_init)); + + wqe_to_init.num_sge = num_sge; + vma_send_wr_opcode(wqe_to_init) = VMA_IBV_WR_SEND; + wqe_to_init.next = NULL; + wqe_to_init.sg_list = sge_list; + wqe_to_init.wr_id = 0; +} diff --git a/src/vma/dev/wqe_send_handler.h b/src/vma/dev/wqe_send_handler.h new file mode 100644 index 0000000..57108ce --- /dev/null +++ b/src/vma/dev/wqe_send_handler.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vma/ib/base/verbs_extra.h" +#include "vma/util/to_str.h" + +#ifndef IB_WQE_TEMPLATE_H +#define IB_WQE_TEMPLATE_H + +class wqe_send_handler: public tostr +{ +public: + wqe_send_handler(); + virtual ~wqe_send_handler(); + + void init_wqe(vma_ibv_send_wr &wqe_to_init, struct ibv_sge* sge_list, uint32_t num_sge); + void init_inline_wqe(vma_ibv_send_wr &wqe_to_init, struct ibv_sge* sge_list, uint32_t num_sge); + void init_not_inline_wqe(vma_ibv_send_wr &wqe_to_init, struct ibv_sge* sge_list, uint32_t num_sge); + + inline vma_ibv_wr_opcode set_opcode(vma_ibv_send_wr &wqe, vma_ibv_wr_opcode opcode) { + vma_ibv_wr_opcode last_opcode = vma_send_wr_opcode(wqe); + vma_send_wr_opcode(wqe) = opcode; + return last_opcode; + } + +#ifndef DEFINED_SW_CSUM + inline void enable_hw_csum (vma_ibv_send_wr &send_wqe) { vma_send_wr_send_flags(send_wqe) |= VMA_IBV_SEND_IP_CSUM; } + inline void disable_hw_csum (vma_ibv_send_wr &send_wqe) { vma_send_wr_send_flags(send_wqe) &= ~VMA_IBV_SEND_IP_CSUM; } +#else + inline void enable_hw_csum (vma_ibv_send_wr &) {} + inline void disable_hw_csum (vma_ibv_send_wr &) {} +#endif + +#ifdef DEFINED_TSO + inline void enable_tso(vma_ibv_send_wr &wr, void *hdr, uint16_t hdr_sz, uint16_t mss) { + vma_send_wr_opcode(wr) = VMA_IBV_WR_TSO; + wr.tso.hdr = hdr; + wr.tso.hdr_sz = hdr_sz; + wr.tso.mss = mss; + } +#endif /* DEFINED_TSO */ + + inline void enable_inline (vma_ibv_send_wr &send_wqe) { vma_send_wr_send_flags(send_wqe) |= VMA_IBV_SEND_INLINE; } +}; + +#endif /* IB_WQE_TEMPLATE_H */ diff --git a/src/vma/dev/wqe_send_ib_handler.cpp b/src/vma/dev/wqe_send_ib_handler.cpp new file mode 100644 index 0000000..1b2457e --- /dev/null +++ b/src/vma/dev/wqe_send_ib_handler.cpp @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "wqe_send_ib_handler.h" + +wqe_send_ib_handler::wqe_send_ib_handler() +{ +} + +wqe_send_ib_handler::~wqe_send_ib_handler() +{ +} + +void wqe_send_ib_handler::init_path_record(vma_ibv_send_wr &wqe_to_init, struct ibv_ah *ah, uint32_t rem_qkey, uint32_t rem_qpn) +{ + wqe_to_init.wr.ud.ah = ah; + wqe_to_init.wr.ud.remote_qkey = rem_qkey; + wqe_to_init.wr.ud.remote_qpn = rem_qpn; +} + +void wqe_send_ib_handler::init_ib_wqe(vma_ibv_send_wr &wqe_to_init, struct ibv_sge* sge_list, uint32_t num_sge, + struct ibv_ah *ah, uint32_t rem_qpn, uint32_t rem_qkey) +{ + wqe_send_handler::init_wqe(wqe_to_init, sge_list, num_sge); + init_path_record(wqe_to_init, ah, rem_qkey, rem_qpn); +} + +void wqe_send_ib_handler::init_inline_ib_wqe(vma_ibv_send_wr &wqe_to_init, struct ibv_sge* sge_list, uint32_t num_sge, + struct ibv_ah *ah, uint32_t rem_qpn, uint32_t rem_qkey) +{ + wqe_send_handler::init_inline_wqe(wqe_to_init, sge_list, num_sge); + init_path_record(wqe_to_init, ah, rem_qkey, rem_qpn); +} + +void wqe_send_ib_handler::init_not_inline_ib_wqe(vma_ibv_send_wr &wqe_to_init, struct ibv_sge* sge_list, uint32_t num_sge, + struct ibv_ah *ah, uint32_t rem_qpn, uint32_t rem_qkey) +{ + wqe_send_handler::init_not_inline_wqe(wqe_to_init, sge_list, num_sge); + init_path_record(wqe_to_init, ah, rem_qkey, rem_qpn); +} diff --git a/src/vma/dev/wqe_send_ib_handler.h b/src/vma/dev/wqe_send_ib_handler.h new file mode 100644 index 0000000..b0e46f9 --- /dev/null +++ b/src/vma/dev/wqe_send_ib_handler.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "wqe_send_handler.h" +#include "vma/util/vtypes.h" + +#ifndef WQE_TEMPLATE_SEND_IB_H_ +#define WQE_TEMPLATE_SEND_IB_H_ + +class wqe_send_ib_handler: public wqe_send_handler +{ +public: + wqe_send_ib_handler(); + virtual ~wqe_send_ib_handler(); + + void init_ib_wqe(vma_ibv_send_wr &wqe_to_init, struct ibv_sge* sge_list, uint32_t num_sge, struct ibv_ah *ah, uint32_t rem_qpn, uint32_t rem_qkey); + void init_inline_ib_wqe(vma_ibv_send_wr & wqe_to_init, struct ibv_sge *sge_list, uint32_t num_sge, struct ibv_ah *ah, uint32_t rem_qpn, uint32_t rem_qkey); + void init_not_inline_ib_wqe(vma_ibv_send_wr & wqe_to_init, struct ibv_sge *sge_list, uint32_t num_sge, struct ibv_ah *ah, uint32_t rem_qpn, uint32_t rem_qkey); + +private: + void init_path_record(vma_ibv_send_wr & wqe_to_init, struct ibv_ah *ah, uint32_t rem_qkey, uint32_t rem_qpn); +}; + +#endif /* WQE_TEMPLATE_SEND_IB_H_ */ diff --git a/src/vma/event/command.h b/src/vma/event/command.h new file mode 100644 index 0000000..9f50d9f --- /dev/null +++ b/src/vma/event/command.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * command.h + * + */ + +#ifndef COMMAND_H_ +#define COMMAND_H_ + +#include "vma/netlink/netlink_wrapper.h" +#include "vma/util/to_str.h" +#include "vma/event/timer_handler.h" + +class command : public tostr +{ +public: + command(){}; + virtual ~command(){}; + virtual void execute() = 0; +private: + //block copy ctor + command(const command &command); +}; + +class command_netlink: public command , public timer_handler +{ +public: + command_netlink(netlink_wrapper *executer): m_ntl_executer(executer) {}; + + virtual void execute() { + if (m_ntl_executer) { + m_ntl_executer->handle_events(); + } + } + + const std::string to_str() const + { + return(string("command_netlink")); + } + + virtual void handle_timer_expired(void* a) { + NOT_IN_USE(a); + m_ntl_executer->neigh_timer_expired(); + } + + +private: + netlink_wrapper *m_ntl_executer; + +}; + +#endif /* COMMAND_H_ */ diff --git a/src/vma/event/delta_timer.cpp b/src/vma/event/delta_timer.cpp new file mode 100644 index 0000000..f0e9256 --- /dev/null +++ b/src/vma/event/delta_timer.cpp @@ -0,0 +1,362 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include +#include +#include "utils/bullseye.h" +#include "utils/clock.h" +#include "vlogger/vlogger.h" +#include "vma/util/sys_vars.h" +#include "vma/util/utils.h" +#include "delta_timer.h" +#include "timer_handler.h" + +#define MODULE_NAME "tmr:" + +#define tmr_logpanic __log_panic +#define tmr_logerr __log_err +#define tmr_logwarn __log_warn +#define tmr_loginfo __log_info +#define tmr_logdbg __log_dbg +#define tmr_logfunc __log_func +//#define tmr_logfuncall __log_funcall +#define tmr_logfuncall(fmt, ...) + + +#define IS_NODE_INVALID(_node_) \ + (!_node_ || !_node_->handler || (_node_->req_type < 0 || _node_->req_type >= INVALID_TIMER)) + + +timer::timer() +{ + m_list_head = NULL; + gettime(&m_ts_last); +} + +timer::~timer() +{ + timer_node_t* iter = m_list_head; + timer_node_t* to_free = NULL; + tmr_logfunc(""); + m_list_head = NULL; + // free all the list + while (iter) { + to_free = iter; + iter = iter->next; + free(to_free); + } +} + +void timer::add_new_timer(unsigned int timeout_msec, timer_node_t* node, timer_handler* handler, void* user_data, timer_req_type_t req_type) +{ + node->handler = handler; + node->req_type = req_type; + node->user_data = user_data; + node->orig_time_msec = timeout_msec; + + BULLSEYE_EXCLUDE_BLOCK_START + if (IS_NODE_INVALID(node)) { + free(node); + return; + } + BULLSEYE_EXCLUDE_BLOCK_END + + insert_to_list(node); + return; +} + +void timer::wakeup_timer(timer_node_t* node) +{ + BULLSEYE_EXCLUDE_BLOCK_START + if (IS_NODE_INVALID(node)) { + return; + } + BULLSEYE_EXCLUDE_BLOCK_END + + remove_from_list(node); + + unsigned int orig_time = node->orig_time_msec; + node->orig_time_msec = 0; + insert_to_list(node); + node->orig_time_msec = orig_time; + + return; +} + +void timer::remove_timer(timer_node_t* node, timer_handler *handler) +{ + // Look for handler in the list if node wasen't indicated + if (!node) { + node = m_list_head; + while (node) { + if (node->handler == handler) // node found + break; + node = node->next; + } + } + + // Here we MUST have a valid node pointer + BULLSEYE_EXCLUDE_BLOCK_START + if (IS_NODE_INVALID(node) || (node->handler != handler)) { + tmr_logfunc("bad combo for removale (%p,%p)", node, handler); + return; + } + BULLSEYE_EXCLUDE_BLOCK_END + + // Invalidate node before freeing it + node->handler = NULL; + node->req_type = INVALID_TIMER; + + // Remove & Free node + remove_from_list(node); + free(node); + return; +} + +void timer::remove_all_timers(timer_handler *handler) +{ + timer_node_t* node = m_list_head; + timer_node_t* node_tmp = NULL; + + // Look for handler in the list if node wasen't indicated + while (node) { + if (node->handler == handler) {// node found + node_tmp = node; + node = node->next; + // Here we MUST have a valid node pointer + BULLSEYE_EXCLUDE_BLOCK_START + if (IS_NODE_INVALID(node_tmp) || (node_tmp->handler != handler)) { + tmr_logfunc("bad combo for removale (%p,%p)", node_tmp, handler); + continue; + } + BULLSEYE_EXCLUDE_BLOCK_END + // Invalidate node before freeing it + node_tmp->handler = NULL; + node_tmp->req_type = INVALID_TIMER; + remove_from_list(node_tmp); + // Remove & Free node + free(node_tmp); + node_tmp = NULL; + } else { + node = node->next; + } + } + + + return; +} + +int timer::update_timeout() +{ + int ret = 0, delta_msec = 0; + timer_node_t* list_tmp = NULL; + struct timespec ts_now, ts_delta; + + ret = gettime(&ts_now); + BULLSEYE_EXCLUDE_BLOCK_START + if (ret) { + tmr_logpanic("gettime() returned with error (errno %d %m)", ret); + return INFINITE_TIMEOUT; + } + BULLSEYE_EXCLUDE_BLOCK_END + // Find difference (subtract) + ts_sub(&ts_now, &m_ts_last, &ts_delta); + delta_msec = ts_to_msec(&ts_delta); + + // Save 'now' as 'last' + if (delta_msec > 0) + m_ts_last = ts_now; + + // empty list -> unlimited timeout + if (!m_list_head) { + tmr_logfunc("elapsed time: %d msec", delta_msec); + ret = INFINITE_TIMEOUT; + goto out; + } + + // Check for timeout! + list_tmp = m_list_head; + while (delta_msec > 0 && list_tmp) { + tmr_logfuncall("updating list node %p with elapsed time: %d msec", list_tmp, delta_msec); + if ((int) list_tmp->delta_time_msec > delta_msec) { + list_tmp->delta_time_msec -= delta_msec; + break; + } + else { + delta_msec -= list_tmp->delta_time_msec; + list_tmp->delta_time_msec = 0; + } + list_tmp = list_tmp->next; + } + + ret = m_list_head->delta_time_msec; + +out: + tmr_logfuncall("next timeout: %d msec", ret); + return ret; +} + +void timer::process_registered_timers() +{ + timer_node_t* iter = m_list_head; + timer_node_t* next_iter; + while (iter && (iter->delta_time_msec == 0)) { + tmr_logfuncall("timer expired on %p", iter->handler); + + /* Special check is need to protect + * from using destroyed object pointed by handler + * See unregister_timer_event() + * Object can be destoyed from another thread (lock protection) + * and from current thread (lock and lock count condition) + */ + if (iter->handler && + !iter->lock_timer.trylock() && + (1 == iter->lock_timer.is_locked_by_me())) { + iter->handler->handle_timer_expired(iter->user_data); + iter->lock_timer.unlock(); + } + next_iter = iter->next; + + switch (iter->req_type) { + case PERIODIC_TIMER: + // re-insert + remove_from_list(iter); + iter->prev = iter->next = NULL; + insert_to_list(iter); + break; + + case ONE_SHOT_TIMER: + remove_timer(iter, iter->handler); + break; + + BULLSEYE_EXCLUDE_BLOCK_START + case INVALID_TIMER: + default: + tmr_logwarn("invalid timer expired on %p", iter->handler); + break; + } + BULLSEYE_EXCLUDE_BLOCK_END + iter = next_iter; + } +} + +// insert allocated node to the list +void timer::insert_to_list(timer_node_t* new_node) +{ + unsigned int tmp_delta; + timer_node_t* iter; + timer_node_t* prev; + + if (!m_list_head) { // first node in the list + new_node->delta_time_msec = new_node->orig_time_msec; // time from now + new_node->next = NULL; + new_node->prev = NULL; + m_list_head = new_node; + tmr_logfuncall("insert first node to list (handler %p, timer %d, delta time %d)", new_node->handler, new_node->orig_time_msec, new_node->delta_time_msec); + return; + } + // else: need to find the correct place in the list + tmp_delta = new_node->orig_time_msec; + iter = m_list_head; + prev = NULL; + + while (iter && tmp_delta >= iter->delta_time_msec) { + tmp_delta = tmp_delta - iter->delta_time_msec; + prev = iter; + iter = iter->next; + } + + new_node->delta_time_msec = tmp_delta; + new_node->next = iter; + new_node->prev = prev; + if (prev) { + prev->next = new_node; + } + else { // first node in the list + m_list_head = new_node; + } + // update the delta time for the next element + if (new_node->next) { + new_node->next->delta_time_msec = new_node->next->delta_time_msec - new_node->delta_time_msec; + new_node->next->prev = new_node; + } + tmr_logfuncall("insert new node to list (handler %p, timer %d, delta time %d)", new_node->handler, new_node->orig_time_msec, new_node->delta_time_msec); +} + +// remove timer from list (without free) +// called after timer expired (as part of unregister timer, or while reregister periodic timer) +void timer::remove_from_list(timer_node_t* node) +{ + // remove from the list + if (node->prev) { // not the first element in list + node->prev->next = node->next; + } + else { + m_list_head = node->next; + } + if (node->next) { // not the last element in list + node->next->delta_time_msec = node->next->delta_time_msec + node->delta_time_msec; + node->next->prev = node->prev; + } + tmr_logfuncall("removed node from list (handler %p, timer %d, delta time %d)", node->handler, node->orig_time_msec, node->delta_time_msec); +} + + +const char* timer_req_type_str(timer_req_type_t type) +{ + switch (type) { + case PERIODIC_TIMER: return "PERIODIC"; + case ONE_SHOT_TIMER: return "ONE SHOT"; + BULLSEYE_EXCLUDE_BLOCK_START + case INVALID_TIMER: return "INVALID"; + default: return "Unknown timer type"; + BULLSEYE_EXCLUDE_BLOCK_END + } +} + + + +//code coverage +#if 0 +void timer::debug_print_list() +{ + timer_node_t* iter = m_list_head; + tmr_logdbg(""); + while (iter) { + tmr_logdbg("node %p timer %d",iter, iter->delta_time_msec); + iter = iter->next; + } +} +#endif + + diff --git a/src/vma/event/delta_timer.h b/src/vma/event/delta_timer.h new file mode 100644 index 0000000..8199f6b --- /dev/null +++ b/src/vma/event/delta_timer.h @@ -0,0 +1,115 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef DELTA_TIMER_H +#define DELTA_TIMER_H + +#include +#include "utils/lock_wrapper.h" + +#define INFINITE_TIMEOUT (-1) + +class timer_handler; +class timers_group; + +enum timer_req_type_t { + // reregister itself every after timer expires. (the client doesn't need to reregister) + // in order to stop the timer, the client needs to unregister + PERIODIC_TIMER, + + // after the timer expires the client doesn't need to unregister + ONE_SHOT_TIMER, + + INVALID_TIMER +}; + +struct timer_node_t { + /* delta time from the previous node (millisec) */ + unsigned int delta_time_msec; + /* the orig timer requested (saved in order to re-register periodic timers) */ + unsigned int orig_time_msec; + /* control thread-safe access to handler. Recursive because unregister_timer_event() + * can be called from handle_timer_expired() + * that is under trylock() inside process_registered_timers + */ + lock_spin_recursive lock_timer; + /* link to the context registered */ + timer_handler* handler; + void* user_data; + timers_group* group; + timer_req_type_t req_type; + struct timer_node_t* next; + struct timer_node_t* prev; +}; // used by the list + +class timer +{ +public: + timer(); + ~timer(); + + // add a new timer + void add_new_timer(unsigned int timeout, timer_node_t* node, timer_handler* handler, + void* user_data, timer_req_type_t req_type); + + // wakeup existing timer + void wakeup_timer(timer_node_t* node); + + // remove timer from list and free it. + // called for stopping (unregistering) a timer + void remove_timer(timer_node_t* node, timer_handler* handler); + + // remove all timers from list and free it. + // called for stopping (unregistering) all timers + void remove_all_timers(timer_handler* handler); + + // update the timeout in first element in the list + // return the timeout needed. (or INFINITE_TIMEOUT if there's no timeout) + int update_timeout(); + + // run "tick" func for all the registered timer handler that their timeout expiered + void process_registered_timers(); + + void debug_print_list(); + +private: + void insert_to_list(timer_node_t* node); + void remove_from_list(timer_node_t* node); + + timer_node_t* m_list_head; + timespec m_ts_last; +}; + +const char* timer_req_type_str(timer_req_type_t type); + +#endif //DELTA_TIMER_H diff --git a/src/vma/event/event.h b/src/vma/event/event.h new file mode 100644 index 0000000..c6c062b --- /dev/null +++ b/src/vma/event/event.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef EVENT_H +#define EVENT_H + +#include +#include +#include +#include "utils/bullseye.h" +#include "vma/util/to_str.h" + +class event : public tostr { + public: + enum type{ + UNKNOWN_EVENT, + SEND_EVENT, + DROP_EVENT + }; + + type m_type; + event(void* notifier=NULL) : m_type(UNKNOWN_EVENT), m_notifier(notifier) {} + virtual ~event() {}; + + virtual const std::string to_str() const + { + char outstr[1024]; + sprintf(outstr, "EVENT_TYPE=%s NOTIFIER_PTR=%llu", typeid(*this).name(), (long long unsigned int)m_notifier); + return std::string(outstr); + } + + private: + void* m_notifier; +}; + + +#endif /* EVENT_H */ diff --git a/src/vma/event/event_handler_ibverbs.h b/src/vma/event/event_handler_ibverbs.h new file mode 100644 index 0000000..cb4c19f --- /dev/null +++ b/src/vma/event/event_handler_ibverbs.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef EVENT_HANDLER_IBVERBS_H +#define EVENT_HANDLER_IBVERBS_H + +/* + * @class event_handler + * An object registers with event_handler_manager to get event notification callbacks for the registered HCA context. + * This callback function will be called when an event was received on the appropritae channel with the appropritae id. + * The channels can be shared between several objects, but the id's in each channel has to be unic. + */ +class event_handler_ibverbs +{ +public: + virtual ~event_handler_ibverbs() {}; + virtual void handle_event_ibverbs_cb(void* ev_data, void* user_data) = 0; +}; + +#endif //EVENT_HANDLER_IBVERBS_H diff --git a/src/vma/event/event_handler_manager.cpp b/src/vma/event/event_handler_manager.cpp new file mode 100644 index 0000000..107c59e --- /dev/null +++ b/src/vma/event/event_handler_manager.cpp @@ -0,0 +1,1040 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "event_handler_manager.h" + +#include +#include +#include +#include "vma/dev/ring_allocation_logic.h" +#include "vma/sock/fd_collection.h" +#include "vma/sock/sock-redirect.h" // calling orig_os_api.epoll() +#include "timer_handler.h" +#include "event_handler_ibverbs.h" +#include "event_handler_rdma_cm.h" + +#include "vma/util/instrumentation.h" + +#define MODULE_NAME "evh:" + +#define evh_logpanic __log_panic +#define evh_logerr __log_err +#define evh_logwarn __log_warn +#define evh_loginfo __log_info +#define evh_logdbg __log_dbg +#define evh_logfunc __log_func +#define evh_logfuncall __log_funcall + +#undef VLOG_PRINTF_ENTRY +#define VLOG_PRINTF_ENTRY(log_level, log_fmt, log_args...) vlog_printf(log_level, MODULE_NAME "%d:%s(" log_fmt ")\n", __LINE__, __FUNCTION__, ##log_args) + +#if (VMA_MAX_DEFINED_LOG_LEVEL < DEFINED_VLOG_DEBUG) +#define evh_logdbg_entry(log_fmt, log_args...) ((void)0) +#else +#define evh_logdbg_entry(log_fmt, log_args...) do { if (g_vlogger_level >= VLOG_DEBUG) VLOG_PRINTF_ENTRY(VLOG_DEBUG, log_fmt, ##log_args); } while (0) +#endif + +#if (VMA_MAX_DEFINED_LOG_LEVEL < DEFINED_VLOG_FINE) +#define evh_logfunc_entry(log_fmt, log_args...) ((void)0) +#else +#define evh_logfunc_entry(log_fmt, log_args...) do { if (g_vlogger_level >= VLOG_FUNC) VLOG_PRINTF_ENTRY(VLOG_FUNC, log_fmt, ##log_args); } while (0) +#endif /* VMA_MAX_DEFINED_LOG_LEVEL */ + + +#define INITIAL_EVENTS_NUM 64 + +event_handler_manager* g_p_event_handler_manager = NULL; + +pthread_t g_n_internal_thread_id = 0; + + +void* event_handler_manager::register_timer_event(int timeout_msec, timer_handler* handler, + timer_req_type_t req_type, void* user_data, + timers_group* group /* = NULL */) +{ + evh_logdbg("timer handler '%p' registered %s timer for %d msec (user data: %X)", + handler, timer_req_type_str(req_type), timeout_msec, user_data); + BULLSEYE_EXCLUDE_BLOCK_START + if (!handler || (req_type < 0 || req_type >= INVALID_TIMER)) { + evh_logwarn("bad timer type (%d) or handler (%p)", req_type, handler); + return NULL; + } + BULLSEYE_EXCLUDE_BLOCK_END + + // malloc here the timer list node in order to return it to the app + void* node = calloc(1, sizeof(struct timer_node_t)); + BULLSEYE_EXCLUDE_BLOCK_START + if (!node) { + evh_logdbg("malloc failure"); + throw_vma_exception("malloc failure"); + } + BULLSEYE_EXCLUDE_BLOCK_END + + timer_node_t* timer_node = (timer_node_t*)node; + timer_node->lock_timer=lock_spin_recursive("timer"); + + reg_action_t reg_action; + memset(®_action, 0, sizeof(reg_action)); + reg_action.type = REGISTER_TIMER; + reg_action.info.timer.handler = handler; + reg_action.info.timer.user_data = user_data; + reg_action.info.timer.group = group; + reg_action.info.timer.node = node; + reg_action.info.timer.timeout_msec = timeout_msec; + reg_action.info.timer.req_type = req_type; + post_new_reg_action(reg_action); + return node; +} + +void event_handler_manager::wakeup_timer_event(timer_handler* handler, void* node) +{ + evh_logdbg("timer handler '%p'", handler); + BULLSEYE_EXCLUDE_BLOCK_START + if (!handler) { + evh_logwarn("bad handler (%p)", handler); + return; + } + BULLSEYE_EXCLUDE_BLOCK_END + reg_action_t reg_action; + memset(®_action, 0, sizeof(reg_action)); + reg_action.type = WAKEUP_TIMER; + reg_action.info.timer.handler = handler; + reg_action.info.timer.node = node; + post_new_reg_action(reg_action); + return; +} + +void event_handler_manager::unregister_timer_event(timer_handler* handler, void* node) +{ + evh_logdbg("timer handler '%p'", handler); + reg_action_t reg_action; + memset(®_action, 0, sizeof(reg_action)); + reg_action.type = UNREGISTER_TIMER; + reg_action.info.timer.handler = handler; + reg_action.info.timer.node = node; + + /* Special protection is needed to avoid scenario when deregistration is done + * during timer_handler object destruction, timer node itself is not removed + * and time for this timer node is expired. In this case there is no guarantee + * to operate with timer_handler object. + * See timer::process_registered_timers() + * Do just lock() to protect timer_handler inside process_registered_timers() + */ + if (node) { + timer_node_t* timer_node = (timer_node_t*)node; + timer_node->lock_timer.lock(); + } + + post_new_reg_action(reg_action); +} + +void event_handler_manager::unregister_timers_event_and_delete(timer_handler* handler) +{ + evh_logdbg("timer handler '%p'", handler); + reg_action_t reg_action; + memset(®_action, 0, sizeof(reg_action)); + reg_action.type = UNREGISTER_TIMERS_AND_DELETE; + reg_action.info.timer.handler = handler; + post_new_reg_action(reg_action); +} + +void event_handler_manager::register_ibverbs_event(int fd, event_handler_ibverbs *handler, + void* channel, void* user_data) +{ + reg_action_t reg_action; + memset(®_action, 0, sizeof(reg_action)); + reg_action.type = REGISTER_IBVERBS; + reg_action.info.ibverbs.fd = fd; + reg_action.info.ibverbs.handler = handler; + reg_action.info.ibverbs.channel = channel; + reg_action.info.ibverbs.user_data = user_data; + post_new_reg_action(reg_action); +} + +void event_handler_manager::unregister_ibverbs_event(int fd, event_handler_ibverbs* handler) +{ + reg_action_t reg_action; + memset(®_action, 0, sizeof(reg_action)); + reg_action.type = UNREGISTER_IBVERBS; + reg_action.info.ibverbs.fd = fd; + reg_action.info.ibverbs.handler = handler; + post_new_reg_action(reg_action); +} + +void event_handler_manager::register_rdma_cm_event(int fd, void* id, void* cma_channel, event_handler_rdma_cm* handler) +{ + reg_action_t reg_action; + memset(®_action, 0, sizeof(reg_action)); + reg_action.type = REGISTER_RDMA_CM; + reg_action.info.rdma_cm.fd = fd; + reg_action.info.rdma_cm.id = id; + reg_action.info.rdma_cm.handler = handler; + reg_action.info.rdma_cm.cma_channel = cma_channel; + post_new_reg_action(reg_action); +} + +void event_handler_manager::unregister_rdma_cm_event(int fd, void* id) +{ + reg_action_t reg_action; + memset(®_action, 0, sizeof(reg_action)); + reg_action.type = UNREGISTER_RDMA_CM; + reg_action.info.rdma_cm.fd = fd; + reg_action.info.rdma_cm.id = id; + post_new_reg_action(reg_action); +} + +void event_handler_manager::register_command_event(int fd, command* cmd) +{ + reg_action_t reg_action; + + evh_logdbg("Register command %s event", cmd->to_str().c_str()); + + memset(®_action, 0, sizeof(reg_action)); + reg_action.type = REGISTER_COMMAND; + reg_action.info.cmd.fd = fd; + reg_action.info.cmd.cmd = cmd; + post_new_reg_action(reg_action); + +} + +event_handler_manager::event_handler_manager() : + m_reg_action_q_lock("reg_action_q_lock"), + m_b_sysvar_internal_thread_arm_cq_enabled(safe_mce_sys().internal_thread_arm_cq_enabled), + m_n_sysvar_vma_time_measure_num_samples(safe_mce_sys().vma_time_measure_num_samples), + m_n_sysvar_timer_resolution_msec(safe_mce_sys().timer_resolution_msec) +{ + evh_logfunc(""); + + m_cq_epfd = 0; + + m_epfd = orig_os_api.epoll_create(INITIAL_EVENTS_NUM); + BULLSEYE_EXCLUDE_BLOCK_START + if (m_epfd == -1) { + evh_logdbg("epoll_create failed on ibv device collection (errno=%d %m)", errno); + free_evh_resources(); + throw_vma_exception("epoll_create failed on ibv device collection"); + } + BULLSEYE_EXCLUDE_BLOCK_END + + m_b_continue_running = true; + m_event_handler_tid = 0; + + wakeup_set_epoll_fd(m_epfd); + going_to_sleep(); + + return; +} + +event_handler_manager::~event_handler_manager() +{ + free_evh_resources(); +} + +void event_handler_manager::free_evh_resources() +{ + evh_logfunc(""); + + // Flag thread to stop on next loop + stop_thread(); + evh_logfunc("Thread stopped"); +} + +// event handler main thread startup +void* event_handler_thread(void *_p_tgtObject) +{ + event_handler_manager* p_tgtObject = (event_handler_manager*)_p_tgtObject; + g_n_internal_thread_id = pthread_self(); + evh_logdbg("Entering internal thread, id = %lu", g_n_internal_thread_id); + + if (strcmp(safe_mce_sys().internal_thread_cpuset, MCE_DEFAULT_INTERNAL_THREAD_CPUSET)) { + std::string tasks_file(safe_mce_sys().internal_thread_cpuset); + tasks_file += "/tasks"; + FILE *fp = fopen(tasks_file.c_str(), "w"); + BULLSEYE_EXCLUDE_BLOCK_START + if (fp == NULL) { + evh_logpanic("Failed to open %s for writing", tasks_file.c_str()); + } + if (fprintf(fp, "%d", gettid()) <= 0) { + fclose(fp); + evh_logpanic("Failed to add internal thread id to %s", tasks_file.c_str()); + } + BULLSEYE_EXCLUDE_BLOCK_END + fclose(fp); + evh_logdbg("VMA Internal thread added to cpuset %s.", safe_mce_sys().internal_thread_cpuset); + + // do set affinity now that we are on correct cpuset + cpu_set_t cpu_set = safe_mce_sys().internal_thread_affinity; + if ( strcmp(safe_mce_sys().internal_thread_affinity_str, "-1")) { + if (pthread_setaffinity_np(g_n_internal_thread_id, sizeof(cpu_set), &cpu_set)) { + evh_logdbg("VMA Internal thread affinity failed. Did you try to set affinity outside of cpuset?"); + } else { + evh_logdbg("VMA Internal thread affinity is set."); + } + } else { + evh_logdbg("VMA Internal thread affinity not set."); + } + /* cppcheck-suppress resourceLeak */ + } + + void* ret = p_tgtObject->thread_loop(); + evh_logdbg("Ending internal thread"); + return ret; +} + +int event_handler_manager::start_thread() +{ + cpu_set_t cpu_set; + pthread_attr_t tattr; + + if (!m_b_continue_running) + return -1; + + if (m_event_handler_tid != 0) + return 0; + + //m_reg_action_q.reserve(); //todo change to vector and reserve + + BULLSEYE_EXCLUDE_BLOCK_START + if (pthread_attr_init(&tattr)) { + evh_logpanic("Failed to initialize thread attributes"); + } + BULLSEYE_EXCLUDE_BLOCK_END + + cpu_set = safe_mce_sys().internal_thread_affinity; + if ( strcmp(safe_mce_sys().internal_thread_affinity_str, "-1") && !strcmp(safe_mce_sys().internal_thread_cpuset, MCE_DEFAULT_INTERNAL_THREAD_CPUSET)) { // no affinity + BULLSEYE_EXCLUDE_BLOCK_START + if (pthread_attr_setaffinity_np(&tattr, sizeof(cpu_set), &cpu_set)) { + evh_logpanic("Failed to set CPU affinity"); + } + BULLSEYE_EXCLUDE_BLOCK_END + } + else { + evh_logdbg("VMA Internal thread affinity not set."); + } + + + int ret = pthread_create(&m_event_handler_tid, &tattr, event_handler_thread, this); + if (ret) { + // maybe it's the cset issue? try without affinity + evh_logwarn("Failed to start event handler thread with thread affinity - trying without. [errno=%d %s]", + ret, strerror(ret)); + BULLSEYE_EXCLUDE_BLOCK_START + if (pthread_attr_init(&tattr)) { + evh_logpanic("Failed to initialize thread attributes"); + } + if (pthread_create(&m_event_handler_tid, &tattr, event_handler_thread, this)) { + evh_logpanic("Failed to start event handler thread"); + } + BULLSEYE_EXCLUDE_BLOCK_END + } + + pthread_attr_destroy(&tattr); + + evh_logdbg("Started event handler thread"); + return 0; +} + +void event_handler_manager::stop_thread() +{ + if (!m_b_continue_running) + return; + m_b_continue_running = false; + + if(!g_is_forked_child){ + + do_wakeup(); + + // Wait for thread exit + if (m_event_handler_tid) { + pthread_join(m_event_handler_tid, 0); + evh_logdbg("event handler thread stopped"); + } + else { + evh_logdbg("event handler thread not running"); + } + } + m_event_handler_tid = 0; + + // Close main epfd and signaling socket + orig_os_api.close(m_epfd); + m_epfd = -1; +} + +void event_handler_manager::update_epfd(int fd, int operation, int events) +{ + epoll_event ev = {0, {0}}; + + if (m_epfd < 0) { + return; + } + + ev.events = events; + ev.data.fd = fd; + BULLSEYE_EXCLUDE_BLOCK_START + if ((orig_os_api.epoll_ctl(m_epfd, operation, fd, &ev) < 0) && + (!(errno == ENOENT || errno == EBADF))) { + const char* operation_str[] = {"", "ADD", "DEL", "MOD"}; + evh_logerr("epoll_ctl(%d, %s, fd=%d) failed (errno=%d %m)", + m_epfd, operation_str[operation], fd, errno); + } + BULLSEYE_EXCLUDE_BLOCK_END +} + +const char* event_handler_manager::reg_action_str(event_action_type_e reg_action_type) +{ + switch (reg_action_type) { + case REGISTER_TIMER: return "REGISTER_TIMER"; + case UNREGISTER_TIMER: return "UNREGISTER_TIMER"; + case UNREGISTER_TIMERS_AND_DELETE: return "UNREGISTER_TIMERS_AND_DELETE"; + case REGISTER_IBVERBS: return "REGISTER_IBVERBS"; + case UNREGISTER_IBVERBS: return "UNREGISTER_IBVERBS"; + case REGISTER_RDMA_CM: return "REGISTER_RDMA_CM"; + case UNREGISTER_RDMA_CM: return "UNREGISTER_RDMA_CM"; + case REGISTER_COMMAND: return "REGISTER_COMMAND"; + case UNREGISTER_COMMAND: return "UNREGISTER_COMMAND"; + BULLSEYE_EXCLUDE_BLOCK_START + default: return "UNKNOWN"; + BULLSEYE_EXCLUDE_BLOCK_END + } +} + +//get new action of event (register / unregister), and post to the thread's pipe +void event_handler_manager::post_new_reg_action(reg_action_t& reg_action) +{ + if (!m_b_continue_running) + return; + + start_thread(); + + evh_logfunc("add event action %s (%d)", reg_action_str(reg_action.type), reg_action.type); + + m_reg_action_q_lock.lock(); + m_reg_action_q.push_back(reg_action); + m_reg_action_q_lock.unlock(); + do_wakeup(); +} + +void event_handler_manager::priv_register_timer_handler(timer_reg_info_t& info) +{ + if (info.group) { + info.group->add_new_timer((timer_node_t*)info.node, info.handler, info.user_data); + } else { + m_timer.add_new_timer(info.timeout_msec, (timer_node_t*)info.node, + info.handler, info.user_data, info.req_type); + } +} + +void event_handler_manager::priv_wakeup_timer_handler(timer_reg_info_t& info) +{ + timer_node_t* node = (timer_node_t*)info.node; + if (node && !node->group) { + m_timer.wakeup_timer(node); + } +} + +void event_handler_manager::priv_unregister_timer_handler(timer_reg_info_t& info) +{ + timer_node_t* node = (timer_node_t*)info.node; + if (node && node->group) { + node->group->remove_timer((timer_node_t*)info.node); + } else { + m_timer.remove_timer(node, info.handler); + } +} + +void event_handler_manager::priv_unregister_all_handler_timers(timer_reg_info_t& info) +{ + m_timer.remove_all_timers(info.handler); +} + +void event_handler_manager::priv_prepare_ibverbs_async_event_queue(event_handler_map_t::iterator& i) +{ + evh_logdbg_entry(""); + + int cnt = 0; + struct pollfd poll_fd = { /*.fd=*/ 0, /*.events=*/ POLLIN, /*.revents=*/ 0}; + + if (i == m_event_handler_map.end()) { + evh_logdbg("No event handler"); + return; + } + + poll_fd.fd = i->second.ibverbs_ev.fd; + + // change the blocking mode of the async event queue + set_fd_block_mode(poll_fd.fd, false); + + // empty the async event queue + while (orig_os_api.poll(&poll_fd, 1, 0) > 0) { + process_ibverbs_event(i); + cnt++; + } + evh_logdbg("Emptied %d Events", cnt); +} + +void event_handler_manager::priv_register_ibverbs_events(ibverbs_reg_info_t& info) +{ + event_handler_map_t::iterator i; + i = m_event_handler_map.find(info.fd); + if (i == m_event_handler_map.end()) { + event_data_t v; + + v.type = EV_IBVERBS; + v.ibverbs_ev.fd = info.fd; + v.ibverbs_ev.channel = info.channel; + + /* coverity[uninit_use_in_call] */ + /* cppcheck-suppress uninitStructMember */ + m_event_handler_map[info.fd] = v; + i = m_event_handler_map.find(info.fd); + + priv_prepare_ibverbs_async_event_queue(i); + + update_epfd(info.fd, EPOLL_CTL_ADD, EPOLLIN | EPOLLPRI); + evh_logdbg("%d added to event_handler_map_t!", info.fd); + } + BULLSEYE_EXCLUDE_BLOCK_START + if (i->second.type != EV_IBVERBS) { + evh_logerr("fd=%d: is already handling events of different type", info.fd); + return; + } + + ibverbs_event_map_t::iterator j; + j = i->second.ibverbs_ev.ev_map.find(info.handler); + if (j != i->second.ibverbs_ev.ev_map.end()) { + evh_logerr("Event for %d/%p already registered", info.fd, info.handler); + return; + } + BULLSEYE_EXCLUDE_BLOCK_END + + ibverbs_event_t vv; + vv.handler = info.handler; + vv.user_data = info.user_data; + i->second.ibverbs_ev.ev_map[info.handler] = vv; + + return; +} + +void event_handler_manager::priv_unregister_ibverbs_events(ibverbs_reg_info_t& info) +{ + + event_handler_map_t::iterator i; + ibverbs_event_map_t::iterator j; + int n = 0; + + i = m_event_handler_map.find(info.fd); + + BULLSEYE_EXCLUDE_BLOCK_START + if (i == m_event_handler_map.end()) { + evh_logerr("Event for %d/%p already does not exist", info.fd, info.handler); + return; + } + + if (i->second.type != EV_IBVERBS) { + evh_logerr("fd=%d: is already handling events of different type", info.fd); + return; + } + + n = i->second.ibverbs_ev.ev_map.size(); + + if (n < 1) { + evh_logerr("Event for %d/%p already does not exist", info.fd, info.handler); + return; + + } + + j = i->second.ibverbs_ev.ev_map.find(info.handler); + if (j == i->second.ibverbs_ev.ev_map.end()) { + evh_logerr("event for %d/%p does not exist", info.fd, info.handler); + return; + } + BULLSEYE_EXCLUDE_BLOCK_END + + i->second.ibverbs_ev.ev_map.erase(j); + if (n == 1) { + update_epfd(info.fd, EPOLL_CTL_DEL, EPOLLIN | EPOLLPRI); + m_event_handler_map.erase(i); + evh_logdbg("%d erased from event_handler_map_t!", info.fd); + } +} + +void event_handler_manager::priv_register_rdma_cm_events(rdma_cm_reg_info_t& info) +{ + evh_logfunc_entry("fd=%d, event_handler_id=%p", info.fd, info.id); + + // Handle the new registration + event_handler_map_t::iterator iter_fd = m_event_handler_map.find(info.fd); + if (iter_fd == m_event_handler_map.end()) { + evh_logdbg("Adding new channel (fd %d, id %#x, handler %p)", info.fd, info.id, info.handler); + event_data_t map_value; + + map_value.type = EV_RDMA_CM; + map_value.rdma_cm_ev.n_ref_count = 1; + map_value.rdma_cm_ev.map_rdma_cm_id[info.id] = info.handler; + map_value.rdma_cm_ev.cma_channel = info.cma_channel; + + /* coverity[uninit_use_in_call] */ + /* cppcheck-suppress uninitStructMember */ + m_event_handler_map[info.fd] = map_value; + + update_epfd(info.fd, EPOLL_CTL_ADD, EPOLLIN | EPOLLPRI); + } + else { + BULLSEYE_EXCLUDE_BLOCK_START + if (iter_fd->second.type != EV_RDMA_CM) { + evh_logerr("fd=%d: is already handling events of different type", info.fd); + return; + } + event_handler_rdma_cm_map_t::iterator iter_id = iter_fd->second.rdma_cm_ev.map_rdma_cm_id.find(info.id); + if (iter_id == iter_fd->second.rdma_cm_ev.map_rdma_cm_id.end()) { + evh_logdbg("Adding to exitsing channel fd %d (id %#x, handler %p)", info.fd, info.id, info.handler); + iter_fd->second.rdma_cm_ev.map_rdma_cm_id[info.id] = info.handler; + iter_fd->second.rdma_cm_ev.n_ref_count++; + if (iter_fd->second.rdma_cm_ev.cma_channel != info.cma_channel) { + evh_logerr("Trying to change the channel processing cb's on a registered fd %d (by id %#x)", info.fd, info.id); + } + } + else { + evh_logerr("Channel-id pair <%d, %#x> already registered (handler %p)", info.fd, info.id, info.handler); + } + BULLSEYE_EXCLUDE_BLOCK_END + } +} + +void event_handler_manager::priv_unregister_rdma_cm_events(rdma_cm_reg_info_t& info) +{ + evh_logfunc_entry("fd=%d, id=%p", info.fd, info.id); + + event_handler_map_t::iterator iter_fd = m_event_handler_map.find(info.fd); + if (iter_fd != m_event_handler_map.end()) { + BULLSEYE_EXCLUDE_BLOCK_START + if (iter_fd->second.type != EV_RDMA_CM) { + evh_logerr("fd=%d: is already handling events of different type", info.fd); + return; + } + BULLSEYE_EXCLUDE_BLOCK_END + event_handler_rdma_cm_map_t::iterator iter_id = iter_fd->second.rdma_cm_ev.map_rdma_cm_id.find(info.id); + BULLSEYE_EXCLUDE_BLOCK_START + if (iter_id != iter_fd->second.rdma_cm_ev.map_rdma_cm_id.end()) { + BULLSEYE_EXCLUDE_BLOCK_END + evh_logdbg("Removing from channel %d, id %p", info.fd, info.id); + iter_fd->second.rdma_cm_ev.map_rdma_cm_id.erase(iter_id); + iter_fd->second.rdma_cm_ev.n_ref_count--; + if (iter_fd->second.rdma_cm_ev.n_ref_count == 0) { + update_epfd(info.fd, EPOLL_CTL_DEL, EPOLLIN | EPOLLPRI); + m_event_handler_map.erase(iter_fd); + evh_logdbg("Removed channel <%d %p>", info.fd, info.id); + } + } + else { + evh_logerr("Channel-id pair <%d %p> not found", info.fd, info.id); + } + } + else { + evh_logdbg("Channel %d not found", info.fd); + } +} + +void event_handler_manager::priv_register_command_events(command_reg_info_t& info) +{ + // In case this is new registration need to add netlink fd to the epfd + event_handler_map_t::iterator iter_fd = m_event_handler_map.find(info.fd); + if (iter_fd == m_event_handler_map.end()) { + evh_logdbg("Adding new channel (fd %d)", info.fd); + event_data_t map_value; + + map_value.type = EV_COMMAND; + map_value.command_ev.cmd = info.cmd; + + /* coverity[uninit_use_in_call] */ + /* cppcheck-suppress uninitStructMember */ + m_event_handler_map[info.fd] = map_value; + update_epfd(info.fd, EPOLL_CTL_ADD, EPOLLIN | EPOLLPRI); + } + +} + +void event_handler_manager::priv_unregister_command_events(command_reg_info_t& info) +{ + + event_handler_map_t::iterator iter_fd = m_event_handler_map.find(info.fd); + if (iter_fd == m_event_handler_map.end()) { + evh_logdbg(" channel wasn't found (fd %d)", info.fd); + + } + else if(iter_fd->first != EV_COMMAND){ + evh_logdbg(" This fd (%d) no longer COMMAND type fd", info.fd); + } + else { + update_epfd(info.fd, EPOLL_CTL_DEL, EPOLLIN | EPOLLPRI); + } +} + +void event_handler_manager::handle_registration_action(reg_action_t& reg_action) +{ + if (!m_b_continue_running) + return; + + evh_logfunc("event action %d", reg_action.type); + switch (reg_action.type) { + case REGISTER_TIMER: + priv_register_timer_handler(reg_action.info.timer); + break; + case WAKEUP_TIMER: + priv_wakeup_timer_handler(reg_action.info.timer); + break; + case UNREGISTER_TIMER: + priv_unregister_timer_handler(reg_action.info.timer); + break; + case REGISTER_IBVERBS: + priv_register_ibverbs_events(reg_action.info.ibverbs); + break; + case UNREGISTER_IBVERBS: + priv_unregister_ibverbs_events(reg_action.info.ibverbs); + break; + case REGISTER_RDMA_CM: + priv_register_rdma_cm_events(reg_action.info.rdma_cm); + break; + case UNREGISTER_RDMA_CM: + priv_unregister_rdma_cm_events(reg_action.info.rdma_cm); + break; + case REGISTER_COMMAND: + priv_register_command_events(reg_action.info.cmd); + break; + case UNREGISTER_COMMAND: + priv_unregister_command_events(reg_action.info.cmd); + break; + case UNREGISTER_TIMERS_AND_DELETE: + priv_unregister_all_handler_timers(reg_action.info.timer); + delete reg_action.info.timer.handler; + reg_action.info.timer.handler = NULL; + break; + BULLSEYE_EXCLUDE_BLOCK_START + default: + evh_logerr("illegal event action! (%d)", reg_action.type); + break; + BULLSEYE_EXCLUDE_BLOCK_END + } + return; +} + +void event_handler_manager::query_for_ibverbs_event(int async_fd) +{ + evh_logfunc_entry(""); + + struct pollfd poll_fd; + event_handler_map_t::iterator i; + + poll_fd.events = POLLIN | POLLPRI; + poll_fd.revents = 0; + poll_fd.fd = async_fd; + + // ibverbs events should be read only from the internal thread context + if (pthread_self() != m_event_handler_tid) { + return; + } + + // Check for ready events + if (orig_os_api.poll(&poll_fd, 1, 0) <= 0) { + return; + } + + // Verify handler exists in map + if ((i = m_event_handler_map.find(async_fd)) == m_event_handler_map.end()) { + return; + } + + process_ibverbs_event(i); +} + +void event_handler_manager::statistics_print(int fd, vlog_levels_t log_level) +{ + if (m_b_continue_running && g_p_fd_collection) { + g_p_fd_collection->statistics_print(fd, log_level); + } +} + +void event_handler_manager::process_ibverbs_event(event_handler_map_t::iterator &i) +{ + evh_logfunc_entry(""); + + // + // Pre handling + // + struct ibv_context *hca = (struct ibv_context*)i->second.ibverbs_ev.channel; + struct ibv_async_event ibv_event; + + IF_VERBS_FAILURE(ibv_get_async_event(hca, &ibv_event)) { + vlog_levels_t _level = (errno == EBADF) ? VLOG_DEBUG : VLOG_ERROR; // EBADF may returned during plugout + vlog_printf(_level, "[%d] Received HCA event but failed to get it (errno=%d %m)\n", hca->async_fd, errno); + return; + } ENDIF_VERBS_FAILURE; + evh_logdbg("[%d] Received ibverbs event %s (%d)", hca->async_fd, priv_ibv_event_desc_str(ibv_event.event_type), ibv_event.event_type); + + // + // Notify Event to handlers + // + for (ibverbs_event_map_t::iterator pos = i->second.ibverbs_ev.ev_map.begin(); + pos != i->second.ibverbs_ev.ev_map.end(); pos++) { + pos->second.handler->handle_event_ibverbs_cb(&ibv_event, pos->second.user_data); + } + + evh_logdbg("[%d] Completed ibverbs event %s (%d)", hca->async_fd, priv_ibv_event_desc_str(ibv_event.event_type), ibv_event.event_type); + + // + // Post handling + // + ibv_ack_async_event(&ibv_event); +} + +void event_handler_manager::process_rdma_cm_event(event_handler_map_t::iterator &iter_fd) +{ + // Read the notification event channel + struct rdma_event_channel* cma_channel = (struct rdma_event_channel*)iter_fd->second.rdma_cm_ev.cma_channel; + struct rdma_cm_event* p_tmp_cm_event = NULL; + struct rdma_cm_event cma_event; + + evh_logfunc_entry("cma_channel %p (fd = %d)", cma_channel, cma_channel->fd); + + BULLSEYE_EXCLUDE_BLOCK_START + // Get rdma_cm event + if (rdma_get_cm_event(cma_channel, &p_tmp_cm_event)) { + evh_logerr("rdma_get_cm_event failed on cma_channel %d (fd = %d) (errno=%d %m)", cma_channel, cma_channel->fd, errno); + return; + } + if (!p_tmp_cm_event) { + evh_logpanic("rdma_get_cm_event succeeded but the returned event is NULL on cma_channel %d (fd = %d) (errno=%d %m)", cma_channel, cma_channel->fd, errno); + } + BULLSEYE_EXCLUDE_BLOCK_END + + // Duplicate rdma_cm event to local memory + memcpy(&cma_event, p_tmp_cm_event, sizeof(cma_event)); + + // Ack rdma_cm event (free) + rdma_ack_cm_event(p_tmp_cm_event); + evh_logdbg("[%d] Received rdma_cm event %s (%d)", cma_channel->fd, priv_rdma_cm_event_type_str(cma_event.event), cma_event.event); + + void* cma_id = (void*)cma_event.id; + if (cma_event.listen_id) // we assume that cma_listen_id != NULL in case of connect request + cma_id = (void*)cma_event.listen_id; + + + // Find registered event handler + if (cma_id != NULL) { + event_handler_rdma_cm_map_t::iterator iter_id = iter_fd->second.rdma_cm_ev.map_rdma_cm_id.find(cma_id); + if (iter_id != iter_fd->second.rdma_cm_ev.map_rdma_cm_id.end()) { + event_handler_rdma_cm* handler = iter_id->second; + + // Call the registered event handler with event to be handled + if (handler) + handler->handle_event_rdma_cm_cb(&cma_event); + } + else { + evh_logdbg("Can't find event_handler for ready event_handler_id %d (fd=%d)", cma_id, iter_fd->first); + } + } + + evh_logdbg("[%d] Completed rdma_cm event %s (%d)", cma_channel->fd, priv_rdma_cm_event_type_str(cma_event.event), cma_event.event); +} + + +/* +The main loop actions: + 1) update timeout + handle registers that theire timeout expiered + 2) epoll_wait + 3) handle new registrations/unregistrations + 4) update timeout + handle registers that theire timeout expiered + 5) handle new events +*/ + +void* event_handler_manager::thread_loop() +{ + int timeout_msec; + int maxevents = INITIAL_EVENTS_NUM; + struct pollfd poll_fd; + struct epoll_event* p_events = (struct epoll_event*)malloc(sizeof(struct epoll_event)*maxevents); + BULLSEYE_EXCLUDE_BLOCK_START + if(!p_events){ + evh_logdbg("malloc failure"); + throw_vma_exception("malloc failure"); + } + BULLSEYE_EXCLUDE_BLOCK_END + + poll_fd.events = POLLIN | POLLPRI; + poll_fd.revents = 0; + while (m_b_continue_running) { +#ifdef VMA_TIME_MEASURE + if (g_inst_cnt >= m_n_sysvar_vma_time_measure_num_samples) + finit_instrumentation(safe_mce_sys().vma_time_measure_filename); +#endif + + // update timer and get timeout + timeout_msec = m_timer.update_timeout(); + if (timeout_msec == 0) { + // at least one timer has expired! + m_timer.process_registered_timers(); + continue; + } + + if( m_b_sysvar_internal_thread_arm_cq_enabled && m_cq_epfd == 0 && g_p_net_device_table_mgr) { + m_cq_epfd = g_p_net_device_table_mgr->global_ring_epfd_get(); + if( m_cq_epfd > 0 ) { + epoll_event evt = {0, {0}}; + evt.events = EPOLLIN | EPOLLPRI; + evt.data.fd = m_cq_epfd; + orig_os_api.epoll_ctl(m_epfd, EPOLL_CTL_ADD, m_cq_epfd, &evt); + } + } + + uint64_t poll_sn = 0; + if( m_b_sysvar_internal_thread_arm_cq_enabled && m_cq_epfd > 0 && g_p_net_device_table_mgr) { + g_p_net_device_table_mgr->global_ring_poll_and_process_element(&poll_sn, NULL); + int ret = g_p_net_device_table_mgr->global_ring_request_notification(poll_sn); + if (ret > 0) { + g_p_net_device_table_mgr->global_ring_poll_and_process_element(&poll_sn, NULL); + } + } + + // Make sure we sleep for a minimum of X milli seconds + if (timeout_msec > 0) { + if ((int)m_n_sysvar_timer_resolution_msec > timeout_msec) { + timeout_msec = m_n_sysvar_timer_resolution_msec; + } + } + + evh_logfuncall("calling orig_os_api.epoll with %d msec timeout", timeout_msec); + int ret = orig_os_api.epoll_wait(m_epfd, p_events, maxevents, timeout_msec); + if (ret < 0) { + evh_logfunc("epoll returned with error, errno=%d %m)", errno); + continue; + } + evh_logfuncall("orig_os_api.epoll found %d ready fds", ret); + + // check pipe + for (int idx = 0; (idx < ret) && (m_b_continue_running); ++idx) { + if(m_b_sysvar_internal_thread_arm_cq_enabled && p_events[idx].data.fd == m_cq_epfd && g_p_net_device_table_mgr){ + g_p_net_device_table_mgr->global_ring_wait_for_notification_and_process_element(&poll_sn, NULL); + } + else if (is_wakeup_fd(p_events[idx].data.fd)) { + // a request for registration was sent + reg_action_t reg_action; + while (1) { + m_reg_action_q_lock.lock(); + if (m_reg_action_q.empty()) { + return_from_sleep(); + remove_wakeup_fd(); + going_to_sleep(); + m_reg_action_q_lock.unlock(); + break; + } + reg_action = m_reg_action_q.front(); + m_reg_action_q.pop_front(); + m_reg_action_q_lock.unlock(); + handle_registration_action(reg_action); + } + break; + } + } + + if ((m_timer.update_timeout() == 0) && (m_b_continue_running)) { + // at least one timer has expired! + m_timer.process_registered_timers(); + } + + + // process ready event channels + for (int idx = 0; (idx < ret) && (m_b_continue_running); ++idx) { + //if (p_events[idx].events & (EPOLLERR|EPOLLHUP)) + // evh_logdbg("error in fd %d",p_events[idx].data.fd ); + + int fd = p_events[idx].data.fd; + + if(m_b_sysvar_internal_thread_arm_cq_enabled && fd == m_cq_epfd) continue; + + evh_logfunc("Processing fd %d", fd); + + if (is_wakeup_fd(fd)) // the pipe was already handled + continue; + + event_handler_map_t::iterator i = m_event_handler_map.find(fd); + if (i == m_event_handler_map.end()) { + // No event handler - this is probably a poll_os event! + if (!g_p_fd_collection->set_immediate_os_sample(fd)) { + evh_logdbg("No event handler (fd=%d)", fd); + } + continue; + } + + switch (i->second.type) { + case EV_RDMA_CM: + int result; + poll_fd.fd = fd; + result = orig_os_api.poll(&poll_fd, 1, 0); + if (result == 0) { + evh_logdbg("error in fd %d", fd); + break; + } + process_rdma_cm_event(i); + break; + case EV_IBVERBS: + process_ibverbs_event(i); + break; + case EV_COMMAND: + i->second.command_ev.cmd->execute(); + break; + BULLSEYE_EXCLUDE_BLOCK_START + default: + evh_logerr("Unknow event on fd=%d", fd); + BULLSEYE_EXCLUDE_BLOCK_END + } + } // for idx + + if (ret == maxevents) { + struct epoll_event* p_events_new; + // increase the events array + maxevents *= 2; + p_events_new = ( struct epoll_event*)realloc( (void *)p_events, sizeof(struct epoll_event) * maxevents); + BULLSEYE_EXCLUDE_BLOCK_START + if( !p_events_new) { + evh_logpanic("realloc failure") ; + } + p_events = p_events_new; + BULLSEYE_EXCLUDE_BLOCK_END + } + + } // while (m_b_continue_running) + + free(p_events); + + return 0; +} diff --git a/src/vma/event/event_handler_manager.h b/src/vma/event/event_handler_manager.h new file mode 100644 index 0000000..5bdfe2e --- /dev/null +++ b/src/vma/event/event_handler_manager.h @@ -0,0 +1,229 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef EVENT_HANDLER_MANAGER_H +#define EVENT_HANDLER_MANAGER_H + +#include +#include +#include "vlogger/vlogger.h" +#include "utils/lock_wrapper.h" +#include "vma/util/wakeup_pipe.h" +#include "vma/netlink/netlink_wrapper.h" +#include "vma/infra/subject_observer.h" +#include "vma/event/command.h" +#include "vma/event/delta_timer.h" +#include "vma/event/timers_group.h" + +class timer_handler; +class event_handler_ibverbs; +class event_handler_rdma_cm; + +typedef std::map event_handler_rdma_cm_map_t; + +typedef enum { + REGISTER_TIMER, + WAKEUP_TIMER, /* NOT AVAILABLE FOR GROUPED TIMERS */ + UNREGISTER_TIMER, + UNREGISTER_TIMERS_AND_DELETE, + REGISTER_IBVERBS, + UNREGISTER_IBVERBS, + REGISTER_RDMA_CM, + UNREGISTER_RDMA_CM, + REGISTER_COMMAND, + UNREGISTER_COMMAND +} event_action_type_e; + + +struct ibverbs_event_t { + event_handler_ibverbs* handler; + void* user_data; +}; + +struct rdma_cm_ev_t { + int n_ref_count; // number of event_handler on this fd + event_handler_rdma_cm_map_t map_rdma_cm_id; // each event_handler class maps with it's own event_handler_id (referenced as void*) + void* cma_channel; // meaning here for the rdma_event_channel object +}; + +typedef std::map ibverbs_event_map_t; + +struct ibverbs_ev_t { + int fd; + void* channel; + ibverbs_event_map_t ev_map; +}; + +struct command_ev_t { + command* cmd; +}; + +struct timer_reg_info_t { + timer_handler* handler; + void* node; + unsigned int timeout_msec; + void* user_data; + timers_group* group; + timer_req_type_t req_type; +}; + +struct ibverbs_reg_info_t { + event_handler_ibverbs* handler; + int fd; + void* channel; + void* user_data; +}; + +struct rdma_cm_reg_info_t { + event_handler_rdma_cm* handler; + int fd; + void* id; + void* cma_channel; +}; + +struct command_reg_info_t { + int fd; + command* cmd; +}; + +struct reg_action_t { + event_action_type_e type; + union { + timer_reg_info_t timer; + ibverbs_reg_info_t ibverbs; + rdma_cm_reg_info_t rdma_cm; + command_reg_info_t cmd; + } info; +}; + +typedef std::deque reg_action_q_t; + +enum ev_type { + EV_IBVERBS, + EV_RDMA_CM, + EV_COMMAND, +}; + + +struct event_data_t { + ev_type type; + ibverbs_ev_t ibverbs_ev; + rdma_cm_ev_t rdma_cm_ev; + command_ev_t command_ev; +}; + +typedef std::map event_handler_map_t; +typedef std::map timer_list_t; + + +/* +** Class event_handler_manager +** The event manager object listens on the registered channels and distributes the incoming events +** to the appropriate registered event_handlers objects by their registered id's. +** All registered objects must implememtn the event_handler class which is the registered callback function. +*/ +class event_handler_manager : public wakeup_pipe +{ +public: + event_handler_manager(); + ~event_handler_manager(); + + void* register_timer_event(int timeout_msec, timer_handler* handler, timer_req_type_t req_type, void* user_data, timers_group* group = NULL); + void wakeup_timer_event(timer_handler* handler, void* node); + void unregister_timer_event(timer_handler* handler, void* node); + void unregister_timers_event_and_delete(timer_handler* handler); + + void register_ibverbs_event(int fd, event_handler_ibverbs* handler, void* channel, void* user_data); + void unregister_ibverbs_event(int fd, event_handler_ibverbs* handler); + + void register_rdma_cm_event(int fd, void* id, void* cma_channel, event_handler_rdma_cm* handler); + void unregister_rdma_cm_event(int fd, void* id); + + void register_command_event(int fd, command* cmd); + + void* thread_loop(); + void stop_thread(); + bool is_running() {return m_b_continue_running; }; + + void update_epfd(int fd, int operation, int events); + void query_for_ibverbs_event(int async_fd); + void statistics_print(int fd, vlog_levels_t log_level); + +private: + pthread_t m_event_handler_tid; + bool m_b_continue_running; + int m_cq_epfd; + int m_epfd; + + // pipe for the event registration handling + reg_action_q_t m_reg_action_q; + lock_spin m_reg_action_q_lock; + timer m_timer; + + const bool m_b_sysvar_internal_thread_arm_cq_enabled; + const uint32_t m_n_sysvar_vma_time_measure_num_samples; + const uint32_t m_n_sysvar_timer_resolution_msec; + + event_handler_map_t m_event_handler_map; + + void priv_register_timer_handler(timer_reg_info_t& info); + void priv_wakeup_timer_handler(timer_reg_info_t& info); + void priv_unregister_timer_handler(timer_reg_info_t& info); + void priv_unregister_all_handler_timers(timer_reg_info_t& info); + void priv_register_ibverbs_events(ibverbs_reg_info_t& info); + void priv_unregister_ibverbs_events(ibverbs_reg_info_t& info); + void priv_register_rdma_cm_events(rdma_cm_reg_info_t& info); + void priv_unregister_rdma_cm_events(rdma_cm_reg_info_t& info); + void priv_register_command_events(command_reg_info_t& info); + void priv_unregister_command_events(command_reg_info_t& info); + void priv_prepare_ibverbs_async_event_queue(event_handler_map_t::iterator& i); + + const char* reg_action_str(event_action_type_e reg_action_type); + void post_new_reg_action(reg_action_t& reg_action); + void handle_registration_action(reg_action_t& reg_action); + void process_ibverbs_event(event_handler_map_t::iterator &i); + void process_rdma_cm_event(event_handler_map_t::iterator &i); + int start_thread(); + + void event_channel_post_process_for_rdma_events(void* p_event); + void* event_channel_pre_process_for_rdma_events(void* p_event_channel_handle, void** p_event); + + void free_evh_resources(void); +}; + + +extern event_handler_manager* g_p_event_handler_manager; + +extern pthread_t g_n_internal_thread_id; + +#endif diff --git a/src/vma/event/event_handler_rdma_cm.h b/src/vma/event/event_handler_rdma_cm.h new file mode 100644 index 0000000..bf1a240 --- /dev/null +++ b/src/vma/event/event_handler_rdma_cm.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef EVENT_HANDLER_RDMA_CM_H +#define EVENT_HANDLER_RDMA_CM_H + +#include + +/* + * @class event_handler + * An object registers with event_handler_manager to get event notification callbacks for the registered rdma_cm id's. + * This callback function will be called when an event was received on the appropritae channel with the appropritae id. + * The channels can be shared between several objects, but the id's in each channel has to be unic. + */ +class event_handler_rdma_cm +{ +public: + virtual ~event_handler_rdma_cm() {}; + virtual void handle_event_rdma_cm_cb(struct rdma_cm_event* p_event) = 0; +}; + +#endif //EVENT_HANDLER_RDMA_CM_H diff --git a/src/vma/event/netlink_event.cpp b/src/vma/event/netlink_event.cpp new file mode 100644 index 0000000..298d692 --- /dev/null +++ b/src/vma/event/netlink_event.cpp @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "netlink_event.h" +#include "vlogger/vlogger.h" +#include +#include +#include "stdio.h" +#include "vma/netlink/netlink_compatibility.h" + +#define TOSTR_MAX_SIZE 4096 + +netlink_event::netlink_event(struct nlmsghdr* hdr, void* notifier) : + event(notifier), nl_type(0), nl_pid(0), nl_seq(0) +{ + if (hdr) { + nl_type = hdr->nlmsg_type; + nl_pid = hdr->nlmsg_pid; + nl_seq = hdr->nlmsg_seq; + } + +} + +const std::string netlink_event::to_str() const +{ + char outstr[TOSTR_MAX_SIZE]; + sprintf(outstr, "%s. NETLINK: TYPE=%u, PID=%u SEQ=%u", + event::to_str().c_str(), nl_type, nl_pid, + nl_seq); + return std::string(outstr); +} + +const std::string neigh_nl_event::to_str() const +{ + char outstr[TOSTR_MAX_SIZE]; + sprintf(outstr, + "%s. NEIGH: DST_ADDR=%s LINK_ADDR=%s FLAGS=%u IFINDEX=%d STATE=%d TYPE=%d", + netlink_event::to_str().c_str(), + m_neigh_info->dst_addr_str.c_str(), + m_neigh_info->lladdr_str.c_str(), m_neigh_info->flags, + m_neigh_info->ifindex, m_neigh_info->state, + m_neigh_info->type); + return std::string(outstr); + +} + +const std::string route_nl_event::to_str() const +{ + char outstr[TOSTR_MAX_SIZE]; + route_val* p_route_val = m_route_info->get_route_val(); + if (p_route_val) { + sprintf(outstr, + "%s. ROUTE: TABBLE=%u SCOPE=%u PROTOCOL=%u DST_ADDR=%u DST_PREFIX=%u TYPE=%u PREF_SRC=%u IFF_NAME=%s", + netlink_event::to_str().c_str(), p_route_val->get_table_id(), + p_route_val->get_scope(), p_route_val->get_protocol(), + p_route_val->get_dst_addr(), p_route_val->get_dst_pref_len(), + p_route_val->get_type(), p_route_val->get_src_addr(), + p_route_val->get_if_name()); + } + else { + sprintf(outstr, "Error in parsing netlink event"); + } + return std::string(outstr); +} + +neigh_nl_event::neigh_nl_event(struct nlmsghdr* hdr, struct rtnl_neigh* neigh, + void* notifier) : + netlink_event(hdr, notifier), m_neigh_info(NULL) +{ + m_neigh_info = new netlink_neigh_info(neigh); + if ((!hdr) && (neigh)) { + nl_type = rtnl_neigh_get_type(neigh); + } +} + +neigh_nl_event::~neigh_nl_event() { + if (m_neigh_info) + delete m_neigh_info; +} + +route_nl_event::route_nl_event(struct nlmsghdr* hdr, struct rtnl_route* route, + void* notifier) : + netlink_event(hdr, notifier), m_route_info(NULL) +{ + m_route_info = new netlink_route_info(route); +} + +route_nl_event::~route_nl_event() +{ + if (m_route_info) + delete m_route_info; +} +link_nl_event::link_nl_event(struct nlmsghdr* hdr, struct rtnl_link* rt_link, + void* notifier) : + netlink_event(hdr, notifier) +{ + m_link_info = new netlink_link_info(rt_link); +} + +link_nl_event::~link_nl_event() { + if (m_link_info) + delete m_link_info; +} + +const std::string link_nl_event::to_str() const +{ + char outstr[TOSTR_MAX_SIZE]; + sprintf(outstr, + //"%s. LINK: ARPTYPE=%u BROADCAST=%s ADDR_FAMILY=%d FLAGS=%u IFINDEX=%d MODE=%u MASTER_IFINDEX=%d MTU=%u NAME=%s OPERSTATE=%u TXQLEN=%u", + "%s. LINK: BROADCAST=%s ADDR_FAMILY=%d FLAGS=%u IFINDEX=%d MASTER_IFINDEX=%d MTU=%u NAME=%s OPERSTATE=%s TXQLEN=%u", + netlink_event::to_str().c_str(),/* m_link_info->arptype,*/ + m_link_info->broadcast_str.c_str(), m_link_info->addr_family, + m_link_info->flags, m_link_info->ifindex, + /*m_link_info->mode,*/ m_link_info->master_ifindex, + m_link_info->mtu, m_link_info->name.c_str(), + m_link_info->get_operstate2str().c_str(), m_link_info->txqlen); + + return std::string(outstr); +} diff --git a/src/vma/event/netlink_event.h b/src/vma/event/netlink_event.h new file mode 100644 index 0000000..33c4982 --- /dev/null +++ b/src/vma/event/netlink_event.h @@ -0,0 +1,154 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef NETLINK_EVENT_H_ +#define NETLINK_EVENT_H_ + +#include +#include "config.h" +#include "vma/event/event.h" +#include "vma/netlink/neigh_info.h" +#include "vma/netlink/route_info.h" +#include "vma/netlink/link_info.h" +#include + +class netlink_link; +class netlink_neigh; + +class netlink_event: public event +{ +public: + netlink_event(struct nlmsghdr* hdr, void* notifier); + virtual ~netlink_event() { } + + + /* netlink route family message types: + RTM_DELLINK + RTM_GETLINK + RTM_SETLINK + RTM_NEWADDR + RTM_DELADDR + RTM_GETADDR + RTM_NEWROUTE + RTM_DELROUTE + RTM_GETROUTE + RTM_NEWNEIGH + RTM_DELNEIGH + RTM_GETNEIGH + RTM_NEWRULE + RTM_DELRULE + RTM_GETRULE + RTM_NEWQDISC + RTM_DELQDISC + RTM_GETQDISC + RTM_NEWTCLASS + RTM_DELTCLASS + RTM_GETTCLASS + RTM_NEWTFILTER + RTM_DELTFILTER + RTM_GETTFILTER + RTM_NEWACTION + RTM_DELACTION + RTM_GETACTION + RTM_NEWPREFIX + RTM_GETPREFIX + RTM_GETMULTICAS + RTM_GETANYCAST + RTM_NEWNEIGHTBL + RTM_GETNEIGHTBL + RTM_SETNEIGHTBL + RTM_NEWADDRLABEL + RTM_DELADDRLABEL + RTM_GETADDRLABEL + */ + uint16_t nl_type; + + uint32_t nl_pid; + uint32_t nl_seq; + + virtual const std::string to_str() const; + + +}; + +class neigh_nl_event: public netlink_event +{ +public: + neigh_nl_event(struct nlmsghdr* hdr, struct rtnl_neigh* neigh, + void* notifier); + + virtual ~neigh_nl_event(); + + virtual const std::string to_str() const; + + const netlink_neigh_info* get_neigh_info() const { return m_neigh_info; } + +private: + netlink_neigh_info* m_neigh_info; + +}; + +class route_nl_event: public netlink_event +{ +public: + route_nl_event(struct nlmsghdr* hdr, struct rtnl_route* route, + void* notifier); + + virtual ~route_nl_event(); + + virtual const std::string to_str() const; + + netlink_route_info* get_route_info() const { return m_route_info; } + +private: + netlink_route_info* m_route_info; + +}; + +class link_nl_event: public netlink_event +{ +public: + link_nl_event(struct nlmsghdr* hdr, struct rtnl_link* rt_link, + void* notifier); + + virtual ~link_nl_event(); + + virtual const std::string to_str() const; + + const netlink_link_info* get_link_info() const { return m_link_info; } + +private: + netlink_link_info* m_link_info; +}; + +#endif /* NETLINK_EVENT_H_ */ diff --git a/src/vma/event/timer_handler.h b/src/vma/event/timer_handler.h new file mode 100644 index 0000000..76a5e3c --- /dev/null +++ b/src/vma/event/timer_handler.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef TIMER_HANDLER_H +#define TIMER_HANDLER_H + +/** + * simple timer notification. + * Any class that inherit timer_handler should also inherit cleanable_obj, and use clean_obj instead of delete. + * It must implement the clean_obj method to delete the object from the internal thread. + */ +class timer_handler +{ +public: + virtual ~timer_handler() {}; + virtual void handle_timer_expired(void* user_data) = 0; +}; + +#endif diff --git a/src/vma/event/timers_group.h b/src/vma/event/timers_group.h new file mode 100644 index 0000000..0141ca6 --- /dev/null +++ b/src/vma/event/timers_group.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef TIMERS_GROUP_H +#define TIMERS_GROUP_H + +/* + * This is an API for batching timers into groups. + * Instead of registering each timer separately into the internal thread, the group is registered once, + * and the timers are registered to the group. + * The registration to the group is still done through the internal thread. + * The group must be deleted through the internal thread (must implement clean_obj interface). + * Registering to group must be used with register_timer_event() and unregister_timer_event() only. + */ +class timers_group : public timer_handler { +public: + virtual ~timers_group() {}; + // execute all the timers registered to the group + // according to the internal group logic. + virtual void handle_timer_expired(void* user_data) = 0; + +protected: + friend class event_handler_manager; + // add a new timer + virtual void add_new_timer(timer_node_t* node, timer_handler* handler, void* user_data) = 0; + + // remove timer from list and free it. + // called for stopping (unregistering) a timer + virtual void remove_timer(timer_node_t* node) = 0; +}; + +#endif diff --git a/src/vma/event/vlogger_timer_handler.cpp b/src/vma/event/vlogger_timer_handler.cpp new file mode 100644 index 0000000..1006459 --- /dev/null +++ b/src/vma/event/vlogger_timer_handler.cpp @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include +#include + +#include "vlogger_timer_handler.h" +#include "timer_handler.h" +#include "event_handler_manager.h" + +vlogger_timer_handler* g_p_vlogger_timer_handler = NULL; + +vlogger_timer_handler::vlogger_timer_handler():m_timer_handle(NULL) +{ + if (g_p_event_handler_manager) { + /* failure in allocating m_timer_handle will result in throwing an exception by called methods */ + m_timer_handle = g_p_event_handler_manager->register_timer_event(UPDATE_VLOGGER_LEVELS_INTERVAL, this, PERIODIC_TIMER, 0); + } +} + +vlogger_timer_handler::~vlogger_timer_handler() +{ + if (m_timer_handle) { + g_p_event_handler_manager->unregister_timer_event(this, m_timer_handle); + m_timer_handle = NULL; + } +} + +void vlogger_timer_handler::handle_timer_expired(void* user_data) +{ + NOT_IN_USE(user_data); + if (g_p_vlogger_level) + g_vlogger_level = *g_p_vlogger_level; + if (g_p_vlogger_details) + g_vlogger_details = *g_p_vlogger_details; +} + + + + + + diff --git a/src/vma/event/vlogger_timer_handler.h b/src/vma/event/vlogger_timer_handler.h new file mode 100644 index 0000000..56fbd71 --- /dev/null +++ b/src/vma/event/vlogger_timer_handler.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef VLOGGER_TIMER_HANDLER_H +#define VLOGGER_TIMER_HANDLER_H + +#include "timer_handler.h" + +#define UPDATE_VLOGGER_LEVELS_INTERVAL 100 + +class vlogger_timer_handler : public timer_handler +{ +public: + vlogger_timer_handler(); + ~vlogger_timer_handler(); +private: + void handle_timer_expired(void* user_data); + + void* m_timer_handle; +}; + +extern vlogger_timer_handler* g_p_vlogger_timer_handler; + +#endif /*VLOGGER_TIMER_HANDLER_H*/ diff --git a/src/vma/ib/base/verbs_extra.cpp b/src/vma/ib/base/verbs_extra.cpp new file mode 100644 index 0000000..94c3587 --- /dev/null +++ b/src/vma/ib/base/verbs_extra.cpp @@ -0,0 +1,427 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "vma/ib/base/verbs_extra.h" + +#include +#include + +#include "vma_extra.h" +#include "vma/util/valgrind.h" + +#undef MODULE_NAME +#define MODULE_NAME "verbs_extra:" + +// See - IB Arch Spec - 11.6.2 COMPLETION RETURN STATUS +const char* priv_ibv_wc_status_str(enum ibv_wc_status status) +{ + BULLSEYE_EXCLUDE_BLOCK_START + switch (status) { + case IBV_WC_SUCCESS: return "IBV_WC_SUCCESS"; + case IBV_WC_LOC_LEN_ERR: return "IBV_WC_LOC_LEN_ERR"; + case IBV_WC_LOC_QP_OP_ERR: return "IBV_WC_LOC_QP_OP_ERR"; + case IBV_WC_LOC_EEC_OP_ERR: return "IBV_WC_LOC_EEC_OP_ERR"; + case IBV_WC_LOC_PROT_ERR: return "IBV_WC_LOC_PROT_ERR"; + case IBV_WC_WR_FLUSH_ERR: return "IBV_WC_WR_FLUSH_ERR"; + case IBV_WC_MW_BIND_ERR: return "IBV_WC_MW_BIND_ERR"; + case IBV_WC_BAD_RESP_ERR: return "IBV_WC_BAD_RESP_ERR"; + case IBV_WC_LOC_ACCESS_ERR: return "IBV_WC_LOC_ACCESS_ERR"; + case IBV_WC_REM_INV_REQ_ERR: return "IBV_WC_REM_INV_REQ_ERR"; + case IBV_WC_REM_ACCESS_ERR: return "IBV_WC_REM_ACCESS_ERR"; + case IBV_WC_REM_OP_ERR: return "IBV_WC_REM_OP_ERR"; + case IBV_WC_RETRY_EXC_ERR: return "IBV_WC_RETRY_EXC_ERR"; + case IBV_WC_RNR_RETRY_EXC_ERR: return "IBV_WC_RNR_RETRY_EXC_ERR"; + case IBV_WC_LOC_RDD_VIOL_ERR: return "IBV_WC_LOC_RDD_VIOL_ERR"; + case IBV_WC_REM_INV_RD_REQ_ERR: return "IBV_WC_REM_INV_RD_REQ_ERR"; + case IBV_WC_REM_ABORT_ERR: return "IBV_WC_REM_ABORT_ERR"; + case IBV_WC_INV_EECN_ERR: return "IBV_WC_INV_EECN_ERR"; + case IBV_WC_INV_EEC_STATE_ERR: return "IBV_WC_INV_EEC_STATE_ERR"; + case IBV_WC_FATAL_ERR: return "IBV_WC_FATAL_ERR"; + case IBV_WC_RESP_TIMEOUT_ERR: return "IBV_WC_RESP_TIMEOUT_ERR"; + case IBV_WC_GENERAL_ERR: return "IBV_WC_GENERAL_ERR"; + default: break; + } + return "IBV_WC_UNKNOWN"; + BULLSEYE_EXCLUDE_BLOCK_END +} + +// See - IB Arch Spec - 11.6.3 ASYNCHRONOUS EVENTS +const char* priv_ibv_event_desc_str(enum ibv_event_type type) +{ + BULLSEYE_EXCLUDE_BLOCK_START + switch (type) { + case IBV_EVENT_CQ_ERR: return "CQ_ERR"; + case IBV_EVENT_QP_FATAL: return "QP_FATAL"; + case IBV_EVENT_QP_REQ_ERR: return "QP_REQ_ERR"; + case IBV_EVENT_QP_ACCESS_ERR: return "QP_ACCESS_ERR"; + case IBV_EVENT_COMM_EST: return "COMM_EST"; + case IBV_EVENT_SQ_DRAINED: return "SQ_DRAINED"; + case IBV_EVENT_PATH_MIG: return "PATH_MIG"; + case IBV_EVENT_PATH_MIG_ERR: return "PATH_MIG_ERR"; + case IBV_EVENT_DEVICE_FATAL: return "DEVICE_FATAL"; + case IBV_EVENT_PORT_ACTIVE: return "PORT_ACTIVE"; + case IBV_EVENT_PORT_ERR: return "PORT_ERR"; + case IBV_EVENT_LID_CHANGE: return "LID_CHANGE"; + case IBV_EVENT_PKEY_CHANGE: return "PKEY_CHANGE"; + case IBV_EVENT_SM_CHANGE: return "SM_CHANGE"; + case IBV_EVENT_SRQ_ERR: return "SRQ_ERR"; + case IBV_EVENT_SRQ_LIMIT_REACHED: return "SRQ_LIMIT_REACHED"; + case IBV_EVENT_QP_LAST_WQE_REACHED: return "QP_LAST_WQE_REACHED"; + case IBV_EVENT_CLIENT_REREGISTER: return "CLIENT_REREGISTER"; + case IBV_EVENT_GID_CHANGE: return "GID_CHANGE"; + default: break; + } + return "UNKNOWN"; + BULLSEYE_EXCLUDE_BLOCK_END +} + +int priv_ibv_find_pkey_index(struct ibv_context *verbs, uint8_t port_num, uint16_t pkey, uint16_t *pkey_index) +{ + int ret, i; + uint16_t chk_pkey = 0; + for (i = 0, ret = 0; !ret; i++) { + ret = ibv_query_pkey(verbs, port_num, i, &chk_pkey); + if (!ret && pkey == chk_pkey) { + *pkey_index = (uint16_t)i; + return 0; + } + } + return -1; +} + +int priv_ibv_modify_qp_to_err(struct ibv_qp *qp) +{ + vma_ibv_qp_attr qp_attr; + memset(&qp_attr, 0, sizeof(qp_attr)); + qp_attr.qp_state = IBV_QPS_ERR; + BULLSEYE_EXCLUDE_BLOCK_START + IF_VERBS_FAILURE_EX(vma_ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE), EIO) { + return -1; + } ENDIF_VERBS_FAILURE; + BULLSEYE_EXCLUDE_BLOCK_END + + return 0; +} + +int priv_ibv_modify_qp_to_reset(struct ibv_qp *qp) +{ + vma_ibv_qp_attr qp_attr; + memset(&qp_attr, 0, sizeof(qp_attr)); + qp_attr.qp_state = IBV_QPS_RESET; + BULLSEYE_EXCLUDE_BLOCK_START + IF_VERBS_FAILURE(vma_ibv_modify_qp(qp, &qp_attr, IBV_QP_STATE)) { + return -1; + } ENDIF_VERBS_FAILURE; + BULLSEYE_EXCLUDE_BLOCK_END + return 0; +} + +int priv_ibv_modify_qp_from_err_to_init_raw(struct ibv_qp *qp, uint8_t port_num) +{ + vma_ibv_qp_attr qp_attr; + + if (qp->qp_type != IBV_QPT_RAW_PACKET) + return -1; + + if (priv_ibv_query_qp_state(qp) != IBV_QPS_RESET) { + if (priv_ibv_modify_qp_to_reset(qp)) { + return -2; + } + } + + memset(&qp_attr, 0, sizeof(qp_attr)); + qp_attr.qp_state = IBV_QPS_INIT; + qp_attr.port_num = port_num; + BULLSEYE_EXCLUDE_BLOCK_START + IF_VERBS_FAILURE(vma_ibv_modify_qp(qp, &qp_attr, (ibv_qp_attr_mask)(IBV_QP_STATE | IBV_QP_PORT))) { + return -3; + } ENDIF_VERBS_FAILURE; + BULLSEYE_EXCLUDE_BLOCK_END + + return 0; +} + +int priv_ibv_modify_qp_from_err_to_init_ud(struct ibv_qp *qp, uint8_t port_num, uint16_t pkey_index, uint32_t underly_qpn) +{ + vma_ibv_qp_attr qp_attr; + ibv_qp_attr_mask qp_attr_mask = (ibv_qp_attr_mask)IBV_QP_STATE; + + if (qp->qp_type != IBV_QPT_UD) + return -1; + + if (priv_ibv_query_qp_state(qp) != IBV_QPS_RESET) { + if (priv_ibv_modify_qp_to_reset(qp)) { + return -2; + } + } + + memset(&qp_attr, 0, sizeof(qp_attr)); + qp_attr.qp_state = IBV_QPS_INIT; + if (0 == underly_qpn) { + qp_attr_mask = (ibv_qp_attr_mask)(qp_attr_mask | IBV_QP_QKEY | IBV_QP_PKEY_INDEX | IBV_QP_PORT); + qp_attr.qkey = IPOIB_QKEY; + qp_attr.pkey_index = pkey_index; + qp_attr.port_num = port_num; + } + + BULLSEYE_EXCLUDE_BLOCK_START + IF_VERBS_FAILURE(vma_ibv_modify_qp(qp, &qp_attr, qp_attr_mask)) { + return -3; + } ENDIF_VERBS_FAILURE; + BULLSEYE_EXCLUDE_BLOCK_END + + return 0; +} + +int priv_ibv_modify_qp_from_init_to_rts(struct ibv_qp *qp, uint32_t underly_qpn) +{ + vma_ibv_qp_attr qp_attr; + ibv_qp_attr_mask qp_attr_mask = (ibv_qp_attr_mask)IBV_QP_STATE; + + if (priv_ibv_query_qp_state(qp) != IBV_QPS_INIT) { + return -1; + } + + memset(&qp_attr, 0, sizeof(qp_attr)); + qp_attr.qp_state = IBV_QPS_RTR; + BULLSEYE_EXCLUDE_BLOCK_START + IF_VERBS_FAILURE(vma_ibv_modify_qp(qp, &qp_attr, qp_attr_mask)) { + return -2; + } ENDIF_VERBS_FAILURE; + BULLSEYE_EXCLUDE_BLOCK_END + + qp_attr.qp_state = IBV_QPS_RTS; + + if ((qp->qp_type == IBV_QPT_UD) && (0 == underly_qpn)) { + qp_attr_mask = (ibv_qp_attr_mask)(qp_attr_mask | IBV_QP_SQ_PSN); + qp_attr.sq_psn = 0; + } + + BULLSEYE_EXCLUDE_BLOCK_START + IF_VERBS_FAILURE(vma_ibv_modify_qp(qp, &qp_attr, qp_attr_mask)) { + return -3; + } ENDIF_VERBS_FAILURE; + BULLSEYE_EXCLUDE_BLOCK_END + + return 0; +} + +// Return 'ibv_qp_state' of the ibv_qp +int priv_ibv_query_qp_state(struct ibv_qp *qp) +{ + struct ibv_qp_attr qp_attr; + struct ibv_qp_init_attr qp_init_attr; + BULLSEYE_EXCLUDE_BLOCK_START + IF_VERBS_FAILURE(ibv_query_qp(qp, &qp_attr, IBV_QP_STATE, &qp_init_attr)) { + return -1; + } ENDIF_VERBS_FAILURE; + BULLSEYE_EXCLUDE_BLOCK_END + VALGRIND_MAKE_MEM_DEFINED(&qp_attr, sizeof(qp_attr)); + return (ibv_qp_state)qp_attr.qp_state; +} + +int priv_ibv_query_burst_supported(struct ibv_qp *qp, uint8_t port_num) +{ +#ifdef DEFINED_IBV_QP_SUPPORT_BURST + if (priv_ibv_modify_qp_from_err_to_init_raw(qp, port_num) == 0) { + if (priv_ibv_modify_qp_from_init_to_rts(qp, 0) == 0) { + struct vma_rate_limit_t rate = {1000, 100, 100}; + if (priv_ibv_modify_qp_ratelimit(qp, rate, RL_RATE | RL_BURST_SIZE | RL_PKT_SIZE) == 0){ + return 0; + } + } + } + +#else + NOT_IN_USE(qp); + NOT_IN_USE(port_num); +#endif + + return -1; +} + +int priv_ibv_query_flow_tag_supported(struct ibv_qp *qp, uint8_t port_num) +{ + NOT_IN_USE(qp); + NOT_IN_USE(port_num); + int res = -1; + +#ifdef DEFINED_IBV_FLOW_TAG + + // Create + struct { + vma_ibv_flow_attr attr; + vma_ibv_flow_spec_eth eth; + vma_ibv_flow_spec_ipv4 ipv4; + vma_ibv_flow_spec_tcp_udp tcp_udp; + vma_ibv_flow_spec_action_tag flow_tag; + } ft_attr; + + // Initialize + memset(&ft_attr, 0, sizeof(ft_attr)); + ft_attr.attr.size = sizeof(ft_attr); + ft_attr.attr.num_of_specs = 4; + ft_attr.attr.type = VMA_IBV_FLOW_ATTR_NORMAL; + ft_attr.attr.priority = 1; // almost highest priority, 0 is used for 5-tuple later + ft_attr.attr.port = port_num; + + // Set filters + uint8_t mac_0[ETH_ALEN] = { 0x00, 0x00, 0x00, 0x00, 0x00, 0x00 }; + uint8_t mac_f[ETH_ALEN] = { 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF }; + + ibv_flow_spec_eth_set(&ft_attr.eth, mac_0 , 0); // L2 filter + memcpy(ft_attr.eth.val.src_mac, mac_f, ETH_ALEN); + memset(ft_attr.eth.mask.src_mac, FS_MASK_ON_8, ETH_ALEN); + + ibv_flow_spec_ipv4_set(&ft_attr.ipv4, INADDR_LOOPBACK, INADDR_LOOPBACK); // L3 filter + ibv_flow_spec_tcp_udp_set(&ft_attr.tcp_udp, true, 0, 0); // L4 filter + ibv_flow_spec_flow_tag_set(&ft_attr.flow_tag, FLOW_TAG_MASK-1); // enable flow tag + + // Create flow + vma_ibv_flow *ibv_flow = vma_ibv_create_flow(qp, &ft_attr.attr); + if (ibv_flow) { + res = 0; + vma_ibv_destroy_flow(ibv_flow); + } +#endif // DEFINED_IBV_FLOW_TAG + + return res; +} + +int priv_ibv_create_flow_supported(struct ibv_qp *qp, uint8_t port_num) +{ + int res = -1; + + struct { + vma_ibv_flow_attr attr; + vma_ibv_flow_spec_ipv4 ipv4; + vma_ibv_flow_spec_tcp_udp tcp_udp; + } cf_attr; + + // Initialize + memset(&cf_attr, 0, sizeof(cf_attr)); + cf_attr.attr.size = sizeof(cf_attr); + cf_attr.attr.num_of_specs = 2; + cf_attr.attr.type = VMA_IBV_FLOW_ATTR_NORMAL; + cf_attr.attr.priority = 1; // almost highest priority, 0 is used for 5-tuple later + cf_attr.attr.port = port_num; + + ibv_flow_spec_ipv4_set(&cf_attr.ipv4, INADDR_LOOPBACK, INADDR_LOOPBACK); // L3 filter + ibv_flow_spec_tcp_udp_set(&cf_attr.tcp_udp, true, 0, 0); // L4 filter + + // Create flow + vma_ibv_flow *ibv_flow = vma_ibv_create_flow(qp, &cf_attr.attr); + if (ibv_flow) { + res = 0; + vma_ibv_destroy_flow(ibv_flow); + } + + return res; +} + +int vma_rdma_lib_reset() { +#ifdef HAVE_RDMA_LIB_RESET + vlog_printf(VLOG_DEBUG, "rdma_lib_reset called\n"); + return rdma_lib_reset(); +#else + vlog_printf(VLOG_DEBUG, "rdma_lib_reset doesn't exist returning 0\n"); + return 0; +#endif +} + +// be advised that this method will change packet pacing value and also change state to RTS +int priv_ibv_modify_qp_ratelimit(struct ibv_qp *qp, struct vma_rate_limit_t &rate_limit, uint32_t rl_changes) +{ +#ifdef DEFINED_IBV_PACKET_PACING_CAPS + vma_ibv_rate_limit_attr qp_attr; + uint64_t attr_mask = IBV_QP_STATE; + + if (priv_ibv_query_qp_state(qp) != IBV_QPS_RTS) { + vlog_printf(VLOG_DEBUG, "failed querying QP\n"); + return -1; + } + memset(&qp_attr, 0, sizeof(qp_attr)); + vma_ibv_init_qps_attr(qp_attr); + + if (rate_limit.rate && (rl_changes & RL_RATE)) { + qp_attr.rate_limit = rate_limit.rate; + attr_mask |= VMA_IBV_QP_RATE_LIMIT; + } +#ifdef DEFINED_IBV_QP_SUPPORT_BURST + if (rate_limit.max_burst_sz && rate_limit.typical_pkt_sz && (rl_changes & (RL_BURST_SIZE | RL_PKT_SIZE))) { + vma_ibv_init_burst_attr(qp_attr, rate_limit); + } +#endif + BULLSEYE_EXCLUDE_BLOCK_START + IF_VERBS_FAILURE(vma_ibv_modify_qp_rate_limit(qp, &qp_attr, attr_mask)) { + vlog_printf(VLOG_DEBUG, "failed setting rate limit\n"); + return -2; + } ENDIF_VERBS_FAILURE; + BULLSEYE_EXCLUDE_BLOCK_END +#ifdef DEFINED_IBV_QP_SUPPORT_BURST + vlog_printf(VLOG_DEBUG, "qp was set to rate limit %d, burst size %d, packet size %d\n", + rate_limit.rate, rate_limit.max_burst_sz, rate_limit.typical_pkt_sz); +#else + vlog_printf(VLOG_DEBUG, "qp was set to rate limit %d\n", rate_limit.rate); +#endif + return 0; +#else + vlog_printf(VLOG_DEBUG, "rate limit not supported\n"); + NOT_IN_USE(qp); + NOT_IN_USE(rate_limit); + NOT_IN_USE(rl_changes); + return 0; +#endif // DEFINED_IBV_PACKET_PACING_CAPS +} + +void priv_ibv_modify_cq_moderation(struct ibv_cq* cq, uint32_t period, uint32_t count) +{ +#ifdef DEFINED_IBV_CQ_ATTR_MODERATE + vma_ibv_cq_attr cq_attr; + memset(&cq_attr, 0, sizeof(cq_attr)); + vma_cq_attr_mask(cq_attr) = VMA_IBV_CQ_MODERATION; + vma_cq_attr_moderation(cq_attr).cq_count = count; + vma_cq_attr_moderation(cq_attr).cq_period = period; + + vlog_printf(VLOG_FUNC, "modify cq moderation, period=%d, count=%d\n", period, count); + + IF_VERBS_FAILURE_EX(vma_ibv_modify_cq(cq, &cq_attr, VMA_IBV_CQ_MODERATION), EIO) { + vlog_printf(VLOG_DEBUG, "Failure modifying cq moderation (errno=%d %m)\n", errno); + } ENDIF_VERBS_FAILURE; +#else + NOT_IN_USE(cq); + NOT_IN_USE(count); + NOT_IN_USE(period); +#endif +} + diff --git a/src/vma/ib/base/verbs_extra.h b/src/vma/ib/base/verbs_extra.h new file mode 100644 index 0000000..5cbe9d7 --- /dev/null +++ b/src/vma/ib/base/verbs_extra.h @@ -0,0 +1,626 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef VERBS_EXTRA_H +#define VERBS_EXTRA_H + +#include +#include +#include +#include "vma/util/vtypes.h" +#if defined(DEFINED_VERBS_VERSION) && (DEFINED_VERBS_VERSION == 2) +#include +#endif +#include +#include +#include +#if defined(DEFINED_DIRECT_VERBS) +#include "vma/ib/mlx5/ib_mlx5.h" +#endif /* DEFINED_DIRECT_VERBS */ + +#ifndef DEFINED_IBV_WC_WITH_VLAN +//#warning probaly you are trying to compile on OFED which doesnt support VLAN for RAW QP. +//#error when you see this then you need to manually open the below comment and to comment the current and the previous lines. +//#define IBV_WC_WITH_VLAN 1 << 3 +#endif + +// Wrapper for all IBVERBS & RDMA_CM API to normalize the return code and errno value +// With these marco all ibverbs & rdma_cm failures are caugth and errno is updated +// Without this marco ibverbs & rdma_cm returns sometimes with -1 and sometimes with -errno +inline int _errnocheck(int rc) { + if (rc < -1) { + errno = -rc; + } + return rc; +} + +#define IF_VERBS_FAILURE_EX(__func__, __err__) { if (_errnocheck(__func__) && (errno != __err__)) +#define IF_VERBS_FAILURE(__func__) { if (_errnocheck(__func__)) +#define ENDIF_VERBS_FAILURE } + + +#define IF_RDMACM_FAILURE(__func__) IF_VERBS_FAILURE(__func__) +#define ENDIF_RDMACM_FAILURE ENDIF_VERBS_FAILURE +#define IPOIB_QKEY 0x0b1b + +// See - IB Arch Spec - 11.6.2 COMPLETION RETURN STATUS +const char* priv_ibv_wc_status_str(enum ibv_wc_status status); + +// See - IB Arch Spec - 11.6.3 ASYNCHRONOUS EVENTS +const char* priv_ibv_event_desc_str(enum ibv_event_type type); + +#define priv_rdma_cm_event_type_str(__rdma_cm_ev_t__) \ + rdma_event_str(__rdma_cm_ev_t__) + +// Find pkey_index from the ibv_context + port_num + pkey +int priv_ibv_find_pkey_index(struct ibv_context *verbs, uint8_t port_num, uint16_t pkey, uint16_t *pkey_index); + +int priv_ibv_modify_qp_to_err(struct ibv_qp *qp); +int priv_ibv_modify_qp_from_err_to_init_raw(struct ibv_qp *qp, uint8_t port_num); +int priv_ibv_modify_qp_from_err_to_init_ud(struct ibv_qp *qp, uint8_t port_num, uint16_t pkey_index, uint32_t underly_qpn); +int priv_ibv_modify_qp_from_init_to_rts(struct ibv_qp *qp, uint32_t underly_qpn = 0); + +// Return 'ibv_qp_state' of the ibv_qp +int priv_ibv_query_qp_state(struct ibv_qp *qp); + +// change ib rate limit +int priv_ibv_modify_qp_ratelimit(struct ibv_qp *qp, struct vma_rate_limit_t &rate_limit, uint32_t rl_changes); + +// Modify cq moderation +void priv_ibv_modify_cq_moderation(struct ibv_cq* cq, uint32_t period, uint32_t count); + +#ifndef VLAN_VID_MASK +#define VLAN_VID_MASK 0xFFF /* define vlan range: 1-4095. taken from */ +#endif + +#define FS_MASK_ON_8 (0xff) +#define FS_MASK_ON_16 (0xffff) +#define FS_MASK_ON_32 (0xffffffff) + +#define FLOW_TAG_MASK ((1 << 20) -1) +int priv_ibv_query_flow_tag_supported(struct ibv_qp *qp, uint8_t port_num); +int priv_ibv_create_flow_supported(struct ibv_qp *qp, uint8_t port_num); +int priv_ibv_query_burst_supported(struct ibv_qp *qp, uint8_t port_num); + +/* DEFINED_VERBS_VERSION: + * 1 - Legacy Verbs API + * 2 - Experimental Verbs API + * 3 - Upstream Verbs API + */ +#if defined(DEFINED_VERBS_VERSION) && (DEFINED_VERBS_VERSION == 1 || DEFINED_VERBS_VERSION == 3) +//ibv_create_qp +#ifdef DEFINED_IBV_QP_INIT_SOURCE_QPN +#define vma_ibv_create_qp(pd, attr) ibv_create_qp_ex((pd)->context, attr) +typedef struct ibv_qp_init_attr_ex vma_ibv_qp_init_attr; +#define vma_ibv_qp_create_flags(attr) (attr).create_flags +#define vma_ibv_qp_source_qpn(attr) (attr).source_qpn +#define VMA_IBV_QP_INIT_QPN_CREATE_FLAGS IBV_QP_CREATE_SOURCE_QPN +#define VMA_IBV_QP_INIT_QPN_MASK IBV_QP_INIT_ATTR_CREATE_FLAGS +#define vma_ibv_qp_init_attr_comp_mask(_pd, _attr) { (_attr).pd = _pd; (_attr).comp_mask |= IBV_QP_INIT_ATTR_PD; } +#else +#define vma_ibv_create_qp(pd, attr) ibv_create_qp(pd, attr) +typedef struct ibv_qp_init_attr vma_ibv_qp_init_attr; +#define vma_ibv_qp_init_attr_comp_mask(_pd, _attr) { NOT_IN_USE(_pd); NOT_IN_USE(_attr); } +#endif + +//ibv_query_device +#define vma_ibv_device_attr_comp_mask(attr) NOT_IN_USE(attr) +typedef struct ibv_device_attr vma_ibv_device_attr; + +#ifdef DEFINED_IBV_DEVICE_ATTR_EX +#define vma_ibv_query_device(context, attr) ibv_query_device_ex(context, NULL, attr) +typedef struct ibv_device_attr_ex vma_ibv_device_attr_ex; +#define vma_get_device_orig_attr(device_attr) &device_attr->orig_attr +#else +#define vma_ibv_query_device(context, attr) ibv_query_device(context, attr) +typedef vma_ibv_device_attr vma_ibv_device_attr_ex; +#define vma_get_device_orig_attr(device_attr) device_attr +#endif + +//ibv_modify_qp +#define vma_ibv_modify_qp(qp, attr, mask) ibv_modify_qp(qp, attr, mask) +typedef struct ibv_qp_attr vma_ibv_qp_attr; +//ibv_poll_cq +#define vma_ibv_poll_cq(cq, num, wc) ibv_poll_cq(cq, num, wc) +typedef struct ibv_wc vma_ibv_wc; +#define vma_wc_flags(wc) (wc).wc_flags +#define vma_wc_opcode(wc) (wc).opcode +#define VMA_IBV_WC_RECV IBV_WC_RECV +//csum offload +#ifdef DEFINED_IBV_DEVICE_RAW_IP_CSUM +#define vma_is_rx_hw_csum_supported(attr) ((attr)->device_cap_flags & (IBV_DEVICE_RAW_IP_CSUM | IBV_DEVICE_UD_IP_CSUM)) +#define vma_wc_rx_hw_csum_ok(wc) (vma_wc_flags(wc) & IBV_WC_IP_CSUM_OK) +#else +#define vma_is_rx_hw_csum_supported(attr) 0 +#define vma_wc_rx_hw_csum_ok(wc) (1) +#endif + +typedef int vma_ibv_cq_init_attr; +#define vma_ibv_create_cq(context, cqe, cq_context, channel, comp_vector, attr) ibv_create_cq(context, cqe, cq_context, channel, comp_vector) + +//rx hw timestamp +#define VMA_IBV_WC_WITH_TIMESTAMP 0 +#define vma_wc_timestamp(wc) 0 +#define vma_ibv_cq_init_ts_attr(attr) { NOT_IN_USE(attr); } + +#ifdef DEFINED_IBV_CQ_TIMESTAMP +#define VMA_IBV_DEVICE_ATTR_HCA_CORE_CLOCK 0 +#define VMA_IBV_VALUES_MASK_RAW_CLOCK IBV_VALUES_MASK_RAW_CLOCK +#define vma_ibv_query_values(ctx, values) ibv_query_rt_values_ex(ctx, values) +#define vma_get_ts_val(values) values.raw_clock.tv_nsec +typedef struct ibv_values_ex vma_ts_values; +#endif + +//ibv_post_send +#define VMA_IBV_SEND_SIGNALED IBV_SEND_SIGNALED +#define VMA_IBV_SEND_INLINE IBV_SEND_INLINE +#ifdef DEFINED_IBV_SEND_IP_CSUM + #define VMA_IBV_SEND_IP_CSUM (IBV_SEND_IP_CSUM) +#else + #define DEFINED_SW_CSUM +#endif +#define vma_ibv_send_flags ibv_send_flags +#define vma_send_wr_send_flags(wr) (wr).send_flags +#define VMA_IBV_WR_SEND IBV_WR_SEND +#define vma_ibv_wr_opcode ibv_wr_opcode +#define vma_send_wr_opcode(wr) (wr).opcode + +#ifdef DEFINED_TSO + #define VMA_IBV_WR_TSO (vma_ibv_wr_opcode)IBV_WR_TSO + #define vma_check_dev_attr_tso(_attr) 1 + #define vma_get_tso_caps(_attr) (((vma_ibv_device_attr_ex *)(_attr))->tso_caps) + #define vma_ibv_qp_init_attr_tso(_attr, _max_tso_header) \ + do { \ + _attr.comp_mask |= IBV_QP_INIT_ATTR_MAX_TSO_HEADER; \ + _attr.max_tso_header = _max_tso_header; \ + } while (0) + typedef struct ibv_tso_caps vma_ibv_tso_caps; +#else + #define VMA_IBV_WR_TSO (vma_ibv_wr_opcode)VMA_IBV_WR_SEND + #define vma_check_dev_attr_tso(_attr) 0 + #define vma_ibv_qp_init_attr_tso(_attr, _max_tso_header) ((void)0) +#endif /* DEFINED_TSO */ + +// Dummy send +#ifdef DEFINED_IBV_WR_NOP +#define vma_is_nop_supported(device_attr) 1 +#define VMA_IBV_WR_NOP (vma_ibv_wr_opcode)MLX5_OPCODE_NOP +#else +#define vma_is_nop_supported(device_attr) 0 +#define VMA_IBV_WR_NOP (vma_ibv_wr_opcode)(0) // Use 0 as "default" opcode when NOP is not defined. +#endif + +#define vma_ibv_post_send(qp, wr, bad_wr) ibv_post_send(qp, wr, bad_wr) +typedef struct ibv_send_wr vma_ibv_send_wr; +//ibv_reg_mr +#define VMA_IBV_ACCESS_LOCAL_WRITE IBV_ACCESS_LOCAL_WRITE +#ifdef DEFINED_IBV_ACCESS_ALLOCATE_MR +#define VMA_IBV_ACCESS_ALLOCATE_MR IBV_ACCESS_ALLOCATE_MR +#endif +//flow steering +#define VMA_IBV_FLOW_ATTR_NORMAL IBV_FLOW_ATTR_NORMAL +#define VMA_IBV_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK IBV_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK +#ifdef DEFINED_IBV_FLOW_SPEC_IB +#define VMA_IBV_FLOW_SPEC_IB IBV_FLOW_SPEC_IB +#endif +#define VMA_IBV_FLOW_SPEC_ETH IBV_FLOW_SPEC_ETH +#define VMA_IBV_FLOW_SPEC_IPV4 IBV_FLOW_SPEC_IPV4 +#define VMA_IBV_FLOW_SPEC_TCP IBV_FLOW_SPEC_TCP +#define VMA_IBV_FLOW_SPEC_UDP IBV_FLOW_SPEC_UDP +#define vma_ibv_create_flow(qp, flow) ibv_create_flow(qp, flow) +#define vma_ibv_destroy_flow(flow_id) ibv_destroy_flow(flow_id) +typedef struct ibv_flow vma_ibv_flow; +typedef struct ibv_flow_attr vma_ibv_flow_attr; +typedef struct ibv_flow_spec_ib vma_ibv_flow_spec_ib; +typedef struct ibv_flow_spec_eth vma_ibv_flow_spec_eth; +typedef struct ibv_flow_spec_ipv4 vma_ibv_flow_spec_ipv4; +typedef struct ibv_flow_spec_tcp_udp vma_ibv_flow_spec_tcp_udp; + +// Flow tag +#ifdef DEFINED_IBV_FLOW_TAG +#define VMA_IBV_FLOW_SPEC_ACTION_TAG IBV_FLOW_SPEC_ACTION_TAG +typedef struct ibv_flow_spec_action_tag vma_ibv_flow_spec_action_tag; +#define vma_get_flow_tag(cqe) ntohl((uint32_t)(cqe->sop_drop_qpn)) +#else +typedef struct ibv_flow_spec_action_tag_dummy {} vma_ibv_flow_spec_action_tag; +#define vma_get_flow_tag(cqe) 0 +#endif // DEFINED_IBV_FLOW_TAG + +#ifdef DEFINED_IBV_CQ_ATTR_MODERATE +typedef struct ibv_modify_cq_attr vma_ibv_cq_attr; +#define vma_ibv_modify_cq(cq, cq_attr, mask) ibv_modify_cq(cq, cq_attr) +#define vma_cq_attr_mask(cq_attr) (cq_attr).attr_mask +#define vma_cq_attr_moderation(cq_attr) (cq_attr).moderate +#define VMA_IBV_CQ_MODERATION IBV_CQ_ATTR_MODERATE +#endif + +// Clock info +#ifdef DEFINED_IBV_CLOCK_INFO +typedef struct mlx5dv_clock_info vma_ibv_clock_info; +#define vma_ibv_query_clock_info(ctx, clock_info) mlx5dv_get_clock_info(ctx, clock_info) +#define vma_ibv_convert_ts_to_ns(clock_info, hw_ts) mlx5dv_ts_to_ns(clock_info, hw_ts) +#endif //DEFINED_IBV_CLOCK_INFO + +// ibv_dm +#ifdef DEFINED_IBV_DM +#define vma_ibv_alloc_dm(ctx, attr) ibv_alloc_dm(ctx, attr) +#define vma_ibv_free_dm(dm) ibv_free_dm(dm) +#define vma_ibv_reg_dm_mr(mr) ibv_reg_dm_mr((mr)->pd, (mr)->dm, 0, (mr)->length, IBV_ACCESS_LOCAL_WRITE | IBV_ACCESS_ZERO_BASED) +#define vma_ibv_memcpy_dm(dm, attr) ibv_memcpy_to_dm(dm, (attr)->dm_offset, (attr)->host_addr, (attr)->length) +#define vma_ibv_init_memcpy_dm(attr, src, head, size) { attr.host_addr = src; attr.dm_offset = head; attr.length = size; } +#define vma_ibv_init_dm_mr(in_mr, ctx_pd, size, allocated_dm) { in_mr.pd = ctx_pd; in_mr.length = size; in_mr.dm = allocated_dm; } +typedef struct ibv_alloc_dm_attr vma_ibv_alloc_dm_attr; +typedef struct ibv_dm vma_ibv_dm; +typedef struct { + void * host_addr; + uint64_t dm_offset; + size_t length; +} vma_ibv_memcpy_dm_attr; +typedef struct { + struct ibv_pd *pd; + size_t length; + vma_ibv_dm *dm; +} vma_ibv_reg_mr_in; +#endif + +#ifdef DEFINED_IBV_PACKET_PACING_CAPS +#define VMA_IBV_QP_RATE_LIMIT IBV_QP_RATE_LIMIT +#define vma_is_pacing_caps_supported(attr) (attr->packet_pacing_caps.qp_rate_limit_min) + +#ifdef DEFINED_IBV_QP_SUPPORT_BURST +#define vma_ibv_init_burst_attr(qp_attr, rate_limit) { qp_attr.max_burst_sz = rate_limit.max_burst_sz; qp_attr.typical_pkt_sz = rate_limit.typical_pkt_sz; } +typedef struct ibv_qp_rate_limit_attr vma_ibv_rate_limit_attr; +#define vma_ibv_modify_qp_rate_limit(qp, attr, mask) ibv_modify_qp_rate_limit(qp, attr) +#define vma_ibv_init_qps_attr(qp_attr) { NOT_IN_USE(qp_attr); } +#else +typedef vma_ibv_qp_attr vma_ibv_rate_limit_attr; +#define vma_ibv_modify_qp_rate_limit(qp, attr, mask) vma_ibv_modify_qp(qp, attr, mask) +#define vma_ibv_init_qps_attr(qp_attr) { qp_attr.qp_state = IBV_QPS_RTS; } +#endif // DEFINED_IBV_QP_SUPPORT_BURST + +#endif // DEFINED_IBV_PACKET_PACING_CAPS + +#else /* DEFINED_VERBS_VERSION */ + +//ibv_create_qp +#define vma_ibv_create_qp(pd, attr) ibv_exp_create_qp((pd)->context, attr) +typedef struct ibv_exp_qp_init_attr vma_ibv_qp_init_attr; +#define vma_ibv_qp_init_attr_comp_mask(_pd, _attr) { (_attr).pd = _pd; (_attr).comp_mask |= IBV_EXP_QP_INIT_ATTR_PD; } + +#ifdef DEFINED_IBV_QP_INIT_SOURCE_QPN +#define vma_ibv_qp_create_flags(attr) (attr).exp_create_flags +#define vma_ibv_qp_source_qpn(attr) (attr).associated_qpn +#define VMA_IBV_QP_INIT_QPN_CREATE_FLAGS 0 +#define VMA_IBV_QP_INIT_QPN_MASK IBV_EXP_QP_INIT_ATTR_ASSOCIATED_QPN +#endif + +//ibv_query_device +#define vma_ibv_query_device(context, attr) ibv_exp_query_device(context, attr) +typedef struct ibv_exp_device_attr vma_ibv_device_attr; +typedef vma_ibv_device_attr vma_ibv_device_attr_ex; +#define vma_get_device_orig_attr(device_attr) device_attr +#define vma_ibv_device_attr_comp_mask(attr) { (attr)->comp_mask = IBV_EXP_DEVICE_ATTR_RESERVED - 1; } + +#ifdef DEFINED_IBV_EXP_DEVICE_RX_CSUM_L4_PKT +#define vma_is_rx_hw_csum_supported(attr) (((attr)->exp_device_cap_flags & IBV_EXP_DEVICE_RX_CSUM_L3_PKT) \ + && ((attr)->exp_device_cap_flags & IBV_EXP_DEVICE_RX_CSUM_L4_PKT)) +#else +#ifdef DEFINED_IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT +#define vma_is_rx_hw_csum_supported(attr) (((attr)->exp_device_cap_flags & IBV_EXP_DEVICE_RX_CSUM_IP_PKT) \ + && ((attr)->exp_device_cap_flags & IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT)) +#else +#define vma_is_rx_hw_csum_supported(attr) 0 +#endif +#endif +//ibv_modify_qp +#define vma_ibv_modify_qp(qp, attr, mask) ibv_exp_modify_qp(qp, attr, mask) +typedef struct ibv_exp_qp_attr vma_ibv_qp_attr; + +//ibv_exp_poll_cq +#ifdef DEFINED_IBV_EXP_CQ +#define vma_ibv_poll_cq(cq, num, wc) ibv_exp_poll_cq(cq, num, wc, sizeof(struct ibv_exp_wc)) +typedef struct ibv_exp_wc vma_ibv_wc; +#define vma_wc_flags(wc) (wc).exp_wc_flags +#define vma_wc_opcode(wc) (wc).exp_opcode +#define VMA_IBV_WC_RECV IBV_EXP_WC_RECV + +//experimental cq +typedef struct ibv_exp_cq_init_attr vma_ibv_cq_init_attr; +#define vma_ibv_create_cq(context, cqe, cq_context, channel, comp_vector, attr) ibv_exp_create_cq(context, cqe, cq_context, channel, comp_vector, attr) +#else +//ibv_poll_cq +#define vma_ibv_poll_cq(cq, num, wc) ibv_poll_cq(cq, num, wc) +typedef struct ibv_wc vma_ibv_wc; +#define vma_wc_flags(wc) (wc).wc_flags +#define vma_wc_opcode(wc) (wc).opcode +#define VMA_IBV_WC_RECV IBV_WC_RECV + +//verbs cq +typedef int vma_ibv_cq_init_attr; +#define vma_ibv_create_cq(context, cqe, cq_context, channel, comp_vector, attr) ibv_create_cq(context, cqe, cq_context, channel, comp_vector) +#endif + +#ifdef DEFINED_IBV_EXP_DEVICE_RX_CSUM_L4_PKT +#define vma_wc_rx_hw_csum_ok(wc) ((vma_wc_flags(wc) & IBV_EXP_L3_RX_CSUM_OK) && (vma_wc_flags(wc) & IBV_EXP_L4_RX_CSUM_OK)) +#else +#ifdef DEFINED_IBV_EXP_DEVICE_RX_CSUM_TCP_UDP_PKT +#define vma_wc_rx_hw_csum_ok(wc) ((vma_wc_flags(wc) & IBV_EXP_WC_RX_IP_CSUM_OK) && (vma_wc_flags(wc) & IBV_EXP_WC_RX_TCP_UDP_CSUM_OK)) +#else +#define vma_wc_rx_hw_csum_ok(wc) (1) +#endif +#endif + +//rx hw timestamp +#ifdef DEFINED_IBV_CQ_TIMESTAMP +#define VMA_IBV_WC_WITH_TIMESTAMP IBV_EXP_WC_WITH_TIMESTAMP +#define vma_wc_timestamp(wc) (wc).timestamp +#define VMA_IBV_DEVICE_ATTR_HCA_CORE_CLOCK IBV_EXP_DEVICE_ATTR_WITH_HCA_CORE_CLOCK +#define VMA_IBV_VALUES_MASK_RAW_CLOCK 0 +#define vma_ibv_query_values(ctx, values) ibv_exp_query_values(ctx, IBV_EXP_VALUES_HW_CLOCK, values) +#define vma_get_ts_val(values) values.hwclock +typedef struct ibv_exp_values vma_ts_values; +#define vma_ibv_cq_init_ts_attr(attr) { (attr)->flags |= IBV_EXP_CQ_TIMESTAMP; (attr)->comp_mask |= IBV_EXP_CQ_INIT_ATTR_FLAGS; } +#else +#define VMA_IBV_WC_WITH_TIMESTAMP 0 +#define vma_wc_timestamp(wc) 0 +#define vma_ibv_cq_init_ts_attr(attr) { NOT_IN_USE(attr); } +#endif + +#ifdef DEFINED_IBV_CQ_ATTR_MODERATE +typedef struct ibv_exp_cq_attr vma_ibv_cq_attr; +#define vma_ibv_modify_cq(cq, cq_attr, mask) ibv_exp_modify_cq(cq, cq_attr,mask) +#define vma_cq_attr_mask(cq_attr) (cq_attr).comp_mask +#define vma_cq_attr_moderation(cq_attr) (cq_attr).moderation +#define VMA_IBV_CQ_MODERATION IBV_EXP_CQ_ATTR_MODERATION +#endif + +//ibv_post_send +#define VMA_IBV_SEND_SIGNALED IBV_EXP_SEND_SIGNALED +#define VMA_IBV_SEND_INLINE IBV_EXP_SEND_INLINE +#ifdef DEFINED_IBV_EXP_SEND_IP_CSUM + #define VMA_IBV_SEND_IP_CSUM (IBV_EXP_SEND_IP_CSUM) +#else + #define DEFINED_SW_CSUM +#endif +#define vma_ibv_send_flags ibv_exp_send_flags +#define vma_send_wr_send_flags(wr) (wr).exp_send_flags +#define VMA_IBV_WR_SEND IBV_EXP_WR_SEND +#define vma_ibv_wr_opcode ibv_exp_wr_opcode +#define vma_send_wr_opcode(wr) (wr).exp_opcode + +#ifdef DEFINED_TSO + #define VMA_IBV_WR_TSO (vma_ibv_wr_opcode)IBV_EXP_WR_TSO + #define vma_check_dev_attr_tso(_attr) ((_attr)->comp_mask & IBV_EXP_DEVICE_ATTR_TSO_CAPS) + #define vma_get_tso_caps(_attr) (((vma_ibv_device_attr_ex *)(_attr))->tso_caps) + #define vma_ibv_qp_init_attr_tso(_attr, _max_tso_header) \ + do { \ + _attr.comp_mask |= IBV_EXP_QP_INIT_ATTR_MAX_TSO_HEADER; \ + _attr.max_tso_header = _max_tso_header; \ + } while (0) + typedef struct ibv_exp_tso_caps vma_ibv_tso_caps; +#else + #define VMA_IBV_WR_TSO (vma_ibv_wr_opcode)VMA_IBV_WR_SEND + #define vma_check_dev_attr_tso(_attr) 0 + #define vma_ibv_qp_init_attr_tso(_attr, _max_tso_header) ((void)0) +#endif /* DEFINED_TSO */ + +// Dummy send +#ifdef DEFINED_IBV_WR_NOP +#define vma_is_nop_supported(device_attr) ((device_attr)->exp_device_cap_flags & IBV_EXP_DEVICE_NOP) +#define VMA_IBV_WR_NOP IBV_EXP_WR_NOP +#else +#define vma_is_nop_supported(device_attr) 0 +#define VMA_IBV_WR_NOP (vma_ibv_wr_opcode)(0) // Use 0 as "default" opcode when NOP is not defined. +#endif + +#define vma_ibv_post_send(qp, wr, bad_wr) ibv_exp_post_send(qp, wr, bad_wr) +typedef struct ibv_exp_send_wr vma_ibv_send_wr; +//ibv_reg_mr +#define VMA_IBV_ACCESS_LOCAL_WRITE IBV_EXP_ACCESS_LOCAL_WRITE +#ifdef DEFINED_IBV_EXP_ACCESS_ALLOCATE_MR +#define VMA_IBV_ACCESS_ALLOCATE_MR IBV_EXP_ACCESS_ALLOCATE_MR +#endif +//flow steering +#define VMA_IBV_FLOW_ATTR_NORMAL IBV_EXP_FLOW_ATTR_NORMAL +#define VMA_IBV_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK IBV_EXP_FLOW_ATTR_FLAGS_ALLOW_LOOP_BACK +#ifdef DEFINED_IBV_FLOW_SPEC_IB +#define VMA_IBV_FLOW_SPEC_IB IBV_EXP_FLOW_SPEC_IB +#endif +#define VMA_IBV_FLOW_SPEC_ETH IBV_EXP_FLOW_SPEC_ETH +#define VMA_IBV_FLOW_SPEC_IPV4 IBV_EXP_FLOW_SPEC_IPV4 +#define VMA_IBV_FLOW_SPEC_TCP IBV_EXP_FLOW_SPEC_TCP +#define VMA_IBV_FLOW_SPEC_UDP IBV_EXP_FLOW_SPEC_UDP +#define vma_ibv_create_flow(qp, flow) ibv_exp_create_flow(qp, flow) +#define vma_ibv_destroy_flow(flow_id) ibv_exp_destroy_flow(flow_id) +typedef struct ibv_exp_flow vma_ibv_flow; +typedef struct ibv_exp_flow_attr vma_ibv_flow_attr; +typedef struct ibv_exp_flow_spec_ib vma_ibv_flow_spec_ib; +typedef struct ibv_exp_flow_spec_eth vma_ibv_flow_spec_eth; +typedef struct ibv_exp_flow_spec_ipv4 vma_ibv_flow_spec_ipv4; +typedef struct ibv_exp_flow_spec_tcp_udp vma_ibv_flow_spec_tcp_udp; + +//Flow tag +#ifdef DEFINED_IBV_FLOW_TAG +#define VMA_IBV_FLOW_SPEC_ACTION_TAG IBV_EXP_FLOW_SPEC_ACTION_TAG +#define vma_get_flow_tag(cqe) ntohl((uint32_t)(cqe->sop_drop_qpn)) +typedef struct ibv_exp_flow_spec_action_tag vma_ibv_flow_spec_action_tag; +#else +#define vma_get_flow_tag(cqe) 0 +typedef struct ibv_exp_flow_spec_action_tag_dummy {} vma_ibv_flow_spec_action_tag; +#endif //DEFINED_IBV_FLOW_TAG + +// Clock info +#ifdef DEFINED_IBV_CLOCK_INFO +typedef struct ibv_exp_values vma_ibv_clock_info; +#define vma_ibv_convert_ts_to_ns(info, hw_ts) ibv_exp_cqe_ts_to_ns(&((info)->clock_info), hw_ts) +#define vma_ibv_query_clock_info(ctx, clock_info) ibv_exp_query_values(ctx, IBV_EXP_VALUES_CLOCK_INFO, clock_info) +#endif //DEFINED_IBV_CLOCK_INFO + +// ibv_dm +#ifdef DEFINED_IBV_DM +#define vma_ibv_alloc_dm(ctx, attr) ibv_exp_alloc_dm(ctx, attr) +#define vma_ibv_free_dm(dm) ibv_exp_free_dm(dm) +#define vma_ibv_reg_dm_mr(mr) ibv_exp_reg_mr(mr) +#define vma_ibv_memcpy_dm(dm, attr) ibv_exp_memcpy_dm(dm, attr) +#define vma_ibv_init_memcpy_dm(attr, src, head, size) { attr.memcpy_dir = IBV_EXP_DM_CPY_TO_DEVICE; attr.host_addr = src; attr.dm_offset = head; attr.length = size; } +#define vma_ibv_init_dm_mr(in_mr, ctx_pd, size, allocated_dm) { in_mr.pd = ctx_pd; in_mr.comp_mask = IBV_EXP_REG_MR_DM; in_mr.length = size; in_mr.dm = allocated_dm; } +typedef struct ibv_exp_alloc_dm_attr vma_ibv_alloc_dm_attr; +typedef struct ibv_exp_memcpy_dm_attr vma_ibv_memcpy_dm_attr; +typedef struct ibv_exp_dm vma_ibv_dm; +typedef struct ibv_exp_reg_mr_in vma_ibv_reg_mr_in; +#endif + +#ifdef DEFINED_IBV_PACKET_PACING_CAPS +#define VMA_IBV_QP_RATE_LIMIT IBV_EXP_QP_RATE_LIMIT +#define vma_is_pacing_caps_supported(attr) ((attr)->comp_mask & IBV_EXP_DEVICE_ATTR_PACKET_PACING_CAPS) +typedef vma_ibv_qp_attr vma_ibv_rate_limit_attr; +#define vma_ibv_modify_qp_rate_limit(qp, attr, mask) vma_ibv_modify_qp(qp, attr, mask) +#define vma_ibv_init_qps_attr(qp_attr) { qp_attr.qp_state = IBV_QPS_RTS; } +#endif // DEFINED_IBV_PACKET_PACING_CAPS + +#ifdef DEFINED_IBV_QP_SUPPORT_BURST +#define vma_ibv_init_burst_attr(qp_attr, rate_limit) { qp_attr.burst_info.max_burst_sz = rate_limit.max_burst_sz; qp_attr.burst_info.typical_pkt_sz = rate_limit.typical_pkt_sz; qp_attr.comp_mask |= IBV_EXP_QP_ATTR_BURST_INFO; } +#endif // DEFINED_IBV_QP_SUPPORT_BURST + +#endif /* DEFINED_VERBS_VERSION */ + +// ibv_dm +#ifdef DEFINED_IBV_DM +#define vma_ibv_dm_size(attr) ((attr)->max_dm_size) +#else +#define vma_ibv_dm_size(attr) (0) +#endif + +#ifdef HAVE_MP_RQ +#define vma_is_umr_supported(attr) ((attr)->umr_caps.max_klm_list_size) +#define vma_is_mp_rq_supported(attr) ((attr)->comp_mask & IBV_EXP_DEVICE_ATTR_MP_RQ) +#else +#define vma_is_umr_supported(attr) (0) +#define vma_is_mp_rq_supported(attr) (0) +#endif + +#if defined(HAVE_IBV_EXP_GET_DEVICE_LIST) +#define vma_ibv_get_device_list(num) ibv_exp_get_device_list(num) +#else +#define vma_ibv_get_device_list(num) ibv_get_device_list(num) +#endif + +typedef enum { + RL_RATE = 1<<0, + RL_BURST_SIZE = 1<<1, + RL_PKT_SIZE = 1<<2, +} vma_rl_changed; + +int vma_rdma_lib_reset(); + +#ifdef DEFINED_IBV_FLOW_SPEC_IB +static inline void ibv_flow_spec_ib_set_by_dst_gid(vma_ibv_flow_spec_ib* ib, uint8_t* dst_gid) +{ + ib->type = VMA_IBV_FLOW_SPEC_IB; + ib->size = sizeof(vma_ibv_flow_spec_ib); + if (dst_gid) + { + memcpy(ib->val.dst_gid, dst_gid, 16); + memset(ib->mask.dst_gid, FS_MASK_ON_8, 16); + } +} + +static inline void ibv_flow_spec_ib_set_by_dst_qpn(vma_ibv_flow_spec_ib* ib, uint32_t dst_qpn) +{ + ib->type = VMA_IBV_FLOW_SPEC_IB; + ib->size = sizeof(vma_ibv_flow_spec_ib); + ib->val.qpn = dst_qpn; + ib->mask.qpn = FS_MASK_ON_32; +} +#endif + +static inline void ibv_flow_spec_eth_set(vma_ibv_flow_spec_eth* eth, uint8_t* dst_mac, uint16_t vlan_tag) +{ + eth->type = VMA_IBV_FLOW_SPEC_ETH; + eth->size = sizeof(vma_ibv_flow_spec_eth); + eth->val.ether_type = ntohs(ETH_P_IP); + eth->mask.ether_type = FS_MASK_ON_16; + memcpy(eth->val.dst_mac, dst_mac, ETH_ALEN); + memset(eth->mask.dst_mac, FS_MASK_ON_8, ETH_ALEN); + eth->val.vlan_tag = vlan_tag & htons(VLAN_VID_MASK); + eth->mask.vlan_tag = eth->val.vlan_tag ? htons(VLAN_VID_MASK) : 0; //we do not support vlan options +} + +static inline void ibv_flow_spec_ipv4_set(vma_ibv_flow_spec_ipv4* ipv4, uint32_t dst_ip, uint32_t src_ip) +{ + ipv4->type = VMA_IBV_FLOW_SPEC_IPV4; + ipv4->size = sizeof(vma_ibv_flow_spec_ipv4); + ipv4->val.src_ip = src_ip; + if (ipv4->val.src_ip) ipv4->mask.src_ip = FS_MASK_ON_32; + ipv4->val.dst_ip = dst_ip; + if (ipv4->val.dst_ip) ipv4->mask.dst_ip = FS_MASK_ON_32; +} + +static inline void ibv_flow_spec_tcp_udp_set(vma_ibv_flow_spec_tcp_udp* tcp_udp, bool is_tcp, uint16_t dst_port, uint16_t src_port) +{ + tcp_udp->type = is_tcp ? VMA_IBV_FLOW_SPEC_TCP : VMA_IBV_FLOW_SPEC_UDP; + tcp_udp->size = sizeof(vma_ibv_flow_spec_tcp_udp); + tcp_udp->val.src_port = src_port; + if(tcp_udp->val.src_port) tcp_udp->mask.src_port = FS_MASK_ON_16; + tcp_udp->val.dst_port = dst_port; + if(tcp_udp->val.dst_port) tcp_udp->mask.dst_port = FS_MASK_ON_16; +} + +static inline void ibv_flow_spec_flow_tag_set(vma_ibv_flow_spec_action_tag* flow_tag, uint32_t tag_id) +{ + NOT_IN_USE(tag_id); + if (flow_tag == NULL) + return; +#ifdef DEFINED_IBV_FLOW_TAG + flow_tag->type = VMA_IBV_FLOW_SPEC_ACTION_TAG; + flow_tag->size = sizeof(vma_ibv_flow_spec_action_tag); + flow_tag->tag_id = tag_id; +#endif //DEFINED_IBV_FLOW_TAG +} + + +static inline void ibv_source_qpn_set(vma_ibv_qp_init_attr& qp_init_attr, uint32_t source_qpn) +{ + NOT_IN_USE(qp_init_attr); + NOT_IN_USE(source_qpn); + +#ifdef DEFINED_IBV_QP_INIT_SOURCE_QPN + if (source_qpn) { + qp_init_attr.comp_mask |= VMA_IBV_QP_INIT_QPN_MASK; + vma_ibv_qp_create_flags(qp_init_attr) |= VMA_IBV_QP_INIT_QPN_CREATE_FLAGS; + vma_ibv_qp_source_qpn(qp_init_attr) = source_qpn; + } +#endif /* DEFINED_IBV_QP_INIT_SOURCE_QPN */ +} + +#endif diff --git a/src/vma/ib/mlx5/ib_mlx5.cpp b/src/vma/ib/mlx5/ib_mlx5.cpp new file mode 100644 index 0000000..4135887 --- /dev/null +++ b/src/vma/ib/mlx5/ib_mlx5.cpp @@ -0,0 +1,215 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif +#include "util/valgrind.h" +#if defined(DEFINED_DIRECT_VERBS) + +#include "vma/util/valgrind.h" +#include "vma/util/utils.h" +#include "vma/ib/mlx5/ib_mlx5.h" + + +int vma_ib_mlx5_get_qp(struct ibv_qp *qp, vma_ib_mlx5_qp_t *mlx5_qp, uint32_t flags) +{ + int ret = 0; + struct mlx5dv_obj obj; + struct mlx5dv_qp dqp; + enum ibv_qp_attr_mask attr_mask = IBV_QP_CAP; + struct ibv_qp_attr tmp_ibv_qp_attr; + struct ibv_qp_init_attr tmp_ibv_qp_init_attr; + + memset(&obj, 0, sizeof(obj)); + memset(&dqp, 0, sizeof(dqp)); + + obj.qp.in = qp; + obj.qp.out = &dqp; + ret = vma_ib_mlx5dv_init_obj(&obj, MLX5DV_OBJ_QP); + if (ret != 0) { + goto out; + } + VALGRIND_MAKE_MEM_DEFINED(&dqp, sizeof(dqp)); + mlx5_qp->qp = qp; + mlx5_qp->qpn = qp->qp_num; + mlx5_qp->flags = flags; + mlx5_qp->sq.dbrec = &dqp.dbrec[MLX5_SND_DBR]; + mlx5_qp->sq.buf = dqp.sq.buf; + mlx5_qp->sq.wqe_cnt = dqp.sq.wqe_cnt; + mlx5_qp->sq.stride = dqp.sq.stride; + mlx5_qp->rq.dbrec = &dqp.dbrec[MLX5_RCV_DBR]; + mlx5_qp->rq.buf = dqp.rq.buf; + mlx5_qp->rq.wqe_cnt = dqp.rq.wqe_cnt; + mlx5_qp->rq.stride = dqp.rq.stride; + mlx5_qp->rq.wqe_shift = ilog_2(dqp.rq.stride); + mlx5_qp->rq.head = 0; + mlx5_qp->rq.tail = 0; + mlx5_qp->bf.reg = dqp.bf.reg; + mlx5_qp->bf.size = dqp.bf.size; + mlx5_qp->bf.offset = 0; + + ret = ibv_query_qp(qp, &tmp_ibv_qp_attr, attr_mask, &tmp_ibv_qp_init_attr); + if (ret != 0) { + goto out; + } + + VALGRIND_MAKE_MEM_DEFINED(&tmp_ibv_qp_attr, sizeof(tmp_ibv_qp_attr)); + mlx5_qp->cap.max_send_wr = tmp_ibv_qp_attr.cap.max_send_wr; + mlx5_qp->cap.max_recv_wr = tmp_ibv_qp_attr.cap.max_recv_wr; + mlx5_qp->cap.max_send_sge = tmp_ibv_qp_attr.cap.max_send_sge; + mlx5_qp->cap.max_recv_sge = tmp_ibv_qp_attr.cap.max_recv_sge; + mlx5_qp->cap.max_inline_data = tmp_ibv_qp_attr.cap.max_inline_data; + +out: + return ret; +} + +int vma_ib_mlx5_get_cq(struct ibv_cq *cq, vma_ib_mlx5_cq_t *mlx5_cq) +{ + int ret = 0; + struct mlx5dv_obj obj; + struct mlx5dv_cq dcq; + + /* Initialization of cq can be done once to protect + * internal data from corruption. + * cq field is used to detect one time initialization + * For example: this function can be called when QP is moved + * from ERROR state to RESET so cq_ci or cq_sn should not be + * updated + */ + if (mlx5_cq == NULL || mlx5_cq->cq == cq) { + return 0; + } + + memset(&obj, 0, sizeof(obj)); + memset(&dcq, 0, sizeof(dcq)); + + obj.cq.in = cq; + obj.cq.out = &dcq; + ret = vma_ib_mlx5dv_init_obj(&obj, MLX5DV_OBJ_CQ); + if (ret != 0) { + return ret; + } + VALGRIND_MAKE_MEM_DEFINED(&dcq, sizeof(dcq)); + mlx5_cq->cq = cq; + mlx5_cq->cq_num = dcq.cqn; + mlx5_cq->cq_ci = 0; + mlx5_cq->cq_sn = 0; + mlx5_cq->cqe_count = dcq.cqe_cnt; + mlx5_cq->cqe_size = dcq.cqe_size; + mlx5_cq->cqe_size_log = ilog_2(dcq.cqe_size); + mlx5_cq->dbrec = dcq.dbrec; + mlx5_cq->uar = dcq.cq_uar; + + /* Move buffer forward for 128b CQE, so we would get pointer to the 2nd + * 64b when polling. + */ + mlx5_cq->cq_buf = (uint8_t *)dcq.buf + dcq.cqe_size - sizeof(struct mlx5_cqe64); + + return 0; +} + +int vma_ib_mlx5_post_recv(vma_ib_mlx5_qp_t *mlx5_qp, + struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr) +{ + struct mlx5_wqe_data_seg *scat; + int err = 0; + int nreq; + int ind; + int i, j; + + ind = mlx5_qp->rq.head & (mlx5_qp->rq.wqe_cnt - 1); + *bad_wr = NULL; + + for (nreq = 0; wr; ++nreq, wr = wr->next) { + if (unlikely((int)mlx5_qp->rq.head - (int)mlx5_qp->rq.tail + nreq >= (int)mlx5_qp->cap.max_recv_wr)) { + errno = ENOMEM; + err = -errno; + *bad_wr = wr; + goto out; + } + + if (unlikely(wr->num_sge > (int)mlx5_qp->cap.max_recv_sge)) { + errno = EINVAL; + err = -errno; + *bad_wr = wr; + goto out; + } + + scat = (struct mlx5_wqe_data_seg *)((uint8_t *)mlx5_qp->rq.buf + (ind << mlx5_qp->rq.wqe_shift)); + + for (i = 0, j = 0; i < wr->num_sge; ++i) { + if (unlikely(!wr->sg_list[i].length)) continue; + + scat[j].byte_count = htonl(wr->sg_list[i].length); + scat[j].lkey = htonl(wr->sg_list[i].lkey); + scat[j].addr = htonll(wr->sg_list[i].addr); + j++; + } + + if (j < (int)mlx5_qp->cap.max_recv_sge) { + scat[j].byte_count = 0; + scat[j].lkey = htonl(MLX5_INVALID_LKEY); + scat[j].addr = 0; + } + + ind = (ind + 1) & (mlx5_qp->rq.wqe_cnt - 1); + } + +out: + if (likely(nreq)) { + mlx5_qp->rq.head += nreq; + + /* + * Make sure that descriptors are written before + * doorbell record. + */ + wmb(); + + /* + * For Raw Packet QP, avoid updating the doorbell record + * as long as the QP isn't in RTR state, to avoid receiving + * packets in illegal states. + * This is only for Raw Packet QPs since they are represented + * differently in the hardware. + */ + if (likely(!((mlx5_qp->qp->qp_type == IBV_QPT_RAW_PACKET || + mlx5_qp->flags & VMA_IB_MLX5_QP_FLAGS_USE_UNDERLAY) && + mlx5_qp->qp->state < IBV_QPS_RTR))) + *mlx5_qp->rq.dbrec = htonl(mlx5_qp->rq.head & 0xffff); + } + + return err; +} + +#endif /* DEFINED_DIRECT_VERBS */ diff --git a/src/vma/ib/mlx5/ib_mlx5.h b/src/vma/ib/mlx5/ib_mlx5.h new file mode 100644 index 0000000..2aea374 --- /dev/null +++ b/src/vma/ib/mlx5/ib_mlx5.h @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef SRC_VMA_IB_MLX5_H_ +#define SRC_VMA_IB_MLX5_H_ + +#if defined(DEFINED_DIRECT_VERBS) + +#include + +#if (DEFINED_DIRECT_VERBS == 2) +#include +#include "vma/ib/mlx5/ib_mlx5_hw.h" +#elif (DEFINED_DIRECT_VERBS == 3) +extern "C" { +#include +} +#include "vma/ib/mlx5/ib_mlx5_dv.h" +#else +#error "Unsupported Direct VERBS parameter" +#endif + +#include +#include + +/* ib/mlx5 layer is used by other VMA code that needs + * direct access to mlx5 resources. + * It hides differences in rdma-core(Upstream OFED) and mlx5(Mellanox OFED) + * mlx5 provider implementations. + * rdma-core(Upstream OFED) structures/macro/enum etc are taken as basis + * inside this layer + */ + + +/** + * Get internal verbs information. + */ +int vma_ib_mlx5dv_init_obj(struct mlx5dv_obj *obj, uint64_t type); + +enum { + VMA_IB_MLX5_QP_FLAGS_USE_UNDERLAY = 0x01 +}; + +enum { + VMA_IB_MLX5_CQ_SET_CI = 0, + VMA_IB_MLX5_CQ_ARM_DB = 1 +}; + +/* Queue pair */ +typedef struct vma_ib_mlx5_qp { + struct ibv_qp *qp; + uint32_t qpn; + uint32_t flags; + struct ibv_qp_cap cap; + struct { + volatile uint32_t *dbrec; + void *buf; + uint32_t wqe_cnt; + uint32_t stride; + } sq; + struct { + volatile uint32_t *dbrec; + void *buf; + uint32_t wqe_cnt; + uint32_t stride; + uint32_t wqe_shift; + unsigned head; + unsigned tail; + } rq; + struct { + void *reg; + uint32_t size; + uint32_t offset; + } bf; +} vma_ib_mlx5_qp_t; + +/* Completion queue */ +typedef struct vma_ib_mlx5_cq { + struct ibv_cq *cq; + void *cq_buf; + unsigned cq_num; + unsigned cq_ci; + unsigned cq_sn; + unsigned cqe_count; + unsigned cqe_size; + unsigned cqe_size_log; + volatile uint32_t *dbrec; + void *uar; +} vma_ib_mlx5_cq_t; + +int vma_ib_mlx5_get_qp(struct ibv_qp *qp, vma_ib_mlx5_qp_t *mlx5_qp, uint32_t flags = 0); +int vma_ib_mlx5_post_recv(vma_ib_mlx5_qp_t *mlx5_qp, struct ibv_recv_wr *wr, struct ibv_recv_wr **bad_wr); + +int vma_ib_mlx5_get_cq(struct ibv_cq *cq, vma_ib_mlx5_cq_t *mlx5_cq); +int vma_ib_mlx5_req_notify_cq(vma_ib_mlx5_cq_t *mlx5_cq, int solicited); +void vma_ib_mlx5_get_cq_event(vma_ib_mlx5_cq_t *mlx5_cq, int count); + +#endif /* DEFINED_DIRECT_VERBS */ + +#endif /* SRC_VMA_IB_MLX5_H_ */ diff --git a/src/vma/ib/mlx5/ib_mlx5_dv.cpp b/src/vma/ib/mlx5/ib_mlx5_dv.cpp new file mode 100644 index 0000000..3b45406 --- /dev/null +++ b/src/vma/ib/mlx5/ib_mlx5_dv.cpp @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#if defined(DEFINED_DIRECT_VERBS) && (DEFINED_DIRECT_VERBS == 3) + +#include "vma/ib/mlx5/ib_mlx5.h" + +int vma_ib_mlx5dv_init_obj(struct mlx5dv_obj *obj, uint64_t type) +{ + int ret = 0; + + ret = mlx5dv_init_obj(obj, type); + + return ret; +} + +int vma_ib_mlx5_req_notify_cq(vma_ib_mlx5_cq_t *mlx5_cq, int solicited) +{ + uint64_t doorbell; + uint32_t sn; + uint32_t ci; + uint32_t cmd; + + sn = mlx5_cq->cq_sn & 3; + ci = mlx5_cq->cq_ci & 0xffffff; + cmd = solicited ? MLX5_CQ_DB_REQ_NOT_SOL : MLX5_CQ_DB_REQ_NOT; + + doorbell = sn << 28 | cmd | ci; + doorbell <<= 32; + doorbell |= mlx5_cq->cq_num; + + mlx5_cq->dbrec[VMA_IB_MLX5_CQ_ARM_DB] = htonl(sn << 28 | cmd | ci); + + /* + * Make sure that the doorbell record in host memory is + * written before ringing the doorbell via PCI WC MMIO. + */ + wmb(); + + *(uint64_t *)((uint8_t *)mlx5_cq->uar + MLX5_CQ_DOORBELL) = htonll(doorbell); + + wc_wmb(); + + return 0; +} + +void vma_ib_mlx5_get_cq_event(vma_ib_mlx5_cq_t *mlx5_cq, int count) +{ + mlx5_cq->cq_sn += count; +} + +#endif /* (DEFINED_DIRECT_VERBS == 3) */ diff --git a/src/vma/ib/mlx5/ib_mlx5_dv.h b/src/vma/ib/mlx5/ib_mlx5_dv.h new file mode 100644 index 0000000..d5dd889 --- /dev/null +++ b/src/vma/ib/mlx5/ib_mlx5_dv.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef SRC_VMA_IB_MLX5_DV_H_ +#define SRC_VMA_IB_MLX5_DV_H_ + +#ifndef SRC_VMA_IB_MLX5_H_ +#error "Use instead." +#endif + +#if defined(DEFINED_DIRECT_VERBS) && (DEFINED_DIRECT_VERBS == 3) + +#endif /* (DEFINED_DIRECT_VERBS == 3) */ + +#endif /* SRC_VMA_IB_MLX5_DV_H_ */ diff --git a/src/vma/ib/mlx5/ib_mlx5_hw.cpp b/src/vma/ib/mlx5/ib_mlx5_hw.cpp new file mode 100644 index 0000000..323fa31 --- /dev/null +++ b/src/vma/ib/mlx5/ib_mlx5_hw.cpp @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif +#include "util/valgrind.h" +#if defined(DEFINED_DIRECT_VERBS) && (DEFINED_DIRECT_VERBS == 2) + +#include "vma/ib/mlx5/ib_mlx5.h" + +static int vma_ib_mlx5dv_get_qp(struct ibv_qp *qp, struct mlx5dv_qp *mlx5_qp); +static int vma_ib_mlx5dv_get_cq(struct ibv_cq *cq, struct mlx5dv_cq *mlx5_cq); + + +int vma_ib_mlx5dv_init_obj(struct mlx5dv_obj *obj, uint64_t obj_type) +{ + int ret = 0; + + if (obj_type & MLX5DV_OBJ_QP) { + ret = vma_ib_mlx5dv_get_qp(obj->qp.in, obj->qp.out); + } + + if (!ret && (obj_type & MLX5DV_OBJ_CQ)) { + ret = vma_ib_mlx5dv_get_cq(obj->cq.in, obj->cq.out); + } + + return ret; +} + +static int vma_ib_mlx5dv_get_qp(struct ibv_qp *qp, struct mlx5dv_qp *mlx5_qp) +{ + int ret = 0; + struct ibv_mlx5_qp_info ibv_qp_info; + struct mlx5_qp *mqp = to_mqp(qp); + + ret = ibv_mlx5_exp_get_qp_info(qp, &ibv_qp_info); + if (ret != 0) { + return ret; + } + VALGRIND_MAKE_MEM_DEFINED(&ibv_qp_info, sizeof(ibv_qp_info)); + mlx5_qp->dbrec = ibv_qp_info.dbrec; + mlx5_qp->sq.buf = (mqp->sq_buf_size ? + (void *)((uintptr_t)mqp->sq_buf.buf) : /* IBV_QPT_RAW_PACKET or Underly QP */ + (void *)((uintptr_t)mqp->buf.buf + mqp->sq.offset)); + mlx5_qp->sq.wqe_cnt = ibv_qp_info.sq.wqe_cnt; + mlx5_qp->sq.stride = ibv_qp_info.sq.stride; + mlx5_qp->rq.buf = ibv_qp_info.rq.buf; + mlx5_qp->rq.wqe_cnt = ibv_qp_info.rq.wqe_cnt; + mlx5_qp->rq.stride = ibv_qp_info.rq.stride; + mlx5_qp->bf.reg = ibv_qp_info.bf.reg; + mlx5_qp->bf.size = ibv_qp_info.bf.size; + + return ret; +} + +static int vma_ib_mlx5dv_get_cq(struct ibv_cq *cq, struct mlx5dv_cq *mlx5_cq) +{ + int ret = 0; + struct ibv_mlx5_cq_info ibv_cq_info; + + ret = ibv_mlx5_exp_get_cq_info(cq, &ibv_cq_info); + if (ret != 0) { + return ret; + } + VALGRIND_MAKE_MEM_DEFINED(&ibv_cq_info, sizeof(ibv_cq_info)); + mlx5_cq->buf = ibv_cq_info.buf; + mlx5_cq->dbrec = ibv_cq_info.dbrec; + mlx5_cq->cqe_cnt = ibv_cq_info.cqe_cnt; + mlx5_cq->cqe_size = ibv_cq_info.cqe_size; + mlx5_cq->cq_uar = NULL; + mlx5_cq->cqn = ibv_cq_info.cqn; + + return ret; +} + +int vma_ib_mlx5_req_notify_cq(vma_ib_mlx5_cq_t *mlx5_cq, int solicited) +{ + struct mlx5_cq *mcq = to_mcq(mlx5_cq->cq); + mcq->cons_index = mlx5_cq->cq_ci; + return ibv_req_notify_cq(mlx5_cq->cq, solicited); +} + +void vma_ib_mlx5_get_cq_event(vma_ib_mlx5_cq_t *, int) +{ + // no need in operation with cq_sn as far as it is managed by driver code for now +} + +#endif /* (DEFINED_DIRECT_VERBS == 2) */ diff --git a/src/vma/ib/mlx5/ib_mlx5_hw.h b/src/vma/ib/mlx5/ib_mlx5_hw.h new file mode 100644 index 0000000..cfff2d9 --- /dev/null +++ b/src/vma/ib/mlx5/ib_mlx5_hw.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef SRC_VMA_IB_MLX5_HW_H_ +#define SRC_VMA_IB_MLX5_HW_H_ + +#ifndef SRC_VMA_IB_MLX5_H_ +#error "Use instead." +#endif + +#if defined(DEFINED_DIRECT_VERBS) && (DEFINED_DIRECT_VERBS == 2) + +#include + +/* This structures duplicate mlx5dv.h (rdma-core upstream) + * to use upstream specific approach as a basis + */ +struct mlx5dv_qp { + volatile uint32_t *dbrec; + struct { + void *buf; + uint32_t wqe_cnt; + uint32_t stride; + } sq; + struct { + void *buf; + uint32_t wqe_cnt; + uint32_t stride; + } rq; + struct { + void *reg; + uint32_t size; + } bf; + uint64_t comp_mask; +}; + +struct mlx5dv_cq { + void *buf; + volatile uint32_t *dbrec; + uint32_t cqe_cnt; + uint32_t cqe_size; + void *cq_uar; + uint32_t cqn; + uint64_t comp_mask; +}; + +struct mlx5dv_obj { + struct { + struct ibv_qp *in; + struct mlx5dv_qp *out; + } qp; + struct { + struct ibv_cq *in; + struct mlx5dv_cq *out; + } cq; +}; + +enum mlx5dv_obj_type { + MLX5DV_OBJ_QP = 1 << 0, + MLX5DV_OBJ_CQ = 1 << 1, +}; + +#endif /* (DEFINED_DIRECT_VERBS == 2) */ + +#endif /* SRC_VMA_IB_MLX5_HW_H_ */ diff --git a/src/vma/infra/DemoCollMgr.cpp b/src/vma/infra/DemoCollMgr.cpp new file mode 100644 index 0000000..f95dbd1 --- /dev/null +++ b/src/vma/infra/DemoCollMgr.cpp @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "DemoCollMgr.h" + +Demo_Coll_Mgr1::Demo_Coll_Mgr1() : cache_collection_mgr, demo_subject_1_value_t>("lock: Demo_Coll_Mgr1") +{ + printf("created collection mgr: char --> int\n"); + +} + +Demo_Subject1* Demo_Coll_Mgr1::create_new_entry(key_class key, const observer* obs) +{ + NOT_IN_USE(obs); + return new Demo_Subject1(key.get_actual_key()); +} + +Demo_Coll_Mgr1::~Demo_Coll_Mgr1() +{ + +} + +Demo_Coll_Mgr2::Demo_Coll_Mgr2() : cache_collection_mgr, demo_subject_2_value_t>("lock: Demo_Coll_Mgr2") +{ + printf("created collection mgr: int --> uint \n"); + +} + +Demo_Subject2* Demo_Coll_Mgr2::create_new_entry(key_class key, const observer* obs) +{ + NOT_IN_USE(obs); + return new Demo_Subject2(key.get_actual_key()); +} + + +Demo_Coll_Mgr2::~Demo_Coll_Mgr2() +{ + +} + + diff --git a/src/vma/infra/DemoCollMgr.h b/src/vma/infra/DemoCollMgr.h new file mode 100644 index 0000000..76c0bb9 --- /dev/null +++ b/src/vma/infra/DemoCollMgr.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef DEMOCOLLMGR_H_ +#define DEMOCOLLMGR_H_ + +#include "cache_subject_observer.h" +#include "DemoSubject.h" + +class Demo_Coll_Mgr1 : public cache_table_mgr, demo_subject_1_value_t> +{ +public: + Demo_Coll_Mgr1(); + virtual ~Demo_Coll_Mgr1(); + virtual Demo_Subject1* create_new_entry(key_class, const observer*); +}; + +class Demo_Coll_Mgr2 : public cache_table_mgr, demo_subject_2_value_t> +{ +public: + Demo_Coll_Mgr2(); + virtual ~Demo_Coll_Mgr2(); + virtual Demo_Subject2* create_new_entry(key_class, const observer*); +}; + +#endif /* DEMOCOLLMGR_H_ */ diff --git a/src/vma/infra/DemoObserver.cpp b/src/vma/infra/DemoObserver.cpp new file mode 100644 index 0000000..904bd40 --- /dev/null +++ b/src/vma/infra/DemoObserver.cpp @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * + * DemoObserver.cpp + * + */ + +#include "DemoObserver.h" + +Demo_Observer::Demo_Observer() : cache_observer() +{ + printf("created observer: id = %p\n", this); +} + +void Demo_Observer::notify_cb() +{ + set_state(false); + printf("observer %p was notified\n", this); +}; + +void Demo_Observer::register_to_subjects(Demo_Coll_Mgr1 *coll_for_subjects_1, Demo_Coll_Mgr2 *coll_for_subjects_2) +{ + Demo_Subject1* s1 = NULL; + Demo_Subject2* s2 = NULL; + key_class c('a'); + key_class i(1); + char ch='a'; + int in=1; + + // ######################### create collections of subjects type 1+2 ######################### // + for(; ch < 'f' && in < 6; ch++, in++) + { + c.set_actual_key(ch); + coll_for_subjects_1->register_observer(c, this, (cache_entry_subject,demo_subject_1_value_t> **)&s1); // registered for subject1 with key 'a' + m_subjects_1_list.insert(pair(c.get_actual_key(), s1)); + i.set_actual_key(in); + coll_for_subjects_2->register_observer(i, this, (cache_entry_subject,demo_subject_2_value_t> **)&s2); // registered for subject2 with key 1 + m_subjects_2_list.insert(pair(i.get_actual_key(), s2)); + } + +} + +void Demo_Observer::update_subject_1(demo_subject_1_key_t key, demo_subject_1_value_t value) +{ + Demo_Subject1 *s1 = m_subjects_1_list.find(key)->second; //find subject corresponding to key in the list + if (s1) + { + s1->update_val(value); // expected output: notification msg + s1->notify_observers(); + } + else + printf("subject corresponding to key wasn't found\n"); +} + +void Demo_Observer::get_subject_1(demo_subject_1_key_t key) +{ + demo_subject_1_value_t val_s1; + Demo_Subject1 *s1 = m_subjects_1_list.find(key)->second; //find subject corresponding to key in the list + if (s1) + { + s1->get_val(val_s1); + printf("subject1: key = %c, val = %d\n", key, val_s1); + } + else + printf("subject corresponding to key wasn't found\n"); +} + +void Demo_Observer::update_subject_2(demo_subject_2_key_t key, demo_subject_2_value_t value) +{ + Demo_Subject2 *s2 = m_subjects_2_list.find(key)->second; //find subject corresponding to key in the list + if (s2) + { + s2->update_val(value); // expected output: notification msg + s2->notify_observers(); + } + else + printf("subject corresponding to key wasn't found\n"); +} + +void Demo_Observer::get_subject_2(demo_subject_2_key_t key) +{ + demo_subject_2_value_t val_s2; + Demo_Subject2 *s2 = m_subjects_2_list.find(key)->second; //find subject corresponding to key in the list + if (s2) + { + s2->get_val(val_s2); + printf("subject2: key = %d, val = %d\n", key, val_s2); + } + else + printf("subject corresponding to key wasn't found\n"); +} + +bool Demo_Observer::start_test(Demo_Coll_Mgr1 *coll_for_subjects_1, Demo_Coll_Mgr2 *coll_for_subjects_2) +{ + update_subject_1('a', 12); + + update_subject_1('b', 13); + + update_subject_1('c', 14); + + get_subject_1('a'); // expected output: val = 12 + + update_subject_2(1, 1000); + + update_subject_2(2, 2000); + + get_subject_2(1); // expected output: val = 2000 + + coll_for_subjects_1->unregister_observer('a', this); + //m_subjects_1_list.erase('a'); // supposed to remove from m_subjects_1_list, left for testing + + update_subject_1('a', 15); // only other observer is notified + + coll_for_subjects_2->unregister_observer(2, this); + + return true; + +} + +Demo_Observer::~Demo_Observer() +{ + +} + diff --git a/src/vma/infra/DemoObserver.h b/src/vma/infra/DemoObserver.h new file mode 100644 index 0000000..31eb7f8 --- /dev/null +++ b/src/vma/infra/DemoObserver.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * + * DemoObserver.h + * + */ + +#ifndef DEMOOBSERVER_H_ +#define DEMOOBSERVER_H_ + +#include +#include +#include +#include "cache_subject_observer.h" +#include "DemoSubject.h" +#include "DemoCollMgr.h" +using namespace std; + +class Demo_Observer : public cache_observer +{ +public: + Demo_Observer(); + virtual ~Demo_Observer(); + + void register_to_subjects(Demo_Coll_Mgr1 *coll_for_subjects_1, Demo_Coll_Mgr2 *coll_for_subjects_2); + bool start_test(Demo_Coll_Mgr1 *coll_for_subjects_1, Demo_Coll_Mgr2 *coll_for_subjects_2); + + void notify_cb(); //hide cache_observer function for testing + +private: + + void update_subject_1(demo_subject_1_key_t key, demo_subject_1_value_t value); //sets value of subject type-1 corresponding to key + void update_subject_2(demo_subject_2_key_t key, demo_subject_2_value_t value); //sets value of subject type-2 corresponding to key + + void get_subject_1(demo_subject_1_key_t key); //prints value of subject type-1 corresponding to key + void get_subject_2(demo_subject_2_key_t key); //prints value of subject type-2 corresponding to key + + //Demo_Coll_Mgr1* m_coll_for_subjects_1; //collection mgr for subjects type-1 + //Demo_Coll_Mgr2* m_coll_for_subjects_2; //collection mgr for subjects type-2 + + map m_subjects_1_list; //list of observed subjects type-1 + map m_subjects_2_list; //list of observed subjects type-2 + +}; + +#endif /* DEMOOBSERVER_H_ */ diff --git a/src/vma/infra/DemoSubject.cpp b/src/vma/infra/DemoSubject.cpp new file mode 100644 index 0000000..b6ed294 --- /dev/null +++ b/src/vma/infra/DemoSubject.cpp @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * + * DemoSubject.h + * + */ + + +#include "DemoSubject.h" + +Demo_Subject1::Demo_Subject1(demo_subject_1_key_t key_1) + : cache_entry_subject, demo_subject_1_value_t>(key_class(key_1), "lock: Demo_Subject1") +{ + + printf("new subject of type 1: \n"); + + printf("\t key = %c, no value \n", key_1); + +} + +Demo_Subject1::Demo_Subject1(demo_subject_1_key_t key_1, demo_subject_1_value_t val_1) + : cache_entry_subject, demo_subject_1_value_t>(key_class(key_1)) +{ + + set_val(val_1); + + printf("new subject of type 1: \n"); + + printf("\t key = %c, value = %d\n", key_1, val_1); + +} + +Demo_Subject1::~Demo_Subject1() +{ + +} + +Demo_Subject2::Demo_Subject2(demo_subject_2_key_t key_2) + : cache_entry_subject, demo_subject_2_value_t>(key_class(key_2), "lock: Demo_Subject2") +{ + + printf("new subject of type 2: \n"); + + printf("\t key = %d, no value \n", key_2); + +} + +Demo_Subject2::Demo_Subject2(demo_subject_2_key_t key_2, demo_subject_2_value_t val_2) + : cache_entry_subject, demo_subject_2_value_t>(key_class(key_2)) +{ + + set_val(val_2); + + printf("new subject of type 1: \n"); + + printf("\t key = %d, value = %d\n", key_2, val_2); + +} + +Demo_Subject2::~Demo_Subject2() +{ + +} diff --git a/src/vma/infra/DemoSubject.h b/src/vma/infra/DemoSubject.h new file mode 100644 index 0000000..5c5f4ad --- /dev/null +++ b/src/vma/infra/DemoSubject.h @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * + * DemoSubject.h + * + */ + +#ifndef DEMOSUBJECT_H_ +#define DEMOSUBJECT_H_ + +#include "cache_subject_observer.h" +#include + +template +class key_class : public tostr +{ +public: + key_class(Key key) { m_key = (uint32_t)key; }; + key_class(){}; + + const std::string to_str() const + { + char s[20]; + /* cppcheck-suppress wrongPrintfScanfArgNum */ + snprintf(s, sizeof(s), "%d.%d.%d.%d", NIPQUAD(m_key)); + return(std::string(s)); + } + + void set_actual_key(Key key) { m_key = (uint32_t)key; }; + + uint32_t get_actual_key() { return m_key; }; +private: + uint32_t m_key; +}; + +typedef char demo_subject_1_key_t; +typedef int demo_subject_1_value_t; +typedef cache_entry_subject, demo_subject_1_value_t> Demo_Subject1_t; + +class Demo_Subject1 : public Demo_Subject1_t +{ +public: + Demo_Subject1(demo_subject_1_key_t key_1); + Demo_Subject1(demo_subject_1_key_t key_1, demo_subject_1_value_t val_1); + + virtual inline bool get_val(INOUT demo_subject_1_value_t & val) { val = m_val; return true; }; + + inline void update_val(IN demo_subject_1_value_t & val) { this->set_val(val); }; + + virtual ~Demo_Subject1(); +}; + +typedef int demo_subject_2_key_t; +typedef uint demo_subject_2_value_t; + +class Demo_Subject2 : public cache_entry_subject, demo_subject_2_value_t> +{ +public: + Demo_Subject2(demo_subject_2_key_t key_2); + Demo_Subject2(demo_subject_2_key_t key_2, demo_subject_2_value_t val_2); + + virtual inline bool get_val(INOUT demo_subject_2_value_t & val) { val = m_val; return true; }; + + inline void update_val(IN demo_subject_2_value_t & val) { this->set_val(val); }; + + virtual ~Demo_Subject2(); +}; + +#endif /* DEMOSUBJECT_H_ */ diff --git a/src/vma/infra/Makefile.am b/src/vma/infra/Makefile.am new file mode 100755 index 0000000..5e1c8da --- /dev/null +++ b/src/vma/infra/Makefile.am @@ -0,0 +1,52 @@ +AM_CFLAGS = -Wall -g #-O3 + +AM_CPPFLAGS := \ + -I$(top_srcdir)/src + +noinst_HEADERS = \ + sender.h \ + subject_observer.h \ + cache_subject_observer.h \ + DemoSubject.h \ + DemoCollMgr.h \ + DemoObserver.h + +EXTRA_DIST = \ + cache_subject_observer.h \ + main.cpp \ + DemoSubject.cpp \ + DemoCollMgr.cpp \ + DemoObserver.cpp \ + DemoCollMgr.h \ + DemoObserver.h \ + DemoSubject.h + +noinst_LTLIBRARIES = libinfra.la +libinfra_la_LDFLAGS = -static +libinfra_la_SOURCES = \ + sender.cpp \ + subject_observer.cpp + +# This section is disabled +# (just keep one for future use) +#noinst_PROGRAMS = cache_test + +#cache_test_LDADD = \ +# libinfra.la + +#cache_test_SOURCES = \ +# main.cpp \ +# cache_subject_observer.h \ +# DemoSubject.cpp \ +# DemoCollMgr.cpp \ +# DemoObserver.cpp \ +# DemoCollMgr.h \ +# DemoObserver.h \ +# DemoSubject.h + +#cache_test_CXXFLAGS = -g + +#cache_test_DEPENDENCIES = \ +# libinfra.la \ +# $(top_builddir)/src/vlogger/libvlogger.la + diff --git a/src/vma/infra/cache_subject_observer.h b/src/vma/infra/cache_subject_observer.h new file mode 100644 index 0000000..702dae0 --- /dev/null +++ b/src/vma/infra/cache_subject_observer.h @@ -0,0 +1,319 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef CACHE_SUBJECT_OBSERVER_H +#define CACHE_SUBJECT_OBSERVER_H + +#include +#include +#include "vlogger/vlogger.h" +#include "utils/lock_wrapper.h" +#include "vma/util/vtypes.h" +#include "vma/infra/subject_observer.h" +#include "vma/sock/cleanable_obj.h" +#include "vma/event/timer_handler.h" +#include "vma/event/event_handler_manager.h" + +#ifndef MODULE_NAME +#define MODULE_NAME "cache_subject_observer:" +#endif + +typedef uint64_t ticks_t; + +class cache_observer : public observer +{ +public: + cache_observer() : m_last_access_time(0), m_is_valid(false) {}; + virtual ~cache_observer() {}; + + inline bool is_valid() { return m_is_valid; }; + inline void notify_cb(event * ev) { NOT_IN_USE(ev); set_state(false); }; + +protected: + inline void set_state(IN bool state) { m_is_valid = state; }; + +private: + ticks_t m_last_access_time; + bool m_is_valid; // is my entry valid + + cache_observer(const cache_observer &); // block copy constructor +}; + + +template +class cache_entry_subject : public subject, public tostr, public cleanable_obj +{ +public: + cache_entry_subject(Key, const char* lock_name = "lock(cache_entry_subject)"); + virtual ~cache_entry_subject() {}; + + // We want to return copy of the Val and not the pointer to it + virtual bool get_val(INOUT Val & val) = 0; + +protected: + // cache_collection now can access cash_entry private and protected members + //typename cannot shadow the class's typename + template friend class cache_table_mgr; + + // coverity[member_decl] + Val m_val; + + inline Key get_key() const { return m_key; }; + + inline void set_val(IN Val & val) + { + auto_unlocker lock(m_lock); + m_val = val; + }; + + /* This function should return true if cache_entry can be deleted */ + virtual bool is_deletable() { return true; }; + + int get_observers_count(); + +private: + const Key m_key; + + cache_entry_subject(const cache_entry_subject &); // block copy constructor +}; + +template +class cache_table_mgr : public tostr, public timer_handler +{ +public: + cache_table_mgr(const char* lock_name = "lock(cache_table_mgr)") : m_lock(lock_name), m_timer_handle(NULL) {}; + virtual ~cache_table_mgr(); + + /* Returns pointer to the subject */ + bool register_observer(IN Key, IN const cache_observer *, OUT cache_entry_subject **); + bool unregister_observer(IN Key, IN const cache_observer *); + void print_tbl(); + cache_entry_subject* get_entry(IN Key); + int get_cache_tbl_size() { return m_cache_tbl.size(); }; + +protected: + // stats - Need to define structure for statistics + + std::tr1::unordered_map *> m_cache_tbl; + + lock_mutex_recursive m_lock; + + virtual cache_entry_subject* create_new_entry(Key, const observer* ) = 0; + + // This function removes cache entries that are obsolete or number of observers is 0 + entry is deletable + virtual void run_garbage_collector(); + virtual void start_garbage_collector(int); + virtual void stop_garbage_collector(); + virtual void handle_timer_expired(void *); + +private: + cache_table_mgr(const cache_table_mgr & ); // block copy constructor + + void try_to_remove_cache_entry(IN typename std::tr1::unordered_map *>::iterator &); + void * m_timer_handle; +}; + +// ########################################################################################## // +// ##################################### implementation ##################################### // +// ########################################################################################## // + +/********************************* cache_entry_subject ********************************/ + +template +cache_entry_subject::cache_entry_subject(Key key, const char* lock_name /*="lock(cache_entry_subject)"*/) : subject(lock_name), m_key(key) +{ + +} + +template +int cache_entry_subject::get_observers_count() +{ + auto_unlocker lock(m_lock); + return (m_observers.size()); +} + +/*********************************cache_collection_mgr ********************************/ +//template +//cache_entry_subject* cache_collection_mgr ::create_new_entry(Key key) +//{ +// return(new cache_entry_subject(key)); +//} + + +template +cache_table_mgr ::~cache_table_mgr() +{ + print_tbl(); +} + +//This function should be called under lock +template +void cache_table_mgr ::try_to_remove_cache_entry(IN typename std::tr1::unordered_map *>::iterator & itr) +{ + cache_entry_subject * cache_entry = itr->second; + Key key = itr->first; + if (!cache_entry->get_observers_count() && cache_entry->is_deletable()){ + __log_dbg("Deleting cache_entry %s", cache_entry->to_str().c_str()); + m_cache_tbl.erase(key); + cache_entry->clean_obj(); + } + else { + __log_dbg("Cache_entry %s is not deletable", itr->second->to_str().c_str()); + } +} + +template +void cache_table_mgr::run_garbage_collector() +{ + __log_dbg(""); + typename std::tr1::unordered_map *>::iterator cache_itr, cache_itr_tmp; + auto_unlocker lock(m_lock); + for (cache_itr = m_cache_tbl.begin(); cache_itr != m_cache_tbl.end(); ) { + cache_itr_tmp = cache_itr; + cache_itr_tmp++; + try_to_remove_cache_entry(cache_itr); + cache_itr = cache_itr_tmp; + } +} + +template +void cache_table_mgr::start_garbage_collector(int timeout_msec) +{ + stop_garbage_collector(); + + m_timer_handle = g_p_event_handler_manager->register_timer_event(timeout_msec, this, PERIODIC_TIMER, NULL); + if(m_timer_handle == NULL) { + __log_warn("Failed to start garbage_collector"); + } + +} + +template +void cache_table_mgr::stop_garbage_collector() +{ + if (m_timer_handle) { + g_p_event_handler_manager->unregister_timer_event(this, m_timer_handle); + m_timer_handle = NULL; + } +} + +template +void cache_table_mgr::handle_timer_expired(void *user_data) +{ + NOT_IN_USE(user_data); + run_garbage_collector(); +} + +template +bool cache_table_mgr::register_observer(IN Key key, IN const cache_observer* new_observer, OUT cache_entry_subject** cache_entry) +{ + if (new_observer == NULL) { + __log_dbg("new_observer == NULL"); + return false; + } + + cache_entry_subject* my_cache_entry; + + auto_unlocker lock(m_lock); + if (!m_cache_tbl.count(key)) { + // Create new entry and insert it to the table + my_cache_entry = create_new_entry(key, new_observer); + if (!my_cache_entry) { + __log_dbg("Failed to allocate new cache_entry_subject with Key = %s", key.to_str().c_str()); + return false; + } + m_cache_tbl[key] = my_cache_entry; + __log_dbg("Created new cache_entry Key = %s", key.to_str().c_str()); + } + else { + my_cache_entry = m_cache_tbl[key]; + } + + my_cache_entry->register_observer(new_observer); + *cache_entry = my_cache_entry; + return true; +} + +template +bool cache_table_mgr ::unregister_observer(IN Key key, IN const cache_observer* old_observer) +{ + __log_dbg(""); + if (old_observer == NULL) { + __log_dbg("old_observer == NULL"); + return false; + } + + auto_unlocker lock(m_lock); + + typename std::tr1::unordered_map *>::iterator cache_itr = m_cache_tbl.find(key); + if (cache_itr == m_cache_tbl.end()) { + __log_dbg("Couldn't unregister observer, the cache_entry (Key = %s) doesn't exist", key.to_str().c_str()); + return false; + } + + cache_itr->second->unregister_observer(old_observer); + + // If number of observers == 0 and cache_entry is deletable need to delete this entry + try_to_remove_cache_entry(cache_itr); + return true; +} + +template +cache_entry_subject* cache_table_mgr ::get_entry(Key key) +{ + cache_entry_subject* ret_entry = NULL; + + if (m_cache_tbl.count(key)) + ret_entry = m_cache_tbl.find(key)->second; + return ret_entry; +} + +template +void cache_table_mgr ::print_tbl() +{ + auto_unlocker lock(m_lock); + typename std::tr1::unordered_map *>::iterator cache_itr = m_cache_tbl.begin(); + if (cache_itr != m_cache_tbl.end()) { + __log_dbg("%s contains:", to_str().c_str()); + for (; cache_itr != m_cache_tbl.end(); cache_itr++) + __log_dbg(" %s", cache_itr->second->to_str().c_str()); + } + else { + __log_dbg("%s empty", to_str().c_str()); + } +} + + +#undef MODULE_NAME + +#endif /* SUBJECT_OBSERVER_TEMPLATE_H */ diff --git a/src/vma/infra/main.cpp b/src/vma/infra/main.cpp new file mode 100644 index 0000000..159d613 --- /dev/null +++ b/src/vma/infra/main.cpp @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "DemoObserver.h" + +int main() +{ + + Demo_Observer *o1 = new Demo_Observer(); + Demo_Observer *o2 = new Demo_Observer(); + + //collection mgr, subjects type-1 + Demo_Coll_Mgr1 *coll_for_subjects_1 = new Demo_Coll_Mgr1(); + //collection mgr, subjects type-2 + Demo_Coll_Mgr2 *coll_for_subjects_2 = new Demo_Coll_Mgr2(); + + o1->register_to_subjects(coll_for_subjects_1, coll_for_subjects_2); + o2->register_to_subjects(coll_for_subjects_1, coll_for_subjects_2); + + o1->start_test(coll_for_subjects_1, coll_for_subjects_2); + + delete o1; + delete o2; + delete coll_for_subjects_1; + delete coll_for_subjects_2; + + return 0; +} + diff --git a/src/vma/infra/sender.cpp b/src/vma/infra/sender.cpp new file mode 100644 index 0000000..1af8e9b --- /dev/null +++ b/src/vma/infra/sender.cpp @@ -0,0 +1,47 @@ +/* + * sender.cpp + * + * Created on: Feb 28, 2013 + * Author: olgas + */ + +#include "vma/infra/sender.h" + +send_data::send_data(const send_info *si) +{ + BULLSEYE_EXCLUDE_BLOCK_START + if(si == NULL) { + m_iov.iov_base = NULL; + m_iov.iov_len = 0; + return; + } + BULLSEYE_EXCLUDE_BLOCK_END + + uint8_t* buff = NULL; + size_t total_len = 0; + + for(uint32_t i = 0;i < si->m_sz_iov;i++){ + total_len += si->m_p_iov[i].iov_len; + } + + buff = new uint8_t[total_len]; + BULLSEYE_EXCLUDE_BLOCK_START + if (NULL == buff) { + m_iov.iov_base = NULL; + m_iov.iov_len = 0; + return; + } + BULLSEYE_EXCLUDE_BLOCK_END + + memcpy_fromiovec(buff, si->m_p_iov, si->m_sz_iov, 0, total_len); + m_iov.iov_base = buff; + m_iov.iov_len = total_len; +} + +send_data::~send_data() +{ + if(m_iov.iov_base) { + delete[]((uint8_t *)m_iov.iov_base); + } +} + diff --git a/src/vma/infra/sender.h b/src/vma/infra/sender.h new file mode 100644 index 0000000..61e5272 --- /dev/null +++ b/src/vma/infra/sender.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef SEND_INFO +#define SEND_INFO + +#include "utils/bullseye.h" +#include "vlogger/vlogger.h" +#include "vma/util/to_str.h" +#include "vma/util/utils.h" +#include "vma/event/event.h" +#include "vma/proto/header.h" + +class event; + + +class send_info : tostr +{ +public: + send_info(iovec *iov, size_t sz): + m_p_iov(iov), + m_sz_iov(sz){}; + virtual ~send_info(){}; + + iovec *m_p_iov; + size_t m_sz_iov; +}; + +class neigh_send_info : public send_info +{ +public: + neigh_send_info(iovec *iov, size_t sz, header *hdr, uint8_t proto, + uint32_t mtu, uint8_t tos): + send_info(iov, sz), m_p_header(hdr),m_mtu(mtu), m_tos(tos), m_protocol(proto){}; + header *m_p_header; + uint32_t m_mtu; + uint8_t m_tos; + uint8_t m_protocol; +}; + +class send_data +{ +public: + send_data(const send_info *si); + virtual ~send_data(); + iovec m_iov; +}; + +class neigh_send_data : public send_data +{ +public: + neigh_send_data(const neigh_send_info *nsi): send_data((const send_info*)nsi), + m_header(new header(*(nsi->m_p_header))), + m_mtu(nsi->m_mtu), + m_tos(nsi->m_tos), + m_protocol(nsi->m_protocol) + { + }; + + virtual ~neigh_send_data() + { + if(m_header) { + delete m_header; + } + }; + + header *m_header; + uint32_t m_mtu; + uint8_t m_tos; + uint8_t m_protocol; +}; + +class send_event : public event +{ +public: + send_event(send_info s_info): m_send_info(s_info) { m_type = SEND_EVENT; }; + + send_info m_send_info; + +}; + +#endif /* SEND_INFO */ diff --git a/src/vma/infra/sender_info_dst.cpp b/src/vma/infra/sender_info_dst.cpp new file mode 100644 index 0000000..3cf6435 --- /dev/null +++ b/src/vma/infra/sender_info_dst.cpp @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "sender_info_dst.h" + +sender::send_info_dst(ibv_send_wr *send_wqe, dst_entry *dst_entry): sender::send_info(send_wqe) +{ + +} diff --git a/src/vma/infra/sender_info_dst.h b/src/vma/infra/sender_info_dst.h new file mode 100644 index 0000000..d3518a9 --- /dev/null +++ b/src/vma/infra/sender_info_dst.h @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef SENDER_INFO_DST_H +#define SENDER_INFO_DST_H + +#include "sender.h" +#include "proto/dst_entry.h" + +class sender::send_info_dst: public sender::send_info +{ +public: + sender::send_info_dst(ibv_send_wr *send_wqe, dst_entry *dst_entry): m_p_send_wqe(send_wqe) {}; + sender::send_info_dst(): m_p_send_wqe(NULL) {}; + virtual ~send_info() {}; + + dst_entry *m_p_send_wqe; + +}; + + + +#endif /* SENDER_INFO_DST_H */ diff --git a/src/vma/infra/subject_observer.cpp b/src/vma/infra/subject_observer.cpp new file mode 100644 index 0000000..a996e5a --- /dev/null +++ b/src/vma/infra/subject_observer.cpp @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "vlogger/vlogger.h" +#include "vma/infra/subject_observer.h" + +#define MODULE_NAME "subject_observer" + +#define sub_obs_logerr __log_info_err +#define sub_obs_logwarn __log_info_warn +#define sub_obs_loginfo __log_info_info +#define sub_obs_logdbg __log_info_dbg +#define sub_obs_logfunc __log_info_func +#define sub_obs_logfuncall __log_info_funcall + + +bool subject::register_observer(IN const observer* const new_observer) +{ + if (new_observer == NULL) { +// sub_obs_logdbg("[%s] observer (NULL)", to_str()); + return false; + } + + auto_unlocker lock(m_lock); + if (m_observers.count((observer *)new_observer) > 0) { +// sub_obs_logdbg("[%s] Observer is already registered (%p)", to_str(), new_observer); + return false; + } + m_observers.insert((observer *)new_observer); +// sub_obs_logdbg("[%s] Successfully registered new_observer %s", to_str(), new_observer->to_str()); + return true; +} + +bool subject::unregister_observer(IN const observer * const old_observer) +{ + if (old_observer == NULL) { +// sub_obs_logdbg("[%s] observer (NULL)", to_str()); + return false; + } + + auto_unlocker lock(m_lock); + m_observers.erase((observer *)old_observer); +// sub_obs_logdbg("[%s] Successfully unregistered old_observer %s",to_str(), old_observer->to_str()); + return true; +} + +void subject::notify_observers(event* ev /*=NULL*/) +{ +// sub_obs_logdbg("[%s]", to_str()); + + auto_unlocker lock(m_lock); + for (observers_t::iterator iter = m_observers.begin(); iter != m_observers.end(); iter++) { + if (ev) + (*iter)->notify_cb(ev); + else + (*iter)->notify_cb(); + } +} diff --git a/src/vma/infra/subject_observer.h b/src/vma/infra/subject_observer.h new file mode 100644 index 0000000..f425cfb --- /dev/null +++ b/src/vma/infra/subject_observer.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +/* This class implements subject observer design pattern */ + +#ifndef SUBJECT_OBSERVER_H +#define SUBJECT_OBSERVER_H + +#include +#include "utils/lock_wrapper.h" +#include "vma/util/vtypes.h" +#include "vma/util/to_str.h" +#include "vma/event/event.h" + +class observer +{ +public: + virtual ~observer() {}; + virtual void notify_cb() { return; }; + virtual void notify_cb(event * ev) { NOT_IN_USE(ev); notify_cb(); }; +}; + +typedef std::tr1::unordered_set observers_t; + +class subject +{ +public: + subject(const char* lock_name = "lock(subject)") : m_lock(lock_name) {}; + virtual ~subject() {}; + bool register_observer(IN const observer* const new_observer); + bool unregister_observer(IN const observer* const old_observer); + void notify_observers(event * ev = NULL); + +protected: + lock_mutex_recursive m_lock; + observers_t m_observers; // list of pointers of all observers (using stl::set for uniqueness - preventing duplicates) +}; + +#endif /* SUBJECT_OBSERVER_H */ diff --git a/src/vma/iomux/epfd_info.cpp b/src/vma/iomux/epfd_info.cpp new file mode 100644 index 0000000..850dfa9 --- /dev/null +++ b/src/vma/iomux/epfd_info.cpp @@ -0,0 +1,809 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#define MODULE_NAME "epfd_info:" + +#define SUPPORTED_EPOLL_EVENTS (EPOLLIN|EPOLLOUT|EPOLLERR|EPOLLHUP|EPOLLRDHUP|EPOLLONESHOT|EPOLLET) + +#define NUM_LOG_INVALID_EVENTS 10 +#define EPFD_MAX_OFFLOADED_STR 150 + +#define CQ_FD_MARK 0xabcd + +int epfd_info::remove_fd_from_epoll_os(int fd) +{ + int ret = orig_os_api.epoll_ctl(m_epfd, EPOLL_CTL_DEL, fd, NULL); + BULLSEYE_EXCLUDE_BLOCK_START + if (ret < 0) { + __log_dbg("failed to remove fd=%d from os epoll epfd=%d (errno=%d %m)", fd, m_epfd, errno); + } + BULLSEYE_EXCLUDE_BLOCK_END + return ret; +} + +epfd_info::epfd_info(int epfd, int size) : + lock_mutex_recursive("epfd_info"), m_epfd(epfd), m_size(size), m_ring_map_lock("epfd_ring_map_lock"), + m_lock_poll_os("epfd_lock_poll_os"), m_sysvar_thread_mode(safe_mce_sys().thread_mode), + m_b_os_data_available(false) +{ + __log_funcall(""); + int max_sys_fd = get_sys_max_fd_num(); + if (m_size<=max_sys_fd) + { + m_size=max_sys_fd; + __log_dbg("using open files max limit of %d file descriptors", m_size); + } + + m_ready_fds.set_id("epfd_info (%p) : m_ready_fds", this); + + m_p_offloaded_fds = new int[m_size]; + m_n_offloaded_fds = 0; + + memset(&(m_local_stats.stats), 0, sizeof(m_local_stats.stats)); + + /* This initialization is not needed (because it is also done in shmem) but for proper code + * we do it in any case + */ + m_local_stats.enabled = true; + m_local_stats.epfd = m_epfd; + + m_stats = &m_local_stats; + + m_log_invalid_events = NUM_LOG_INVALID_EVENTS; + + vma_stats_instance_create_epoll_block(m_epfd, &(m_stats->stats)); + + // Register this socket to read nonoffloaded data + g_p_event_handler_manager->update_epfd(m_epfd, EPOLL_CTL_ADD, EPOLLIN | EPOLLPRI | EPOLLONESHOT); + + wakeup_set_epoll_fd(m_epfd); +} + +epfd_info::~epfd_info() +{ + __log_funcall(""); + socket_fd_api* sock_fd; + + // Meny: going over all handled fds and removing epoll context. + + lock(); + + while(!m_ready_fds.empty()) + { + sock_fd = m_ready_fds.get_and_pop_front(); + sock_fd->m_epoll_event_flags = 0; + } + + while(!m_fd_offloaded_list.empty()) + { + sock_fd = m_fd_offloaded_list.get_and_pop_front(); + sock_fd->m_fd_rec.reset(); + } + + for (int i = 0; i < m_n_offloaded_fds; i++) { + sock_fd = fd_collection_get_sockfd(m_p_offloaded_fds[i]); + BULLSEYE_EXCLUDE_BLOCK_START + if (sock_fd) { + unlock(); + m_ring_map_lock.lock(); + sock_fd->remove_epoll_context(this); + m_ring_map_lock.unlock(); + lock(); + } else { + __log_err("Invalid temp_sock_fd_api==NULL. Deleted fds should have been removed from epfd."); + } + BULLSEYE_EXCLUDE_BLOCK_END + } + + g_p_event_handler_manager->update_epfd(m_epfd, EPOLL_CTL_DEL, EPOLLIN | EPOLLPRI | EPOLLONESHOT); + + unlock(); + + vma_stats_instance_remove_epoll_block(&m_stats->stats); + delete [] m_p_offloaded_fds; +} + +int epfd_info::ctl(int op, int fd, epoll_event *event) +{ + int ret; + epoll_event event_dummy; + if (event == NULL) { + memset(&event_dummy, 0, sizeof(event_dummy)); + event = &event_dummy; + } + + // YossiE TODO make "event table" - and add index in that table instead + // of real event (in orig_os_api.epoll_ctl). must have this because fd's can + // be added after the cq. + lock(); + + switch (op) { + case EPOLL_CTL_ADD: + ret = add_fd(fd, event); + break; + case EPOLL_CTL_DEL: + ret = del_fd(fd); + break; + case EPOLL_CTL_MOD: + ret = mod_fd(fd, event); + break; + default: + errno = EINVAL; + ret = -1; + break; + } + + unlock(); + return ret; +} + +void epfd_info::get_offloaded_fds_arr_and_size(int **p_p_num_offloaded_fds, + int **p_p_offloadded_fds) +{ + *p_p_num_offloaded_fds = &m_n_offloaded_fds; + *p_p_offloadded_fds = m_p_offloaded_fds; +} + +bool epfd_info::is_cq_fd(uint64_t data) +{ + if ((data >> 32) != CQ_FD_MARK) + return false; + + lock(); + //todo consider making m_ready_cq_fd_q a set instead of queue + m_ready_cq_fd_q.push_back((int)(data & 0xffff)); + unlock(); + + return true; +} + +int epfd_info::add_fd(int fd, epoll_event *event) +{ + int ret; + epoll_fd_rec fd_rec; + epoll_event evt = {0, {0}}; + + bool is_offloaded = false; + + __log_funcall("fd=%d", fd); + + socket_fd_api* temp_sock_fd_api = fd_collection_get_sockfd(fd); + if (temp_sock_fd_api && temp_sock_fd_api->get_type()== FD_TYPE_SOCKET) { + is_offloaded = true; + } + + // Make sure that offloaded fd has a correct event mask + if (is_offloaded) { + if (m_log_invalid_events && (event->events & ~SUPPORTED_EPOLL_EVENTS)) { + __log_dbg("invalid event mask 0x%x for offloaded fd=%d", event->events, fd); + __log_dbg("(event->events & ~%s)=0x%x", TO_STR(SUPPORTED_EPOLL_EVENTS), + event->events & ~SUPPORTED_EPOLL_EVENTS); + m_log_invalid_events--; + } + } + + if (temp_sock_fd_api && temp_sock_fd_api->skip_os_select()) { + __log_dbg("fd=%d must be skipped from os epoll()", fd); + // Checking for duplicate fds + if (get_fd_rec(fd)) { + errno = EEXIST; + __log_dbg("epoll_ctl: fd=%d is already registered with this epoll instance %d (errno=%d %m)", fd, m_epfd, errno); + return -1; + } + } + else { + // Add an event which indirectly point to our event + evt.events = event->events; + evt.data.u64 = 0; //zero all data + evt.data.fd = fd; + ret = orig_os_api.epoll_ctl(m_epfd, EPOLL_CTL_ADD, fd, &evt); + BULLSEYE_EXCLUDE_BLOCK_START + if (ret < 0) { + __log_dbg("failed to add fd=%d to epoll epfd=%d (errno=%d %m)", fd, m_epfd, errno); + return ret; + } + BULLSEYE_EXCLUDE_BLOCK_END + } + + fd_rec.events = event->events; + fd_rec.epdata = event->data; + + if (is_offloaded) { // TODO: do we need to handle offloading only for one of read/write? + if (m_n_offloaded_fds >= m_size) { + __log_dbg("Reached max fds for epoll (%d)", m_size); + errno = ENOMEM; + return -1; + } + + //NOTE: when supporting epoll on epfd, need to add epfd ring list + //NOTE: when having rings in pipes, need to overload add_epoll_context + unlock(); + m_ring_map_lock.lock(); + ret = temp_sock_fd_api->add_epoll_context(this); + m_ring_map_lock.unlock(); + lock(); + + if (ret < 0) { + switch (errno) { + case EEXIST: + __log_dbg("epoll_ctl: fd=%d is already registered with this epoll instance %d (errno=%d %m)", fd, m_epfd, errno); + break; + case ENOMEM: + __log_dbg("epoll_ctl: fd=%d is already registered with another epoll instance %d, cannot register to epoll %d (errno=%d %m)", fd, temp_sock_fd_api->get_epoll_context_fd(), m_epfd, errno); + break; + default: + __log_dbg("epoll_ctl: failed to add fd=%d to epoll epfd=%d (errno=%d %m)", fd, m_epfd, errno); + break; + } + return ret; + } + + m_p_offloaded_fds[m_n_offloaded_fds] = fd; + ++m_n_offloaded_fds; + + m_fd_offloaded_list.push_back(temp_sock_fd_api); + fd_rec.offloaded_index = m_n_offloaded_fds; + temp_sock_fd_api->m_fd_rec = fd_rec; + + // if the socket is ready, add it to ready events + uint32_t events = 0; + if ((event->events & EPOLLIN) && temp_sock_fd_api->is_readable(NULL, NULL)) { + events |= EPOLLIN; + } + if ((event->events & EPOLLOUT) && temp_sock_fd_api->is_writeable()) { + // MNY: udp_socket is always ready to write. Both VMA and the OS will notify it. + // Can't remove notification in VMA in case user decides to skip the OS using VMA params. + // Meaning: user will get 2 ready WRITE events on startup of socket + events |= EPOLLOUT; + } + if (events != 0) { + insert_epoll_event(temp_sock_fd_api, events); + } + else{ + do_wakeup(); + } + } else { + fd_rec.offloaded_index = -1; + m_fd_non_offloaded_map[fd] = fd_rec; + } + + __log_func("fd %d added in epfd %d with events=%#x and data=%#x", + fd, m_epfd, event->events, event->data); + return 0; +} + +void epfd_info::increase_ring_ref_count(ring* ring) +{ + m_ring_map_lock.lock(); + ring_map_t::iterator iter = m_ring_map.find(ring); + if (iter != m_ring_map.end()) { + //increase ref count + iter->second++; + } else { + m_ring_map[ring] = 1; + + // add cq channel fd to the epfd + int num_ring_rx_fds = ring->get_num_resources(); + int *ring_rx_fds_array = ring->get_rx_channel_fds(); + for (int i = 0; i < num_ring_rx_fds; i++) { + epoll_event evt = {0, {0}}; + evt.events = EPOLLIN | EPOLLPRI; + int fd = ring_rx_fds_array[i]; + evt.data.u64 = (((uint64_t)CQ_FD_MARK << 32) | fd); + int ret = orig_os_api.epoll_ctl(m_epfd, EPOLL_CTL_ADD, fd, &evt); + BULLSEYE_EXCLUDE_BLOCK_START + if (ret < 0) { + __log_dbg("failed to add cq fd=%d to epoll epfd=%d (errno=%d %m)", + fd, m_epfd, errno); + } else { + __log_dbg("add cq fd=%d to epfd=%d", fd, m_epfd); + } + BULLSEYE_EXCLUDE_BLOCK_END + } + } + m_ring_map_lock.unlock(); +} + +void epfd_info::decrease_ring_ref_count(ring* ring) +{ + m_ring_map_lock.lock(); + ring_map_t::iterator iter = m_ring_map.find(ring); + BULLSEYE_EXCLUDE_BLOCK_START + if (iter == m_ring_map.end()) { + __log_err("expected to find ring %p here!", ring); + m_ring_map_lock.unlock(); + return; + } + BULLSEYE_EXCLUDE_BLOCK_END + + //decrease ref count + iter->second--; + + if (iter->second == 0) { + m_ring_map.erase(iter); + + // remove cq channel fd from the epfd + int num_ring_rx_fds = ring->get_num_resources(); + int *ring_rx_fds_array = ring->get_rx_channel_fds(); + for (int i = 0; i < num_ring_rx_fds; i++) { + // delete cq fd from epfd + int ret = orig_os_api.epoll_ctl(m_epfd, EPOLL_CTL_DEL, ring_rx_fds_array[i], NULL); + BULLSEYE_EXCLUDE_BLOCK_START + if (ret < 0) { + __log_dbg("failed to remove cq fd=%d from epfd=%d (errno=%d %m)", + ring_rx_fds_array[i], m_epfd, errno); + } else { + __log_dbg("remove cq fd=%d from epfd=%d", ring_rx_fds_array[i], m_epfd); + } + BULLSEYE_EXCLUDE_BLOCK_END + } + } + m_ring_map_lock.unlock(); +} + +/* + * del_fd have two modes: + * 1. not passthrough (default) - remove the fd from the epfd, both from OS epfd and VMA epfd + * 2. passthrough - remove the fd as offloaded fd, and keep it only on OS epfd if it was there. + * this is a 1 way direction from both offloaded/not-offloaded to not-offloaded only. + */ +int epfd_info::del_fd(int fd, bool passthrough) +{ + __log_funcall("fd=%d", fd); + + epoll_fd_rec* fi; + socket_fd_api* temp_sock_fd_api = fd_collection_get_sockfd(fd); + if (temp_sock_fd_api && temp_sock_fd_api->skip_os_select()) { + __log_dbg("fd=%d must be skipped from os epoll()", fd); + } + else if (!passthrough) { + remove_fd_from_epoll_os(fd); + } + + fi = get_fd_rec(fd); + if (!fi) { + errno = ENOENT; + return -1; + } + + if (temp_sock_fd_api && temp_sock_fd_api->get_epoll_context_fd() == m_epfd) { + m_fd_offloaded_list.erase(temp_sock_fd_api); + if (passthrough) { + // In case the socket is not offloaded we must copy it to the non offloaded sockets map. + // This can happen after bind(), listen() or accept() calls. + m_fd_non_offloaded_map[fd] = *fi; + m_fd_non_offloaded_map[fd].offloaded_index = -1; + } + } else { + fd_info_map_t::iterator fd_iter = m_fd_non_offloaded_map.find(fd); + if (fd_iter != m_fd_non_offloaded_map.end()) { + m_fd_non_offloaded_map.erase(fd_iter); + } + } + + if (temp_sock_fd_api && temp_sock_fd_api->ep_ready_fd_node.is_list_member()) { + temp_sock_fd_api->m_epoll_event_flags = 0; + m_ready_fds.erase(temp_sock_fd_api); + } + + // handle offloaded fds + if (fi->offloaded_index > 0) { + + //check if the index of fd, which is being removed, is the last one. + //if does, it is enough to decrease the val of m_n_offloaded_fds in order + //to shrink the offloaded fds array. + if (fi->offloaded_index < m_n_offloaded_fds) { + // remove fd and replace by last fd + m_p_offloaded_fds[fi->offloaded_index - 1] = + m_p_offloaded_fds[m_n_offloaded_fds - 1]; + + socket_fd_api* last_socket = fd_collection_get_sockfd(m_p_offloaded_fds[m_n_offloaded_fds - 1]); + if (last_socket && last_socket->get_epoll_context_fd() == m_epfd) { + last_socket->m_fd_rec.offloaded_index = fi->offloaded_index; + } else { + __log_warn("Failed to update the index of offloaded fd: %d last_socket %p\n", + m_p_offloaded_fds[m_n_offloaded_fds - 1], last_socket); + } + } + + --m_n_offloaded_fds; + } + + if (temp_sock_fd_api) { + temp_sock_fd_api->m_fd_rec.reset(); + unlock(); + m_ring_map_lock.lock(); + temp_sock_fd_api->remove_epoll_context(this); + m_ring_map_lock.unlock(); + lock(); + } + + __log_func("fd %d removed from epfd %d", fd, m_epfd); + return 0; +} + +int epfd_info::mod_fd(int fd, epoll_event *event) +{ + epoll_event evt; + epoll_fd_rec* fd_rec; + int ret; + + __log_funcall("fd=%d", fd); + // find the fd in local table + fd_rec = get_fd_rec(fd); + if (!fd_rec) { + errno = ENOENT; + return -1; + } + + socket_fd_api* temp_sock_fd_api = fd_collection_get_sockfd(fd); + // check if fd is offloaded that new event mask is OK + if (temp_sock_fd_api && temp_sock_fd_api->m_fd_rec.offloaded_index > 0) { + if (m_log_invalid_events && (event->events & ~SUPPORTED_EPOLL_EVENTS)) { + __log_dbg("invalid event mask 0x%x for offloaded fd=%d", event->events, fd); + __log_dbg("(event->events & ~%s)=0x%x", TO_STR(SUPPORTED_EPOLL_EVENTS), + event->events & ~SUPPORTED_EPOLL_EVENTS); + m_log_invalid_events--; + } + } + + if (temp_sock_fd_api && temp_sock_fd_api->skip_os_select()) { + __log_dbg("fd=%d must be skipped from os epoll()", fd); + } + else { + // modify fd + evt.events = event->events; + evt.data.u64 = 0; //zero all data + evt.data.fd = fd; + ret = orig_os_api.epoll_ctl(m_epfd, EPOLL_CTL_MOD, fd, &evt); + BULLSEYE_EXCLUDE_BLOCK_START + if (ret < 0) { + __log_err("failed to modify fd=%d in epoll epfd=%d (errno=%d %m)", fd, m_epfd, errno); + return ret; + } + BULLSEYE_EXCLUDE_BLOCK_END + } + + // modify fd data in local table + fd_rec->epdata = event->data; + fd_rec->events = event->events; + + bool is_offloaded = temp_sock_fd_api && temp_sock_fd_api->get_type()== FD_TYPE_SOCKET; + + uint32_t events = 0; + if (is_offloaded) { + // if the socket is ready, add it to ready events + if ((event->events & EPOLLIN) && temp_sock_fd_api->is_readable(NULL, NULL)) { + events |= EPOLLIN; + } + if ((event->events & EPOLLOUT) && temp_sock_fd_api->is_writeable()) { + // MNY: udp_socket is always ready to write. Both VMA and the OS will notify it. + // Can't remove notification in VMA in case user decides to skip the OS using VMA params. + // Meaning: user will get 2 ready WRITE events on startup of socket + events |= EPOLLOUT; + } + if (events != 0) { + insert_epoll_event(temp_sock_fd_api, events); + } + } + + if (event->events == 0 || events == 0) { + if (temp_sock_fd_api && temp_sock_fd_api->ep_ready_fd_node.is_list_member()) { + temp_sock_fd_api->m_epoll_event_flags = 0; + m_ready_fds.erase(temp_sock_fd_api); + } + } + + __log_func("fd %d modified in epfd %d with events=%#x and data=%#x", + fd, m_epfd, event->events, event->data); + return 0; +} + +epoll_fd_rec* epfd_info::get_fd_rec(int fd) +{ + epoll_fd_rec* fd_rec = NULL; + socket_fd_api* temp_sock_fd_api = fd_collection_get_sockfd(fd); + lock(); + + if (temp_sock_fd_api && temp_sock_fd_api->get_epoll_context_fd() == m_epfd) { + fd_rec = &temp_sock_fd_api->m_fd_rec; + } else { + fd_info_map_t::iterator iter = m_fd_non_offloaded_map.find(fd); + if (iter != m_fd_non_offloaded_map.end()) { + fd_rec = &iter->second; + } + } + + unlock(); + return fd_rec; +} + +void epfd_info::fd_closed(int fd, bool passthrough) +{ + lock(); + if (get_fd_rec(fd)) { + del_fd(fd, passthrough); + } + unlock(); +} + +void epfd_info::insert_epoll_event_cb(socket_fd_api* sock_fd, uint32_t event_flags) +{ + lock(); + //EPOLLHUP | EPOLLERR are reported without user request + if (event_flags & (sock_fd->m_fd_rec.events | EPOLLHUP | EPOLLERR)) { + insert_epoll_event(sock_fd, event_flags); + } + unlock(); +} + +void epfd_info::insert_epoll_event(socket_fd_api *sock_fd, uint32_t event_flags) +{ + // assumed lock + if (sock_fd->ep_ready_fd_node.is_list_member()) { + sock_fd->m_epoll_event_flags |= event_flags; + } + else { + sock_fd->m_epoll_event_flags = event_flags; + m_ready_fds.push_back(sock_fd); + } + + do_wakeup(); +} + +void epfd_info::remove_epoll_event(socket_fd_api *sock_fd, uint32_t event_flags) +{ + sock_fd->m_epoll_event_flags &= ~event_flags; + if (sock_fd->m_epoll_event_flags == 0) { + m_ready_fds.erase(sock_fd); + } +} + +epoll_stats_t *epfd_info::stats() +{ + return m_stats; +} + +int epfd_info::ring_poll_and_process_element(uint64_t *p_poll_sn, void* pv_fd_ready_array/* = NULL*/) +{ + __log_func(""); + + int ret_total = 0; + + if (m_ring_map.empty()) { + return ret_total; + } + + m_ring_map_lock.lock(); + + for (ring_map_t::iterator iter = m_ring_map.begin(); iter != m_ring_map.end(); iter++) { + int ret = iter->first->poll_and_process_element_rx(p_poll_sn, pv_fd_ready_array); + BULLSEYE_EXCLUDE_BLOCK_START + if (ret < 0 && errno != EAGAIN) { + __log_err("Error in ring->poll_and_process_element() of %p (errno=%d %m)", iter->first, errno); + m_ring_map_lock.unlock(); + return ret; + } + BULLSEYE_EXCLUDE_BLOCK_END + if (ret > 0) + __log_func("ring[%p] Returned with: %d (sn=%d)", iter->first, ret, *p_poll_sn); + ret_total += ret; + } + + m_ring_map_lock.unlock(); + + if (m_sysvar_thread_mode == THREAD_MODE_PLENTY && ret_total == 0 && errno == EAGAIN) pthread_yield(); + + if (ret_total) { + __log_func("ret_total=%d", ret_total); + } else { + __log_funcall("ret_total=%d", ret_total); + } + return ret_total; +} + +int epfd_info::ring_request_notification(uint64_t poll_sn) +{ + __log_func(""); + int ret_total = 0; + + if (m_ring_map.empty()) { + return ret_total; + } + + m_ring_map_lock.lock(); + + for (ring_map_t::iterator iter = m_ring_map.begin(); iter != m_ring_map.end(); iter++) { + int ret = iter->first->request_notification(CQT_RX, poll_sn); + BULLSEYE_EXCLUDE_BLOCK_START + if (ret < 0) { + __log_err("Error ring[%p]->request_notification() (errno=%d %m)", iter->first, errno); + m_ring_map_lock.unlock(); + return ret; + } + BULLSEYE_EXCLUDE_BLOCK_END + __log_func("ring[%p] Returned with: %d (sn=%d)", iter->first, ret, poll_sn); + ret_total += ret; + } + + m_ring_map_lock.unlock(); + + return ret_total; +} + +int epfd_info::ring_wait_for_notification_and_process_element(uint64_t *p_poll_sn, void* pv_fd_ready_array /* = NULL*/) +{ + __log_func(""); + int ret_total = 0; + + while (!m_ready_cq_fd_q.empty()) { + + lock(); + if (m_ready_cq_fd_q.empty()) { + unlock(); + break; + } + int fd = m_ready_cq_fd_q.back(); + m_ready_cq_fd_q.pop_back(); + unlock(); + + cq_channel_info* p_cq_ch_info = g_p_fd_collection->get_cq_channel_fd(fd); + if (p_cq_ch_info) { + ring* p_ready_ring = p_cq_ch_info->get_ring(); + // Handle the CQ notification channel + int ret = p_ready_ring->wait_for_notification_and_process_element(fd, p_poll_sn, pv_fd_ready_array); + if (ret < 0) { + if (errno == EAGAIN) { + __log_dbg("Error in ring->wait_for_notification_and_process_element() of %p (errno=%d %m)", p_ready_ring, errno); + } + else { + __log_err("Error in ring->wait_for_notification_and_process_element() of %p (errno=%d %m)", p_ready_ring, errno); + } + continue; + } + if (ret > 0) { + __log_func("ring[%p] Returned with: %d (sn=%d)", p_ready_ring, ret, *p_poll_sn); + } + ret_total += ret; + } + else { + __log_dbg("failed to find channel fd. removing cq fd=%d from epfd=%d", fd, m_epfd); + BULLSEYE_EXCLUDE_BLOCK_START + if ((orig_os_api.epoll_ctl(m_epfd, EPOLL_CTL_DEL, + fd, NULL)) && (!(errno == ENOENT || errno == EBADF))) { + __log_err("failed to del cq channel fd=%d from os epfd=%d (errno=%d %m)", fd, m_epfd, errno); + } + BULLSEYE_EXCLUDE_BLOCK_END + } + } + + if (ret_total) { + __log_func("ret_total=%d", ret_total); + } else { + __log_funcall("ret_total=%d", ret_total); + } + return ret_total; +} + +void epfd_info::clean_obj() +{ + if (g_p_fd_collection) + g_p_fd_collection->remove_epfd_from_list(this); + cleanable_obj::clean_obj(); +} + +void epfd_info::statistics_print(vlog_levels_t log_level /* = VLOG_DEBUG */) +{ + size_t num_rings, num_ready_fds, num_ready_cq_fd; + int offloaded_str_place, i = 0; + char offloaded_str[VLOGGER_STR_SIZE]; + + // Prepare data + num_rings = m_ring_map.size(); + iomux_func_stats_t temp_iomux_stats = m_stats->stats; + num_ready_fds = m_ready_fds.size(); + num_ready_cq_fd = m_ready_cq_fd_q.size(); + + // Epoll data + vlog_printf(log_level, "Fd number : %d\n", m_epfd); + vlog_printf(log_level, "Size : %d\n", m_size); + + vlog_printf(log_level, "Offloaded Fds : %d\n", m_n_offloaded_fds); + + while (i < m_n_offloaded_fds) { + memset(offloaded_str, 0, sizeof(offloaded_str)); + for (offloaded_str_place = 0; offloaded_str_place < EPFD_MAX_OFFLOADED_STR && i < m_n_offloaded_fds; i++) { + int n = snprintf(&offloaded_str[offloaded_str_place], sizeof(offloaded_str) - offloaded_str_place - 1, " %d", m_p_offloaded_fds[i]); + if (!likely((0 < n) && (n < (int)(sizeof(offloaded_str) - offloaded_str_place - 1)))) { + break; + } + offloaded_str_place += n; + } + + offloaded_str[offloaded_str_place] = '\0'; + vlog_printf(log_level, "Offloaded Fds list: %s\n", offloaded_str); + } + + vlog_printf(log_level, "Number of rings : %u\n", num_rings); + vlog_printf(log_level, "Number of ready Fds : %u\n", num_ready_fds); + vlog_printf(log_level, "Number of ready CQ Fds : %u\n", num_ready_cq_fd); + + if (temp_iomux_stats.n_iomux_os_rx_ready || temp_iomux_stats.n_iomux_rx_ready || temp_iomux_stats.n_iomux_timeouts || temp_iomux_stats.n_iomux_errors || + temp_iomux_stats.n_iomux_poll_miss || temp_iomux_stats.n_iomux_poll_hit) { + + vlog_printf(log_level, "Polling CPU : %d%%\n", temp_iomux_stats.n_iomux_polling_time); + + if (temp_iomux_stats.threadid_last != 0) + vlog_printf(log_level, "Thread Id : %5u\n", temp_iomux_stats.threadid_last); + + if (temp_iomux_stats.n_iomux_os_rx_ready || temp_iomux_stats.n_iomux_rx_ready) + vlog_printf(log_level, "Rx fds ready : %u / %u [os/offload]\n", temp_iomux_stats.n_iomux_os_rx_ready, temp_iomux_stats.n_iomux_rx_ready); + + if (temp_iomux_stats.n_iomux_poll_miss + temp_iomux_stats.n_iomux_poll_hit) { + double iomux_poll_hit = (double)temp_iomux_stats.n_iomux_poll_hit; + double iomux_poll_hit_percentage = (iomux_poll_hit / (iomux_poll_hit + (double)temp_iomux_stats.n_iomux_poll_miss)) * 100; + vlog_printf(log_level, "Polls [miss/hit] : %u / %u (%2.2f%%)\n", temp_iomux_stats.n_iomux_poll_miss, temp_iomux_stats.n_iomux_poll_hit, iomux_poll_hit_percentage); + + if (temp_iomux_stats.n_iomux_timeouts) + vlog_printf(log_level, "Timeouts : %u\n", temp_iomux_stats.n_iomux_timeouts); + + if (temp_iomux_stats.n_iomux_errors) + vlog_printf(log_level, "Errors : %u\n", temp_iomux_stats.n_iomux_errors); + } + } +} + +void epfd_info::set_os_data_available() +{ + auto_unlocker locker(m_lock_poll_os); + m_b_os_data_available = true; +} + +void epfd_info::register_to_internal_thread() +{ + auto_unlocker locker(m_lock_poll_os); + m_b_os_data_available = false; + + // Reassign EPOLLIN event + g_p_event_handler_manager->update_epfd(m_epfd, EPOLL_CTL_MOD, EPOLLIN | EPOLLPRI | EPOLLONESHOT); +} + +bool epfd_info::get_and_unset_os_data_available() +{ + auto_unlocker locker(m_lock_poll_os); + bool ret = m_b_os_data_available; + m_b_os_data_available = false; + return ret; +} diff --git a/src/vma/iomux/epfd_info.h b/src/vma/iomux/epfd_info.h new file mode 100644 index 0000000..3b3e434 --- /dev/null +++ b/src/vma/iomux/epfd_info.h @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef VMA_EPOLL_H +#define VMA_EPOLL_H + +#include +#include +#include + +typedef vma_list_t ep_ready_fd_list_t; +typedef vma_list_t fd_info_list_t; +typedef std::tr1::unordered_map fd_info_map_t; +typedef std::tr1::unordered_map ring_map_t; +typedef std::deque ready_cq_fd_q_t; + +class epfd_info : public lock_mutex_recursive, public cleanable_obj, public wakeup_pipe +{ +public: + epfd_info(int epfd, int size); + ~epfd_info(); + + /** + * Lock and perform epoll_ctl. + * Arguments the same as for epoll_ctl() + */ + int ctl(int op, int fd, epoll_event *event); + + /** + * Get the offloaded fds array and its length. + * @param adress of the pointer to number of offloaded fds. + * @param adress of the offloaded fds array. + */ + void get_offloaded_fds_arr_and_size(int **p_p_num_offloaded_fds, + int **p_p_offloadded_fds); + + /** + * check if fd is cq fd according to the data. + * if it is, save the fd in ready cq fds queue. + * @param data field from event data + * @return true if fd is cq fd + */ + bool is_cq_fd(uint64_t data); + + /** + * Get the original user data posted with this fd. + * @param fd File descriptor. + * @return Pointer to user data if the data for this fd was found. + */ + epoll_fd_rec* get_fd_rec(int fd); + + /** + * Called when fd is closed, to remove it from this set. + * @param fd Closed file descriptor. + */ + void fd_closed(int fd, bool passthrough = false); + + ep_ready_fd_list_t m_ready_fds; + + /** + * @return Pointer to statistics block for this group + */ + epoll_stats_t *stats(); + + int ring_poll_and_process_element(uint64_t *p_poll_sn, void* pv_fd_ready_array = NULL); + + int ring_request_notification(uint64_t poll_sn); + + int ring_wait_for_notification_and_process_element(uint64_t *p_poll_sn, void* pv_fd_ready_array = NULL); + + virtual void clean_obj(); + + void statistics_print(vlog_levels_t log_level = VLOG_DEBUG); + + // Called from the internal thread to mark that non offloaded data is available. + void set_os_data_available(); + + // Register this epfd to the internal thread, Called after non offloaded data has been received. + void register_to_internal_thread(); + + // Thread safe function which returns true if non offloaded data is available. + // Will also set m_b_os_data_available to false. + bool get_and_unset_os_data_available(); + + // Returns true if non offloaded data is available. + inline bool get_os_data_available() {return m_b_os_data_available;} + + static inline size_t epfd_info_node_offset(void) {return NODE_OFFSET(epfd_info, epfd_info_node);} + list_node epfd_info_node; + +private: + + const int m_epfd; + int m_size; + int *m_p_offloaded_fds; + int m_n_offloaded_fds; + fd_info_map_t m_fd_non_offloaded_map; + fd_info_list_t m_fd_offloaded_list; + ring_map_t m_ring_map; + lock_mutex_recursive m_ring_map_lock; + lock_spin m_lock_poll_os; + const thread_mode_t m_sysvar_thread_mode; + ready_cq_fd_q_t m_ready_cq_fd_q; + epoll_stats_t m_local_stats; + epoll_stats_t *m_stats; + int m_log_invalid_events; + bool m_b_os_data_available; // true when non offloaded data is available + + int add_fd(int fd, epoll_event *event); + int del_fd(int fd, bool passthrough = false); + int mod_fd(int fd, epoll_event *event); + +public: + int get_epoll_fd() {return m_epfd;}; + int remove_fd_from_epoll_os(int fd); + inline size_t get_fd_non_offloaded_size() {return m_fd_non_offloaded_map.size();} + inline size_t get_fd_offloaded_size() {return m_fd_offloaded_list.size();} + void insert_epoll_event_cb(socket_fd_api* sock_fd, uint32_t event_flags); + void insert_epoll_event(socket_fd_api *sock_fd, uint32_t event_flags); + void remove_epoll_event(socket_fd_api *sock_fd, uint32_t event_flags); + void increase_ring_ref_count(ring* ring); + void decrease_ring_ref_count(ring* ring); +}; + +#endif + diff --git a/src/vma/iomux/epoll_wait_call.cpp b/src/vma/iomux/epoll_wait_call.cpp new file mode 100644 index 0000000..07530e1 --- /dev/null +++ b/src/vma/iomux/epoll_wait_call.cpp @@ -0,0 +1,397 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "epoll_wait_call.h" + +#include + +#include +#include +#include +#include + +#include "epfd_info.h" + +#define MODULE_NAME "epoll_wait_call:" + +epoll_wait_call::epoll_wait_call(epoll_event *extra_events_buffer, offloaded_mode_t *off_modes_buffer, + int epfd, epoll_event *events, int maxevents, + int timeout, const sigset_t *sigmask /* = NULL */) : + io_mux_call(NULL, off_modes_buffer, 0, sigmask), // TODO: rethink on these arguments + m_epfd(epfd), m_events(events), m_maxevents(maxevents), m_timeout(timeout), + m_p_ready_events(extra_events_buffer) +{ + // get epfd_info + m_epfd_info = fd_collection_get_epfd(epfd); + if (!m_epfd_info || maxevents <= 0) { + __log_dbg("error, epfd %d not found or maxevents <= 0 (=%d)", epfd, maxevents); + errno = maxevents <= 0 ? EINVAL : EBADF; + vma_throw_object(io_mux_call::io_error); + } + + // create stats + m_p_stats = &m_epfd_info->stats()->stats; +} + +void epoll_wait_call::init_offloaded_fds() +{ + // copy offloaded_fds pointer and count + m_epfd_info->get_offloaded_fds_arr_and_size(&m_p_num_all_offloaded_fds, &m_p_all_offloaded_fds); + m_num_all_offloaded_fds = *m_p_num_all_offloaded_fds; // TODO: fix orig ugly code, and then remove this + + __log_func("building: epfd=%d, m_epfd_info->get_fd_offloaded_size()=%zu, m_epfd_info->get_fd_non_offloaded_size()=%zu, *m_p_num_all_offloaded_fds=%d", + m_epfd, m_epfd_info->get_fd_offloaded_size(), m_epfd_info->get_fd_non_offloaded_size(), *m_p_num_all_offloaded_fds); +} + +int epoll_wait_call::get_current_events() +{ + if (m_epfd_info->m_ready_fds.empty()) { + return m_n_all_ready_fds; + } + + vma_list_t socket_fd_list; + lock(); + int i, ready_rfds = 0, ready_wfds = 0; + i = m_n_all_ready_fds; + socket_fd_api *p_socket_object; + ep_ready_fd_list_t::iterator iter = m_epfd_info->m_ready_fds.begin(); + while (iter != m_epfd_info->m_ready_fds.end() && i < m_maxevents) { + p_socket_object = *iter; + ++iter; + + m_events[i].events = 0; //initialize + + bool got_event = false; + + //epoll_wait will always wait for EPOLLERR and EPOLLHUP; it is not necessary to set it in events. + uint32_t mutual_events = p_socket_object->m_epoll_event_flags & (p_socket_object->m_fd_rec.events | EPOLLERR | EPOLLHUP); + + //EPOLLHUP & EPOLLOUT are mutually exclusive. see poll man pages. epoll adapt poll behavior. + if ((mutual_events & EPOLLHUP) && (mutual_events & EPOLLOUT)) { + mutual_events &= ~EPOLLOUT; + } + + if (mutual_events & EPOLLIN) { + if (handle_epoll_event(p_socket_object->is_readable(NULL), EPOLLIN, p_socket_object, i)) { + ready_rfds++; + got_event = true; + } + mutual_events &= ~EPOLLIN; + } + + if (mutual_events & EPOLLOUT) { + if (handle_epoll_event(p_socket_object->is_writeable(), EPOLLOUT, p_socket_object, i)) { + ready_wfds++; + got_event = true; + } + mutual_events &= ~EPOLLOUT; + } + + if (mutual_events) { + if (handle_epoll_event(true, mutual_events, p_socket_object, i)) { + got_event = true; + } + } + + if (got_event) { + socket_fd_list.push_back(p_socket_object); + ++i; + } + } + + m_n_ready_rfds += ready_rfds; + m_n_ready_wfds += ready_wfds; + m_p_stats->n_iomux_rx_ready += ready_rfds; + + unlock(); + + /* + * for checking ring migration we need a socket context. + * in epoll we separate the rings from the sockets, so only here we access the sockets. + * therefore, it is most convenient to check it here. + * we need to move the ring migration to the epfd, going over the registered sockets, + * when polling the rings was not fruitful. + * this will be more similar to the behavior of select/poll. + * see RM task 212058 + */ + while (!socket_fd_list.empty()) { + socket_fd_api* sockfd = socket_fd_list.get_and_pop_front(); + sockfd->consider_rings_migration(); + } + + return (i); +} + +epoll_wait_call::~epoll_wait_call() +{ +} + +void epoll_wait_call::prepare_to_block() +{ + // Empty +} + +bool epoll_wait_call::_wait(int timeout) +{ + int i, ready_fds, fd; + bool cq_ready = false; + epoll_fd_rec* fd_rec; + + __log_func("calling os epoll: %d", m_epfd); + + if (timeout) { + lock(); + if (m_epfd_info->m_ready_fds.empty()) { + m_epfd_info->going_to_sleep(); + } else { + timeout = 0; + } + unlock(); + } + + if (m_sigmask) { + ready_fds = orig_os_api.epoll_pwait(m_epfd, m_p_ready_events, m_maxevents, timeout, m_sigmask); + } else { + ready_fds = orig_os_api.epoll_wait(m_epfd, m_p_ready_events, m_maxevents, timeout); + } + + if (timeout) { + lock(); + m_epfd_info->return_from_sleep(); + unlock(); + } + + if (ready_fds < 0) { + vma_throw_object(io_mux_call::io_error); + } + + // convert the returned events to user events and mark offloaded fds + m_n_all_ready_fds = 0; + for (i = 0; i < ready_fds; ++i) { + fd = m_p_ready_events[i].data.fd; + + // wakeup event + if(m_epfd_info->is_wakeup_fd(fd)) + { + lock(); + m_epfd_info->remove_wakeup_fd(); + unlock(); + continue; + } + + // If it's CQ + if (m_epfd_info->is_cq_fd(m_p_ready_events[i].data.u64)) { + cq_ready = true; + continue; + } + + if (m_p_ready_events[i].events & EPOLLIN) { + socket_fd_api* temp_sock_fd_api = fd_collection_get_sockfd(fd); + if (temp_sock_fd_api) { + // Instructing the socket to sample the OS immediately to prevent hitting EAGAIN on recvfrom(), + // after iomux returned a shadow fd as ready (only for non-blocking sockets) + temp_sock_fd_api->set_immediate_os_sample(); + } + } + + // Copy event bits and data + m_events[m_n_all_ready_fds].events = m_p_ready_events[i].events; + fd_rec = m_epfd_info->get_fd_rec(fd); + if (fd_rec) { + m_events[m_n_all_ready_fds].data = fd_rec->epdata; + ++m_n_all_ready_fds; + } else { + __log_dbg("error - could not found fd %d in m_fd_info of epfd %d", fd, m_epfd); + } + } + + return cq_ready; +} + +bool epoll_wait_call::wait_os(bool zero_timeout) +{ + return _wait(zero_timeout ? 0 : m_timeout); +} + +bool epoll_wait_call::wait(const timeval &elapsed) +{ + int timeout; + + if (m_timeout < 0) { + timeout = m_timeout; + } else { + timeout = m_timeout - tv_to_msec(&elapsed); + if (timeout < 0) { + // Already reached timeout + return false; + } + } + + return _wait(timeout); +} + +bool epoll_wait_call::is_timeout(const timeval &elapsed) +{ + return m_timeout >= 0 && m_timeout <= tv_to_msec(&elapsed); +} + +void epoll_wait_call::set_offloaded_rfd_ready(int fd_index) +{ + // Empty - event inserted via event callback + NOT_IN_USE(fd_index); +} + +void epoll_wait_call::set_offloaded_wfd_ready(int fd_index) +{ + // Empty + NOT_IN_USE(fd_index); +} + +void epoll_wait_call::set_rfd_ready(int fd) +{ + // Empty + NOT_IN_USE(fd); +} + +void epoll_wait_call::set_wfd_ready(int fd) +{ + // Empty + NOT_IN_USE(fd); +} + +void epoll_wait_call::set_efd_ready(int fd, int errors) +{ + // Empty + NOT_IN_USE(fd); + NOT_IN_USE(errors); +} + +void epoll_wait_call::lock() +{ + m_epfd_info->lock(); +} + +void epoll_wait_call::unlock() +{ + m_epfd_info->unlock(); +} + +bool epoll_wait_call::check_all_offloaded_sockets() +{ + // check cq for acks + ring_poll_and_process_element(); + m_n_all_ready_fds = get_current_events(); + + __log_func("m_n_all_ready_fds=%d, m_n_ready_rfds=%d, m_n_ready_wfds=%d", m_n_all_ready_fds, m_n_ready_rfds, m_n_ready_wfds); + return m_n_all_ready_fds; +} + +bool epoll_wait_call::immidiate_return(int &poll_os_countdown) +{ + NOT_IN_USE(poll_os_countdown); + return false; +} + +bool epoll_wait_call::handle_epoll_event(bool is_ready, uint32_t events, socket_fd_api *socket_object, int index) +{ + if (is_ready) { + epoll_fd_rec& fd_rec = socket_object->m_fd_rec; + m_events[index].data = fd_rec.epdata; + m_events[index].events |= events; + + if (fd_rec.events & EPOLLONESHOT) { + // Clear events for this fd + fd_rec.events &= ~events; + } + if (fd_rec.events & EPOLLET) { + m_epfd_info->remove_epoll_event(socket_object, events); + } + return true; + } + else { + // not readable, need to erase from our ready list (LT support) + m_epfd_info->remove_epoll_event(socket_object, events); + return false; + } + +} + +bool epoll_wait_call::handle_os_countdown(int &poll_os_countdown) +{ + NOT_IN_USE(poll_os_countdown); + + if (!m_epfd_info->get_os_data_available() || !m_epfd_info->get_and_unset_os_data_available()) { + return false; + } + + /* + * Poll OS when the internal thread found non offloaded data. + */ + bool cq_ready = wait_os(true); + + m_epfd_info->register_to_internal_thread(); + + if (cq_ready) { + // This will empty the cqepfd + // (most likely in case of a wakeup and probably only under epoll_wait (Not select/poll)) + ring_wait_for_notification_and_process_element(NULL); + } + /* Before we exit with ready OS fd's we'll check the CQs once more and exit + * below after calling check_all_offloaded_sockets(); + * IMPORTANT : We cannot do an opposite with current code, + * means we cannot poll cq and then poll os (for epoll) - because poll os + * will delete ready offloaded fds. + */ + if (m_n_all_ready_fds) { + m_p_stats->n_iomux_os_rx_ready += m_n_all_ready_fds; // TODO: fix it - we only know all counter, not read counter + check_all_offloaded_sockets(); + return true; + } + + return false; +} + +int epoll_wait_call::ring_poll_and_process_element() +{ + return m_epfd_info->ring_poll_and_process_element(&m_poll_sn, NULL); +} + +int epoll_wait_call::ring_request_notification() +{ + return m_epfd_info->ring_request_notification(m_poll_sn); +} + +int epoll_wait_call::ring_wait_for_notification_and_process_element(void* pv_fd_ready_array) +{ + return m_epfd_info->ring_wait_for_notification_and_process_element(&m_poll_sn, pv_fd_ready_array); +} diff --git a/src/vma/iomux/epoll_wait_call.h b/src/vma/iomux/epoll_wait_call.h new file mode 100644 index 0000000..3f3703e --- /dev/null +++ b/src/vma/iomux/epoll_wait_call.h @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef _EPOLL_WAIT_CALL_H +#define _EPOLL_WAIT_CALL_H + +#include +#include + +#include "io_mux_call.h" + +class epfd_info; + +/** + * @class poll_call + * Functor for poll() + */ +class epoll_wait_call : public io_mux_call +{ +public: + /** + * Create an epoll_wait call. + * @param extra_events_buffer Array of at least maxevents size. + * @param ready_event_map_buffer Array of at least maxevents size. + * + * Rest of the arguments are the same as for poll() library function. + * @throws io_mux_call::io_error + */ + epoll_wait_call(epoll_event *extra_events_buffer, offloaded_mode_t *off_modes_buffer, + int epfd, epoll_event *events, int maxevents, int timeout, const sigset_t *sigmask = NULL); + virtual ~epoll_wait_call(); + + /// @override + virtual void set_offloaded_rfd_ready(int fd_index); + virtual void set_offloaded_wfd_ready(int fd_index); + + /// @override + virtual void prepare_to_block(); + + /// @override + virtual bool wait_os(bool zero_timeout); + + /// @override + virtual bool wait(const timeval &elapsed); + + /// @override + virtual bool is_timeout(const timeval &elapsed); + + /// @override + virtual void set_rfd_ready(int fd); + + /// @override + virtual void set_wfd_ready(int fd); + + /// @override + virtual void set_efd_ready(int fd, int errors); + + /// @override + virtual void lock(); + + /// @override + virtual void unlock(); + + /// @override + virtual bool immidiate_return(int &poll_os_countdown); + + /// @override + virtual bool check_all_offloaded_sockets(); + + void init_offloaded_fds(); + + int get_current_events(); + + bool handle_epoll_event(bool is_ready, uint32_t events, socket_fd_api *socket_object, int index); + +protected: + virtual int ring_poll_and_process_element(); + + virtual int ring_request_notification(); + + virtual int ring_wait_for_notification_and_process_element(void* pv_fd_ready_array); + + virtual bool handle_os_countdown(int &poll_os_countdown); + +private: + bool _wait(int timeout); + + /// Parameters for the call + const int m_epfd; + epoll_event * const m_events; + const int m_maxevents; + const int m_timeout; + + epoll_event *m_p_ready_events; + epfd_info *m_epfd_info; +}; + +#endif diff --git a/src/vma/iomux/io_mux_call.cpp b/src/vma/iomux/io_mux_call.cpp new file mode 100644 index 0000000..6a16153 --- /dev/null +++ b/src/vma/iomux/io_mux_call.cpp @@ -0,0 +1,590 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "io_mux_call.h" + +#include "utils/clock.h" +#include "vlogger/vlogger.h" +#include +#include +#include +#include "vma/util/instrumentation.h" + +//#define IOMUX_DEBUG +#ifdef IOMUX_DEBUG +#define __if_dbg(_log_args_...) __log_dbg(_log_args_) +#else +#define __if_dbg(_log_args_...) +#endif + +uint64_t g_polling_time_usec=0; //polling time in the last second in usec +timeval g_last_zero_polling_time; //the last time g_polling_time_usec was zeroed +int g_n_last_checked_index = 0; //save the last fd index we checked in check_offloaded_rsockets() + +#define MODULE_NAME "io_mux_call:" + +int io_mux_call::m_n_skip_os_count = 0; + +inline void io_mux_call::timer_update() +{ + if (!tv_isset(&m_start)) { + // after first loop - set + gettime(&m_start); + __log_func("start timer"); + } + else { + timeval current; + gettime(¤t); + tv_sub(¤t, &m_start, &m_elapsed); + __log_funcall("update timer (elapsed time: %d sec, %d usec)", m_elapsed.tv_sec, m_elapsed.tv_usec); + } +} + +inline void io_mux_call::check_rfd_ready_array(fd_array_t *fd_ready_array) +{ + int fd_index; + + for (fd_index=0; fd_index < fd_ready_array->fd_count; ++fd_index) { + set_rfd_ready(fd_ready_array->fd_list[fd_index]); + } + if (m_n_ready_rfds) { + m_p_stats->n_iomux_rx_ready += m_n_ready_rfds; + __log_func("found ready_fds=%d", m_n_ready_rfds); + //return true; + } + //return false; +} + +inline void io_mux_call::check_offloaded_wsockets() +{ + for (int offloaded_index = 0; offloaded_index < *m_p_num_all_offloaded_fds; ++offloaded_index) { + +// int fd = m_p_offloaded_wfds[offloaded_index]; + + if (m_p_offloaded_modes[offloaded_index] & OFF_WRITE) { + int fd = m_p_all_offloaded_fds[offloaded_index]; + socket_fd_api *p_socket_object = fd_collection_get_sockfd(fd); + if (!p_socket_object) { + // If we can't find this previously mapped offloaded socket + // then it was probably closed. We need to get out with error code + errno = EBADF; + vma_throw_object(io_mux_call::io_error); + } + + // Poll the socket object + if (p_socket_object->is_writeable()) { + set_wfd_ready(fd); + } + } + } +} + +inline void io_mux_call::check_offloaded_esockets() +{ + for (int offloaded_index = 0; offloaded_index < *m_p_num_all_offloaded_fds; ++offloaded_index) { + if (m_p_offloaded_modes[offloaded_index] & OFF_RDWR) { + int fd = m_p_all_offloaded_fds[offloaded_index]; + socket_fd_api *p_socket_object = fd_collection_get_sockfd(fd); + if (!p_socket_object) { + // If we can't find this previously mapped offloaded socket + // then it was probably closed. We need to get out with error code + errno = EBADF; + vma_throw_object(io_mux_call::io_error); + } + + // Poll the socket object + int errors = 0; + if (p_socket_object->is_errorable(&errors)) { + set_efd_ready(fd, errors); + } + } + } +} + +inline bool io_mux_call::check_all_offloaded_sockets() +{ + check_offloaded_rsockets(); + + if (!m_n_ready_rfds) + { + // check cq for acks + ring_poll_and_process_element(); + check_offloaded_wsockets(); + check_offloaded_esockets(); + } + + __log_func("m_n_all_ready_fds=%d, m_n_ready_rfds=%d, m_n_ready_wfds=%d, m_n_ready_efds=%d", m_n_all_ready_fds, m_n_ready_rfds, m_n_ready_wfds, m_n_ready_efds); + return m_n_all_ready_fds; +} + +inline void io_mux_call::zero_polling_cpu(timeval current) +{ + timeval delta; + int delta_time; // in usec + + // check if it's time to zero g_polling_time_usec + tv_sub(¤t, &g_last_zero_polling_time, &delta); + delta_time=tv_to_usec(&delta); + + if (delta_time>=USEC_PER_SEC) { + m_p_stats->n_iomux_polling_time = (g_polling_time_usec*100)/delta_time; + + __log_funcall("zero polling time: accumulated: %d usec delta=%d (%d%))", g_polling_time_usec, delta_time, m_p_stats->n_iomux_polling_time); + g_polling_time_usec=0; + g_last_zero_polling_time = current; + } +} + +io_mux_call::io_mux_call(int *off_fds_buffer, offloaded_mode_t *off_modes_buffer, int num_fds, const sigset_t *sigmask) : + m_check_sig_pending_ratio(0), + m_n_sysvar_select_skip_os_fd_check(safe_mce_sys().select_skip_os_fd_check), + m_n_sysvar_select_poll_os_ratio(safe_mce_sys().select_poll_os_ratio), + m_n_sysvar_select_poll_num(safe_mce_sys().select_poll_num), + m_b_sysvar_select_poll_os_force(safe_mce_sys().select_poll_os_force), + m_b_sysvar_select_handle_cpu_usage_stats(safe_mce_sys().select_handle_cpu_usage_stats), + m_p_all_offloaded_fds(off_fds_buffer), + m_p_offloaded_modes(off_modes_buffer), + m_num_all_offloaded_fds(0), + m_cqepfd(-1), + m_poll_sn(0), + m_p_stats(NULL), + m_n_all_ready_fds(0), + m_n_ready_rfds(0), + m_n_ready_wfds(0), + m_n_ready_efds(0), + m_sigmask(sigmask) +{ + m_p_num_all_offloaded_fds = &m_num_all_offloaded_fds; + tv_clear(&m_start); + tv_clear(&m_elapsed); + + if (m_p_all_offloaded_fds) memset(m_p_all_offloaded_fds, 0, num_fds*sizeof(m_p_all_offloaded_fds[0])); + if (m_p_offloaded_modes) memset(m_p_offloaded_modes , 0, num_fds*sizeof(m_p_offloaded_modes[0])); + + m_fd_ready_array.fd_max = FD_ARRAY_MAX; + m_fd_ready_array.fd_count = 0; +} + +void io_mux_call::check_offloaded_rsockets() +{ + int fd, offloaded_index, num_all_offloaded_fds; + fd_array_t fd_ready_array; + socket_fd_api *p_socket_object; + + fd_ready_array.fd_max = FD_ARRAY_MAX; + + offloaded_index = g_n_last_checked_index; + num_all_offloaded_fds = *m_p_num_all_offloaded_fds; + + for (int i = 0; i < num_all_offloaded_fds; ++i) { + + ++offloaded_index %= num_all_offloaded_fds; + + if (m_p_offloaded_modes[offloaded_index] & OFF_READ) { + fd = m_p_all_offloaded_fds[offloaded_index]; + p_socket_object = fd_collection_get_sockfd(fd); + if (!p_socket_object) { + // If we can't find this previously mapped offloaded socket + // then it was probably closed. We need to get out with error code + errno = EBADF; + g_n_last_checked_index = offloaded_index; + vma_throw_object(io_mux_call::io_error); + } + + fd_ready_array.fd_count = 0; + + // Poll the socket object + if (p_socket_object->is_readable(&m_poll_sn, &fd_ready_array)) { + set_offloaded_rfd_ready(offloaded_index); + // We have offloaded traffic. Don't sample the OS immediately + p_socket_object->unset_immediate_os_sample(); + } + + check_rfd_ready_array(&fd_ready_array); + + + //TODO: consider - m_n_all_ready_fds + if (m_n_ready_rfds){ + g_n_last_checked_index = offloaded_index; + return ; + } + + } + } + g_n_last_checked_index = offloaded_index; + //return false; +} + +bool io_mux_call::handle_os_countdown(int &poll_os_countdown) +{ + /* + * Poll OS when count down reaches zero. This honors CQ-OS ratio. + * This also handles the 0 ratio case - do not poll OS at all. + */ + if (poll_os_countdown-- == 0 && m_n_sysvar_select_poll_os_ratio > 0) { + if (wait_os(true)) { + // This will empty the cqepfd + // (most likely in case of a wakeup and probably only under epoll_wait (Not select/poll)) + ring_wait_for_notification_and_process_element(NULL); + } + /* Before we exit with ready OS fd's we'll check the CQs once more and exit + * below after calling check_all_offloaded_sockets(); + * IMPORTANT : We cannot do an opposite with current code, + * means we cannot poll cq and then poll os (for epoll) - because poll os + * will delete ready offloaded fds. + */ + if (m_n_all_ready_fds) { + m_p_stats->n_iomux_os_rx_ready += m_n_all_ready_fds; // TODO: fix it - we only know all counter, not read counter + check_all_offloaded_sockets(); + return true; + } + poll_os_countdown = m_n_sysvar_select_poll_os_ratio - 1; + } + + return false; +} + +void io_mux_call::polling_loops() +{ + int poll_counter; + int check_timer_countdown = 1; // Poll once before checking the time + int poll_os_countdown = 0; + bool multiple_polling_loops, finite_polling; + timeval before_polling_timer = TIMEVAL_INITIALIZER, after_polling_timer = TIMEVAL_INITIALIZER, delta; + + if(immidiate_return(poll_os_countdown)) { + return; + } + +#ifdef VMA_TIME_MEASURE + TAKE_T_POLL_START; + ZERO_POLL_COUNT; +#endif + + poll_counter = 0; + finite_polling = m_n_sysvar_select_poll_num != -1; + multiple_polling_loops = m_n_sysvar_select_poll_num != 0; + + timeval poll_duration; + tv_clear(&poll_duration); + poll_duration.tv_usec = m_n_sysvar_select_poll_num; + + __if_dbg("2nd scenario start"); + + if (m_b_sysvar_select_handle_cpu_usage_stats) { + // handle polling cpu statistics + if (!tv_isset(&g_last_zero_polling_time)) { + // after first loop - set + gettime(&g_last_zero_polling_time); + } + + gettime(&before_polling_timer); + zero_polling_cpu(before_polling_timer); + } + + do { +#ifdef VMA_TIME_MEASURE + INC_POLL_COUNT; +#endif + __log_funcall("2nd scenario loop %d", poll_counter); + __log_funcall("poll_os_countdown=%d, select_poll_os_ratio=%d, check_timer_countdown=%d, m_num_offloaded_rfds=%d," + " m_n_all_ready_fds=%d, m_n_ready_rfds=%d, m_n_ready_wfds=%d, m_n_ready_efds=%d, multiple_polling_loops=%d", + poll_os_countdown, m_n_sysvar_select_poll_os_ratio, check_timer_countdown, *m_p_num_all_offloaded_fds, + m_n_all_ready_fds, m_n_ready_rfds, m_n_ready_wfds, m_n_ready_efds, multiple_polling_loops); + + if (handle_os_countdown(poll_os_countdown)) { + // Break if non-offloaded data was found. + break; + } + + /* + * Poll offloaded sockets. + * If this is successful we must exit - wait_os() might mess the results. + */ + //__log_func("before check_all_offloaded_sockets"); + if (check_all_offloaded_sockets()) + break; + + + /* + * Update elapsed time & Check for timeout or expiry of polling loops duration + * Update start time on first entry + */ + if (check_timer_countdown <= 1) { + timer_update(); + if (is_timeout(m_elapsed)) { + __if_dbg("2nd scenario timeout (loop %d, elapsed %d)", poll_counter, m_elapsed.tv_usec); + __if_dbg("timeout (loop %d, elapsed %d)", poll_counter, m_elapsed.tv_usec); + break; + } + + if (finite_polling && (tv_cmp(&poll_duration, &m_elapsed, <=))) { + __if_dbg("2nd scenario reached max poll duration (loop %d, elapsed %d)", poll_counter, m_elapsed.tv_usec); + __if_dbg("timeout reached max poll duration (loop %d, elapsed %d)", poll_counter, m_elapsed.tv_usec); + break; + } + + // Check the timer each 512 offloaded fd's checked + check_timer_countdown = 512; + + __if_dbg("2nd scenario timer update (loop %d, elapsed %d)", poll_counter, m_elapsed.tv_usec); + } + + // update timer check with referance to number of offlaoded sockets in loop + check_timer_countdown -= *m_p_num_all_offloaded_fds; + //check_timer_countdown -= m_num_offloaded_wfds; //TODO: consider the appropriate factor + poll_counter++; + + if (g_b_exit || is_sig_pending()) { + errno = EINTR; + vma_throw_object(io_mux_call::io_error); + } + } while (m_n_all_ready_fds == 0 && multiple_polling_loops); + + if (m_b_sysvar_select_handle_cpu_usage_stats) { + // handle polling cpu statistics + gettime(&after_polling_timer); + + //calc accumulated polling time + tv_sub(&after_polling_timer, &before_polling_timer, &delta); + g_polling_time_usec += tv_to_usec(&delta); + + zero_polling_cpu(after_polling_timer); + } + + if (m_n_all_ready_fds) {//TODO: verify! + ++m_p_stats->n_iomux_poll_hit; + __log_func("polling_loops found %d ready fds (rfds=%d, wfds=%d, efds=%d)", m_n_all_ready_fds, m_n_ready_rfds, m_n_ready_wfds, m_n_ready_efds); +#ifdef VMA_TIME_MEASURE + TAKE_T_POLL_END; +#endif + } + else { + ++m_p_stats->n_iomux_poll_miss; + } + + __if_dbg("2nd scenario exit (loop %d, elapsed %d)", poll_counter, m_elapsed.tv_usec); +} + +void io_mux_call::blocking_loops() +{ + int ret; + bool cq_ready = false; + bool woke_up_non_valid = false; + fd_array_t fd_ready_array; + fd_ready_array.fd_max = FD_ARRAY_MAX; + + prepare_to_block(); + + /* + * Loop as long as no fd's are found, and cq is ready. + * If wait() returns without cq ready - timeout expired. + */ + do { + if (g_b_exit || is_sig_pending()) { + errno = EINTR; + vma_throw_object(io_mux_call::io_error); + } + + woke_up_non_valid = false; + + ret = ring_request_notification(); + __log_func("arming cq with poll_sn=%lx ret=%d", m_poll_sn, ret); + if (ret < 0) { + vma_throw_object(io_mux_call::io_error); + } + else if (ret > 0) { + // arm failed - process pending wce + cq_ready = true; + fd_ready_array.fd_count = 0; + check_all_offloaded_sockets(); + } + else /* ret == 0 */ { + + timer_update(); + + // arming was successful - block on cq + __log_func("going to sleep (elapsed time: %d sec, %d usec)", m_elapsed.tv_sec, m_elapsed.tv_usec); + if (check_all_offloaded_sockets()) { + continue; + } + + cq_ready = wait(m_elapsed); + __log_func("wait() returned %d, m_n_all_ready_fds=%d", cq_ready, m_n_all_ready_fds); + if (cq_ready) { + fd_ready_array.fd_count = 0; + ring_wait_for_notification_and_process_element(&fd_ready_array); + // tcp sockets can be accept ready! + __log_func("before check_all_offloaded_sockets"); + check_all_offloaded_sockets(); + // This hurts epoll and doesn't seem to make a different for the rest + //check_rfd_ready_array(&fd_ready_array); + } else if (!m_n_all_ready_fds && !is_timeout(m_elapsed)) { + __log_func("woke up by wake up mechanism, check current events"); + check_all_offloaded_sockets(); + if(!m_n_all_ready_fds) { + woke_up_non_valid = true; + __log_func("woke up by wake up mechanism but the events are no longer valid"); + } + } + } + } while (!m_n_all_ready_fds && (woke_up_non_valid || cq_ready) && !is_timeout(m_elapsed)); //TODO: consider sum r + w +} + +int io_mux_call::call() +{ + //TODO: need stats adjustments for write... + + __log_funcall(""); + + if (!m_b_sysvar_select_poll_os_force // TODO: evaluate/consider this logic + && (*m_p_num_all_offloaded_fds == 0)) + { + // 1st scenario + timer_update(); + wait_os(false); + if (g_b_exit || is_sig_pending()) { + errno = EINTR; + vma_throw_object(io_mux_call::io_error); + } + m_p_stats->n_iomux_os_rx_ready += m_n_ready_rfds; //TODO: check + + //wake up mechanism can bring up events of later joined offloaded sockets + if(*m_p_num_all_offloaded_fds) { + check_all_offloaded_sockets(); + if (m_n_all_ready_fds) goto done; + else { //false wake-up, and we already discovered that we should be in 2nd scenario + timer_update(); + if (is_timeout(m_elapsed)) goto done; + } + } else { + goto done; + } + } + + // 2nd scenario + polling_loops(); + + // 3rd scenario + if (!m_n_all_ready_fds && !is_timeout(m_elapsed)) { + blocking_loops(); + } + + done: + + if (m_n_all_ready_fds == 0) {//TODO: check + // An error throws an exception + ++m_p_stats->n_iomux_timeouts; + } + + __log_func("return %d", m_n_all_ready_fds); + return m_n_all_ready_fds; // TODO: consider sum r + w +} + +//check if we found anything in the constructor of select and poll +//override in epoll +bool io_mux_call::immidiate_return(int &poll_os_countdown){ + + prepare_to_poll(); + + if(m_n_all_ready_fds){ + m_n_ready_rfds = 0; //will be counted again in check_rfd_ready_array() + m_n_all_ready_fds = 0; + check_rfd_ready_array(&m_fd_ready_array); + ring_poll_and_process_element(); + return true; + } + + /* + * Give OS priority in 1 of SELECT_SKIP_OS times + * In all other times, OS is never polled first (even if ratio is 1). + */ + if (--m_n_skip_os_count <= 0) { + m_n_skip_os_count = m_n_sysvar_select_skip_os_fd_check; + poll_os_countdown = 0; + } else { + poll_os_countdown = m_n_sysvar_select_poll_os_ratio; + } + + return false; +} + +int io_mux_call::ring_poll_and_process_element() +{ + //TODO: (select, poll) this access all CQs, it is better to check only relevant ones + return g_p_net_device_table_mgr->global_ring_poll_and_process_element(&m_poll_sn, NULL); +} + +int io_mux_call::ring_request_notification() +{ + return g_p_net_device_table_mgr->global_ring_request_notification(m_poll_sn); +} + +int io_mux_call::ring_wait_for_notification_and_process_element(void* pv_fd_ready_array) +{ + return g_p_net_device_table_mgr->global_ring_wait_for_notification_and_process_element(&m_poll_sn, pv_fd_ready_array); +} + +bool io_mux_call::is_sig_pending() +{ + if (!m_sigmask) return false; + + if (m_check_sig_pending_ratio >= CHECK_INTERRUPT_RATIO) { + m_check_sig_pending_ratio = 0; + } else { + m_check_sig_pending_ratio++; + return false; + } + + sigset_t set_pending, set_andn; + sigemptyset(&set_pending); + sigemptyset(&set_andn); + + if (sigpending(&set_pending)) { + __log_err("sigpending() failed (errno = %d %m)", errno); + return false; + } + + sigandnset(&set_andn, &set_pending, m_sigmask); + + //good flow - first option - no signals + if (sigisemptyset(&set_andn)) { + __log_funcall("no pending signals which the user is waiting for"); + return false; + } + + //good flow - second options - pending signals - deliver them + sigsuspend(m_sigmask); + + return true; +} diff --git a/src/vma/iomux/io_mux_call.h b/src/vma/iomux/io_mux_call.h new file mode 100644 index 0000000..4faf9f6 --- /dev/null +++ b/src/vma/iomux/io_mux_call.h @@ -0,0 +1,300 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef _IO_MUX_CALL_H +#define _IO_MUX_CALL_H + +#include +#include + +#include +#include +#include +#include + +//from sigset.h +#ifndef sigandnset +#define sigandnset(dest, left, right) \ + (__extension__ ({ int __cnt = _SIGSET_NWORDS; \ + sigset_t *__dest = (dest); \ + const sigset_t *__left = (left); \ + const sigset_t *__right = (right); \ + while (--__cnt >= 0) \ + __dest->__val[__cnt] = (__left->__val[__cnt] \ + & ~(__right->__val[__cnt])); \ + 0; })) +#endif + +#define CHECK_INTERRUPT_RATIO 0 + +extern timeval g_last_zero_polling_time; //the last time g_polling_time_usec was zeroed + +/** + * @class mux_call + * Base class for IO multiplexing system calls - select,poll,epoll_wait + */ +class io_mux_call +{ +public: + + enum offloaded_mode_t { + OFF_NONE = 0x0, + OFF_READ = 0x1, + OFF_WRITE = 0x2, + OFF_RDWR = OFF_READ | OFF_WRITE // offloaded for both read & write + }; + + + /** + * Create a multiplexing call. + * @param fds_buffer Pointer to a buffer large enough to hold all fds. + */ + io_mux_call(int *off_fds_buffer, offloaded_mode_t *off_modes_buffer, int num_fds = 0, const sigset_t *sigmask = NULL); // = 0 is only temp + virtual ~io_mux_call() {}; + + /** + * Sets an offloaded file descriptor as ready. + * @param fd_index Index in offloaded_fds array. + * @return Whether fd was added. + * Also updates m_n_ready_rfds + */ + virtual void set_offloaded_rfd_ready(int fd_index) = 0; + virtual void set_offloaded_wfd_ready(int fd_index) = 0; + + /** + * Sets a file descriptor as ready. + * @param fd_index Index in offloaded_fds array. + * @return Whether fd was added. + * Also updates m_n_ready_rfds + */ + virtual void set_rfd_ready(int fd) = 0; + virtual void set_wfd_ready(int fd) = 0; + virtual void set_efd_ready(int fd, int errors) = 0; + /** + * Prepare to poll on fds + */ + virtual void prepare_to_poll() {}; + + /** + * Prepare to block on fds. + * Set m_cq_epfd. + */ + virtual void prepare_to_block() = 0; + + /** + * Waits on original file descriptors only. + * Updates m_n_all_ready_fds. + * @param zero_timeout If true, wait with zero timeout. + * If false, wait with original timeout. + * @throws io_mux_call::io_error + */ + virtual bool wait_os(bool zero_timeout) = 0; + + /** + * Blocks until any fd (or cq_epfd) are ready, or timeout expires. + * Updates the timeout with time remaining. + * Updates m_n_all_ready_fds. + * + * @param elapsed Time elapsed since the call start. + * Should wait at most (timeout - elapsed). + * @return true if cq_epfd is ready. + * @throws io_mux_call::io_error + */ + virtual bool wait(const timeval &elapsed) = 0; + + /** + * Checks if there is a timeout (used in polling loops). + * @param elapsed Time elapsed since the call start. + * @return true if elapsed > timeout, false otherwise. + */ + virtual bool is_timeout(const timeval &elapsed) = 0; + + /** + * Call the function. + * @return Number of ready fds. + * @throws io_mux_call::io_error + * + * This is how it works: + * No offloaded sockets - redirect the call to OS. + * Otherwise: + * Loop N times until found or timeout: Poll all offloaded sockets, if + * nothing is found poll OS. + * If nothing is found yet: + * Loop until found or timeout: Arm the CQ and block on offloaded sockets + * plus CQ epfd. If CQ is found, poll offloaded sockets. If something else + * is found, return it. + */ + int call(); + + static inline void update_fd_array(fd_array_t* p_fd_array, int fd) + { + if (p_fd_array && (p_fd_array->fd_count < p_fd_array->fd_max)) { + // Check that fd doesn't exist in the array + for (int i=(p_fd_array->fd_count - 1); i>=0; i--) { + if (p_fd_array->fd_list[i] == fd) { + return; + } + } + p_fd_array->fd_list[p_fd_array->fd_count] = fd; + p_fd_array->fd_count++; + } + + } + + virtual bool immidiate_return(int &poll_os_countdown); + /** + * @class io_error + * Exception by OS IO functions. + */ + + class io_error : public vma_exception { + public: + io_error(const char* _message, const char* _function, const char* _filename, int _lineno, int _errnum) throw() + : vma_exception(_message, _function, _filename, _lineno, _errnum) + { + + } + }; + +private: + + /** + * Go over offloaded fd's and check if their sockinfo is ready. + * If ready, calls set_offloaded_rfd_ready() & set_offloaded_wfd_ready() on that fd. + * @return Whether an fd is ready. + */ + virtual bool check_all_offloaded_sockets(); + inline void check_offloaded_rsockets(); + inline void check_offloaded_wsockets(); + inline void check_offloaded_esockets(); + + /** + * Loop: Poll the CQ and check for ready fds + */ + void polling_loops(); + + /** + * Loop: Block on CQ and check for ready fds + */ + void blocking_loops(); + + /** + * Internal timer update + * Used to update the elapsed time for is_timeout() calls + */ + inline void timer_update(); + + /** + * Check if the polling CPU var needs to be zeroed + * (internal for the statistics) + */ + inline void zero_polling_cpu(timeval current); + + /** + * Go over fd_ready_array and set all fd's in it as ready. + * @return Whether anything was found in the array. + */ + inline void check_rfd_ready_array(fd_array_t *fd_ready_array); + + /** + * check if we have signal pending and the call need to be interrupted + */ + inline bool is_sig_pending(); + + /// counts the number times os poll was skipped + static int m_n_skip_os_count; + + int m_check_sig_pending_ratio; + + const uint32_t m_n_sysvar_select_skip_os_fd_check; + const uint32_t m_n_sysvar_select_poll_os_ratio; + const int32_t m_n_sysvar_select_poll_num; + const bool m_b_sysvar_select_poll_os_force; + const bool m_b_sysvar_select_handle_cpu_usage_stats; + +public: +protected: + + virtual int ring_poll_and_process_element(); + + virtual int ring_request_notification(); + + virtual int ring_wait_for_notification_and_process_element(void* pv_fd_ready_array); + + virtual bool handle_os_countdown(int &poll_os_countdown); + + /// Pointer to an array of all offloaded fd's + int *m_p_all_offloaded_fds; + offloaded_mode_t *m_p_offloaded_modes; + + //--- read handling + /// Number of offloaded fd's + int m_num_all_offloaded_fds; + + + /// Pointer to the number of offloaded fd's + int *m_p_num_all_offloaded_fds; + + //-- + /// CQ epoll file descriptor (wrapper) + int m_cqepfd; + + /// poll sn + uint64_t m_poll_sn; + + /// vma statistics. each implementation must initialize this. + iomux_func_stats_t *m_p_stats; + + /// timer managment + timeval m_start, m_elapsed; + + /// number of total ready fds (r + w + x) + int m_n_all_ready_fds; + + // TODO: consider removing m_n_ready_rfds & m_n_ready_wfds + /// number of ready r fds + int m_n_ready_rfds; + + /// number of ready w fds + int m_n_ready_wfds; + + /// number of ready e fds + int m_n_ready_efds; + + /// collect the ready fds in the begining of the call + fd_array_t m_fd_ready_array; + + const sigset_t* m_sigmask; +}; + +#endif diff --git a/src/vma/iomux/poll_call.cpp b/src/vma/iomux/poll_call.cpp new file mode 100644 index 0000000..42130c8 --- /dev/null +++ b/src/vma/iomux/poll_call.cpp @@ -0,0 +1,312 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "poll_call.h" + +#include +#include +#include +#include +#include +#include + +#define MODULE_NAME "poll_call:" + +iomux_func_stats_t g_poll_stats; + +poll_call::poll_call(int *off_rfds_buffer, offloaded_mode_t *off_modes_buffer, int *lookup_buffer, pollfd *working_fds_arr, + pollfd *fds, nfds_t nfds, int timeout, const sigset_t *__sigmask /* = NULL */) : + io_mux_call(off_rfds_buffer, off_modes_buffer, 0, __sigmask), + m_nfds(nfds), m_timeout(timeout), + m_lookup_buffer(lookup_buffer), m_orig_fds(fds) +{ + nfds_t i; + int fd; + m_fds = NULL; + + // create stats + m_p_stats = &g_poll_stats; + vma_stats_instance_get_poll_block(m_p_stats); + + // Collect offloaded fds and remove all tcp (skip_os) sockets from m_fds + for (i = 0; i < m_nfds; ++i) { + //Very important to initialize this to 0 it is not done be default + m_orig_fds[i].revents = 0; + + //We need to initialize m_fds[i].revents in case we already copied it from m_orig_fds + if(m_fds) + m_fds[i].revents = 0; + + fd = m_orig_fds[i].fd; + socket_fd_api* temp_sock_fd_api = fd_collection_get_sockfd(fd); + if (temp_sock_fd_api && (temp_sock_fd_api->get_type()==FD_TYPE_SOCKET)) { + offloaded_mode_t off_mode = OFF_NONE; + if (m_orig_fds[i].events & POLLIN) + off_mode = (offloaded_mode_t)(off_mode | OFF_READ); + if (m_orig_fds[i].events & POLLOUT) + off_mode = (offloaded_mode_t)(off_mode | OFF_WRITE); + + if (off_mode) { + __log_func("---> fd=%d IS SET for read or write!", fd); + m_lookup_buffer[m_num_all_offloaded_fds] = i; + m_p_all_offloaded_fds[m_num_all_offloaded_fds] = fd; + m_p_offloaded_modes[m_num_all_offloaded_fds] = off_mode; + ++m_num_all_offloaded_fds; + + //We will do copy only in case we have at least one offloaded socket + if(! m_fds) + { + m_fds = working_fds_arr; + //m_fds will be working array and m_orig_fds is the pointer to user fds - we cannot modify it + memcpy(m_fds, m_orig_fds, m_nfds * sizeof(fds[0])); + } + + if (temp_sock_fd_api->skip_os_select()) { + __log_func("fd=%d must be skipped from os r poll()", fd); + m_fds[i].fd = -1; + } else if (m_orig_fds[i].events & POLLIN){ + if(temp_sock_fd_api->is_readable(NULL)){ + io_mux_call::update_fd_array(&m_fd_ready_array, fd); + m_n_ready_rfds++; + m_n_all_ready_fds++; + }else{ + // Instructing the socket to sample the OS immediately to prevent hitting EAGAIN on recvfrom(), + // after iomux returned a shadow fd as ready (only for non-blocking sockets) + temp_sock_fd_api->set_immediate_os_sample(); + } + } + } + } + } + + //TODO: No need to have two arrays m_fds and m_orig_fds in case there is no offloaded sockets + if(! m_num_all_offloaded_fds) + m_fds = m_orig_fds; + __log_func("num all offloaded_fds=%d", m_num_all_offloaded_fds); +} + +void poll_call::prepare_to_block() +{ + m_cqepfd = g_p_net_device_table_mgr->global_ring_epfd_get(); + + // add cq + m_fds[m_nfds].events = POLLIN; + m_fds[m_nfds].revents = 0; + m_fds[m_nfds].fd = m_cqepfd; + +} + +bool poll_call::wait_os(bool zero_timeout) +{ + __log_func("calling os poll: %d", m_nfds); + if (m_sigmask) { + struct timespec to, *pto = NULL; + if (zero_timeout) { + to.tv_sec = to.tv_nsec = 0; + pto = &to; + } else if (m_timeout >= 0) { + to.tv_sec = m_timeout / 1000; + to.tv_nsec = (m_timeout % 1000) * 1000000; + pto = &to; + } + m_n_all_ready_fds = orig_os_api.ppoll(m_fds, m_nfds, pto, m_sigmask); + } else { + m_n_all_ready_fds = orig_os_api.poll(m_fds, m_nfds, zero_timeout ? 0 : m_timeout); + } + if (m_n_all_ready_fds < 0) { + vma_throw_object(io_mux_call::io_error); + } + + if (m_n_all_ready_fds > 0) { + __log_dbg("wait_os() returned with %d", m_n_all_ready_fds); + copy_to_orig_fds(); + } + return false; // No cq_fd in poll() event +} + +bool poll_call::wait(const timeval &elapsed) +{ + // poll fds and cq + int timeout; + struct timespec to, *pto = NULL; + + if (m_timeout < 0) { + timeout = m_timeout; + } + else { + timeout = m_timeout - tv_to_msec(&elapsed); + if (timeout < 0) { + // Already reached timeout + return false; + } + } + if (m_sigmask) { + to.tv_sec = m_timeout / 1000; + to.tv_nsec = (m_timeout % 1000) * 1000000; + pto = &to; + m_n_all_ready_fds = orig_os_api.ppoll(m_fds, m_nfds + 1, pto, m_sigmask); + }else { + m_n_all_ready_fds = orig_os_api.poll(m_fds, m_nfds + 1, timeout); + } + + if (m_n_all_ready_fds > 0 && m_fds[m_nfds].revents) { + // CQ was returned - remove it from the count + --m_n_all_ready_fds; + if(m_n_all_ready_fds > 0) + copy_to_orig_fds(); + return true; + } + + if (m_n_all_ready_fds < 0) { + vma_throw_object(io_mux_call::io_error); + } + + copy_to_orig_fds(); + return false; +} + +bool poll_call::is_timeout(const timeval &elapsed) +{ + return m_timeout >= 0 && m_timeout <= tv_to_msec(&elapsed); +} + +void poll_call::set_offloaded_rfd_ready(int fd_index) +{ + if (m_p_offloaded_modes[fd_index] & OFF_READ){ + + int evt_index = m_lookup_buffer[fd_index]; + if (!m_orig_fds[evt_index].revents) + ++m_n_all_ready_fds; + if ((m_orig_fds[evt_index].events & POLLIN) && + !(m_orig_fds[evt_index].revents & POLLIN)){ + m_orig_fds[evt_index].revents |= POLLIN; + ++m_n_ready_rfds; + } + } +} + +void poll_call::set_offloaded_wfd_ready(int fd_index) +{ + if (m_p_offloaded_modes[fd_index] & OFF_WRITE) { + int evt_index = m_lookup_buffer[fd_index]; + if (!m_orig_fds[evt_index].revents) + ++m_n_all_ready_fds; + if ((m_orig_fds[evt_index].events & POLLOUT) && + !(m_orig_fds[evt_index].revents & POLLOUT) && + !(m_orig_fds[evt_index].revents & POLLHUP)){ + /* POLLOUT and POLLHUP are mutually exclusive */ + m_orig_fds[evt_index].revents |= POLLOUT; + ++m_n_ready_wfds; + } + } +} + +void poll_call::set_offloaded_efd_ready(int fd_index, int errors) +{ + if (m_p_offloaded_modes[fd_index] & OFF_RDWR) { + int evt_index = m_lookup_buffer[fd_index]; + if (!m_orig_fds[evt_index].revents) + ++m_n_all_ready_fds; + bool got_errors = false; + if ((errors & POLLHUP) && + !(m_orig_fds[evt_index].revents & POLLHUP)){ + m_orig_fds[evt_index].revents |= POLLHUP; + if (m_orig_fds[evt_index].revents & POLLOUT) { + /* POLLOUT and POLLHUP are mutually exclusive */ + m_orig_fds[evt_index].revents &= ~POLLOUT; + } + got_errors = true; + } + if ((errors & POLLERR) && + !(m_orig_fds[evt_index].revents & POLLERR)){ + m_orig_fds[evt_index].revents |= POLLERR; + got_errors = true; + } + if (got_errors) { + ++m_n_ready_efds; + } + } +} + +void poll_call::set_rfd_ready(int fd) +{ + int fd_index; + + // TODO make this more efficient + for (fd_index = 0; fd_index < *m_p_num_all_offloaded_fds; ++fd_index) { + if (m_p_all_offloaded_fds[fd_index] == fd) { + set_offloaded_rfd_ready(fd_index); + } + } +} +void poll_call::set_wfd_ready(int fd) +{ + int fd_index; + + // TODO make this more efficient + for (fd_index = 0; fd_index < *m_p_num_all_offloaded_fds; ++fd_index) { + if (m_p_all_offloaded_fds[fd_index] == fd) { + set_offloaded_wfd_ready(fd_index); + } + } +} + +void poll_call::set_efd_ready(int fd, int errors) +{ + int fd_index; + + // TODO make this more efficient + for (fd_index = 0; fd_index < *m_p_num_all_offloaded_fds; ++fd_index) { + if (m_p_all_offloaded_fds[fd_index] == fd) { + set_offloaded_efd_ready(fd_index, errors); + } + } +} + +void poll_call::copy_to_orig_fds() +{ + //No need to copy anything in case there are no offloaded sockets. + if(! m_num_all_offloaded_fds) + return; + int ready_fds = m_n_all_ready_fds; + for (nfds_t i = 0; i< m_nfds ; i++) + { + if(m_fds[i].revents) + { + m_orig_fds[i].revents = m_fds[i].revents; + ready_fds--; + if(!ready_fds) + return; + } + } +} diff --git a/src/vma/iomux/poll_call.h b/src/vma/iomux/poll_call.h new file mode 100644 index 0000000..0cabd73 --- /dev/null +++ b/src/vma/iomux/poll_call.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef _POLL_CALL_H +#define _POLL_CALL_H + +#include + +#include "io_mux_call.h" + +/** + * @class poll_call + * Functor for poll() + */ +class poll_call : public io_mux_call +{ +public: + /** + * Create a poll call. + * @param rfds_buffer Array of at least nfds ints. + * @param lookup_buffer Array of at least nfds ints. + * @param extra_fds_buffer Array of at least (1 + nfds) pollfd-s. + * + * Rest of the arguments are the same as for poll() library function. + * @throws io_mux_call::io_error + */ + poll_call(int *off_rfds_buffer, offloaded_mode_t *off_modes_buffer, int *lookup_buffer, + pollfd *working_fds_arr, pollfd *fds, nfds_t nfds, int timeout, const sigset_t *__sigmask = NULL); + + /// @override + virtual void set_offloaded_rfd_ready(int fd_index); + virtual void set_offloaded_wfd_ready(int fd_index); + virtual void set_offloaded_efd_ready(int fd_index, int errors); + + /// @override + virtual void prepare_to_block(); + + /// @override + virtual bool wait_os(bool zero_timeout); + + /// @override + virtual bool wait(const timeval &elapsed); + + /// @override + virtual bool is_timeout(const timeval &elapsed); + + /// @override + virtual void set_rfd_ready(int fd); + + /// @override + virtual void set_wfd_ready(int fd); + + /// @override + virtual void set_efd_ready(int fd, int errors); + +private: + /// Parameters for the call + pollfd * m_fds; + const nfds_t m_nfds; + int m_timeout; + + int * const m_lookup_buffer; + pollfd * const m_orig_fds; + + void copy_to_orig_fds(); + +}; + +#endif diff --git a/src/vma/iomux/select_call.cpp b/src/vma/iomux/select_call.cpp new file mode 100644 index 0000000..e191964 --- /dev/null +++ b/src/vma/iomux/select_call.cpp @@ -0,0 +1,347 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "select_call.h" + +#include "utils/bullseye.h" +#include "vlogger/vlogger.h" +#include +#include +#include +#include + +#define MODULE_NAME "select_call:" + + +#define FD_COPY(__fddst, __fdsrc, __nfds) \ + memcpy(__FDS_BITS(__fddst), __FDS_BITS(__fdsrc), ((__nfds) + 7) >> 3) + +#undef FD_ZERO // Remove select.h origianl FD_ZERO and define our own with limit size +#define FD_ZERO(__fddst, __nfds) \ + memset(__FDS_BITS(__fddst), 0, ((__nfds) + 7) >> 3) +iomux_func_stats_t g_select_stats; + +select_call::select_call(int *off_fds_buffer, offloaded_mode_t *off_modes_buffer, + int nfds, fd_set *readfds, fd_set *writefds, + fd_set *exceptfds, timeval *timeout, const sigset_t *__sigmask /* = NULL */) : + io_mux_call(off_fds_buffer, off_modes_buffer, nfds, __sigmask), + m_nfds(nfds), m_readfds(readfds), m_writefds(writefds), + m_exceptfds(exceptfds), m_timeout(timeout), m_nfds_with_cq(0), m_b_run_prepare_to_poll(false) +{ + int fd; + //socket_fd_api* temp_sock_fd_api = NULL; + + if (m_nfds > FD_SETSIZE) { + errno = ENOMEM; + vma_throw_object(io_mux_call::io_error); + } + + // create stats + m_p_stats = &g_select_stats; + vma_stats_instance_get_select_block(m_p_stats); + + bool offloaded_read = !!m_readfds; + bool offloaded_write = !!m_writefds; + + if (offloaded_read || offloaded_write) { + FD_ZERO(&m_os_rfds, m_nfds); + FD_ZERO(&m_os_wfds, m_nfds); + + //covers the case of select(readfds = NULL) + if(!m_readfds) { + FD_ZERO(&m_cq_rfds, m_nfds); + m_readfds = &m_cq_rfds; + } + + // get offloaded fds in read set + for (fd = 0; fd < m_nfds; ++fd) { + + bool check_read = offloaded_read && FD_ISSET(fd, m_readfds); + bool check_write = offloaded_write && FD_ISSET(fd, m_writefds); + + socket_fd_api* psock = fd_collection_get_sockfd(fd); + + if (psock && psock->get_type() == FD_TYPE_SOCKET) { + + offloaded_mode_t off_mode = OFF_NONE; + if (check_read) off_mode = (offloaded_mode_t)(off_mode | OFF_READ); + if (check_write) off_mode = (offloaded_mode_t)(off_mode | OFF_WRITE); + + if (off_mode) { + __log_func("---> fd=%d IS SET for read or write!", fd); + + m_p_all_offloaded_fds[m_num_all_offloaded_fds] = fd; + m_p_offloaded_modes[m_num_all_offloaded_fds] = off_mode; + m_num_all_offloaded_fds++; + if (! psock->skip_os_select()) { + if (check_read) { + FD_SET(fd, &m_os_rfds); + if (psock->is_readable(NULL)) { + io_mux_call::update_fd_array(&m_fd_ready_array, fd); + m_n_ready_rfds++; + m_n_all_ready_fds++; + } else { + // Instructing the socket to sample the OS immediately to prevent hitting EAGAIN on recvfrom(), + // after iomux returned a shadow fd as ready (only for non-blocking sockets) + psock->set_immediate_os_sample(); + } + } + if (check_write) { + FD_SET(fd, &m_os_wfds); + } + } + else + __log_func("fd=%d must be skipped from os r select()", fd); + + } + } + else { + if (check_read) { + FD_SET(fd, &m_os_rfds); + } + if (check_write) { + FD_SET(fd, &m_os_wfds); + } + } + + } + } + __log_func("num all offloaded_fds=%d", m_num_all_offloaded_fds); +} + + +void select_call::prepare_to_poll() +{ + /* + * Create copies of all sets and zero out the originals. + * This is needed because polling might be successful. + * + * If the read set is zero, use the local copy every time. + * This is OK because it will hold only the CQ, and wait() + * clears the CQ from the set after orig_select() call. + * + * m_readfds is non-NULL here because there are offloaded sockets. + */ + + // copy sets, and zero out the originals + if (m_readfds) { + FD_COPY(&m_orig_readfds, m_readfds, m_nfds); + FD_ZERO(m_readfds, m_nfds); + } + + if (m_writefds) { + FD_COPY(&m_orig_writefds, m_writefds, m_nfds); + FD_ZERO(m_writefds, m_nfds); + } + if (m_exceptfds) { + FD_COPY(&m_orig_exceptfds, m_exceptfds, m_nfds); + FD_ZERO(m_exceptfds, m_nfds); + } + m_b_run_prepare_to_poll = true; +} + +void select_call::prepare_to_block() +{ + m_cqepfd = g_p_net_device_table_mgr->global_ring_epfd_get(); + m_nfds_with_cq = max(m_cqepfd + 1, m_nfds); +} + +bool select_call::wait_os(bool zero_timeout) +{ + timeval to, *pto = NULL; + timespec to_pselect, *pto_pselect = NULL; + +/* Avner: I put it in comment, because this logic is wrong + + // optimization: do not call os select if ALL fds are excluded + // extend check to write/except fds + if (m_rfd_count == m_n_exclude_fds) + return; +*/ + + if (zero_timeout) { + to.tv_sec = to.tv_usec = 0; + pto = &to; + } + else { + pto = m_timeout; + } + + // Restore original sets + if (m_b_run_prepare_to_poll) { + if (m_readfds) FD_COPY(m_readfds, &m_os_rfds, m_nfds); + if (m_writefds) FD_COPY(m_writefds, &m_os_wfds, m_nfds); + if (m_exceptfds)FD_COPY(m_exceptfds, &m_orig_exceptfds, m_nfds); + } + __log_func("calling os select: %d", m_nfds); + if (m_sigmask) { + if (pto) { + to_pselect.tv_sec = pto->tv_sec; + to_pselect.tv_nsec = pto->tv_usec * 1000; + pto_pselect = &to_pselect; + } + m_n_all_ready_fds = orig_os_api.pselect(m_nfds, m_readfds, m_writefds, m_exceptfds, pto_pselect, m_sigmask); + } else { + m_n_all_ready_fds = orig_os_api.select(m_nfds, m_readfds, m_writefds, m_exceptfds, pto); + } + if (m_n_all_ready_fds < 0) { + vma_throw_object(io_mux_call::io_error); + } + if (m_n_all_ready_fds > 0) { + __log_func("wait_os() returned with %d", m_n_all_ready_fds); + } + return false; // No cq_fd in select() event +} + +bool select_call::wait(const timeval &elapsed) +{ + timeval timeout, *pto = NULL; + timespec to_pselect, *pto_pselect = NULL; + + BULLSEYE_EXCLUDE_BLOCK_START + if (m_n_all_ready_fds > 0) { + __log_panic("wait() called when there are ready fd's!!!"); + // YossiE TODO make this and some more checks as debug assertions + // In all functions + } + BULLSEYE_EXCLUDE_BLOCK_END + + // Restore original sets + if (m_b_run_prepare_to_poll) { + if (m_readfds) FD_COPY(m_readfds, &m_os_rfds, m_nfds); + if (m_writefds) FD_COPY(m_writefds, &m_os_wfds, m_nfds); + if (m_exceptfds)FD_COPY(m_exceptfds, &m_orig_exceptfds, m_nfds); + } + + // Call OS select() on original sets + CQ epfd in read set + if (m_readfds) + FD_SET(m_cqepfd, m_readfds); + if (m_timeout) { + tv_sub(m_timeout, &elapsed, &timeout); + if (timeout.tv_sec < 0 || timeout.tv_usec < 0) { + // Already reached timeout + return false; + } + pto = &timeout; + } + + __log_func("going to wait on select CQ+OS nfds=%d cqfd=%d pto=%p!!!", m_nfds_with_cq, m_cqepfd, pto); + + // ACTUAL CALL TO SELECT + if (m_sigmask) { + if (pto) { + to_pselect.tv_sec = pto->tv_sec; + to_pselect.tv_nsec = pto->tv_usec * 1000; + pto_pselect = &to_pselect; + } + m_n_all_ready_fds = orig_os_api.pselect(m_nfds, m_readfds, m_writefds, m_exceptfds, pto_pselect, m_sigmask); + } else { + m_n_all_ready_fds = orig_os_api.select(m_nfds_with_cq, m_readfds, m_writefds, m_exceptfds, pto); + } + __log_func("done select CQ+OS nfds=%d cqfd=%d pto=%p ready=%d!!!", m_nfds_with_cq, m_cqepfd, pto, m_n_all_ready_fds); + if (m_n_all_ready_fds < 0) { + vma_throw_object(io_mux_call::io_error); + } + + // Clear CQ from the set and don't count it + if (m_readfds) + { + if (FD_ISSET(m_cqepfd, m_readfds)) { + FD_CLR(m_cqepfd, m_readfds); // Not needed if m_readfds is NULL + --m_n_all_ready_fds; + return true; + } + } + return false; +} + +bool select_call::is_timeout(const timeval &elapsed) +{ + return m_timeout && tv_cmp(m_timeout, &elapsed, <=); +} + +void select_call::set_offloaded_rfd_ready(int fd_index) +{ + if (m_p_offloaded_modes[fd_index] & OFF_READ) { //TODO: consider removing + int fd = m_p_all_offloaded_fds[fd_index]; + if (!FD_ISSET(fd, m_readfds)) { + FD_SET(fd, m_readfds); + ++m_n_ready_rfds; + ++m_n_all_ready_fds; + __log_func("ready offloaded fd: %d", fd); + } + } +} + +void select_call::set_rfd_ready(int fd) +{ + // This function also checks that fd was in the original read set + if (!FD_ISSET(fd, m_readfds) && FD_ISSET(fd, &m_orig_readfds)) { + FD_SET(fd, m_readfds); + ++m_n_ready_rfds; +// if (!FD_ISSET(fd, m_writefds)) + ++m_n_all_ready_fds; + } +} + +void select_call::set_offloaded_wfd_ready(int fd_index) +{ + if (m_p_offloaded_modes[fd_index] & OFF_WRITE) { //TODO: consider removing + int fd = m_p_all_offloaded_fds[fd_index]; + if (!FD_ISSET(fd, m_writefds)) { + FD_SET(fd, m_writefds); + ++m_n_ready_wfds; + ++m_n_all_ready_fds; + __log_func("ready offloaded w fd: %d", fd); + } + } +} + +void select_call::set_wfd_ready(int fd) +{ + // This function also checks that fd was in the original read set + if (!FD_ISSET(fd, m_writefds) && FD_ISSET(fd, &m_orig_writefds)) { //TODO: why do we need the last 'if'?? + FD_SET(fd, m_writefds); + ++m_n_ready_wfds; +// if (!FD_ISSET(fd, m_readfds)) + ++m_n_all_ready_fds; + __log_func("ready w fd: %d", fd); + } +} + +void select_call::set_efd_ready(int fd, int errors) +{ + /* TODO currently consider errors as ready to write OR read */ + NOT_IN_USE(errors); + NOT_IN_USE(fd); +} diff --git a/src/vma/iomux/select_call.h b/src/vma/iomux/select_call.h new file mode 100644 index 0000000..bb98bbb --- /dev/null +++ b/src/vma/iomux/select_call.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef _SELECT_CALL_H +#define _SELECT_CALL_H + +#include + +#include "io_mux_call.h" + +/** + * @class poll_call + * Functor for poll() + */ +class select_call : public io_mux_call +{ +public: + /** + * Create a select call. + * @param fds_buffer Array of at least nfds ints. + * + * Rest of the arguments are the same as for select() library function. + * @throws io_mux_call::io_error + */ + select_call(int *off_fds_buffer, offloaded_mode_t *off_modes_buffer, + int nfds, fd_set *readfds, fd_set *writefds, fd_set *exceptfds, timeval *timeout, const sigset_t *__sigmask = NULL); + + /// @override + virtual void set_offloaded_rfd_ready(int fd_index); + virtual void set_offloaded_wfd_ready(int fd_index); + + /// @override + virtual void prepare_to_poll(); + + /// @override + virtual void prepare_to_block(); + + /// @override + virtual bool wait_os(bool zero_timeout); + + /// @override + virtual bool wait(const timeval &elapsed); + + /// @override + virtual bool is_timeout(const timeval &elapsed); + + /// @override + virtual void set_rfd_ready(int fd); + virtual void set_wfd_ready(int fd); + virtual void set_efd_ready(int fd, int errors); + +private: + /// Parameters for the call + const int m_nfds; + fd_set * m_readfds; + fd_set * const m_writefds; + fd_set * const m_exceptfds; + timeval * const m_timeout; + + fd_set m_orig_readfds; + fd_set m_orig_writefds; + fd_set m_orig_exceptfds; + int m_nfds_with_cq; + bool m_b_run_prepare_to_poll; +// int *m_exclude_os_fds; +// int m_n_exclude_fds; +// int m_rfd_count; + + fd_set m_os_rfds; + fd_set m_os_wfds; + + fd_set m_cq_rfds; + +}; + +#endif diff --git a/src/vma/libvma.c b/src/vma/libvma.c new file mode 100644 index 0000000..7ce7d0e --- /dev/null +++ b/src/vma/libvma.c @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +extern int main_init(void); +extern int main_destroy(void); + +/* library init function +----------------------------------------------------------------------------- +__attribute__((constructor)) causes the function to be called when +library is firsrt loaded */ +int __attribute__((constructor)) sock_redirect_lib_load_constructor(void) +{ + return main_init(); +} + +int __attribute__((destructor)) sock_redirect_lib_load_destructor(void) +{ + return main_destroy(); +} diff --git a/src/vma/lwip/cc.c b/src/vma/lwip/cc.c new file mode 100644 index 0000000..79dadb6 --- /dev/null +++ b/src/vma/lwip/cc.c @@ -0,0 +1,131 @@ +/*- + * Copyright (c) 2007-2008 + * Swinburne University of Technology, Melbourne, Australia. + * Copyright (c) 2009-2010 Lawrence Stewart + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed at the Centre for Advanced Internet + * Architectures, Swinburne University of Technology, by Lawrence Stewart and + * James Healy, made possible in part by a grant from the Cisco University + * Research Program Fund at Community Foundation Silicon Valley. + * + * Portions of this software were developed at the Centre for Advanced + * Internet Architectures, Swinburne University of Technology, Melbourne, + * Australia by David Hayes under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * This software was first released in 2007 by James Healy and Lawrence Stewart + * whilst working on the NewTCP research project at Swinburne University of + * Technology's Centre for Advanced Internet Architectures, Melbourne, + * Australia, which was made possible in part by a grant from the Cisco + * University Research Program Fund at Community Foundation Silicon Valley. + * More details are available at: + * http://caia.swin.edu.au/urp/newtcp/ + */ + +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vma/lwip/cc.h" +#include "vma/lwip/tcp.h" + +#if TCP_CC_ALGO_MOD + +inline void cc_init(struct tcp_pcb *pcb) +{ + if (pcb->cc_algo->init != NULL) { + pcb->cc_algo->init(pcb); + } +} + +inline void cc_destroy(struct tcp_pcb *pcb) +{ + if (pcb->cc_algo->destroy != NULL) { + pcb->cc_algo->destroy(pcb); + } +} + +inline void cc_ack_received(struct tcp_pcb *pcb, uint16_t type) +{ + if (pcb->cc_algo->ack_received != NULL) { + pcb->cc_algo->ack_received(pcb, type); + } +} + +inline void cc_conn_init(struct tcp_pcb *pcb) +{ + if (pcb->cc_algo->conn_init != NULL) { + pcb->cc_algo->conn_init(pcb); + } +} + +inline void cc_cong_signal(struct tcp_pcb *pcb, uint32_t type) +{ + + if (pcb->cc_algo->cong_signal != NULL) { + pcb->cc_algo->cong_signal(pcb, type); + } +} + +inline void cc_post_recovery(struct tcp_pcb *pcb) +{ + if (pcb->cc_algo->post_recovery != NULL) { + pcb->cc_algo->post_recovery(pcb); + } +} + +#endif //TCP_CC_ALGO_MOD diff --git a/src/vma/lwip/cc.h b/src/vma/lwip/cc.h new file mode 100644 index 0000000..7685bd7 --- /dev/null +++ b/src/vma/lwip/cc.h @@ -0,0 +1,161 @@ +/*- + * Copyright (c) 2007-2008 + * Swinburne University of Technology, Melbourne, Australia. + * Copyright (c) 2009-2010 Lawrence Stewart + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed at the Centre for Advanced Internet + * Architectures, Swinburne University of Technology, by Lawrence Stewart and + * James Healy, made possible in part by a grant from the Cisco University + * Research Program Fund at Community Foundation Silicon Valley. + * + * Portions of this software were developed at the Centre for Advanced + * Internet Architectures, Swinburne University of Technology, Melbourne, + * Australia by David Hayes under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * This software was first released in 2007 by James Healy and Lawrence Stewart + * whilst working on the NewTCP research project at Swinburne University of + * Technology's Centre for Advanced Internet Architectures, Melbourne, + * Australia, which was made possible in part by a grant from the Cisco + * University Research Program Fund at Community Foundation Silicon Valley. + * More details are available at: + * http://caia.swin.edu.au/urp/newtcp/ + */ + +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef CC_H_ +#define CC_H_ + + +struct cc_algo; +struct tcp_pcb; + +#include + +/* types of different cc algorithms */ +enum cc_algo_mod { + CC_MOD_LWIP, + CC_MOD_CUBIC, + CC_MOD_NONE +}; + +/* ACK types passed to the ack_received() hook. */ +#define CC_ACK 0x0001 /* Regular in sequence ACK. */ +#define CC_DUPACK 0x0002 /* Duplicate ACK. */ +#define CC_PARTIALACK 0x0004 /* Not yet. */ +#define CC_SACK 0x0008 /* Not yet. */ + +/* + * Congestion signal types passed to the cong_signal() hook. The highest order 8 + * bits (0x01000000 - 0x80000000) are reserved for CC algos to declare their own + * congestion signal types. + */ +#define CC_ECN 0x00000001 /* ECN marked packet received. */ +#define CC_RTO 0x00000002 /* RTO fired. */ +#define CC_RTO_ERR 0x00000004 /* RTO fired in error. */ +#define CC_NDUPACK 0x00000008 /* Threshold of dupack's reached. */ + +#define CC_SIGPRIVMASK 0xFF000000 /* Mask to check if sig is private. */ + + +#define TCP_CA_NAME_MAX 16 /* max congestion control name length */ + +/* + * Structure to hold data and function pointers that together represent a + * congestion control algorithm. + */ +struct cc_algo { + char name[TCP_CA_NAME_MAX]; + + /* Init cc_data */ + int (*init)(struct tcp_pcb *pcb); + + /* Destroy cc_data */ + void (*destroy)(struct tcp_pcb *pcb); + + /* Init variables for a newly established connection. */ + void (*conn_init)(struct tcp_pcb *pcb); + + /* Called on receipt of an ack. */ + void (*ack_received)(struct tcp_pcb *pcb, uint16_t type); + + /* Called on detection of a congestion signal. */ + void (*cong_signal)(struct tcp_pcb *pcb, uint32_t type); + + /* Called after exiting congestion recovery. */ + void (*post_recovery)(struct tcp_pcb *pcb); + + /* Called when data transfer resumes after an idle period. */ + void (*after_idle)(struct tcp_pcb *pcb); + +}; + +extern struct cc_algo lwip_cc_algo; +extern struct cc_algo cubic_cc_algo; +extern struct cc_algo none_cc_algo; + +void cc_init(struct tcp_pcb *pcb); +void cc_destroy(struct tcp_pcb *pcb); +void cc_ack_received(struct tcp_pcb *pcb, uint16_t type); +void cc_conn_init(struct tcp_pcb *pcb); +void cc_cong_signal(struct tcp_pcb *pcb, uint32_t type); +void cc_post_recovery(struct tcp_pcb *pcb); + +#endif /* CC_H_ */ diff --git a/src/vma/lwip/cc_cubic.c b/src/vma/lwip/cc_cubic.c new file mode 100644 index 0000000..8c4190d --- /dev/null +++ b/src/vma/lwip/cc_cubic.c @@ -0,0 +1,402 @@ +/*- + * Copyright (c) 2007-2008 + * Swinburne University of Technology, Melbourne, Australia. + * Copyright (c) 2009-2010 Lawrence Stewart + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed at the Centre for Advanced Internet + * Architectures, Swinburne University of Technology, by Lawrence Stewart and + * James Healy, made possible in part by a grant from the Cisco University + * Research Program Fund at Community Foundation Silicon Valley. + * + * Portions of this software were developed at the Centre for Advanced + * Internet Architectures, Swinburne University of Technology, Melbourne, + * Australia by David Hayes under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * This software was first released in 2007 by James Healy and Lawrence Stewart + * whilst working on the NewTCP research project at Swinburne University of + * Technology's Centre for Advanced Internet Architectures, Melbourne, + * Australia, which was made possible in part by a grant from the Cisco + * University Research Program Fund at Community Foundation Silicon Valley. + * More details are available at: + * http://caia.swin.edu.au/urp/newtcp/ + */ + +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "cc_cubic.h" +#include "errno.h" +#include + +#if TCP_CC_ALGO_MOD + +#define ticks tcp_ticks + +static int cubic_cb_init(struct tcp_pcb *pcb); +static void cubic_cb_destroy(struct tcp_pcb *pcb); +static void cubic_ack_received(struct tcp_pcb *pcb, uint16_t type); +static void cubic_cong_signal(struct tcp_pcb *pcb, uint32_t type); +static void cubic_conn_init(struct tcp_pcb *pcb); +static void cubic_post_recovery(struct tcp_pcb *pcb); +static void cubic_record_rtt(struct tcp_pcb *pcb); +static void cubic_ssthresh_update(struct tcp_pcb *pcb); + +struct cubic { + /* Cubic K in fixed point form with CUBIC_SHIFT worth of precision. */ + int64_t K; + /* Sum of RTT samples across an epoch in ticks. */ + tscval_t sum_rtt_ticks; + /* cwnd at the most recent congestion event. */ + unsigned long max_cwnd; + /* cwnd at the previous congestion event. */ + unsigned long prev_max_cwnd; + /* Number of congestion events. */ + uint32_t num_cong_events; + /* Minimum observed rtt in ticks. */ + tscval_t min_rtt_ticks; + /* Mean observed rtt between congestion epochs. */ + tscval_t mean_rtt_ticks; + /* ACKs since last congestion event. */ + uint32_t epoch_ack_count; + /* Time of last congestion event in ticks. */ + tscval_t t_last_cong; +}; + +struct cc_algo cubic_cc_algo = { + .name = "cubic", + .init = cubic_cb_init, + .destroy = cubic_cb_destroy, + .ack_received = cubic_ack_received, + .cong_signal = cubic_cong_signal, + .conn_init = cubic_conn_init, + .post_recovery = cubic_post_recovery +}; + +static void +cubic_ack_received(struct tcp_pcb *pcb, uint16_t type) +{ + struct cubic *cubic_data; + unsigned long w_tf, w_cubic_next; + tscval_t ticks_since_cong; + + cubic_data = pcb->cc_data; + cubic_record_rtt(pcb); + + /* + * Regular ACK and we're not in cong/fast recovery and we're cwnd + * limited and we're either not doing ABC or are slow starting or are + * doing ABC and we've sent a cwnd's worth of bytes. + */ + if (type == CC_ACK && !(pcb->flags & TF_INFR) && + (pcb->cwnd < pcb->snd_wnd)) { + /* Use the logic in NewReno ack_received() for slow start. */ + if (pcb->cwnd <= pcb->ssthresh /*|| + cubic_data->min_rtt_ticks == 0*/) + pcb->cwnd += pcb->mss; + else if (cubic_data->min_rtt_ticks > 0) { + ticks_since_cong = ticks - cubic_data->t_last_cong; + + /* + * The mean RTT is used to best reflect the equations in + * the I-D. Using min_rtt in the tf_cwnd calculation + * causes w_tf to grow much faster than it should if the + * RTT is dominated by network buffering rather than + * propogation delay. + */ + w_tf = tf_cwnd(ticks_since_cong, + cubic_data->mean_rtt_ticks, cubic_data->max_cwnd, pcb->mss); + + w_cubic_next = cubic_cwnd(ticks_since_cong + + cubic_data->mean_rtt_ticks, cubic_data->max_cwnd, + pcb->mss, cubic_data->K); + + if (w_cubic_next < w_tf) + /* + * TCP-friendly region, follow tf + * cwnd growth. + */ + pcb->cwnd = w_tf; + + else if (pcb->cwnd < w_cubic_next) { + /* + * Concave or convex region, follow CUBIC + * cwnd growth. + */ + pcb->cwnd += ((w_cubic_next - pcb->cwnd) * pcb->mss) / pcb->cwnd; + } + + /* + * If we're not in slow start and we're probing for a + * new cwnd limit at the start of a connection + * (happens when hostcache has a relevant entry), + * keep updating our current estimate of the + * max_cwnd. + */ + if (cubic_data->num_cong_events == 0 && + cubic_data->max_cwnd < pcb->cwnd) + cubic_data->max_cwnd = pcb->cwnd; + } + } +} + +static void +cubic_cb_destroy(struct tcp_pcb *pcb) +{ + if (pcb->cc_data != NULL) { + free(pcb->cc_data); + pcb->cc_data = NULL; + } +} + +static int +cubic_cb_init(struct tcp_pcb *pcb) +{ + struct cubic *cubic_data; + + cubic_data = malloc(sizeof(struct cubic)); + memset(cubic_data, 0, sizeof(*cubic_data)); + if (cubic_data == NULL) + return (ENOMEM); + + /* Init some key variables with sensible defaults. */ + cubic_data->t_last_cong = ticks; + cubic_data->min_rtt_ticks = 0; + cubic_data->mean_rtt_ticks = 1; + + pcb->cc_data = cubic_data; + + return (0); +} + +/* + * Perform any necessary tasks before we enter congestion recovery. + */ +static void +cubic_cong_signal(struct tcp_pcb *pcb, uint32_t type) +{ + struct cubic *cubic_data = pcb->cc_data; + + switch (type) { + case CC_NDUPACK: + + if (!(pcb->flags & TF_INFR)) { + cubic_ssthresh_update(pcb); + cubic_data->num_cong_events++; + cubic_data->prev_max_cwnd = cubic_data->max_cwnd; + cubic_data->max_cwnd = pcb->cwnd; + } + break; + + case CC_RTO: + /* Set ssthresh to half of the minimum of the current + * cwnd and the advertised window */ + if (pcb->cwnd > pcb->snd_wnd) { + pcb->ssthresh = pcb->snd_wnd / 2; + } else { + pcb->ssthresh = pcb->cwnd / 2; + } + + /* The minimum value for ssthresh should be 2 MSS */ + if ((u32_t)pcb->ssthresh < (u32_t)2*pcb->mss) { + LWIP_DEBUGF(TCP_FR_DEBUG, + ("tcp_receive: The minimum value for ssthresh %"U16_F + " should be min 2 mss %"U16_F"...\n", + pcb->ssthresh, 2*pcb->mss)); + pcb->ssthresh = 2*pcb->mss; + } + + pcb->cwnd = pcb->mss; + + /* + * Grab the current time and record it so we know when the + * most recent congestion event was. Only record it when the + * timeout has fired more than once, as there is a reasonable + * chance the first one is a false alarm and may not indicate + * congestion. + */ + if (pcb->nrtx >= 1) + cubic_data->num_cong_events++; + cubic_data->t_last_cong = ticks; + + break; + } +} + +static void +cubic_conn_init(struct tcp_pcb *pcb) +{ + struct cubic *cubic_data = pcb->cc_data; + + pcb->cwnd = ((pcb->cwnd == 1) ? (pcb->mss * 2) : pcb->mss); + pcb->ssthresh = pcb->mss * 3; + /* + * Ensure we have a sane initial value for max_cwnd recorded. Without + * this here bad things happen when entries from the TCP hostcache + * get used. + */ + cubic_data->max_cwnd = pcb->cwnd; +} + +/* + * Perform any necessary tasks before we exit congestion recovery. + */ +static void +cubic_post_recovery(struct tcp_pcb *pcb) +{ + struct cubic *cubic_data = pcb->cc_data; + + /* Fast convergence heuristic. */ + if (cubic_data->max_cwnd < cubic_data->prev_max_cwnd) + cubic_data->max_cwnd = (cubic_data->max_cwnd * CUBIC_FC_FACTOR) >> CUBIC_SHIFT; + + if (pcb->flags & TF_INFR) { + /* + * If inflight data is less than ssthresh, set cwnd + * conservatively to avoid a burst of data, as suggested in + * the NewReno RFC. Otherwise, use the CUBIC method. + * + * XXXLAS: Find a way to do this without needing curack + */ + if (pcb->last_unacked && TCP_SEQ_GT(pcb->lastack + pcb->ssthresh, pcb->last_unacked->seqno)) + pcb->cwnd = pcb->last_unacked->seqno - pcb->lastack + pcb->mss; + else { + /* Update cwnd based on beta and adjusted max_cwnd. */ + if (((CUBIC_BETA * cubic_data->max_cwnd) >> CUBIC_SHIFT) > 1) + pcb->cwnd = ((CUBIC_BETA * cubic_data->max_cwnd) >> CUBIC_SHIFT); + else + pcb->cwnd = pcb->mss; + } + } + cubic_data->t_last_cong = ticks; + + /* Calculate the average RTT between congestion epochs. */ + if (cubic_data->epoch_ack_count > 0 && + cubic_data->sum_rtt_ticks >= cubic_data->epoch_ack_count) { + cubic_data->mean_rtt_ticks = (cubic_data->sum_rtt_ticks/cubic_data->epoch_ack_count); + } + + cubic_data->epoch_ack_count = 0; + cubic_data->sum_rtt_ticks = 0; + cubic_data->K = cubic_k(cubic_data->max_cwnd / pcb->mss); +} + +/* + * Record the min RTT and sum samples for the epoch average RTT calculation. + */ +static void +cubic_record_rtt(struct tcp_pcb *pcb) +{ + struct cubic *cubic_data = pcb->cc_data; + tscval_t t_srtt_ticks; + + /* Ignore srtt until a min number of samples have been taken. */ + if (pcb->t_rttupdated >= CUBIC_MIN_RTT_SAMPLES) { + + t_srtt_ticks = (pcb->sa >> 3); + + /* + * Record the current SRTT as our minrtt if it's the smallest + * we've seen or minrtt is currently equal to its initialised + * value. + * + * XXXLAS: Should there be some hysteresis for minrtt? + */ + if ((t_srtt_ticks < cubic_data->min_rtt_ticks || + cubic_data->min_rtt_ticks == 0)) { + if (t_srtt_ticks > 1) + cubic_data->min_rtt_ticks = t_srtt_ticks; + else + cubic_data->min_rtt_ticks = 1; + + /* + * If the connection is within its first congestion + * epoch, ensure we prime mean_rtt_ticks with a + * reasonable value until the epoch average RTT is + * calculated in cubic_post_recovery(). + */ + if (cubic_data->min_rtt_ticks > cubic_data->mean_rtt_ticks) { + cubic_data->mean_rtt_ticks = cubic_data->min_rtt_ticks; + } + } + + /* Sum samples for epoch average RTT calculation. */ + cubic_data->sum_rtt_ticks += t_srtt_ticks; + cubic_data->epoch_ack_count++; + } +} + +/* + * Update the ssthresh in the event of congestion. + */ +static void +cubic_ssthresh_update(struct tcp_pcb *pcb) +{ + struct cubic *cubic_data = pcb->cc_data; + + /* + * On the first congestion event, set ssthresh to cwnd * 0.5, on + * subsequent congestion events, set it to cwnd * beta. + */ + if (cubic_data->num_cong_events == 0) + pcb->ssthresh = pcb->cwnd >> 1; + else + pcb->ssthresh = (pcb->cwnd * CUBIC_BETA) >> CUBIC_SHIFT; +} + +#endif //TCP_CC_ALGO_MOD diff --git a/src/vma/lwip/cc_cubic.h b/src/vma/lwip/cc_cubic.h new file mode 100644 index 0000000..41a6100 --- /dev/null +++ b/src/vma/lwip/cc_cubic.h @@ -0,0 +1,218 @@ +/*- + * Copyright (c) 2007-2008 + * Swinburne University of Technology, Melbourne, Australia. + * Copyright (c) 2009-2010 Lawrence Stewart + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed at the Centre for Advanced Internet + * Architectures, Swinburne University of Technology, by Lawrence Stewart and + * James Healy, made possible in part by a grant from the Cisco University + * Research Program Fund at Community Foundation Silicon Valley. + * + * Portions of this software were developed at the Centre for Advanced + * Internet Architectures, Swinburne University of Technology, Melbourne, + * Australia by David Hayes under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * This software was first released in 2007 by James Healy and Lawrence Stewart + * whilst working on the NewTCP research project at Swinburne University of + * Technology's Centre for Advanced Internet Architectures, Melbourne, + * Australia, which was made possible in part by a grant from the Cisco + * University Research Program Fund at Community Foundation Silicon Valley. + * More details are available at: + * http://caia.swin.edu.au/urp/newtcp/ + */ + +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef CC_CUBIC_H_ +#define CC_CUBIC_H_ + +#include "lwip/cc.h" +#include "lwip/tcp_impl.h" +#include + +/* + * once we add support for ECN and ABC rfc in VMA/LWIP, need to add support in the algorithm. + */ + +typedef unsigned long long tscval_t; + +#define hz 100 //according to VMA internal thread + + +/* Number of bits of precision for fixed point math calcs. */ +#define CUBIC_SHIFT 8 + +#define CUBIC_SHIFT_4 32 + +/* 0.5 << CUBIC_SHIFT. */ +#define RENO_BETA 128 + +/* ~0.8 << CUBIC_SHIFT. */ +#define CUBIC_BETA 204 + +/* ~0.2 << CUBIC_SHIFT. */ +#define ONE_SUB_CUBIC_BETA 51 + +/* 3 * ONE_SUB_CUBIC_BETA. */ +#define THREE_X_PT2 153 + +/* (2 << CUBIC_SHIFT) - ONE_SUB_CUBIC_BETA. */ +#define TWO_SUB_PT2 461 + +/* ~0.4 << CUBIC_SHIFT. */ +#define CUBIC_C_FACTOR 102 + +/* CUBIC fast convergence factor: ~0.9 << CUBIC_SHIFT. */ +#define CUBIC_FC_FACTOR 230 + +/* Don't trust s_rtt until this many rtt samples have been taken. */ +#define CUBIC_MIN_RTT_SAMPLES 8 + + +/* + * Implementation based on the formulae found in the CUBIC Internet Draft + * "draft-rhee-tcpm-cubic-02". + * + * Note BETA used in cc_cubic is equal to (1-beta) in the I-D + */ + +/* + * Compute the CUBIC K value used in the cwnd calculation, using an + * implementation of eqn 2 in the I-D. The method used + * here is adapted from Apple Computer Technical Report #KT-32. + */ +static inline int64_t +cubic_k(unsigned long wmax_pkts) +{ + int64_t s, K; + uint16_t p; + + p = 0; + + /* (wmax * beta)/C with CUBIC_SHIFT worth of precision. */ + s = ((wmax_pkts * ONE_SUB_CUBIC_BETA) << CUBIC_SHIFT) / CUBIC_C_FACTOR; + + /* Rebase s to be between 1 and 1/8 with a shift of CUBIC_SHIFT. */ + while (s >= 256) { + s >>= 3; + p++; + } + + /* + * Some magic constants taken from the Apple TR with appropriate + * shifts: 275 == 1.072302 << CUBIC_SHIFT, 98 == 0.3812513 << + * CUBIC_SHIFT, 120 == 0.46946116 << CUBIC_SHIFT. + */ + K = (((s * 275) >> CUBIC_SHIFT) + 98) - + (((s * s * 120) >> CUBIC_SHIFT) >> CUBIC_SHIFT); + + /* Multiply by 2^p to undo the rebasing of s from above. */ + return (K <<= p); +} + +/* + * Compute the new cwnd value using an implementation of eqn 1 from the I-D. + * Thanks to Kip Macy for help debugging this function. + * + * XXXLAS: Characterise bounds for overflow. + */ +static inline unsigned long +cubic_cwnd(tscval_t ticks_since_cong, unsigned long wmax, uint32_t smss, int64_t K) +{ + int64_t cwnd; + + /* K is in fixed point form with CUBIC_SHIFT worth of precision. */ + + /* t - K, with CUBIC_SHIFT worth of precision. */ + cwnd = ((int64_t)(ticks_since_cong << CUBIC_SHIFT) - (K * hz)) / hz; + + /* (t - K)^3, with CUBIC_SHIFT^3 worth of precision. */ + cwnd *= (cwnd * cwnd); + + /* + * C(t - K)^3 + wmax + * The down shift by CUBIC_SHIFT_4 is because cwnd has 4 lots of + * CUBIC_SHIFT included in the value. 3 from the cubing of cwnd above, + * and an extra from multiplying through by CUBIC_C_FACTOR. + */ + cwnd = ((cwnd * CUBIC_C_FACTOR * smss) >> CUBIC_SHIFT_4) + wmax; + + return ((unsigned long)cwnd); +} + +/* + * Compute an approximation of the "TCP friendly" cwnd some number of ticks + * after a congestion event that is designed to yield the same average cwnd as + * NewReno while using CUBIC's beta of 0.8. RTT should be the average RTT + * estimate for the path measured over the previous congestion epoch and wmax is + * the value of cwnd at the last congestion event. + */ +static inline unsigned long +tf_cwnd(tscval_t ticks_since_cong, tscval_t rtt_ticks, unsigned long wmax, + uint32_t smss) +{ + + /* Equation 4 of I-D. */ + return (((wmax * CUBIC_BETA) + (((THREE_X_PT2 * ticks_since_cong * + smss) << CUBIC_SHIFT) / TWO_SUB_PT2 / rtt_ticks)) >> CUBIC_SHIFT); +} + + +#endif /* CC_CUBIC_H_ */ diff --git a/src/vma/lwip/cc_lwip.c b/src/vma/lwip/cc_lwip.c new file mode 100644 index 0000000..493c18c --- /dev/null +++ b/src/vma/lwip/cc_lwip.c @@ -0,0 +1,167 @@ +/*- + * Copyright (c) 2007-2008 + * Swinburne University of Technology, Melbourne, Australia. + * Copyright (c) 2009-2010 Lawrence Stewart + * Copyright (c) 2010 The FreeBSD Foundation + * All rights reserved. + * + * This software was developed at the Centre for Advanced Internet + * Architectures, Swinburne University of Technology, by Lawrence Stewart and + * James Healy, made possible in part by a grant from the Cisco University + * Research Program Fund at Community Foundation Silicon Valley. + * + * Portions of this software were developed at the Centre for Advanced + * Internet Architectures, Swinburne University of Technology, Melbourne, + * Australia by David Hayes under sponsorship from the FreeBSD Foundation. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND + * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE + * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS + * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) + * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT + * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY + * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF + * SUCH DAMAGE. + * + * $FreeBSD$ + */ + +/* + * This software was first released in 2007 by James Healy and Lawrence Stewart + * whilst working on the NewTCP research project at Swinburne University of + * Technology's Centre for Advanced Internet Architectures, Melbourne, + * Australia, which was made possible in part by a grant from the Cisco + * University Research Program Fund at Community Foundation Silicon Valley. + * More details are available at: + * http://caia.swin.edu.au/urp/newtcp/ + */ + +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vma/lwip/cc.h" +#include "vma/lwip/tcp.h" + +#if TCP_CC_ALGO_MOD + +static void lwip_ack_received(struct tcp_pcb *pcb, uint16_t type); +static void lwip_cong_signal(struct tcp_pcb *pcb, uint32_t type); +static void lwip_conn_init(struct tcp_pcb *pcb); +static void lwip_post_recovery(struct tcp_pcb *pcb); + +struct cc_algo lwip_cc_algo = { + .name = "lwip", + .ack_received = lwip_ack_received, + .cong_signal = lwip_cong_signal, + .conn_init = lwip_conn_init, + .post_recovery = lwip_post_recovery +}; + +static void +lwip_ack_received(struct tcp_pcb *pcb, uint16_t type) +{ + + /* Inflate the congestion window, but not if it means that + the value overflows. */ + + if (type == CC_DUPACK) { + if ((u32_t)(pcb->cwnd + pcb->mss) > pcb->cwnd) { + pcb->cwnd += pcb->mss; + } + } else if (type == CC_ACK) { + if (pcb->cwnd < pcb->ssthresh) { + if ((u32_t)(pcb->cwnd + pcb->mss) > pcb->cwnd) { + pcb->cwnd += pcb->mss; + } + LWIP_DEBUGF(TCP_CWND_DEBUG, ("tcp_receive: slow start cwnd %"U32_F"\n", pcb->cwnd)); + } else { + u32_t new_cwnd = (pcb->cwnd + ((u32_t)pcb->mss * (u32_t)pcb->mss) / pcb->cwnd); + if (new_cwnd > pcb->cwnd) { + pcb->cwnd = new_cwnd; + } + LWIP_DEBUGF(TCP_CWND_DEBUG, ("tcp_receive: congestion avoidance cwnd %"U32_F"\n", pcb->cwnd)); + } + } +} + +static void +lwip_cong_signal(struct tcp_pcb *pcb, uint32_t type) +{ + /* Set ssthresh to half of the minimum of the current + * cwnd and the advertised window */ + if (pcb->cwnd > pcb->snd_wnd) { + pcb->ssthresh = pcb->snd_wnd / 2; + } else { + pcb->ssthresh = pcb->cwnd / 2; + } + + /* The minimum value for ssthresh should be 2 MSS */ + if ((u32_t)pcb->ssthresh < (u32_t)2*pcb->mss) { + LWIP_DEBUGF(TCP_FR_DEBUG, + ("tcp_receive: The minimum value for ssthresh %"U16_F + " should be min 2 mss %"U16_F"...\n", + pcb->ssthresh, 2*pcb->mss)); + pcb->ssthresh = 2*pcb->mss; + } + + if (type == CC_NDUPACK) { + pcb->cwnd = pcb->ssthresh + 3 * pcb->mss; + } else if (type == CC_RTO) { + pcb->cwnd = pcb->mss; + } +} + +static void +lwip_post_recovery(struct tcp_pcb *pcb) +{ + pcb->cwnd = pcb->ssthresh; +} + +static void +lwip_conn_init(struct tcp_pcb *pcb) +{ + pcb->cwnd = ((pcb->cwnd == 1) ? (pcb->mss * 2) : pcb->mss); +} + +#endif //TCP_CC_ALGO_MOD diff --git a/src/vma/lwip/cc_none.c b/src/vma/lwip/cc_none.c new file mode 100644 index 0000000..4f92ff0 --- /dev/null +++ b/src/vma/lwip/cc_none.c @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "vma/lwip/cc.h" +#include "vma/lwip/tcp.h" + +#if TCP_CC_ALGO_MOD + +static void none_cc_conn_init(struct tcp_pcb *pcb); + +struct cc_algo none_cc_algo = { + .name = "none_cc", + .conn_init = none_cc_conn_init, +}; + +static void +none_cc_conn_init(struct tcp_pcb *pcb) +{ + pcb->cwnd = UINT32_MAX; +} + +#endif //TCP_CC_ALGO_MOD diff --git a/src/vma/lwip/def.h b/src/vma/lwip/def.h new file mode 100644 index 0000000..715aac9 --- /dev/null +++ b/src/vma/lwip/def.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2001-2004 Swedish Institute of Computer Science. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT + * SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY + * OF SUCH DAMAGE. + * + * This file is part of the lwIP TCP/IP stack. + * + * Author: Adam Dunkels + * + */ +#ifndef __LWIP_DEF_H__ +#define __LWIP_DEF_H__ + +/* arch.h might define NULL already */ + +#include "vma/lwip/opt.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define LWIP_MAX(x , y) (((x) > (y)) ? (x) : (y)) +#define LWIP_MIN(x , y) (((x) < (y)) ? (x) : (y)) + +#ifndef NULL +#define NULL ((void *)0) +#endif + +/** Get the absolute difference between 2 u32_t values (correcting overflows) + * 'a' is expected to be 'higher' (without overflow) than 'b'. */ +#define LWIP_U32_DIFF(a, b) (((a) >= (b)) ? ((a) - (b)) : (((a) + ((b) ^ 0xFFFFFFFF) + 1))) + +/* Endianess-optimized shifting of two u8_t to create one u16_t */ +#if BYTE_ORDER == LITTLE_ENDIAN +#define LWIP_MAKE_U16(a, b) ((a << 8) | b) +#else +#define LWIP_MAKE_U16(a, b) ((b << 8) | a) +#endif + +#ifndef LWIP_PLATFORM_BYTESWAP +#define LWIP_PLATFORM_BYTESWAP 0 +#endif + +#ifndef LWIP_PREFIX_BYTEORDER_FUNCS +/* workaround for naming collisions on some platforms */ + +#ifdef htons +#undef htons +#endif /* htons */ +#ifdef htonl +#undef htonl +#endif /* htonl */ +#ifdef ntohs +#undef ntohs +#endif /* ntohs */ +#ifdef ntohl +#undef ntohl +#endif /* ntohl */ + +#define htons(x) lwip_htons(x) +#define ntohs(x) lwip_ntohs(x) +#define htonl(x) lwip_htonl(x) +#define ntohl(x) lwip_ntohl(x) +#endif /* LWIP_PREFIX_BYTEORDER_FUNCS */ + +#if BYTE_ORDER == BIG_ENDIAN +#define lwip_htons(x) (x) +#define lwip_ntohs(x) (x) +#define lwip_htonl(x) (x) +#define lwip_ntohl(x) (x) +#define PP_HTONS(x) (x) +#define PP_NTOHS(x) (x) +#define PP_HTONL(x) (x) +#define PP_NTOHL(x) (x) +#else /* BYTE_ORDER != BIG_ENDIAN */ +#if LWIP_PLATFORM_BYTESWAP +#define lwip_htons(x) LWIP_PLATFORM_HTONS(x) +#define lwip_ntohs(x) LWIP_PLATFORM_HTONS(x) +#define lwip_htonl(x) LWIP_PLATFORM_HTONL(x) +#define lwip_ntohl(x) LWIP_PLATFORM_HTONL(x) +#else /* LWIP_PLATFORM_BYTESWAP */ +u16_t lwip_htons(u16_t x); +u16_t lwip_ntohs(u16_t x); +u32_t lwip_htonl(u32_t x); +u32_t lwip_ntohl(u32_t x); +#endif /* LWIP_PLATFORM_BYTESWAP */ + +/* These macros should be calculated by the preprocessor and are used + with compile-time constants only (so that there is no little-endian + overhead at runtime). */ +#define PP_HTONS(x) ((((x) & 0xff) << 8) | (((x) & 0xff00) >> 8)) +#define PP_NTOHS(x) PP_HTONS(x) +#define PP_HTONL(x) ((((x) & 0xff) << 24) | \ + (((x) & 0xff00) << 8) | \ + (((x) & 0xff0000UL) >> 8) | \ + (((x) & 0xff000000UL) >> 24)) +#define PP_NTOHL(x) PP_HTONL(x) + +#endif /* BYTE_ORDER == BIG_ENDIAN */ + +#ifdef __cplusplus +} +#endif + +#endif /* __LWIP_DEF_H__ */ + diff --git a/src/vma/lwip/err.h b/src/vma/lwip/err.h new file mode 100644 index 0000000..84cace2 --- /dev/null +++ b/src/vma/lwip/err.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2001-2004 Swedish Institute of Computer Science. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT + * SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY + * OF SUCH DAMAGE. + * + * This file is part of the lwIP TCP/IP stack. + * + * Author: Adam Dunkels + * + */ +#ifndef __LWIP_ERR_H__ +#define __LWIP_ERR_H__ + +#include "vma/lwip/opt.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** Define LWIP_ERR_T in cc.h if you want to use + * a different type for your platform (must be signed). */ +#ifdef LWIP_ERR_T +typedef LWIP_ERR_T err_t; +#else /* LWIP_ERR_T */ +typedef s8_t err_t; +#endif /* LWIP_ERR_T*/ + +/* Definitions for error constants. */ + +#define ERR_OK 0 /* No error, everything OK. */ +#define ERR_MEM -1 /* Out of memory error. */ +#define ERR_BUF -2 /* Buffer error. */ +#define ERR_TIMEOUT -3 /* Timeout. */ +#define ERR_RTE -4 /* Routing problem. */ +#define ERR_INPROGRESS -5 /* Operation in progress */ +#define ERR_VAL -6 /* Illegal value. */ +#define ERR_WOULDBLOCK -7 /* Operation would block. */ + +#define ERR_IS_FATAL(e) ((e) < ERR_VAL) + +#define ERR_ABRT -8 /* Connection aborted. */ +#define ERR_RST -9 /* Connection reset. */ +#define ERR_CLSD -10 /* Connection closed. */ +#define ERR_CONN -11 /* Not connected. */ + +#define ERR_ARG -12 /* Illegal argument. */ + +#define ERR_USE -13 /* Address in use. */ + +#define ERR_IF -14 /* Low-level netif error */ +#define ERR_ISCONN -15 /* Already connected. */ + + +#ifdef LWIP_DEBUG +extern const char *lwip_strerr(err_t err); +#else +#define lwip_strerr(x) "" +#endif /* LWIP_DEBUG */ + +#ifdef __cplusplus +} +#endif + +#endif /* __LWIP_ERR_H__ */ diff --git a/src/vma/lwip/init.c b/src/vma/lwip/init.c new file mode 100644 index 0000000..a2c875f --- /dev/null +++ b/src/vma/lwip/init.c @@ -0,0 +1,58 @@ +/** + * @file + * Modules initialization + * + */ + +/* + * Copyright (c) 2001-2004 Swedish Institute of Computer Science. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT + * SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY + * OF SUCH DAMAGE. + * + * This file is part of the lwIP TCP/IP stack. + * + * Author: Adam Dunkels + * + */ + +#include "lwip/opt.h" + +#include "lwip/init.h" +#include "lwip/stats.h" +#include "lwip/pbuf.h" +#include "lwip/ip.h" +#include "lwip/tcp_impl.h" + + +/** + * Perform Sanity check of user-configurable values, and initialize all modules. + */ +void +lwip_init(void) +{ + /* Modules initialization */ + stats_init(); + pbuf_init(); + tcp_init(); +} diff --git a/src/vma/lwip/init.h b/src/vma/lwip/init.h new file mode 100644 index 0000000..c0869cf --- /dev/null +++ b/src/vma/lwip/init.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2001-2004 Swedish Institute of Computer Science. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT + * SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY + * OF SUCH DAMAGE. + * + * This file is part of the lwIP TCP/IP stack. + * + * Author: Adam Dunkels + * + */ +#ifndef __LWIP_INIT_H__ +#define __LWIP_INIT_H__ + +#include "lwip/opt.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* Modules initialization */ +void lwip_init(void); + +#ifdef __cplusplus +} +#endif + +#endif /* __LWIP_INIT_H__ */ diff --git a/src/vma/lwip/ip.h b/src/vma/lwip/ip.h new file mode 100644 index 0000000..540546c --- /dev/null +++ b/src/vma/lwip/ip.h @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2001-2004 Swedish Institute of Computer Science. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT + * SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY + * OF SUCH DAMAGE. + * + * This file is part of the lwIP TCP/IP stack. + * + * Author: Adam Dunkels + * + */ +#ifndef __LWIP_IP_H__ +#define __LWIP_IP_H__ + +#include "vma/lwip/opt.h" +#include "vma/lwip/ip_addr.h" +#include "vma/lwip/pbuf.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define IP_HLEN 20 + +#define IP_PROTO_TCP 6 + +/* This is passed as the destination address to ip_output_if (not + to ip_output), meaning that an IP header already is constructed + in the pbuf. This is used when TCP retransmits. */ +#ifdef IP_HDRINCL +#undef IP_HDRINCL +#endif /* IP_HDRINCL */ +#define IP_HDRINCL NULL + +#define IP_PCB_ADDRHINT + +/* This is the common part of all PCB types. It needs to be at the + beginning of a PCB type definition. It is located here so that + changes to this common part are made in one location instead of + having to change all PCB structs. */ +#define IP_PCB \ + /* ip addresses in network byte order */ \ + ip_addr_t local_ip; \ + ip_addr_t remote_ip; \ + /* Socket options */ \ + u8_t so_options; \ + /* Type Of Service */ \ + u8_t tos; \ + /* Time To Live */ \ + u8_t ttl \ + /* link layer address resolution hint */ \ + IP_PCB_ADDRHINT + +struct ip_pcb { +/* Common members of all PCB types */ + IP_PCB; +}; + +/* + * Option flags per-socket. These are the same like SO_XXX. + */ +/*#define SOF_DEBUG (u8_t)0x01U Unimplemented: turn on debugging info recording */ +#define SOF_ACCEPTCONN (u8_t)0x02U /* socket has had listen() */ +#define SOF_REUSEADDR (u8_t)0x04U /* allow local address reuse */ +#define SOF_KEEPALIVE (u8_t)0x08U /* keep connections alive */ +/*#define SOF_DONTROUTE (u8_t)0x10U Unimplemented: just use interface addresses */ +#define SOF_BROADCAST (u8_t)0x20U /* permit to send and to receive broadcast messages (see IP_SOF_BROADCAST option) */ +/*#define SOF_USELOOPBACK (u8_t)0x40U Unimplemented: bypass hardware when possible */ +#define SOF_LINGER (u8_t)0x80U /* linger on close if data present */ +/*#define SOF_OOBINLINE (u16_t)0x0100U Unimplemented: leave received OOB data in line */ +/*#define SOF_REUSEPORT (u16_t)0x0200U Unimplemented: allow local address & port reuse */ + +/* These flags are inherited (e.g. from a listen-pcb to a connection-pcb): */ +#define SOF_INHERITED (SOF_REUSEADDR|SOF_KEEPALIVE|SOF_LINGER/*|SOF_DEBUG|SOF_DONTROUTE|SOF_OOBINLINE*/) + +PACK_STRUCT_BEGIN +struct ip_hdr { + /* version / header length / type of service */ + PACK_STRUCT_FIELD(u16_t _v_hl_tos); + /* total length */ + PACK_STRUCT_FIELD(u16_t _len); + /* identification */ + PACK_STRUCT_FIELD(u16_t _id); + /* fragment offset field */ + PACK_STRUCT_FIELD(u16_t _offset); +#define IP_RF 0x8000 /* reserved fragment flag */ +#define IP_DF 0x4000 /* dont fragment flag */ +#define IP_MF 0x2000 /* more fragments flag */ +#define IP_OFFMASK 0x1fff /* mask for fragmenting bits */ + /* time to live */ + PACK_STRUCT_FIELD(u8_t _ttl); + /* protocol*/ + PACK_STRUCT_FIELD(u8_t _proto); + /* checksum */ + PACK_STRUCT_FIELD(u16_t _chksum); + /* source and destination IP addresses */ + PACK_STRUCT_FIELD(ip_addr_p_t src); + PACK_STRUCT_FIELD(ip_addr_p_t dest); +} PACK_STRUCT_STRUCT; +PACK_STRUCT_END + +#define IPH_V(hdr) (ntohs((hdr)->_v_hl_tos) >> 12) +#define IPH_HL(hdr) ((ntohs((hdr)->_v_hl_tos) >> 8) & 0x0f) +#define IPH_TOS(hdr) (ntohs((hdr)->_v_hl_tos) & 0xff) +#define IPH_LEN(hdr) ((hdr)->_len) +#define IPH_ID(hdr) ((hdr)->_id) +#define IPH_OFFSET(hdr) ((hdr)->_offset) +#define IPH_TTL(hdr) ((hdr)->_ttl) +#define IPH_PROTO(hdr) ((hdr)->_proto) +#define IPH_CHKSUM(hdr) ((hdr)->_chksum) + +#define IPH_VHLTOS_SET(hdr, v, hl, tos) (hdr)->_v_hl_tos = (htons(((v) << 12) | ((hl) << 8) | (tos))) +#define IPH_LEN_SET(hdr, len) (hdr)->_len = (len) +#define IPH_ID_SET(hdr, id) (hdr)->_id = (id) +#define IPH_OFFSET_SET(hdr, off) (hdr)->_offset = (off) +#define IPH_TTL_SET(hdr, ttl) (hdr)->_ttl = (u8_t)(ttl) +#define IPH_PROTO_SET(hdr, proto) (hdr)->_proto = (u8_t)(proto) +#define IPH_CHKSUM_SET(hdr, chksum) (hdr)->_chksum = (chksum) + +/** Header of the input packet currently being processed. */ +extern const struct ip_hdr *current_header; +/** Source IP address of current_header */ +extern __thread ip_addr_t current_iphdr_src; +/** Destination IP address of current_header */ +extern __thread ip_addr_t current_iphdr_dest; + +/** Get the interface that received the current packet. + * This function must only be called from a receive callback (udp_recv, + * raw_recv, tcp_accept). It will return NULL otherwise. */ +#define ip_current_netif() (current_netif) +/** Get the IP header of the current packet. + * This function must only be called from a receive callback (udp_recv, + * raw_recv, tcp_accept). It will return NULL otherwise. */ +#define ip_current_header() (current_header) +/** Source IP address of current_header */ +#define ip_current_src_addr() (¤t_iphdr_src) +/** Destination IP address of current_header */ +#define ip_current_dest_addr() (¤t_iphdr_dest) + +#ifdef __cplusplus +} +#endif + +#endif /* __LWIP_IP_H__ */ + + diff --git a/src/vma/lwip/ip_addr.h b/src/vma/lwip/ip_addr.h new file mode 100644 index 0000000..0eb9685 --- /dev/null +++ b/src/vma/lwip/ip_addr.h @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2001-2004 Swedish Institute of Computer Science. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT + * SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY + * OF SUCH DAMAGE. + * + * This file is part of the lwIP TCP/IP stack. + * + * Author: Adam Dunkels + * + */ +#ifndef __LWIP_IP_ADDR_H__ +#define __LWIP_IP_ADDR_H__ + +#include "vma/lwip/opt.h" +#include "vma/lwip/def.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/* This is the aligned version of ip_addr_t, + used as local variable, on the stack, etc. */ +struct ip_addr { + u32_t addr; +}; + +PACK_STRUCT_BEGIN +struct ip_addr_packed { + PACK_STRUCT_FIELD(u32_t addr); +} PACK_STRUCT_STRUCT; +PACK_STRUCT_END + +/** ip_addr_t uses a struct for convenience only, so that the same defines can + * operate both on ip_addr_t as well as on ip_addr_p_t. */ +typedef struct ip_addr ip_addr_t; +typedef struct ip_addr_packed ip_addr_p_t; + + +PACK_STRUCT_BEGIN +struct ip_addr2 { + PACK_STRUCT_FIELD(u16_t addrw[2]); +} PACK_STRUCT_STRUCT; +PACK_STRUCT_END + +/* Forward declaration to not include netif.h */ +struct netif; + +extern const ip_addr_t ip_addr_any; +extern const ip_addr_t ip_addr_broadcast; + +/** IP_ADDR_ can be used as a fixed IP address + * for the wildcard and the broadcast address + */ +#define IP_ADDR_ANY ((ip_addr_t *)&ip_addr_any) +#define IP_ADDR_BROADCAST ((ip_addr_t *)&ip_addr_broadcast) + +/** 255.255.255.255 */ +#define IPADDR_NONE ((u32_t)0xffffffffUL) +/** 127.0.0.1 */ +#define IPADDR_LOOPBACK ((u32_t)0x7f000001UL) +/** 0.0.0.0 */ +#define IPADDR_ANY ((u32_t)0x00000000UL) +/** 255.255.255.255 */ +#define IPADDR_BROADCAST ((u32_t)0xffffffffUL) + + + +#if BYTE_ORDER == BIG_ENDIAN +/** Set an IP address given by the four byte-parts */ +#define IP4_ADDR(ipaddr, a,b,c,d) \ + (ipaddr)->addr = ((u32_t)((a) & 0xff) << 24) | \ + ((u32_t)((b) & 0xff) << 16) | \ + ((u32_t)((c) & 0xff) << 8) | \ + (u32_t)((d) & 0xff) +#else +/** Set an IP address given by the four byte-parts. + Little-endian version that prevents the use of htonl. */ +#define IP4_ADDR(ipaddr, a,b,c,d) \ + (ipaddr)->addr = ((u32_t)((d) & 0xff) << 24) | \ + ((u32_t)((c) & 0xff) << 16) | \ + ((u32_t)((b) & 0xff) << 8) | \ + (u32_t)((a) & 0xff) +#endif + +/** MEMCPY-like copying of IP addresses where addresses are known to be + * 16-bit-aligned if the port is correctly configured (so a port could define + * this to copying 2 u16_t's) - no NULL-pointer-checking needed. */ +#ifndef IPADDR2_COPY +#define IPADDR2_COPY(dest, src) SMEMCPY(dest, src, sizeof(ip_addr_t)) +#endif + +/** Copy IP address - faster than ip_addr_set: no NULL check */ +#define ip_addr_copy(dest, src) ((dest).addr = (src).addr) +/** Safely copy one IP address to another (src may be NULL) */ +#define ip_addr_set(dest, src) ((dest)->addr = \ + ((src) == NULL ? 0 : \ + (src)->addr)) +/** Set complete address to zero */ +#define ip_addr_set_zero(ipaddr) ((ipaddr)->addr = 0) +/** Set address to IPADDR_ANY (no need for htonl()) */ +#define ip_addr_set_any(ipaddr) ((ipaddr)->addr = IPADDR_ANY) +/** Set address to loopback address */ +#define ip_addr_set_loopback(ipaddr) ((ipaddr)->addr = PP_HTONL(IPADDR_LOOPBACK)) +/** Safely copy one IP address to another and change byte order + * from host- to network-order. */ +#define ip_addr_set_hton(dest, src) ((dest)->addr = \ + ((src) == NULL ? 0:\ + htonl((src)->addr))) +/** IPv4 only: set the IP address given as an u32_t */ +#define ip4_addr_set_u32(dest_ipaddr, src_u32) ((dest_ipaddr)->addr = (src_u32)) +/** IPv4 only: get the IP address as an u32_t */ +#define ip4_addr_get_u32(src_ipaddr) ((src_ipaddr)->addr) + +/** Get the network address by combining host address with netmask */ +#define ip_addr_get_network(target, host, netmask) ((target)->addr = ((host)->addr) & ((netmask)->addr)) + +/** + * Determine if two address are on the same network. + * + * @arg addr1 IP address 1 + * @arg addr2 IP address 2 + * @arg mask network identifier mask + * @return !0 if the network identifiers of both address match + */ +#define ip_addr_netcmp(addr1, addr2, mask) (((addr1)->addr & \ + (mask)->addr) == \ + ((addr2)->addr & \ + (mask)->addr)) +#define ip_addr_cmp(addr1, addr2) ((addr1)->addr == (addr2)->addr) + +#define ip_addr_isany(addr1) ((addr1) == NULL || (addr1)->addr == IPADDR_ANY) + +#define ip_addr_isbroadcast(ipaddr, netif) ip4_addr_isbroadcast((ipaddr)->addr, (netif)) +u8_t ip4_addr_isbroadcast(u32_t addr, const struct netif *netif); + +#define ip_addr_netmask_valid(netmask) ip4_addr_netmask_valid((netmask)->addr) +u8_t ip4_addr_netmask_valid(u32_t netmask); + +#define ip_addr_ismulticast(addr1) (((addr1)->addr & PP_HTONL(0xf0000000UL)) == PP_HTONL(0xe0000000UL)) + +#define ip_addr_islinklocal(addr1) (((addr1)->addr & PP_HTONL(0xffff0000UL)) == PP_HTONL(0xa9fe0000UL)) + +#define ip_addr_debug_print(debug, ipaddr) \ + LWIP_DEBUGF(debug, ("%" U16_F ".%" U16_F ".%" U16_F ".%" U16_F, \ + ipaddr != NULL ? ip4_addr1_16(ipaddr) : 0, \ + ipaddr != NULL ? ip4_addr2_16(ipaddr) : 0, \ + ipaddr != NULL ? ip4_addr3_16(ipaddr) : 0, \ + ipaddr != NULL ? ip4_addr4_16(ipaddr) : 0)) + +/* Get one byte from the 4-byte address */ +#define ip4_addr1(ipaddr) (((u8_t*)(ipaddr))[0]) +#define ip4_addr2(ipaddr) (((u8_t*)(ipaddr))[1]) +#define ip4_addr3(ipaddr) (((u8_t*)(ipaddr))[2]) +#define ip4_addr4(ipaddr) (((u8_t*)(ipaddr))[3]) +/* These are cast to u16_t, with the intent that they are often arguments + * to printf using the U16_F format from cc.h. */ +#define ip4_addr1_16(ipaddr) ((u16_t)ip4_addr1(ipaddr)) +#define ip4_addr2_16(ipaddr) ((u16_t)ip4_addr2(ipaddr)) +#define ip4_addr3_16(ipaddr) ((u16_t)ip4_addr3(ipaddr)) +#define ip4_addr4_16(ipaddr) ((u16_t)ip4_addr4(ipaddr)) + +/** For backwards compatibility */ +#define ip_ntoa(ipaddr) ipaddr_ntoa(ipaddr) + +#ifdef __cplusplus +} +#endif + +#endif /* __LWIP_IP_ADDR_H__ */ diff --git a/src/vma/lwip/opt.h b/src/vma/lwip/opt.h new file mode 100644 index 0000000..013c154 --- /dev/null +++ b/src/vma/lwip/opt.h @@ -0,0 +1,1079 @@ +/** + * @file + * + * lwIP Options Configuration + */ + +/* + * Copyright (c) 2001-2004 Swedish Institute of Computer Science. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT + * SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY + * OF SUCH DAMAGE. + * + * This file is part of the lwIP TCP/IP stack. + * + * Author: Adam Dunkels + * + */ +#ifndef __LWIP_OPT_H__ +#define __LWIP_OPT_H__ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +/* + ------------------------------------ + ---------- Memory options ---------- + ------------------------------------ +*/ +/** + * MEM_ALIGNMENT: should be set to the alignment of the CPU + * 4 byte alignment -> #define MEM_ALIGNMENT 4 + * 2 byte alignment -> #define MEM_ALIGNMENT 2 + */ +#define MEM_ALIGNMENT 4 + +/** + * MEM_SIZE: the size of the heap memory. If the application will send + * a lot of data that needs to be copied, this should be set high. + */ +//16000 +#ifdef _LWIP_MIN_MEM_MODE +#define MEM_SIZE 16000 //128000 +#else +#define MEM_SIZE 512000 //128000 +#endif + + +/** + * MEM_USE_POOLS==1: Use an alternative to malloc() by allocating from a set + * of memory pools of various sizes. When mem_malloc is called, an element of + * the smallest pool that can provide the length needed is returned. + * To use this, MEMP_USE_CUSTOM_POOLS also has to be enabled. + */ +#define MEM_USE_POOLS 1 +#define MEMP_USE_CUSTOM_POOLS 1 + +/* + ------------------------------------------------ + ---------- Internal Memory Pool Sizes ---------- + ------------------------------------------------ +*/ +/** + * MEMP_NUM_PBUF: the number of memp struct pbufs (used for PBUF_ROM and PBUF_REF). + * If the application sends a lot of data out of ROM (or other static memory), + * this should be set high. + */ +//30 +#ifdef _LWIP_MIN_MEM_MODE +#define MEMP_NUM_PBUF 30 +#else +#define MEMP_NUM_PBUF 0 //1024 +#endif + +/** + * MEMP_NUM_TCP_PCB: the number of simulatenously active TCP connections. + * (requires the LWIP_TCP option) + */ +#define MEMP_NUM_TCP_PCB 1 //32768 + +/** + * MEMP_NUM_TCP_PCB_LISTEN: the number of listening TCP connections. + * (requires the LWIP_TCP option) + */ +#define MEMP_NUM_TCP_PCB_LISTEN 1 //1024 + +/** + * MEMP_NUM_TCP_SEG: the number of simultaneously queued TCP segments. + * (requires the LWIP_TCP option) + */ +//64 +#ifdef _LWIP_MIN_MEM_MODE +#define MEMP_NUM_TCP_SEG 64 +#else +#define MEMP_NUM_TCP_SEG 0 //16384 +#endif + + +/** + * PBUF_POOL_SIZE: the number of buffers in the pbuf pool. + */ +//32 +#ifdef _LWIP_MIN_MEM_MODE +#define PBUF_POOL_SIZE 32 +#else +#define PBUF_POOL_SIZE 0 //256000 +#endif + + +/** ETH_PAD_SIZE: number of bytes added before the ethernet header to ensure + * alignment of payload after that header. Since the header is 14 bytes long, + * without this padding e.g. addresses in the IP header will not be aligned + * on a 32-bit boundary, so setting this to 2 can speed up 32-bit-platforms. + */ +#define ETH_PAD_SIZE 0 + + + +/* + -------------------------------- + ---------- IP options ---------- + -------------------------------- +*/ +/** + * IP_DEFAULT_TTL: Default value for Time-To-Live used by transport layers. + */ +#define IP_DEFAULT_TTL 255 + +/* + --------------------------------- + ---------- TCP options ---------- + --------------------------------- +*/ +/** + * LWIP_TCP==1: Turn on TCP. + */ +#define LWIP_TCP 1 + +/** + * TCP_QUICKACK_THRESHOLD: TCP quickack threshold (bytes) + * Quickack will be sent for payload <= TCP_QUICKACK_THRESHOLD. + * if TCP_QUICKACK_THRESHOLD = 0, quickack threshold is disabled. + * The threshold is effective only when TCP_QUICKACK is enabled. + */ +#define TCP_QUICKACK_THRESHOLD 0 + +/** + * TCP_WND: The size of a TCP window. This must be at least + * (2 * TCP_MSS) for things to work well + */ +#define TCP_WND 0xFFFF + +/** + * TCP_MSS: TCP Maximum segment size. (default is 536, a conservative default, + * you might want to increase this.) + * For the receive side, this MSS is advertised to the remote side + * when opening a connection. For the transmit size, this MSS sets + * an upper limit on the MSS advertised by the remote host. + */ +/* + * If you don't want to use lwip_tcp_mss for setting the mss during runtime, define TCP_MSS to the DEFAULT_TCP_MSS + */ +#define CONST_TCP_MSS 1460 +#define LWIP_TCP_MSS (lwip_tcp_mss) +//#define TCP_MSS CONST_TCP_MSS + +/** + * TCP_SND_BUF: TCP sender buffer space (bytes). + */ +//4096 +#ifdef _LWIP_MIN_MEM_MODE +#define TCP_SND_BUF 4096 //256*1024 +#else +#define TCP_SND_BUF 1000000 //100000 //256000 +#endif + +#define TCP_SND_BUF_NO_NAGLE 256000 + +/* + ---------------------------------- + ---------- Pbuf options ---------- + ---------------------------------- +*/ +/** + * PBUF_LINK_HLEN: the number of bytes that should be allocated for a + * link level header. The default is 14, the standard value for + * Ethernet. + */ +#define PBUF_LINK_HLEN 20 + +/* + ---------------------------------------- + ---------- Statistics options ---------- + ---------------------------------------- +*/ +/** + * LWIP_STATS==1: Enable statistics collection in lwip_stats. + * NOTE: enabling stats adds about 300-400ns to latency + */ +#define LWIP_STATS 0 + +/** + * LWIP_STATS_DISPLAY==1: Compile in the statistics output functions. + */ +#define LWIP_STATS_DISPLAY 0 +// use 32 bit counters in stats +#define LWIP_STATS_LARGE 0 + + +/* Misc */ + +#define LWIP_TIMEVAL_PRIVATE 0 + + +/* + -------------------------------------- + ---------- Checksum options ---------- + -------------------------------------- +*/ +// Sasha: disable software tx checksums. Use hca hw csum offload instead +/** + * CHECKSUM_GEN_IP==1: Generate checksums in software for outgoing IP packets. + */ +#define CHECKSUM_GEN_IP 0 + +/** + * CHECKSUM_GEN_UDP==1: Generate checksums in software for outgoing UDP packets. + */ +#define CHECKSUM_GEN_UDP 0 + +/** + * CHECKSUM_GEN_TCP==1: Generate checksums in software for outgoing TCP packets. + */ +#define CHECKSUM_GEN_TCP 0 + +/** + * CHECKSUM_CHECK_IP==1: Check checksums in software for incoming IP packets. + */ +#define CHECKSUM_CHECK_IP 0 + +/** + * CHECKSUM_CHECK_UDP==1: Check checksums in software for incoming UDP packets. + */ +#define CHECKSUM_CHECK_UDP 0 + +/** + * CHECKSUM_CHECK_TCP==1: Check checksums in software for incoming TCP packets. + */ +#define CHECKSUM_CHECK_TCP 0 + +/** + * LWIP_CHECKSUM_ON_COPY==1: Calculate checksum when copying data from + * application buffers to pbufs. + */ +#define LWIP_CHECKSUM_ON_COPY 1 + + +// replace lwip byte swapping to optimized one +#include + +#define LWIP_PLATFORM_BYTESWAP 1 +#define LWIP_PLATFORM_HTONS(x) bswap_16(x) +#define LWIP_PLATFORM_HTONL(x) bswap_32(x) + +#define LWIP_3RD_PARTY_L3 1 +#define LWIP_3RD_PARTY_BUFS 1 + +//enable LWIP DEBUG here +#if 1 +//#define PBUF_DEBUG LWIP_DBG_ON +//#define TCP_DEBUG LWIP_DBG_ON +//#define TCP_INPUT_DEBUG LWIP_DBG_ON +//#define TCP_FR_DEBUG LWIP_DBG_ON +//#define TCP_RTO_DEBUG LWIP_DBG_ON +//#define TCP_CWND_DEBUG LWIP_DBG_ON +//#define TCP_WND_DEBUG LWIP_DBG_ON +//#define TCP_OUTPUT_DEBUG LWIP_DBG_ON +//#define TCP_RST_DEBUG LWIP_DBG_ON +//#define TCP_QLEN_DEBUG LWIP_DBG_ON +//#define TCP_TSO_DEBUG LWIP_DBG_ON +#endif + +/* + ----------------------------------------------- + ---------- Platform specific locking ---------- + ----------------------------------------------- +*/ + +/** + * SYS_LIGHTWEIGHT_PROT==1: if you want inter-task protection for certain + * critical regions during buffer allocation, deallocation and memory + * allocation and deallocation. + */ +#ifndef SYS_LIGHTWEIGHT_PROT +#define SYS_LIGHTWEIGHT_PROT 0 +#endif + +/** + * NO_SYS==1: Provides VERY minimal functionality. Otherwise, + * use lwIP facilities. + */ +#ifndef NO_SYS +#define NO_SYS 0 +#endif + +/** + * NO_SYS_NO_TIMERS==1: Drop support for sys_timeout when NO_SYS==1 + * Mainly for compatibility to old versions. + */ +#ifndef NO_SYS_NO_TIMERS +#define NO_SYS_NO_TIMERS 0 +#endif + +/** + * MEMCPY: override this if you have a faster implementation at hand than the + * one included in your C library + */ +#ifndef MEMCPY +#define MEMCPY(dst,src,len) memcpy(dst,src,len) +#endif + +/** + * SMEMCPY: override this with care! Some compilers (e.g. gcc) can inline a + * call to memcpy() if the length is known at compile time and is small. + */ +#ifndef SMEMCPY +#define SMEMCPY(dst,src,len) memcpy(dst,src,len) +#endif + +/* + ------------------------------------ + ---------- Memory options ---------- + ------------------------------------ +*/ +/** + * MEM_LIBC_MALLOC==1: Use malloc/free/realloc provided by your C-library + * instead of the lwip internal allocator. Can save code size if you + * already use it. + */ +#ifndef MEM_LIBC_MALLOC +#define MEM_LIBC_MALLOC 0 +#endif + +/** +* MEMP_MEM_MALLOC==1: Use mem_malloc/mem_free instead of the lwip pool allocator. +* Especially useful with MEM_LIBC_MALLOC but handle with care regarding execution +* speed and usage from interrupts! +*/ +#ifndef MEMP_MEM_MALLOC +#define MEMP_MEM_MALLOC 0 +#endif + +/** + * MEM_ALIGNMENT: should be set to the alignment of the CPU + * 4 byte alignment -> #define MEM_ALIGNMENT 4 + * 2 byte alignment -> #define MEM_ALIGNMENT 2 + */ +#ifndef MEM_ALIGNMENT +#define MEM_ALIGNMENT 1 +#endif + +/** + * MEM_SIZE: the size of the heap memory. If the application will send + * a lot of data that needs to be copied, this should be set high. + */ +#ifndef MEM_SIZE +#define MEM_SIZE 1600 +#endif + +/** + * MEMP_SEPARATE_POOLS: if defined to 1, each pool is placed in its own array. + * This can be used to individually change the location of each pool. + * Default is one big array for all pools + */ +#ifndef MEMP_SEPARATE_POOLS +#define MEMP_SEPARATE_POOLS 0 +#endif + +/** + * MEMP_OVERFLOW_CHECK: memp overflow protection reserves a configurable + * amount of bytes before and after each memp element in every pool and fills + * it with a prominent default value. + * MEMP_OVERFLOW_CHECK == 0 no checking + * MEMP_OVERFLOW_CHECK == 1 checks each element when it is freed + * MEMP_OVERFLOW_CHECK >= 2 checks each element in every pool every time + * memp_malloc() or memp_free() is called (useful but slow!) + */ +#ifndef MEMP_OVERFLOW_CHECK +#define MEMP_OVERFLOW_CHECK 0 +#endif + +/** + * MEMP_SANITY_CHECK==1: run a sanity check after each memp_free() to make + * sure that there are no cycles in the linked lists. + */ +#ifndef MEMP_SANITY_CHECK +#define MEMP_SANITY_CHECK 0 +#endif + +/** + * MEM_USE_POOLS==1: Use an alternative to malloc() by allocating from a set + * of memory pools of various sizes. When mem_malloc is called, an element of + * the smallest pool that can provide the length needed is returned. + * To use this, MEMP_USE_CUSTOM_POOLS also has to be enabled. + */ +#ifndef MEM_USE_POOLS +#define MEM_USE_POOLS 0 +#endif + +/** + * MEM_USE_POOLS_TRY_BIGGER_POOL==1: if one malloc-pool is empty, try the next + * bigger pool - WARNING: THIS MIGHT WASTE MEMORY but it can make a system more + * reliable. */ +#ifndef MEM_USE_POOLS_TRY_BIGGER_POOL +#define MEM_USE_POOLS_TRY_BIGGER_POOL 1 +#endif + +/** + * MEMP_USE_CUSTOM_POOLS==1: whether to include a user file lwippools.h + * that defines additional pools beyond the "standard" ones required + * by lwIP. If you set this to 1, you must have lwippools.h in your + * inlude path somewhere. + */ +#ifndef MEMP_USE_CUSTOM_POOLS +#define MEMP_USE_CUSTOM_POOLS 0 +#endif + +/** + * Set this to 1 if you want to free PBUF_RAM pbufs (or call mem_free()) from + * interrupt context (or another context that doesn't allow waiting for a + * semaphore). + * If set to 1, mem_malloc will be protected by a semaphore and SYS_ARCH_PROTECT, + * while mem_free will only use SYS_ARCH_PROTECT. mem_malloc SYS_ARCH_UNPROTECTs + * with each loop so that mem_free can run. + * + * ATTENTION: As you can see from the above description, this leads to dis-/ + * enabling interrupts often, which can be slow! Also, on low memory, mem_malloc + * can need longer. + * + * If you don't want that, at least for NO_SYS=0, you can still use the following + * functions to enqueue a deallocation call which then runs in the tcpip_thread + * context: + * - pbuf_free_callback(p); + * - mem_free_callback(m); + */ +#ifndef LWIP_ALLOW_MEM_FREE_FROM_OTHER_CONTEXT +#define LWIP_ALLOW_MEM_FREE_FROM_OTHER_CONTEXT 0 +#endif + +/* + ------------------------------------------------ + ---------- Internal Memory Pool Sizes ---------- + ------------------------------------------------ +*/ +/** + * MEMP_NUM_PBUF: the number of memp struct pbufs (used for PBUF_ROM and PBUF_REF). + * If the application sends a lot of data out of ROM (or other static memory), + * this should be set high. + */ +#ifndef MEMP_NUM_PBUF +#define MEMP_NUM_PBUF 16 +#endif + +/** + * MEMP_NUM_TCP_PCB: the number of simulatenously active TCP connections. + * (requires the LWIP_TCP option) + */ +#ifndef MEMP_NUM_TCP_PCB +#define MEMP_NUM_TCP_PCB 5 +#endif + +/** + * MEMP_NUM_TCP_PCB_LISTEN: the number of listening TCP connections. + * (requires the LWIP_TCP option) + */ +#ifndef MEMP_NUM_TCP_PCB_LISTEN +#define MEMP_NUM_TCP_PCB_LISTEN 8 +#endif + +/** + * MEMP_NUM_TCP_SEG: the number of simultaneously queued TCP segments. + * (requires the LWIP_TCP option) + */ +#ifndef MEMP_NUM_TCP_SEG +#define MEMP_NUM_TCP_SEG 16 +#endif + +/** + * PBUF_POOL_SIZE: the number of buffers in the pbuf pool. + */ +#ifndef PBUF_POOL_SIZE +#define PBUF_POOL_SIZE 16 +#endif + +/** ETH_PAD_SIZE: number of bytes added before the ethernet header to ensure + * alignment of payload after that header. Since the header is 14 bytes long, + * without this padding e.g. addresses in the IP header will not be aligned + * on a 32-bit boundary, so setting this to 2 can speed up 32-bit-platforms. + */ +#ifndef ETH_PAD_SIZE +#define ETH_PAD_SIZE 0 +#endif + +/** + * IP_DEFAULT_TTL: Default value for Time-To-Live used by transport layers. + */ +#ifndef IP_DEFAULT_TTL +#define IP_DEFAULT_TTL 255 +#endif + +/* + --------------------------------- + ---------- TCP options ---------- + --------------------------------- +*/ +/** + * LWIP_TCP==1: Turn on TCP. + */ +#ifndef LWIP_TCP +#define LWIP_TCP 1 +#endif + +/** + * TCP_TTL: Default Time-To-Live value. + */ +#ifndef TCP_TTL +#define TCP_TTL (IP_DEFAULT_TTL) +#endif + +/** + * TCP_WND: The size of a TCP window. This must be at least + * (2 * TCP_MSS) for things to work well + */ +#ifndef TCP_WND +#error make sure TCP_WND is not defined here +/* If ever this definition is effective - please note that LWIP_TCP_MSS may be 0 */ +#define TCP_WND (4 * LWIP_TCP_MSS) +#endif + +/* + * use custom congestion control algorithms + */ +#ifndef TCP_CC_ALGO_MOD +#define TCP_CC_ALGO_MOD 1 +#endif + + /** + * window scaling parameter + */ +#define TCP_WND_SCALED(pcb) (TCP_WND << (pcb)->rcv_scale) + +/** + * TCP_MAXRTX: Maximum number of retransmissions of data segments. + */ +#ifndef TCP_MAXRTX +#define TCP_MAXRTX 12 +#endif + +/** + * TCP_SYNMAXRTX: Maximum number of retransmissions of SYN segments. + */ +#ifndef TCP_SYNMAXRTX +#define TCP_SYNMAXRTX 6 +#endif + +/** + * TCP_QUEUE_OOSEQ==1: TCP will queue segments that arrive out of order. + * Define to 0 if your device is low on memory. + */ +#ifndef TCP_QUEUE_OOSEQ +#define TCP_QUEUE_OOSEQ (LWIP_TCP) +#endif + +/** + * TCP_CALCULATE_EFF_SEND_MSS: "The maximum size of a segment that TCP really + * sends, the 'effective send MSS,' MUST be the smaller of the send MSS (which + * reflects the available reassembly buffer size at the remote host) and the + * largest size permitted by the IP layer" (RFC 1122) + * Setting this to 1 enables code that checks TCP_MSS against the MTU of the + * netif used for a connection and limits the MSS if it would be too big otherwise. + */ +#ifndef TCP_CALCULATE_EFF_SEND_MSS +#define TCP_CALCULATE_EFF_SEND_MSS 1 +#endif + + +/** + * TCP_SND_BUF: TCP sender buffer space (bytes). + */ +#ifndef TCP_SND_BUF +#define TCP_SND_BUF 256 +#endif + +/** + * TCP_SND_QUEUELEN: TCP sender buffer space (pbufs). This must be at least + * as much as (2 * TCP_SND_BUF/TCP_MSS) for things to work. + */ +#ifndef TCP_SND_QUEUELEN +#define CONST_TCP_SND_QUEUELEN (4 * (TCP_SND_BUF)/(CONST_TCP_MSS)) +#endif + +/** + * TCP_SNDLOWAT: TCP writable space (bytes). This must be less than + * TCP_SND_BUF. It is the amount of space which must be available in the + * TCP snd_buf for select to return writable (combined with TCP_SNDQUEUELOWAT). + */ +#ifndef TCP_SNDLOWAT +#define TCP_SNDLOWAT ((TCP_SND_BUF)/2) +#endif + +/** + * TCP_SNDQUEUELOWAT: TCP writable bufs (pbuf count). This must be grater + * than TCP_SND_QUEUELEN. If the number of pbufs queued on a pcb drops below + * this number, select returns writable (combined with TCP_SNDLOWAT). + */ +#ifndef TCP_SNDQUEUELOWAT +#define TCP_SNDQUEUELOWAT ((TCP_SND_QUEUELEN)/2) +#endif + +/** + * TCP_OVERSIZE: The maximum number of bytes that tcp_write may + * allocate ahead of time in an attempt to create shorter pbuf chains + * for transmission. The meaningful range is 0 to TCP_MSS. Some + * suggested values are: + * + * 0: Disable oversized allocation. Each tcp_write() allocates a new + pbuf (old behaviour). + * 1: Allocate size-aligned pbufs with minimal excess. Use this if your + * scatter-gather DMA requires aligned fragments. + * 128: Limit the pbuf/memory overhead to 20%. + * TCP_MSS: Try to create unfragmented TCP packets. + * TCP_MSS/4: Try to create 4 fragments or less per TCP packet. + */ +#ifndef TCP_OVERSIZE +#define TCP_OVERSIZE CONST_TCP_MSS +#endif + +/** + * LWIP_TCP_TIMESTAMPS==1: support the TCP timestamp option. + */ +#ifndef LWIP_TCP_TIMESTAMPS +#define LWIP_TCP_TIMESTAMPS 1 +#endif + +/** + * TCP_WND_UPDATE_THRESHOLD: difference in window to trigger an + * explicit window update + */ +#ifndef TCP_WND_UPDATE_THRESHOLD +#define TCP_WND_UPDATE_THRESHOLD (pcb->rcv_wnd_max / 4) +#endif + +/** + * LWIP_EVENT_API and LWIP_CALLBACK_API: Only one of these should be set to 1. + * LWIP_EVENT_API==1: The user defines lwip_tcp_event() to receive all + * events (accept, sent, etc) that happen in the system. + * LWIP_CALLBACK_API==1: The PCB callback function is called directly + * for the event. + */ +#ifndef LWIP_EVENT_API +#define LWIP_EVENT_API 0 +#define LWIP_CALLBACK_API 1 +#else +#define LWIP_EVENT_API 1 +#define LWIP_CALLBACK_API 0 +#endif + + +/* + ---------------------------------- + ---------- Pbuf options ---------- + ---------------------------------- +*/ +/** + * PBUF_LINK_HLEN: the number of bytes that should be allocated for a + * link level header. The default is 14, the standard value for + * Ethernet. + */ +#ifndef PBUF_LINK_HLEN +#define PBUF_LINK_HLEN (14 + ETH_PAD_SIZE) +#endif + + +/* + ------------------------------------ + ---------- Socket options ---------- + ------------------------------------ +*/ +/** + * LWIP_TCP_KEEPALIVE==1: Enable TCP_KEEPIDLE, TCP_KEEPINTVL and TCP_KEEPCNT + * options processing. Note that TCP_KEEPIDLE and TCP_KEEPINTVL have to be set + * in seconds. (does not require sockets.c, and will affect tcp.c) + */ +#ifndef LWIP_TCP_KEEPALIVE +#define LWIP_TCP_KEEPALIVE 0 +#endif + +/* + ---------------------------------------- + ---------- Statistics options ---------- + ---------------------------------------- +*/ +/** + * LWIP_STATS==1: Enable statistics collection in lwip_stats. + */ +#ifndef LWIP_STATS +#define LWIP_STATS 1 +#endif + +#if LWIP_STATS + +/** + * LWIP_STATS_DISPLAY==1: Compile in the statistics output functions. + */ +#ifndef LWIP_STATS_DISPLAY +#define LWIP_STATS_DISPLAY 0 +#endif + +/** + * LINK_STATS==1: Enable link stats. + */ +#ifndef LINK_STATS +#define LINK_STATS 1 +#endif + +/** + * ETHARP_STATS==1: Enable etharp stats. + */ +#ifndef ETHARP_STATS +#define ETHARP_STATS (LWIP_ARP) +#endif + +/** + * IP_STATS==1: Enable IP stats. + */ +#ifndef IP_STATS +#define IP_STATS 1 +#endif + +/** + * IPFRAG_STATS==1: Enable IP fragmentation stats. Default is + * on if using either frag or reass. + */ +#ifndef IPFRAG_STATS +#define IPFRAG_STATS (IP_REASSEMBLY || IP_FRAG) +#endif + +/** + * TCP_STATS==1: Enable TCP stats. Default is on if TCP + * enabled, otherwise off. + */ +#ifndef TCP_STATS +#define TCP_STATS (LWIP_TCP) +#endif + +#else + +#define LINK_STATS 0 +#define IP_STATS 0 +#define IPFRAG_STATS 0 +#define TCP_STATS 0 +#define LWIP_STATS_DISPLAY 0 + +#endif /* LWIP_STATS */ + +/* + -------------------------------------- + ---------- Checksum options ---------- + -------------------------------------- +*/ +/** + * CHECKSUM_GEN_IP==1: Generate checksums in software for outgoing IP packets. + */ +#ifndef CHECKSUM_GEN_IP +#define CHECKSUM_GEN_IP 1 +#endif + +/** + * CHECKSUM_GEN_UDP==1: Generate checksums in software for outgoing UDP packets. + */ +#ifndef CHECKSUM_GEN_UDP +#define CHECKSUM_GEN_UDP 1 +#endif + +/** + * CHECKSUM_GEN_TCP==1: Generate checksums in software for outgoing TCP packets. + */ +#ifndef CHECKSUM_GEN_TCP +#define CHECKSUM_GEN_TCP 1 +#endif + +/** + * CHECKSUM_CHECK_IP==1: Check checksums in software for incoming IP packets. + */ +#ifndef CHECKSUM_CHECK_IP +#define CHECKSUM_CHECK_IP 1 +#endif + +/** + * CHECKSUM_CHECK_UDP==1: Check checksums in software for incoming UDP packets. + */ +#ifndef CHECKSUM_CHECK_UDP +#define CHECKSUM_CHECK_UDP 1 +#endif + +/** + * CHECKSUM_CHECK_TCP==1: Check checksums in software for incoming TCP packets. + */ +#ifndef CHECKSUM_CHECK_TCP +#define CHECKSUM_CHECK_TCP 1 +#endif + +/** + * LWIP_CHECKSUM_ON_COPY==1: Calculate checksum when copying data from + * application buffers to pbufs. + */ +#ifndef LWIP_CHECKSUM_ON_COPY +#define LWIP_CHECKSUM_ON_COPY 0 +#endif + +/** + * LWIP_TSO: Enable Large Segment Offload capability. + */ +#ifndef LWIP_TSO +#ifdef DEFINED_TSO +#define LWIP_TSO 1 +#else +#define LWIP_TSO 0 +#endif /* DEFINED_TSO */ +#endif + +/* Define platform endianness */ +#ifndef BYTE_ORDER +#define BYTE_ORDER LITTLE_ENDIAN +#endif /* BYTE_ORDER */ + +/* Define generic types used in lwIP */ +typedef unsigned char u8_t; +typedef signed char s8_t; +typedef unsigned short u16_t; +typedef signed short s16_t; +typedef unsigned int u32_t; +typedef signed int s32_t; + +typedef unsigned long mem_ptr_t; + +/* Define (sn)printf formatters for these lwIP types */ +#define X8_F "02x" +#define U16_F "hu" +#define S16_F "hd" +#define X16_F "hx" +#define U32_F "u" +#define S32_F "d" +#define X32_F "x" + +/* If only we could use C99 and get %zu */ +#if defined(__x86_64__) +#define SZT_F "lu" +#else +#define SZT_F "u" +#endif + +/* Compiler hints for packing structures */ +#define PACK_STRUCT_FIELD(x) x +#define PACK_STRUCT_STRUCT __attribute__((packed)) +#define PACK_STRUCT_BEGIN +#define PACK_STRUCT_END + +/* prototypes for printf() and abort() */ +#include +#include + +#define LWIP_PLATFORM_ASSERT(x) do {printf("Assertion \"%s\" failed at line %d in %s\n", \ + x, __LINE__, __FILE__); fflush(NULL);} while(0) + +//#define LWIP_PLATFORM_DIAG(x) __log_err x +//#define LWIP_PLATFORM_ASSERT(x) __log_panic(x) +// disable assertions +#define LWIP_NOASSERT + +#define LWIP_RAND() ((u32_t)rand()) + + +#ifndef LWIP_UNUSED_ARG +#define LWIP_UNUSED_ARG(x) (void)x +#endif /* LWIP_UNUSED_ARG */ + +/* + --------------------------------------- + ---------- Debugging options ---------- + --------------------------------------- +*/ + +/** lower two bits indicate debug level + * - 0 all + * - 1 warning + * - 2 serious + * - 3 severe + */ +#define LWIP_DBG_LEVEL_ALL 0x00 +#define LWIP_DBG_LEVEL_OFF LWIP_DBG_LEVEL_ALL /* compatibility define only */ +#define LWIP_DBG_LEVEL_WARNING 0x01 /* bad checksums, dropped packets, ... */ +#define LWIP_DBG_LEVEL_SERIOUS 0x02 /* memory allocation failures, ... */ +#define LWIP_DBG_LEVEL_SEVERE 0x03 +#define LWIP_DBG_MASK_LEVEL 0x03 + +/** flag for LWIP_DEBUGF to enable that debug message */ +#define LWIP_DBG_ON 0x80U +/** flag for LWIP_DEBUGF to disable that debug message */ +#define LWIP_DBG_OFF 0x00U + +/** flag for LWIP_DEBUGF indicating a tracing message (to follow program flow) */ +#define LWIP_DBG_TRACE 0x40U +/** flag for LWIP_DEBUGF indicating a state debug message (to follow module states) */ +#define LWIP_DBG_STATE 0x20U +/** flag for LWIP_DEBUGF indicating newly added code, not thoroughly tested yet */ +#define LWIP_DBG_FRESH 0x10U +/** flag for LWIP_DEBUGF to halt after printing this debug message */ +#define LWIP_DBG_HALT 0x08U + +#define LWIP_ASSERT(message, assertion) + +/** if "expression" isn't true, then print "message" and abort process*/ +#ifndef LWIP_ERROR_ABORT +#define LWIP_ERROR_ABORT(message, expression, handler) do { if (!(expression)) { \ + LWIP_PLATFORM_ASSERT(message); abort(); handler;}} while(0) +#endif /* LWIP_ERROR_ABORT */ + +/** if "expression" isn't true, then print "message" and execute "handler" expression */ +#ifndef LWIP_ERROR +#define LWIP_ERROR(message, expression, handler) do { if (!(expression)) { \ + LWIP_PLATFORM_ASSERT(message); handler;}} while(0) +#endif /* LWIP_ERROR */ + +/** + * LWIP_DBG_MIN_LEVEL: After masking, the value of the debug is + * compared against this value. If it is smaller, then debugging + * messages are written. + */ +#ifndef LWIP_DBG_MIN_LEVEL +#define LWIP_DBG_MIN_LEVEL LWIP_DBG_LEVEL_ALL +#endif + +/** + * LWIP_DBG_TYPES_ON: A mask that can be used to globally enable/disable + * debug messages of certain types. + */ +#ifndef LWIP_DBG_TYPES_ON +#define LWIP_DBG_TYPES_ON LWIP_DBG_ON +#endif + +/** + * PBUF_DEBUG: Enable debugging in pbuf.c. + */ +#ifndef PBUF_DEBUG +#define PBUF_DEBUG LWIP_DBG_OFF +#endif + +/** + * TCP_DEBUG: Enable debugging for TCP. + */ +#ifndef TCP_DEBUG +#define TCP_DEBUG LWIP_DBG_OFF +#endif + +/** + * TCP_INPUT_DEBUG: Enable debugging in tcp_in.c for incoming debug. + */ +#ifndef TCP_INPUT_DEBUG +#define TCP_INPUT_DEBUG LWIP_DBG_OFF +#endif + +/** + * TCP_FR_DEBUG: Enable debugging in tcp_in.c for fast retransmit. + */ +#ifndef TCP_FR_DEBUG +#define TCP_FR_DEBUG LWIP_DBG_OFF +#endif + +/** + * TCP_RTO_DEBUG: Enable debugging in TCP for retransmit + * timeout. + */ +#ifndef TCP_RTO_DEBUG +#define TCP_RTO_DEBUG LWIP_DBG_OFF +#endif + +/** + * TCP_CWND_DEBUG: Enable debugging for TCP congestion window. + */ +#ifndef TCP_CWND_DEBUG +#define TCP_CWND_DEBUG LWIP_DBG_OFF +#endif + +/** + * TCP_WND_DEBUG: Enable debugging in tcp_in.c for window updating. + */ +#ifndef TCP_WND_DEBUG +#define TCP_WND_DEBUG LWIP_DBG_OFF +#endif + +/** + * TCP_OUTPUT_DEBUG: Enable debugging in tcp_out.c output functions. + */ +#ifndef TCP_OUTPUT_DEBUG +#define TCP_OUTPUT_DEBUG LWIP_DBG_OFF +#endif + +/** + * TCP_RST_DEBUG: Enable debugging for TCP with the RST message. + */ +#ifndef TCP_RST_DEBUG +#define TCP_RST_DEBUG LWIP_DBG_OFF +#endif + +/** + * TCP_QLEN_DEBUG: Enable debugging for TCP queue lengths. + */ +#ifndef TCP_QLEN_DEBUG +#define TCP_QLEN_DEBUG LWIP_DBG_OFF +#endif + +/** + * TCP_TSO_DEBUG: Enable debugging for TSO. + */ +#ifndef TCP_TSO_DEBUG +#define TCP_TSO_DEBUG LWIP_DBG_OFF +#endif + +#define LWIP_DEBUG_ENABLE PBUF_DEBUG | TCP_DEBUG | TCP_INPUT_DEBUG | TCP_FR_DEBUG | TCP_RTO_DEBUG \ + | TCP_CWND_DEBUG | TCP_WND_DEBUG | TCP_OUTPUT_DEBUG | TCP_RST_DEBUG | TCP_QLEN_DEBUG \ + | TCP_TSO_DEBUG + +#if LWIP_DEBUG_ENABLE + +/* Plaform specific diagnostic output */ +#define LWIP_PLATFORM_DIAG(x) do {printf x; fflush(0);} while(0) + +/** print debug message only if debug message type is enabled... + * AND is of correct type AND is at least LWIP_DBG_LEVEL + */ +#define LWIP_DEBUGF(debug, message) do { \ + if ( \ + ((debug) & LWIP_DBG_ON) && \ + ((debug) & LWIP_DBG_TYPES_ON) && \ + ((s16_t)((debug) & LWIP_DBG_MASK_LEVEL) >= LWIP_DBG_MIN_LEVEL)) { \ + LWIP_PLATFORM_DIAG(message); \ + if ((debug) & LWIP_DBG_HALT) { \ + while(1); \ + } \ + } \ + } while(0) + +#else /* LWIP_DEBUG_ENABLE */ +#define LWIP_PLATFORM_DIAG(x) +#define LWIP_DEBUGF(debug, message) +#endif /* LWIP_DEBUG_ENABLE */ + +#endif /* __LWIP_OPT_H__ */ diff --git a/src/vma/lwip/pbuf.c b/src/vma/lwip/pbuf.c new file mode 100644 index 0000000..7a90513 --- /dev/null +++ b/src/vma/lwip/pbuf.c @@ -0,0 +1,419 @@ +/** + * @file + * Packet buffer management + * + * Packets are built from the pbuf data structure. It supports dynamic + * memory allocation for packet contents or can reference externally + * managed packet contents both in RAM and ROM. Quick allocation for + * incoming packets is provided through pools with fixed sized pbufs. + * + * A packet may span over multiple pbufs, chained as a singly linked + * list. This is called a "pbuf chain". + * + * Multiple packets may be queued, also using this singly linked list. + * This is called a "packet queue". + * + * So, a packet queue consists of one or more pbuf chains, each of + * which consist of one or more pbufs. CURRENTLY, PACKET QUEUES ARE + * NOT SUPPORTED!!! Use helper structs to queue multiple packets. + * + * The differences between a pbuf chain and a packet queue are very + * precise but subtle. + * + * The last pbuf of a packet has a ->tot_len field that equals the + * ->len field. It can be found by traversing the list. If the last + * pbuf of a packet has a ->next field other than NULL, more packets + * are on the queue. + * + * Therefore, looping through a pbuf of a single packet, has an + * loop end condition (tot_len == p->len), NOT (next == NULL). + */ + +/* + * Copyright (c) 2001-2004 Swedish Institute of Computer Science. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT + * SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY + * OF SUCH DAMAGE. + * + * This file is part of the lwIP TCP/IP stack. + * + * Author: Adam Dunkels + * + */ + +#include "vma/lwip/opt.h" + +#include "vma/lwip/stats.h" +#include "vma/lwip/pbuf.h" + + +#if TCP_QUEUE_OOSEQ +#include "vma/lwip/tcp_impl.h" +#endif + +/** + * Shrink a pbuf chain to a desired length. + * + * @param p pbuf to shrink. + * @param new_len desired new length of pbuf chain + * + * Depending on the desired length, the first few pbufs in a chain might + * be skipped and left unchanged. The new last pbuf in the chain will be + * resized, and any remaining pbufs will be freed. + * + * @note If the pbuf is ROM/REF, only the ->tot_len and ->len fields are adjusted. + * @note May not be called on a packet queue. + * + * @note Despite its name, pbuf_realloc cannot grow the size of a pbuf (chain). + */ +void +pbuf_realloc(struct pbuf *p, u16_t new_len) +{ + struct pbuf *q; + u16_t rem_len; /* remaining length */ + s32_t grow; + + LWIP_ASSERT("pbuf_realloc: p != NULL", p != NULL); + LWIP_ASSERT("pbuf_realloc: sane p->type", p->type == PBUF_POOL || + p->type == PBUF_ROM || + p->type == PBUF_RAM || + p->type == PBUF_REF); + + /* desired length larger than current length? */ + if (new_len >= p->tot_len) { + /* enlarging not yet supported */ + return; + } + + /* the pbuf chain grows by (new_len - p->tot_len) bytes + * (which may be negative in case of shrinking) */ + grow = new_len - p->tot_len; + + /* first, step over any pbufs that should remain in the chain */ + rem_len = new_len; + q = p; + /* should this pbuf be kept? */ + while (rem_len > q->len) { + /* decrease remaining length by pbuf length */ + rem_len -= q->len; + /* decrease total length indicator */ + LWIP_ASSERT("grow < max_u16_t", grow < 0xffff); + q->tot_len += grow; + /* proceed to next pbuf in chain */ + q = q->next; + LWIP_ASSERT("pbuf_realloc: q != NULL", q != NULL); + } + /* we have now reached the new last pbuf (in q) */ + /* rem_len == desired length for pbuf q */ + + /* shrink allocated memory for PBUF_RAM */ + /* (other types merely adjust their length fields */ + if ((q->type == PBUF_RAM) && (rem_len != q->len)) { + LWIP_ASSERT("pbuf_realloc: we don't need to be here ",0); + } + /* adjust length fields for new last pbuf */ + q->len = rem_len; + q->tot_len = q->len; + + /* any remaining pbufs in chain? */ + if (q->next != NULL) { + /* free remaining pbufs in chain */ + pbuf_free(q->next); + } + /* q is last packet in chain */ + q->next = NULL; + +} + +/** + * Adjusts the payload pointer to hide or reveal headers in the payload. + * + * Adjusts the ->payload pointer so that space for a header + * (dis)appears in the pbuf payload. + * + * The ->payload, ->tot_len and ->len fields are adjusted. + * + * @param p pbuf to change the header size. + * @param header_size_increment Number of bytes to increment header size which + * increases the size of the pbuf. New space is on the front. + * (Using a negative value decreases the header size.) + * If hdr_size_inc is 0, this function does nothing and returns succesful. + * + * PBUF_ROM and PBUF_REF type buffers cannot have their sizes increased, so + * the call will fail. A check is made that the increase in header size does + * not move the payload pointer in front of the start of the buffer. + * @return non-zero on failure, zero on success. + * + */ +u8_t +pbuf_header(struct pbuf *p, s16_t header_size_increment) +{ + u16_t type; + void *payload; + u16_t increment_magnitude; + + LWIP_ASSERT("p != NULL", p != NULL); + if ((header_size_increment == 0) || (p == NULL)) { + return 0; + } + + if (header_size_increment < 0){ + increment_magnitude = -header_size_increment; + /* Check that we aren't going to move off the end of the pbuf */ + LWIP_ERROR("increment_magnitude <= p->len", (increment_magnitude <= p->len), return 1;); + } else { + increment_magnitude = header_size_increment; + } + + type = p->type; + /* remember current payload pointer */ + payload = p->payload; + + /* pbuf types containing payloads? */ + if (type == PBUF_RAM || type == PBUF_POOL) { + /* set new payload pointer */ + p->payload = (u8_t *)p->payload - header_size_increment; + /* pbuf types refering to external payloads? */ + } else if (type == PBUF_REF || type == PBUF_ROM) { + /* hide a header in the payload? */ + if ((header_size_increment < 0) && (increment_magnitude > p->len)) + return 1; + /* AlexV: we need to check that the header EXPANTION is legal for PBUF_REF & PBUF_ROM pbufs! */ + p->payload = (u8_t *)p->payload - header_size_increment; + } else { + /* Unknown type */ + LWIP_ASSERT("bad pbuf type", 0); + return 1; + } + /* modify pbuf length fields */ + p->len += header_size_increment; + p->tot_len += header_size_increment; + + LWIP_DEBUGF(PBUF_DEBUG | LWIP_DBG_TRACE, ("pbuf_header: old %p new %p (%"S16_F")\n", + (void *)payload, (void *)p->payload, header_size_increment)); + (void)payload; /* Fix warning -Wunused-but-set-variable */ + + return 0; +} + +/** + * Dereference a pbuf chain or queue and deallocate any no-longer-used + * pbufs at the head of this chain or queue. + * + * Decrements the pbuf reference count. If it reaches zero, the pbuf is + * deallocated. + * + * For a pbuf chain, this is repeated for each pbuf in the chain, + * up to the first pbuf which has a non-zero reference count after + * decrementing. So, when all reference counts are one, the whole + * chain is free'd. + * + * @param p The pbuf (chain) to be dereferenced. + * + * @return the number of pbufs that were de-allocated + * from the head of the chain. + * + * @note MUST NOT be called on a packet queue (Not verified to work yet). + * @note the reference counter of a pbuf equals the number of pointers + * that refer to the pbuf (or into the pbuf). + * + * @internal examples: + * + * Assuming existing chains a->b->c with the following reference + * counts, calling pbuf_free(a) results in: + * + * 1->2->3 becomes ...1->3 + * 3->3->3 becomes 2->3->3 + * 1->1->2 becomes ......1 + * 2->1->1 becomes 1->1->1 + * 1->1->1 becomes ....... + * + */ +u8_t +pbuf_free(struct pbuf *p) +{ + u16_t type; + struct pbuf *q; + u8_t count; + + if (p == NULL) { + LWIP_ASSERT("p != NULL", p != NULL); + /* if assertions are disabled, proceed with debug output */ + LWIP_DEBUGF(PBUF_DEBUG | LWIP_DBG_LEVEL_SERIOUS, + ("pbuf_free(p == NULL) was called.\n")); + return 0; + } + LWIP_DEBUGF(PBUF_DEBUG | LWIP_DBG_TRACE, ("pbuf_free(%p)\n", (void *)p)); + + LWIP_ASSERT("pbuf_free: sane type", + p->type == PBUF_RAM || p->type == PBUF_ROM || + p->type == PBUF_REF || p->type == PBUF_POOL); + + count = 0; + /* de-allocate all consecutive pbufs from the head of the chain that + * obtain a zero reference count after decrementing*/ + while (p != NULL) { + u16_t ref; + /* all pbufs in a chain are referenced at least once */ + LWIP_ASSERT("pbuf_free: p->ref > 0", p->ref > 0); + /* decrease reference count (number of pointers to pbuf) */ + ref = --(p->ref); + /* this pbuf is no longer referenced to? */ + if (ref == 0) { + /* remember next pbuf in chain for next iteration */ + q = p->next; + LWIP_DEBUGF( PBUF_DEBUG | LWIP_DBG_TRACE, ("pbuf_free: deallocating %p\n", (void *)p)); + type = p->type; + /* is this a custom pbuf? */ + if ((p->flags & PBUF_FLAG_IS_CUSTOM) != 0) { + struct pbuf_custom *pc = (struct pbuf_custom*)p; + LWIP_ASSERT("pc->custom_free_function != NULL", pc->custom_free_function != NULL); + pc->custom_free_function(p); + } + count++; + /* proceed to next pbuf */ + p = q; + /* p->ref > 0, this pbuf is still referenced to */ + /* (and so the remaining pbufs in chain as well) */ + } else { + LWIP_DEBUGF( PBUF_DEBUG | LWIP_DBG_TRACE, ("pbuf_free: %p has ref %"U16_F", ending here.\n", (void *)p, ref)); + /* stop walking through the chain */ + p = NULL; + } + } + + (void)type; + /* return number of de-allocated pbufs */ + return count; +} + +/** + * Count number of pbufs in a chain + * + * @param p first pbuf of chain + * @return the number of pbufs in a chain + */ + +u8_t +pbuf_clen(struct pbuf *p) +{ + u8_t len; + + len = 0; + while (p != NULL) { + ++len; + p = p->next; + } + return len; +} + +/** + * Increment the reference count of the pbuf. + * + * @param p pbuf to increase reference counter of + * + */ +void +pbuf_ref(struct pbuf *p) +{ + /* pbuf given? */ + if (p != NULL) { + ++(p->ref); + } +} + +/** + * Concatenate two pbufs (each may be a pbuf chain) and take over + * the caller's reference of the tail pbuf. + * + * @note The caller MAY NOT reference the tail pbuf afterwards. + * Use pbuf_chain() for that purpose in the original lwip project. + * + * @see pbuf_chain() + */ + +void +pbuf_cat(struct pbuf *h, struct pbuf *t) +{ + struct pbuf *p; + + LWIP_ERROR_ABORT("(h != NULL) && (t != NULL) (programmer violates API)", + ((h != NULL) && (t != NULL)), return;); + + /* proceed to last pbuf of chain */ + for (p = h; p->next != NULL; p = p->next) { + /* add total length of second chain to all totals of first chain */ + p->tot_len += t->tot_len; + } + /* { p is last pbuf of first h chain, p->next == NULL } */ + LWIP_ASSERT("p->tot_len == p->len (of last pbuf in chain)", p->tot_len == p->len); + LWIP_ASSERT("p->next == NULL", p->next == NULL); + /* add total length of second chain to last pbuf total of first chain */ + p->tot_len += t->tot_len; + /* chain last pbuf of head (p) with first of tail (t) */ + p->next = t; + /* p->next now references t, but the caller will drop its reference to t, + * so netto there is no change to the reference count of t. + */ +} + +// windows scale needs large pbuf +/** + * This method modifies a 'pbuf chain', so that its total length is + * smaller than 64K. The remainder of the original pbuf chain is stored + * in *rest. + * This function never creates new pbufs, but splits an existing chain + * in two parts. The tot_len of the modified packet queue will likely be + * smaller than 64K. + * 'packet queues' are not supported by this function. + */ +void pbuf_split_64k(struct pbuf *p, struct pbuf **rest) +{ + if (p == NULL || + p->tot_len < 0xffff) { + // pbuf is smaller than 64K + *rest = NULL; + } else { + u32_t tot_len_front = 0; + struct pbuf *i = NULL; + + *rest = p; + while (*rest != NULL && + tot_len_front + (*rest)->len <= 0xffff) { + tot_len_front += (*rest)->len; + i = *rest; + *rest = (*rest)->next; + } + /* i now points to last packet of the first segment. Set next + * pointer to NULL */ + i->next = NULL; + + /* Update the tot_len field in the first part */ + for (i = p; i && i->next != *rest && *rest; i = i->next) { + i->tot_len -= (*rest)->tot_len; + } + + /* tot_len field in rest does not need modifications */ + /* reference counters do not need modifications */ + } +} diff --git a/src/vma/lwip/pbuf.h b/src/vma/lwip/pbuf.h new file mode 100644 index 0000000..d59bf87 --- /dev/null +++ b/src/vma/lwip/pbuf.h @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2001-2004 Swedish Institute of Computer Science. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT + * SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY + * OF SUCH DAMAGE. + * + * This file is part of the lwIP TCP/IP stack. + * + * Author: Adam Dunkels + * + */ + +#ifndef __LWIP_PBUF_H__ +#define __LWIP_PBUF_H__ + +#include "vma/lwip/opt.h" +#include "vma/lwip/err.h" + +#ifdef __cplusplus +extern "C" { +#endif + +/** Align a memory pointer to the alignment defined by MEM_ALIGNMENT + * so that ADDR % MEM_ALIGNMENT == 0 + */ +#ifndef LWIP_MEM_ALIGN +#define LWIP_MEM_ALIGN(addr) ((void *)(((mem_ptr_t)(addr) + MEM_ALIGNMENT - 1) & ~(mem_ptr_t)(MEM_ALIGNMENT-1))) +#endif + +/** Currently, the pbuf_custom code is only needed for one specific configuration of IP_FRAG */ +#define LWIP_SUPPORT_CUSTOM_PBUF 1 + +#define PBUF_TRANSPORT_HLEN 20 +#define PBUF_IP_HLEN 20 + +typedef enum { + PBUF_RAM, /* pbuf data is stored in RAM */ + PBUF_ROM, /* pbuf data is stored in ROM */ + PBUF_REF, /* pbuf comes from the pbuf pool */ + PBUF_POOL /* pbuf payload refers to RAM */ +} pbuf_type; + + +/** indicates this packet's data should be immediately passed to the application */ +#define PBUF_FLAG_PUSH 0x01U +/** indicates this is a custom pbuf: pbuf_free and pbuf_header handle such a + a pbuf differently */ +#define PBUF_FLAG_IS_CUSTOM 0x02U +/** indicates this pbuf is UDP multicast to be looped back */ +#define PBUF_FLAG_MCASTLOOP 0x04U + +struct pbuf { + /** next pbuf in singly linked pbuf chain */ + struct pbuf *next; + + /** pointer to the actual data in the buffer */ + void *payload; + + /** length of this buffer */ + u16_t len; + + /** + * total length of this buffer and all next buffers in chain + * belonging to the same packet. + * + * For non-queue packet chains this is the invariant: + * p->tot_len == p->len + (p->next? p->next->tot_len: 0) + */ + u32_t tot_len; // windows scale needs large pbuf + + /** pbuf_type as u8_t instead of enum to save space */ + u8_t /*pbuf_type*/ type; + + /** misc flags */ + u8_t flags; + + /** + * the reference count always equals the number of pointers + * that refer to this pbuf. This can be pointers from an application, + * the stack itself, or pbuf->next pointers from a chain. + */ + u16_t ref; +}; + +#if LWIP_SUPPORT_CUSTOM_PBUF +/** Prototype for a function to free a custom pbuf */ +typedef void (*pbuf_free_custom_fn)(struct pbuf *p); + +/** A custom pbuf: like a pbuf, but following a function pointer to free it. */ +struct pbuf_custom { + /** The actual pbuf */ + struct pbuf pbuf; + /** This function is called when pbuf_free deallocates this pbuf(_custom) */ + pbuf_free_custom_fn custom_free_function; +}; +#endif /* LWIP_SUPPORT_CUSTOM_PBUF */ + +/* Initializes the pbuf module. This call is empty for now, but may not be in future. */ +#define pbuf_init() + +void pbuf_realloc(struct pbuf *p, u16_t size); +u8_t pbuf_header(struct pbuf *p, s16_t header_size); +void pbuf_ref(struct pbuf *p); +u8_t pbuf_free(struct pbuf *p); +u8_t pbuf_clen(struct pbuf *p); +void pbuf_cat(struct pbuf *head, struct pbuf *tail); + +void pbuf_split_64k(struct pbuf *p, struct pbuf **rest); // windows scale needs large pbuf + +#ifdef __cplusplus +} +#endif + +#endif /* __LWIP_PBUF_H__ */ diff --git a/src/vma/lwip/stats.h b/src/vma/lwip/stats.h new file mode 100644 index 0000000..6628991 --- /dev/null +++ b/src/vma/lwip/stats.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2001-2004 Swedish Institute of Computer Science. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT + * SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY + * OF SUCH DAMAGE. + * + * This file is part of the lwIP TCP/IP stack. + * + * Author: Adam Dunkels + * + */ +#ifndef __LWIP_STATS_H__ +#define __LWIP_STATS_H__ + +#include "vma/lwip/opt.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#if LWIP_STATS + +#ifndef LWIP_STATS_LARGE +#define LWIP_STATS_LARGE 0 +#endif + +#if LWIP_STATS_LARGE +#define STAT_COUNTER u32_t +#define STAT_COUNTER_F U32_F +#else +#define STAT_COUNTER u16_t +#define STAT_COUNTER_F U16_F +#endif + +struct stats_proto { + STAT_COUNTER xmit; /* Transmitted packets. */ + STAT_COUNTER recv; /* Received packets. */ + STAT_COUNTER fw; /* Forwarded packets. */ + STAT_COUNTER drop; /* Dropped packets. */ + STAT_COUNTER chkerr; /* Checksum error. */ + STAT_COUNTER lenerr; /* Invalid length error. */ + STAT_COUNTER memerr; /* Out of memory error. */ + STAT_COUNTER rterr; /* Routing error. */ + STAT_COUNTER proterr; /* Protocol error. */ + STAT_COUNTER opterr; /* Error in options. */ + STAT_COUNTER err; /* Misc error. */ + STAT_COUNTER cachehit; +}; + + +struct stats_ { +#if TCP_STATS + struct stats_proto tcp; +#endif +}; + +extern struct stats_ lwip_stats; + +void stats_init(void); + +#define STATS_INC(x) ++lwip_stats.x +#define STATS_DEC(x) --lwip_stats.x +#define STATS_INC_USED(x, y) do { lwip_stats.x.used += y; \ + if (lwip_stats.x.max < lwip_stats.x.used) { \ + lwip_stats.x.max = lwip_stats.x.used; \ + } \ + } while(0) +#else /* LWIP_STATS */ +#define stats_init() +#define STATS_INC(x) +#define STATS_DEC(x) +#define STATS_INC_USED(x) +#endif /* LWIP_STATS */ + +#if TCP_STATS +#define TCP_STATS_INC(x) STATS_INC(x) +#define TCP_STATS_DISPLAY() stats_display_proto(&lwip_stats.tcp, "TCP") +#else +#define TCP_STATS_INC(x) +#define TCP_STATS_DISPLAY() +#endif + + +/* Display of statistics */ +#if LWIP_STATS_DISPLAY +void stats_display(void); +void stats_display_proto(struct stats_proto *proto, char *name); +#else /* LWIP_STATS_DISPLAY */ +#define stats_display() +#define stats_display_proto(proto, name) +#endif /* LWIP_STATS_DISPLAY */ + +#ifdef __cplusplus +} +#endif + +#endif /* __LWIP_STATS_H__ */ diff --git a/src/vma/lwip/tcp.c b/src/vma/lwip/tcp.c new file mode 100644 index 0000000..72b8730 --- /dev/null +++ b/src/vma/lwip/tcp.c @@ -0,0 +1,1493 @@ +/** + * @file + * Transmission Control Protocol for IP + * + * This file contains common functions for the TCP implementation, such as functinos + * for manipulating the data structures and the TCP timer functions. TCP functions + * related to input and output is found in tcp_in.c and tcp_out.c respectively. + * + */ + +/* + * Copyright (c) 2001-2004 Swedish Institute of Computer Science. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT + * SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY + * OF SUCH DAMAGE. + * + * This file is part of the lwIP TCP/IP stack. + * + * Author: Adam Dunkels + * + */ + +#include "vma/lwip/opt.h" + +#if LWIP_TCP /* don't build if not configured for use in lwipopts.h */ +#include "vma/lwip/cc.h" +#include "vma/lwip/tcp.h" +#include "vma/lwip/tcp_impl.h" +#include "vma/lwip/stats.h" + +#include +#include +#include + +#if LWIP_3RD_PARTY_BUFS +tcp_tx_pbuf_alloc_fn external_tcp_tx_pbuf_alloc; + +void register_tcp_tx_pbuf_alloc(tcp_tx_pbuf_alloc_fn fn) +{ + external_tcp_tx_pbuf_alloc = fn; +} + +tcp_tx_pbuf_free_fn external_tcp_tx_pbuf_free; + +void register_tcp_tx_pbuf_free(tcp_tx_pbuf_free_fn fn) +{ + external_tcp_tx_pbuf_free = fn; +} + +tcp_seg_alloc_fn external_tcp_seg_alloc; + +void register_tcp_seg_alloc(tcp_seg_alloc_fn fn) +{ + external_tcp_seg_alloc = fn; +} + +tcp_seg_free_fn external_tcp_seg_free; + +void register_tcp_seg_free(tcp_seg_free_fn fn) +{ + external_tcp_seg_free = fn; +} +#endif + +/* allow user to be notified upon tcp_state changes */ +tcp_state_observer_fn external_tcp_state_observer; + +void register_tcp_state_observer(tcp_state_observer_fn fn) +{ + external_tcp_state_observer = fn; +} + + +enum cc_algo_mod lwip_cc_algo_module = CC_MOD_LWIP; + +u16_t lwip_tcp_mss = CONST_TCP_MSS; + +u8_t enable_ts_option = 0; +/* slow timer value */ +static u32_t slow_tmr_interval; +/* Incremented every coarse grained timer shot (typically every slow_tmr_interval ms). */ +u32_t tcp_ticks = 0; +const u8_t tcp_backoff[13] = + { 1, 2, 3, 4, 5, 6, 7, 7, 7, 7, 7, 7, 7}; + /* Times per slowtmr hits */ +const u8_t tcp_persist_backoff[7] = { 3, 6, 12, 24, 48, 96, 120 }; + +/** Only used for temporary storage. */ +struct tcp_pcb *tcp_tmp_pcb; + +/** + * + * @param v value to set + */ +void +set_tmr_resolution(u32_t v) +{ + slow_tmr_interval = v * 2; +} +/** + * Called periodically to dispatch TCP timers. + * + */ +void +tcp_tmr(struct tcp_pcb* pcb) +{ + /* Call tcp_fasttmr() every (slow_tmr_interval / 2) ms */ + tcp_fasttmr(pcb); + + if (++(pcb->tcp_timer) & 1) { + /* Call tcp_tmr() every slow_tmr_interval ms, i.e., every other timer + tcp_tmr() is called. */ + tcp_slowtmr(pcb); + } +} + +/** + * Closes the TX side of a connection held by the PCB. + * For tcp_close(), a RST is sent if the application didn't receive all data + * (tcp_recved() not called for all data passed to recv callback). + * + * Listening pcbs are freed and may not be referenced any more. + * Connection pcbs are freed if not yet connected and may not be referenced + * any more. If a connection is established (at least SYN received or in + * a closing state), the connection is closed, and put in a closing state. + * The pcb is then automatically freed in tcp_slowtmr(). It is therefore + * unsafe to reference it. + * + * @param pcb the tcp_pcb to close + * @return ERR_OK if connection has been closed + * another err_t if closing failed and pcb is not freed + */ +static err_t +tcp_close_shutdown(struct tcp_pcb *pcb, u8_t rst_on_unacked_data) +{ + err_t err; + + if (rst_on_unacked_data && ((get_tcp_state(pcb) == ESTABLISHED) || (get_tcp_state(pcb) == CLOSE_WAIT))) { + if ((pcb->refused_data != NULL) || (pcb->rcv_wnd != pcb->rcv_wnd_max)) { + /* Not all data received by application, send RST to tell the remote + side about this. */ + LWIP_ASSERT("pcb->flags & TF_RXCLOSED", pcb->flags & TF_RXCLOSED); + + /* don't call tcp_abort here: we must not deallocate the pcb since + that might not be expected when calling tcp_close */ + tcp_rst(pcb->snd_nxt, pcb->rcv_nxt, pcb->local_port, pcb->remote_port, pcb); + + tcp_pcb_purge(pcb); + + if (get_tcp_state(pcb) == ESTABLISHED) { + /* move to TIME_WAIT since we close actively */ + set_tcp_state(pcb, TIME_WAIT); + } else { + /* CLOSE_WAIT: deallocate the pcb since we already sent a RST for it */ + } + + return ERR_OK; + } + } + + switch (get_tcp_state(pcb)) { + case CLOSED: + /* Closing a pcb in the CLOSED state might seem erroneous, + * however, it is in this state once allocated and as yet unused + * and the user needs some way to free it should the need arise. + * Calling tcp_close() with a pcb that has already been closed, (i.e. twice) + * or for a pcb that has been used and then entered the CLOSED state + * is erroneous, but this should never happen as the pcb has in those cases + * been freed, and so any remaining handles are bogus. */ + err = ERR_OK; + pcb = NULL; + break; + case LISTEN: + err = ERR_OK; + tcp_pcb_remove(pcb); + pcb = NULL; + break; + case SYN_SENT: + err = ERR_OK; + tcp_pcb_remove(pcb); + pcb = NULL; + break; + case SYN_RCVD: + err = tcp_send_fin(pcb); + if (err == ERR_OK) { + set_tcp_state(pcb, FIN_WAIT_1); + } + break; + case ESTABLISHED: + err = tcp_send_fin(pcb); + if (err == ERR_OK) { + set_tcp_state(pcb, FIN_WAIT_1); + } + break; + case CLOSE_WAIT: + err = tcp_send_fin(pcb); + if (err == ERR_OK) { + set_tcp_state(pcb, LAST_ACK); + } + break; + default: + /* Has already been closed, do nothing. */ + err = ERR_OK; + pcb = NULL; + break; + } + + if (pcb != NULL && err == ERR_OK) { + /* To ensure all data has been sent when tcp_close returns, we have + to make sure tcp_output doesn't fail. + Since we don't really have to ensure all data has been sent when tcp_close + returns (unsent data is sent from tcp timer functions, also), we don't care + for the return value of tcp_output for now. */ + /* @todo: When implementing SO_LINGER, this must be changed somehow: + If SOF_LINGER is set, the data should be sent and acked before close returns. + This can only be valid for sequential APIs, not for the raw API. */ + tcp_output(pcb); + } + return err; +} + +/** + * Closes the connection held by the PCB. + * + * Listening pcbs are freed and may not be referenced any more. + * Connection pcbs are freed if not yet connected and may not be referenced + * any more. If a connection is established (at least SYN received or in + * a closing state), the connection is closed, and put in a closing state. + * The pcb is then automatically freed in tcp_slowtmr(). It is therefore + * unsafe to reference it (unless an error is returned). + * + * @param pcb the tcp_pcb to close + * @return ERR_OK if connection has been closed + * another err_t if closing failed and pcb is not freed + */ +err_t +tcp_close(struct tcp_pcb *pcb) +{ +#if TCP_DEBUG + LWIP_DEBUGF(TCP_DEBUG, ("tcp_close: closing in ")); + tcp_debug_print_state(get_tcp_state(pcb)); +#endif /* TCP_DEBUG */ + + if (get_tcp_state(pcb) != LISTEN) { + /* Set a flag not to receive any more data... */ + pcb->flags |= TF_RXCLOSED; + } + /* ... and close */ + return tcp_close_shutdown(pcb, 1); +} + +/** + * Causes all or part of a full-duplex connection of this PCB to be shut down. + * This doesn't deallocate the PCB! + * + * @param pcb PCB to shutdown + * @param shut_rx shut down receive side if this is != 0 + * @param shut_tx shut down send side if this is != 0 + * @return ERR_OK if shutdown succeeded (or the PCB has already been shut down) + * another err_t on error. + */ +err_t +tcp_shutdown(struct tcp_pcb *pcb, int shut_rx, int shut_tx) +{ + if (get_tcp_state(pcb) == LISTEN) { + return ERR_CONN; + } + if (shut_rx) { + /* shut down the receive side: set a flag not to receive any more data... */ + pcb->flags |= TF_RXCLOSED; + if (shut_tx) { + /* shutting down the tx AND rx side is the same as closing for the raw API */ + return tcp_close_shutdown(pcb, 1); + } + /* ... and free buffered data */ + if (pcb->refused_data != NULL) { + pbuf_free(pcb->refused_data); + pcb->refused_data = NULL; + } + } + if (shut_tx) { + /* This can't happen twice since if it succeeds, the pcb's state is changed. + Only close in these states as the others directly deallocate the PCB */ + switch (get_tcp_state(pcb)) { + case SYN_RCVD: + case ESTABLISHED: + case CLOSE_WAIT: + return tcp_close_shutdown(pcb, 0); + default: + /* Not (yet?) connected, cannot shutdown the TX side as that would bring us + into CLOSED state, where the PCB is deallocated. */ + return ERR_CONN; + } + } + /* @todo: return another err_t if not in correct state or already shut? */ + return ERR_OK; +} + +/** + * Abandons a connection and optionally sends a RST to the remote + * host. Deletes the local protocol control block. This is done when + * a connection is killed because of shortage of memory. + * + * @param pcb the tcp_pcb to abort + * @param reset boolean to indicate whether a reset should be sent + */ +void +tcp_abandon(struct tcp_pcb *pcb, int reset) +{ + u32_t seqno, ackno; + u16_t remote_port, local_port; + ip_addr_t remote_ip, local_ip; +#if LWIP_CALLBACK_API + tcp_err_fn errf; +#endif /* LWIP_CALLBACK_API */ + void *errf_arg; + + /* get_tcp_state(pcb) LISTEN not allowed here */ + LWIP_ASSERT("don't call tcp_abort/tcp_abandon for listen-pcbs", + get_tcp_state(pcb) != LISTEN); + /* Figure out on which TCP PCB list we are, and remove us. If we + are in an active state, call the receive function associated with + the PCB with a NULL argument, and send an RST to the remote end. */ + if (get_tcp_state(pcb) == TIME_WAIT) { + tcp_pcb_remove(pcb); + } else { + int send_rst = reset && (get_tcp_state(pcb) != CLOSED); + seqno = pcb->snd_nxt; + ackno = pcb->rcv_nxt; + ip_addr_copy(local_ip, pcb->local_ip); + ip_addr_copy(remote_ip, pcb->remote_ip); + local_port = pcb->local_port; + remote_port = pcb->remote_port; +#if LWIP_CALLBACK_API + errf = pcb->errf; +#endif /* LWIP_CALLBACK_API */ + errf_arg = pcb->my_container; + tcp_pcb_remove(pcb); + if (pcb->unacked != NULL) { + tcp_tx_segs_free(pcb, pcb->unacked); + pcb->unacked = NULL; + } + if (pcb->unsent != NULL) { + tcp_tx_segs_free(pcb, pcb->unsent); + pcb->unsent = NULL; + } +#if TCP_QUEUE_OOSEQ + if (pcb->ooseq != NULL) { + tcp_segs_free(pcb, pcb->ooseq); + } +#endif /* TCP_QUEUE_OOSEQ */ + TCP_EVENT_ERR(errf, errf_arg, ERR_ABRT); + if (send_rst) { + LWIP_DEBUGF(TCP_RST_DEBUG, ("tcp_abandon: sending RST\n")); + tcp_rst(seqno, ackno, local_port, remote_port, pcb); + } + } + (void)local_ip; /* Fix warning -Wunused-but-set-variable */ + (void)remote_ip; /* Fix warning -Wunused-but-set-variable */ +} + +/** + * Aborts the connection by sending a RST (reset) segment to the remote + * host. The pcb is deallocated. This function never fails. + * + * ATTENTION: When calling this from one of the TCP callbacks, make + * sure you always return ERR_ABRT (and never return ERR_ABRT otherwise + * or you will risk accessing deallocated memory or memory leaks! + * + * @param pcb the tcp pcb to abort + */ +void +tcp_abort(struct tcp_pcb *pcb) +{ + tcp_abandon(pcb, 1); +} + +/** + * Binds the connection to a local portnumber and IP address. If the + * IP address is not given (i.e., ipaddr == NULL), the IP address of + * the outgoing network interface is used instead. + * + * @param pcb the tcp_pcb to bind (no check is done whether this pcb is + * already bound!) + * @param ipaddr the local ip address to bind to (use IP_ADDR_ANY to bind + * to any local address + * @param port the local port to bind to + * @return ERR_USE if the port is already in use + * ERR_OK if bound + */ +err_t +tcp_bind(struct tcp_pcb *pcb, ip_addr_t *ipaddr, u16_t port) +{ + LWIP_ERROR("tcp_bind: can only bind in state CLOSED", get_tcp_state(pcb) == CLOSED, return ERR_ISCONN); + + if (!ip_addr_isany(ipaddr)) { + pcb->local_ip = *ipaddr; + } + pcb->local_port = port; + LWIP_DEBUGF(TCP_DEBUG, ("tcp_bind: bind to port %"U16_F"\n", port)); + + return ERR_OK; +} +#if LWIP_CALLBACK_API +/** + * Default accept callback if no accept callback is specified by the user. + */ +static err_t +tcp_accept_null(void *arg, struct tcp_pcb *pcb, err_t err) +{ + LWIP_UNUSED_ARG(arg); + LWIP_UNUSED_ARG(pcb); + LWIP_UNUSED_ARG(err); + + return ERR_ABRT; +} +#endif /* LWIP_CALLBACK_API */ + +/** + * Set the state of the connection to be LISTEN, which means that it + * is able to accept incoming connections. + * + * @param listen_pcb used for listening + * @param pcb the original tcp_pcb + * @return ERR_ISCONN if the conn_pcb is already in LISTEN state + * and ERR_OK on success + * + */ +err_t +tcp_listen(struct tcp_pcb_listen *listen_pcb, struct tcp_pcb *pcb) +{ + /* + * LWIP_ERROR("tcp_listen: conn_pcb already connected", get_tcp_state(pcb) == CLOSED, ERR_ISCONN); + */ + + /* already listening? */ + if (!listen_pcb || (!pcb || get_tcp_state(pcb) == LISTEN)) { + return ERR_ISCONN; + } + listen_pcb->callback_arg = pcb->callback_arg; + listen_pcb->local_port = pcb->local_port; + set_tcp_state(listen_pcb, LISTEN); + listen_pcb->prio = pcb->prio; + listen_pcb->so_options = pcb->so_options; + listen_pcb->so_options |= SOF_ACCEPTCONN; + listen_pcb->ttl = pcb->ttl; + listen_pcb->tos = pcb->tos; + ip_addr_copy(listen_pcb->local_ip, pcb->local_ip); +#if LWIP_CALLBACK_API + listen_pcb->accept = tcp_accept_null; +#endif /* LWIP_CALLBACK_API */ + return ERR_OK; + +} + +/** + * Update the state that tracks the available window space to advertise. + * + * Returns how much extra window would be advertised if we sent an + * update now. + */ +u32_t tcp_update_rcv_ann_wnd(struct tcp_pcb *pcb) +{ + u32_t new_right_edge = pcb->rcv_nxt + pcb->rcv_wnd; + + if (TCP_SEQ_GEQ(new_right_edge, pcb->rcv_ann_right_edge + LWIP_MIN((pcb->rcv_wnd_max / 2), pcb->mss))) { + /* we can advertise more window */ + pcb->rcv_ann_wnd = pcb->rcv_wnd; + return new_right_edge - pcb->rcv_ann_right_edge; + } else { + if (TCP_SEQ_GT(pcb->rcv_nxt, pcb->rcv_ann_right_edge)) { + /* Can happen due to other end sending out of advertised window, + * but within actual available (but not yet advertised) window */ + pcb->rcv_ann_wnd = 0; + } else { + /* keep the right edge of window constant */ + u32_t new_rcv_ann_wnd = pcb->rcv_ann_right_edge - pcb->rcv_nxt; + LWIP_ASSERT("new_rcv_ann_wnd <= 0xffff00", new_rcv_ann_wnd <= 0xffff00); + pcb->rcv_ann_wnd = new_rcv_ann_wnd; + } + return 0; + } +} + +/** + * This function should be called by the application when it has + * processed the data. The purpose is to advertise a larger window + * when the data has been processed. + * + * @param pcb the tcp_pcb for which data is read + * @param len the amount of bytes that have been read by the application + */ +void +tcp_recved(struct tcp_pcb *pcb, u32_t len) +{ + u32_t wnd_inflation; + + LWIP_ASSERT("tcp_recved: len would wrap rcv_wnd\n", + len <= 0xffffffffU - pcb->rcv_wnd ); + + pcb->rcv_wnd += len; + if (pcb->rcv_wnd > pcb->rcv_wnd_max) { + pcb->rcv_wnd = pcb->rcv_wnd_max; + } else if(pcb->rcv_wnd == 0) { + /* rcv_wnd overflowed */ + if ((get_tcp_state(pcb) == CLOSE_WAIT) || (get_tcp_state(pcb) == LAST_ACK)) { + /* In passive close, we allow this, since the FIN bit is added to rcv_wnd + by the stack itself, since it is not mandatory for an application + to call tcp_recved() for the FIN bit, but e.g. the netconn API does so. */ + pcb->rcv_wnd = pcb->rcv_wnd_max; + } else { + LWIP_ASSERT("tcp_recved: len wrapped rcv_wnd\n", 0); + } + } + + wnd_inflation = tcp_update_rcv_ann_wnd(pcb); + + /* If the change in the right edge of window is significant (default + * watermark is TCP_WND/4), then send an explicit update now. + * Otherwise wait for a packet to be sent in the normal course of + * events (or more window to be available later) */ + if (wnd_inflation >= TCP_WND_UPDATE_THRESHOLD) { + tcp_ack_now(pcb); + tcp_output(pcb); + } + + LWIP_DEBUGF(TCP_DEBUG, ("tcp_recved: recveived %"U16_F" bytes, wnd %"U16_F" (%"U16_F").\n", + len, pcb->rcv_wnd, TCP_WND_SCALED(pcb) - pcb->rcv_wnd)); +} + +/** + * Connects to another host. The function given as the "connected" + * argument will be called when the connection has been established. + * + * @param pcb the tcp_pcb used to establish the connection + * @param ipaddr the remote ip address to connect to + * @param port the remote tcp port to connect to + * @param connected callback function to call when connected (or on error) + * @return ERR_VAL if invalid arguments are given + * ERR_OK if connect request has been sent + * other err_t values if connect request couldn't be sent + */ +err_t +tcp_connect(struct tcp_pcb *pcb, ip_addr_t *ipaddr, u16_t port, + tcp_connected_fn connected) +{ + err_t ret; + u32_t iss; + + LWIP_ERROR("tcp_connect: can only connected from state CLOSED", get_tcp_state(pcb) == CLOSED, return ERR_ISCONN); + + LWIP_DEBUGF(TCP_DEBUG, ("tcp_connect to port %"U16_F"\n", port)); + if (ipaddr != NULL) { + pcb->remote_ip = *ipaddr; + } else { + return ERR_VAL; + } + pcb->remote_port = port; + + /* check if we have a route to the remote host */ + if (ip_addr_isany(&(pcb->local_ip))) { + LWIP_ASSERT("tcp_connect: need to find route to host", 0); + } + + if (pcb->local_port == 0) { + return ERR_VAL; + } + iss = tcp_next_iss(); + pcb->rcv_nxt = 0; + pcb->snd_nxt = iss; + pcb->lastack = iss - 1; + pcb->snd_lbb = iss - 1; + pcb->rcv_ann_right_edge = pcb->rcv_nxt; + pcb->snd_wnd = TCP_WND; + /* + * For effective and advertized MSS without MTU consideration: + * If MSS is configured - do not accept a higher value than 536 + * If MSS is not configured assume minimum value of 536 + * The send MSS is updated when an MSS option is received + */ + u16_t snd_mss = pcb->advtsd_mss = (LWIP_TCP_MSS) ? ((LWIP_TCP_MSS > 536) ? 536 : LWIP_TCP_MSS) : 536; + UPDATE_PCB_BY_MSS(pcb, snd_mss); +#if TCP_CALCULATE_EFF_SEND_MSS + /* + * For advertized MSS with MTU knowledge - it is highly likely that it can be derived from the MTU towards the remote IP address. + * Otherwise (if unlikely MTU==0) + * If LWIP_TCP_MSS>0 use it as MSS + * If LWIP_TCP_MSS==0 set advertized MSS value to default 536 + */ + pcb->advtsd_mss = (LWIP_TCP_MSS > 0) ? tcp_eff_send_mss(LWIP_TCP_MSS, pcb) : tcp_mss_follow_mtu_with_default(536, pcb); + /* + * For effective MSS with MTU knowledge - get the minimum between pcb->mss and the MSS derived from the + * MTU towards the remote IP address + * */ + u16_t eff_mss = tcp_eff_send_mss(pcb->mss, pcb); + UPDATE_PCB_BY_MSS(pcb, eff_mss); +#endif /* TCP_CALCULATE_EFF_SEND_MSS */ + pcb->cwnd = 1; + pcb->ssthresh = pcb->mss * 10; + pcb->connected = connected; + + /* Send a SYN together with the MSS option. */ + ret = tcp_enqueue_flags(pcb, TCP_SYN); + if (ret == ERR_OK) { + /* SYN segment was enqueued, changed the pcbs state now */ + set_tcp_state(pcb, SYN_SENT); + + tcp_output(pcb); + } + return ret; +} + +/** + * Called every slow_tmr_interval ms and implements the retransmission timer and the timer that + * closes the psb if it in TIME_WAIT state for enough time. It also increments + * various timers such as the inactivity timer in PCB. + * + * Automatically called from tcp_tmr(). + */ +void +tcp_slowtmr(struct tcp_pcb* pcb) +{ +#if !TCP_CC_ALGO_MOD + u32_t eff_wnd; +#endif //!TCP_CC_ALGO_MOD + u8_t pcb_remove; /* flag if a PCB should be removed */ + u8_t pcb_reset; /* flag if a RST should be sent when removing */ + err_t err; + + err = ERR_OK; + + if (pcb == NULL) { + LWIP_DEBUGF(TCP_DEBUG, ("tcp_slowtmr: no active pcbs\n")); + } + + if (pcb && PCB_IN_ACTIVE_STATE(pcb)) { + LWIP_DEBUGF(TCP_DEBUG, ("tcp_slowtmr: processing active pcb\n")); + LWIP_ASSERT("tcp_slowtmr: active get_tcp_state(pcb) != CLOSED\n", get_tcp_state(pcb) != CLOSED); + LWIP_ASSERT("tcp_slowtmr: active get_tcp_state(pcb) != LISTEN\n", get_tcp_state(pcb) != LISTEN); + LWIP_ASSERT("tcp_slowtmr: active get_tcp_state(pcb) != TIME-WAIT\n", get_tcp_state(pcb) != TIME_WAIT); + + pcb_remove = 0; + pcb_reset = 0; + + if (get_tcp_state(pcb) == SYN_SENT && pcb->nrtx == TCP_SYNMAXRTX) { + ++pcb_remove; + err = ERR_TIMEOUT; + LWIP_DEBUGF(TCP_DEBUG, ("tcp_slowtmr: max SYN retries reached\n")); + } + else if (pcb->nrtx == TCP_MAXRTX) { + ++pcb_remove; + err = ERR_ABRT; + LWIP_DEBUGF(TCP_DEBUG, ("tcp_slowtmr: max DATA retries reached\n")); + } else { + if (pcb->persist_backoff > 0) { + /* If snd_wnd is zero and pcb->unacked is NULL , use persist timer to send 1 byte probes + * instead of using the standard retransmission mechanism. */ + pcb->persist_cnt++; + if (pcb->persist_cnt >= tcp_persist_backoff[pcb->persist_backoff-1]) { + pcb->persist_cnt = 0; + if (pcb->persist_backoff < sizeof(tcp_persist_backoff)) { + pcb->persist_backoff++; + } + /* Use tcp_keepalive() instead of tcp_zero_window_probe() to probe for window update + * without sending any data (which will force us to split the segment). + * tcp_zero_window_probe(pcb); */ + tcp_keepalive(pcb); + } + } else { + /* Increase the retransmission timer if it is running */ + if(pcb->rtime >= 0) + ++pcb->rtime; + + if (pcb->unacked != NULL && pcb->rtime >= pcb->rto) { + /* Time for a retransmission. */ + LWIP_DEBUGF(TCP_RTO_DEBUG, ("tcp_slowtmr: rtime %"S16_F + " pcb->rto %"S16_F"\n", + pcb->rtime, pcb->rto)); + + /* Double retransmission time-out unless we are trying to + * connect to somebody (i.e., we are in SYN_SENT). */ + if (get_tcp_state(pcb) != SYN_SENT) { + pcb->rto = ((pcb->sa >> 3) + pcb->sv) << tcp_backoff[pcb->nrtx]; + } + + /* Reset the retransmission timer. */ + pcb->rtime = 0; + +#if TCP_CC_ALGO_MOD + cc_cong_signal(pcb, CC_RTO); +#else + /* Reduce congestion window and ssthresh. */ + eff_wnd = LWIP_MIN(pcb->cwnd, pcb->snd_wnd); + pcb->ssthresh = eff_wnd >> 1; + if (pcb->ssthresh < (u32_t)(pcb->mss << 1)) { + pcb->ssthresh = (pcb->mss << 1); + } + pcb->cwnd = pcb->mss; +#endif + LWIP_DEBUGF(TCP_CWND_DEBUG, ("tcp_slowtmr: cwnd %"U16_F + " ssthresh %"U16_F"\n", + pcb->cwnd, pcb->ssthresh)); + + /* The following needs to be called AFTER cwnd is set to one + mss - STJ */ + tcp_rexmit_rto(pcb); + } + } + } + /* Check if this PCB has stayed too long in FIN-WAIT-2 */ + if (get_tcp_state(pcb) == FIN_WAIT_2) { + /* If this PCB is in FIN_WAIT_2 because of SHUT_WR don't let it time out. */ + if (pcb->flags & TF_RXCLOSED) { + /* PCB was fully closed (either through close() or SHUT_RDWR): + normal FIN-WAIT timeout handling. */ + if ((u32_t)(tcp_ticks - pcb->tmr) > + TCP_FIN_WAIT_TIMEOUT / slow_tmr_interval) { + ++pcb_remove; + err = ERR_ABRT; + LWIP_DEBUGF(TCP_DEBUG, ("tcp_slowtmr: removing pcb stuck in FIN-WAIT-2\n")); + } + } + } + + /* Check if KEEPALIVE should be sent */ + if((pcb->so_options & SOF_KEEPALIVE) && + ((get_tcp_state(pcb) == ESTABLISHED) || + (get_tcp_state(pcb) == CLOSE_WAIT))) { +#if LWIP_TCP_KEEPALIVE + if((u32_t)(tcp_ticks - pcb->tmr) > + (pcb->keep_idle + (pcb->keep_cnt*pcb->keep_intvl)) + / slow_tmr_interval) +#else + if((u32_t)(tcp_ticks - pcb->tmr) > + (pcb->keep_idle + TCP_MAXIDLE) / slow_tmr_interval) +#endif /* LWIP_TCP_KEEPALIVE */ + { + LWIP_DEBUGF(TCP_DEBUG, ("tcp_slowtmr: KEEPALIVE timeout. Aborting connection to %"U16_F".%"U16_F".%"U16_F".%"U16_F".\n", + ip4_addr1_16(&pcb->remote_ip), ip4_addr2_16(&pcb->remote_ip), + ip4_addr3_16(&pcb->remote_ip), ip4_addr4_16(&pcb->remote_ip))); + + ++pcb_remove; + err = ERR_ABRT; + ++pcb_reset; + } +#if LWIP_TCP_KEEPALIVE + else if((u32_t)(tcp_ticks - pcb->tmr) > + (pcb->keep_idle + pcb->keep_cnt_sent * pcb->keep_intvl) + / slow_tmr_interval) +#else + else if((u32_t)(tcp_ticks - pcb->tmr) > + (pcb->keep_idle + pcb->keep_cnt_sent * TCP_KEEPINTVL_DEFAULT) + / slow_tmr_interval) +#endif /* LWIP_TCP_KEEPALIVE */ + { + tcp_keepalive(pcb); + pcb->keep_cnt_sent++; + } + } + + /* If this PCB has queued out of sequence data, but has been + inactive for too long, will drop the data (it will eventually + be retransmitted). */ +#if TCP_QUEUE_OOSEQ + if (pcb->ooseq != NULL && + (u32_t)tcp_ticks - pcb->tmr >= pcb->rto * TCP_OOSEQ_TIMEOUT) { + tcp_segs_free(pcb, pcb->ooseq); + pcb->ooseq = NULL; + LWIP_DEBUGF(TCP_CWND_DEBUG, ("tcp_slowtmr: dropping OOSEQ queued data\n")); + } +#endif /* TCP_QUEUE_OOSEQ */ + + /* Check if this PCB has stayed too long in SYN-RCVD */ + if (get_tcp_state(pcb) == SYN_RCVD) { + if ((u32_t)(tcp_ticks - pcb->tmr) > + TCP_SYN_RCVD_TIMEOUT / slow_tmr_interval) { + ++pcb_remove; + err = ERR_ABRT; + LWIP_DEBUGF(TCP_DEBUG, ("tcp_slowtmr: removing pcb stuck in SYN-RCVD\n")); + } + } + + /* Check if this PCB has stayed too long in LAST-ACK */ + if (get_tcp_state(pcb) == LAST_ACK) { + if ((u32_t)(tcp_ticks - pcb->tmr) > 2 * TCP_MSL / slow_tmr_interval) { + ++pcb_remove; + err = ERR_ABRT; + LWIP_DEBUGF(TCP_DEBUG, ("tcp_slowtmr: removing pcb stuck in LAST-ACK\n")); + } + } + + /* If the PCB should be removed, do it. */ + if (pcb_remove) { + tcp_pcb_purge(pcb); + + TCP_EVENT_ERR(pcb->errf, pcb->my_container, err); + + if (pcb_reset) { + tcp_rst(pcb->snd_nxt, pcb->rcv_nxt, pcb->local_port, pcb->remote_port, pcb); + } + set_tcp_state(pcb, CLOSED); + } else { + /* We check if we should poll the connection. */ + ++pcb->polltmr; + if (pcb->polltmr >= pcb->pollinterval) { + pcb->polltmr = 0; + LWIP_DEBUGF(TCP_DEBUG, ("tcp_slowtmr: polling application\n")); + TCP_EVENT_POLL(pcb, err); + /* if err == ERR_ABRT, 'prev' is already deallocated */ + if (err == ERR_OK) { + tcp_output(pcb); + } + } + } + } + + + if (pcb && PCB_IN_TIME_WAIT_STATE(pcb)) { + LWIP_ASSERT("tcp_slowtmr: TIME-WAIT get_tcp_state(pcb) == TIME-WAIT", get_tcp_state(pcb) == TIME_WAIT); + pcb_remove = 0; + + /* Check if this PCB has stayed long enough in TIME-WAIT */ + if ((u32_t)(tcp_ticks - pcb->tmr) > 2 * TCP_MSL / slow_tmr_interval) { + ++pcb_remove; + /* err = ERR_ABRT; */ /* Note: suppress warning 'err' is never read */ + } + + /* If the PCB should be removed, do it. */ + if (pcb_remove) { + tcp_pcb_purge(pcb); + + set_tcp_state(pcb, CLOSED); + } + } +} + + +/** + * Is called every slow_tmr_interval and process data previously + * "refused" by upper layer (application) and sends delayed ACKs. + * + * Automatically called from tcp_tmr(). + */ +void +tcp_fasttmr(struct tcp_pcb* pcb) +{ + if(pcb != NULL && PCB_IN_ACTIVE_STATE(pcb)) { + /* If there is data which was previously "refused" by upper layer */ + while (pcb->refused_data != NULL) { // 'while' instead of 'if' because windows scale uses large pbuf + struct pbuf *rest; + /* Notify again application with data previously received. */ + err_t err; + pbuf_split_64k(pcb->refused_data, &rest); + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_fasttmr: notify kept packet\n")); + TCP_EVENT_RECV(pcb, pcb->refused_data, ERR_OK, err); + if (err == ERR_OK) { + pcb->refused_data = rest; + } else { + if (rest) { + pbuf_cat(pcb->refused_data, rest); /* undo splitting */ + } + if (err == ERR_ABRT) { + /* if err == ERR_ABRT, 'pcb' is already deallocated */ + pcb = NULL; + } + break; + } + } + + /* send delayed ACKs */ + if (pcb && (pcb->flags & TF_ACK_DELAY)) { + LWIP_DEBUGF(TCP_DEBUG, ("tcp_fasttmr: delayed ACK\n")); + tcp_ack_now(pcb); + tcp_output(pcb); + pcb->flags &= ~(TF_ACK_DELAY | TF_ACK_NOW); + } + } +} + +/** + * Deallocates a list of TCP segments (tcp_seg structures). + * + * @param seg tcp_seg list of TCP segments to free + */ +void +tcp_segs_free(struct tcp_pcb *pcb, struct tcp_seg *seg) +{ + while (seg != NULL) { + struct tcp_seg *next = seg->next; + seg->next = NULL; + tcp_seg_free(pcb, seg); + seg = next; + } +} + +/** + * Frees a TCP segment (tcp_seg structure). + * + * @param seg single tcp_seg to free + */ +void +tcp_seg_free(struct tcp_pcb *pcb, struct tcp_seg *seg) +{ + if (seg != NULL) { + if (seg->p != NULL) { + pbuf_free(seg->p); +#if TCP_DEBUG + seg->p = NULL; +#endif /* TCP_DEBUG */ + } + external_tcp_seg_free(pcb, seg); + } +} + +/** + * Deallocates a list of TCP segments (tcp_seg structures). + * + * @param seg tcp_seg list of TCP segments to free + */ +void +tcp_tx_segs_free(struct tcp_pcb * pcb, struct tcp_seg *seg) +{ + while (seg != NULL) { + struct tcp_seg *next = seg->next; + seg->next = NULL; + tcp_tx_seg_free(pcb, seg); + seg = next; + } +} + +/** + * Frees a TCP segment (tcp_seg structure). + * + * @param seg single tcp_seg to free + */ +void +tcp_tx_seg_free(struct tcp_pcb * pcb, struct tcp_seg *seg) +{ + if (seg != NULL) { + if (seg->p != NULL) { + tcp_tx_pbuf_free(pcb, seg->p); + } + external_tcp_seg_free(pcb, seg); + } +} + +#if TCP_QUEUE_OOSEQ +/** + * Returns a copy of the given TCP segment. + * The pbuf and data are not copied, only the pointers + * + * @param seg the old tcp_seg + * @return a copy of seg + */ +struct tcp_seg * +tcp_seg_copy(struct tcp_pcb* pcb, struct tcp_seg *seg) +{ + struct tcp_seg *cseg; + + cseg = external_tcp_seg_alloc(pcb); + if (cseg == NULL) { + return NULL; + } + SMEMCPY((u8_t *)cseg, (const u8_t *)seg, sizeof(struct tcp_seg)); + pbuf_ref(cseg->p); + return cseg; +} +#endif /* TCP_QUEUE_OOSEQ */ + +#if LWIP_CALLBACK_API +/** + * Default receive callback that is called if the user didn't register + * a recv callback for the pcb. + */ +err_t +tcp_recv_null(void *arg, struct tcp_pcb *pcb, struct pbuf *p, err_t err) +{ + LWIP_UNUSED_ARG(arg); + if (p != NULL) { + tcp_recved(pcb, (u32_t)p->tot_len); + pbuf_free(p); + } else if (err == ERR_OK) { + return tcp_close(pcb); + } + return ERR_OK; +} +#endif /* LWIP_CALLBACK_API */ + +void tcp_pcb_init (struct tcp_pcb* pcb, u8_t prio) +{ + u32_t iss; + + memset(pcb, 0, sizeof(*pcb)); + pcb->max_snd_buff = TCP_SND_BUF; + pcb->prio = prio; + pcb->snd_buf = pcb->max_snd_buff; + pcb->snd_queuelen = 0; + pcb->snd_scale = 0; + pcb->rcv_scale = 0; + pcb->rcv_wnd = TCP_WND_SCALED(pcb); + pcb->rcv_ann_wnd = TCP_WND_SCALED(pcb); + pcb->rcv_wnd_max = TCP_WND_SCALED(pcb); + pcb->rcv_wnd_max_desired = TCP_WND_SCALED(pcb); + pcb->tos = 0; + pcb->ttl = TCP_TTL; + /* As initial send MSS, we use TCP_MSS but limit it to 536. + The send MSS is updated when an MSS option is received. */ + u16_t snd_mss = pcb->advtsd_mss = (LWIP_TCP_MSS) ? ((LWIP_TCP_MSS > 536) ? 536 : LWIP_TCP_MSS) : 536; + UPDATE_PCB_BY_MSS(pcb, snd_mss); + pcb->max_unsent_len = pcb->max_tcp_snd_queuelen; + pcb->rto = 3000 / slow_tmr_interval; + pcb->sa = 0; + pcb->sv = 3000 / slow_tmr_interval; + pcb->rtime = -1; +#if TCP_CC_ALGO_MOD + switch (lwip_cc_algo_module) { + case CC_MOD_CUBIC: + pcb->cc_algo = &cubic_cc_algo; + break; + case CC_MOD_NONE: + pcb->cc_algo = &none_cc_algo; + break; + case CC_MOD_LWIP: + default: + pcb->cc_algo = &lwip_cc_algo; + break; + } + cc_init(pcb); +#endif + pcb->cwnd = 1; + iss = tcp_next_iss(); + pcb->snd_wl2 = iss; + pcb->snd_nxt = iss; + pcb->lastack = iss; + pcb->snd_lbb = iss; + pcb->tmr = tcp_ticks; + pcb->snd_sml_snt = 0; + pcb->snd_sml_add = 0; + + pcb->polltmr = 0; + pcb->tcp_timer = 0; +#if LWIP_CALLBACK_API + pcb->recv = tcp_recv_null; +#endif /* LWIP_CALLBACK_API */ + + /* Init KEEPALIVE timer */ + pcb->keep_idle = TCP_KEEPIDLE_DEFAULT; + +#if LWIP_TCP_KEEPALIVE + pcb->keep_intvl = TCP_KEEPINTVL_DEFAULT; + pcb->keep_cnt = TCP_KEEPCNT_DEFAULT; +#endif /* LWIP_TCP_KEEPALIVE */ + + pcb->keep_cnt_sent = 0; + pcb->quickack = 0; + pcb->enable_ts_opt = enable_ts_option; + pcb->seg_alloc = NULL; + pcb->pbuf_alloc = NULL; +} + +struct pbuf * +tcp_tx_pbuf_alloc(struct tcp_pcb * pcb, u16_t length, pbuf_type type) +{ + struct pbuf * p; + + if (!pcb->pbuf_alloc) { + + // pbuf_alloc is not valid, we should allocate a new pbuf. + p = external_tcp_tx_pbuf_alloc(pcb); + if (!p) return NULL; + + p->next = NULL; + p->type = type; + /* set reference count */ + p->ref = 1; + /* set flags */ + p->flags = 0; + } else { + // pbuf_alloc is valid, we dont need to allocate a new pbuf element. + p = pcb->pbuf_alloc; + pcb->pbuf_alloc = NULL; + } + + /* Set up internal structure of the pbuf. */ + p->len = p->tot_len = length; + + return p; +} + +// Release preallocated buffers +void tcp_tx_preallocted_buffers_free(struct tcp_pcb * pcb) +{ + if (pcb->seg_alloc) { + tcp_tx_seg_free(pcb, pcb->seg_alloc); + pcb->seg_alloc = NULL; + } + + if (pcb->pbuf_alloc) { + tcp_tx_pbuf_free(pcb, pcb->pbuf_alloc); + pcb->pbuf_alloc = NULL; + } +} + +void +tcp_tx_pbuf_free(struct tcp_pcb * pcb, struct pbuf * p) +{ + struct pbuf * p_next = NULL; + while (p) { + p_next = p->next; + p->next = NULL; + if (p->type == PBUF_RAM) { + external_tcp_tx_pbuf_free(pcb, p); + } else { + pbuf_free(p); + } + p = p_next; + } +} + +/** + * Used to specify the argument that should be passed callback + * functions. + * + * @param pcb tcp_pcb to set the callback argument + * @param arg void pointer argument to pass to callback functions + */ +void +tcp_arg(struct tcp_pcb *pcb, void *arg) +{ + pcb->callback_arg = arg; +} +#if LWIP_CALLBACK_API + +/** + * Used to specify the function that should be called when a TCP + * connection receives data. + * + * @param pcb tcp_pcb to set the recv callback + * @param recv callback function to call for this pcb when data is received + */ +void +tcp_recv(struct tcp_pcb *pcb, tcp_recv_fn recv) +{ + pcb->recv = recv; +} + +/** + * Used to specify the function that should be called when TCP data + * has been successfully delivered to the remote host. + * + * @param pcb tcp_pcb to set the sent callback + * @param sent callback function to call for this pcb when data is successfully sent + */ +void +tcp_sent(struct tcp_pcb *pcb, tcp_sent_fn sent) +{ + pcb->sent = sent; +} + +/** + * Used to specify the function that should be called when a fatal error + * has occured on the connection. + * + * @param pcb tcp_pcb to set the err callback + * @param err callback function to call for this pcb when a fatal error + * has occured on the connection + */ +void +tcp_err(struct tcp_pcb *pcb, tcp_err_fn err) +{ + pcb->errf = err; +} + +/** + * Used for specifying the function that should be called when a + * LISTENing connection has been connected to another host. + * + * @param pcb tcp_pcb to set the accept callback + * @param accept callback function to call for this pcb when LISTENing + * connection has been connected to another host + */ +void +tcp_accept(struct tcp_pcb *pcb, tcp_accept_fn accept) +{ + pcb->accept = accept; +} + +/** + * Used for specifying the function that should be called + * for sending packets. + * + * @param pcb tcp_pcb to set the outputcallback + * @param output callback function + */ +void +tcp_ip_output(struct tcp_pcb *pcb, ip_output_fn ip_output) +{ + pcb->ip_output = ip_output; +} + +/** + * Used for specifying the function that should be called when a + * SYN was received. + * + * @param pcb tcp_pcb to set the accept callback + * @param accept callback function to call for this pcb when SYN + * is received + */ +void +tcp_syn_handled(struct tcp_pcb_listen *pcb, tcp_syn_handled_fn syn_handled) +{ + pcb->syn_handled_cb = syn_handled; +} + +/** + * Used for specifying the function that should be called to clone pcb + * + * @param listen pcb to clone + * @param clone callback function to call in order to clone the pcb + */ +void +tcp_clone_conn(struct tcp_pcb_listen *pcb, tcp_clone_conn_fn clone_conn) +{ + pcb->clone_conn = clone_conn; +} +#endif /* LWIP_CALLBACK_API */ + + +/** + * Used to specify the function that should be called periodically + * from TCP. The interval is specified in terms of the TCP coarse + * timer interval, which is called twice a second. + * + */ +void +tcp_poll(struct tcp_pcb *pcb, tcp_poll_fn poll, u8_t interval) +{ +#if LWIP_CALLBACK_API + pcb->poll = poll; +#else /* LWIP_CALLBACK_API */ + LWIP_UNUSED_ARG(poll); +#endif /* LWIP_CALLBACK_API */ + pcb->pollinterval = interval; +} + +/** + * Purges a TCP PCB. Removes any buffered data and frees the buffer memory + * (pcb->ooseq, pcb->unsent and pcb->unacked are freed). + * + * @param pcb tcp_pcb to purge. The pcb itself is not deallocated! + */ +void +tcp_pcb_purge(struct tcp_pcb *pcb) +{ + if (get_tcp_state(pcb) != CLOSED && + get_tcp_state(pcb) != TIME_WAIT && + get_tcp_state(pcb) != LISTEN) { + + LWIP_DEBUGF(TCP_DEBUG, ("tcp_pcb_purge\n")); + + if (pcb->refused_data != NULL) { + LWIP_DEBUGF(TCP_DEBUG, ("tcp_pcb_purge: data left on ->refused_data\n")); + pbuf_free(pcb->refused_data); + pcb->refused_data = NULL; + } + if (pcb->unsent != NULL) { + LWIP_DEBUGF(TCP_DEBUG, ("tcp_pcb_purge: not all data sent\n")); + } + if (pcb->unacked != NULL) { + LWIP_DEBUGF(TCP_DEBUG, ("tcp_pcb_purge: data left on ->unacked\n")); + } +#if TCP_QUEUE_OOSEQ + if (pcb->ooseq != NULL) { + LWIP_DEBUGF(TCP_DEBUG, ("tcp_pcb_purge: data left on ->ooseq\n")); + } + tcp_segs_free(pcb, pcb->ooseq); + pcb->ooseq = NULL; +#endif /* TCP_QUEUE_OOSEQ */ + + /* Stop the retransmission timer as it will expect data on unacked + queue if it fires */ + pcb->rtime = -1; + + tcp_tx_segs_free(pcb, pcb->unsent); + tcp_tx_segs_free(pcb, pcb->unacked); + pcb->unacked = pcb->unsent = NULL; +#if TCP_OVERSIZE + pcb->unsent_oversize = 0; +#endif /* TCP_OVERSIZE */ +#if TCP_CC_ALGO_MOD + cc_destroy(pcb); +#endif + } +} + +/** + * Purges the PCB and removes it from a PCB list. Any delayed ACKs are sent first. + * + * @param pcblist PCB list to purge. + * @param pcb tcp_pcb to purge. The pcb itself is NOT deallocated! + */ +void +tcp_pcb_remove(struct tcp_pcb *pcb) +{ + tcp_pcb_purge(pcb); + + /* if there is an outstanding delayed ACKs, send it */ + if (get_tcp_state(pcb) != TIME_WAIT && + get_tcp_state(pcb) != LISTEN && + pcb->flags & TF_ACK_DELAY) { + pcb->flags |= TF_ACK_NOW; + tcp_output(pcb); + } + + if (get_tcp_state(pcb) != LISTEN) { + LWIP_ASSERT("unsent segments leaking", pcb->unsent == NULL); + LWIP_ASSERT("unacked segments leaking", pcb->unacked == NULL); +#if TCP_QUEUE_OOSEQ + LWIP_ASSERT("ooseq segments leaking", pcb->ooseq == NULL); +#endif /* TCP_QUEUE_OOSEQ */ + } + + set_tcp_state(pcb, CLOSED); + + LWIP_ASSERT("tcp_pcb_remove: tcp_pcbs_sane()", tcp_pcbs_sane()); +} + +/** + * Calculates a new initial sequence number for new connections. + * + * @return u32_t pseudo random sequence number + */ +u32_t +tcp_next_iss(void) +{ + static u32_t iss = 6510; + + iss += tcp_ticks; /* XXX */ + return iss; +} + +#if TCP_CALCULATE_EFF_SEND_MSS +/** + * Calcluates the effective send mss that can be used for a specific IP address + * by using ip_route to determine the netif used to send to the address and + * calculating the minimum of TCP_MSS and that netif's mtu (if set). + */ +u16_t +tcp_eff_send_mss(u16_t sendmss, struct tcp_pcb *pcb) +{ + u16_t mtu; + + mtu = external_ip_route_mtu(pcb); + if (mtu != 0) { + sendmss = LWIP_MIN(sendmss, mtu - IP_HLEN - TCP_HLEN); + } + return sendmss; +} + +/** + * Calcluates the send mss that can be used for a specific IP address + * by using ip_route to determine the netif used to send to the address. + * In case MTU is unkonw - return the default MSS + */ +u16_t +tcp_mss_follow_mtu_with_default(u16_t defsendmss, struct tcp_pcb *pcb) +{ + u16_t mtu; + + mtu = external_ip_route_mtu(pcb); + if (mtu != 0) { + defsendmss = mtu - IP_HLEN - TCP_HLEN; + defsendmss = LWIP_MAX(defsendmss, 1); /* MSS must be a positive number */ + } + return defsendmss; +} +#endif /* TCP_CALCULATE_EFF_SEND_MSS */ + +#if TCP_DEBUG || TCP_INPUT_DEBUG || TCP_OUTPUT_DEBUG +/** + * Print a tcp header for debugging purposes. + * + * @param tcphdr pointer to a struct tcp_hdr + */ +void +tcp_debug_print(struct tcp_hdr *tcphdr) +{ + LWIP_DEBUGF(TCP_DEBUG, ("TCP header:\n")); + LWIP_DEBUGF(TCP_DEBUG, ("+-------------------------------+\n")); + LWIP_DEBUGF(TCP_DEBUG, ("| %5"U16_F" | %5"U16_F" | (src port, dest port)\n", + ntohs(tcphdr->src), ntohs(tcphdr->dest))); + LWIP_DEBUGF(TCP_DEBUG, ("+-------------------------------+\n")); + LWIP_DEBUGF(TCP_DEBUG, ("| %010"U32_F" | (seq no)\n", + ntohl(tcphdr->seqno))); + LWIP_DEBUGF(TCP_DEBUG, ("+-------------------------------+\n")); + LWIP_DEBUGF(TCP_DEBUG, ("| %010"U32_F" | (ack no)\n", + ntohl(tcphdr->ackno))); + LWIP_DEBUGF(TCP_DEBUG, ("+-------------------------------+\n")); + LWIP_DEBUGF(TCP_DEBUG, ("| %2"U16_F" | |%"U16_F"%"U16_F"%"U16_F"%"U16_F"%"U16_F"%"U16_F"| %5"U16_F" | (hdrlen, flags (", + TCPH_HDRLEN(tcphdr), + TCPH_FLAGS(tcphdr) >> 5 & 1, + TCPH_FLAGS(tcphdr) >> 4 & 1, + TCPH_FLAGS(tcphdr) >> 3 & 1, + TCPH_FLAGS(tcphdr) >> 2 & 1, + TCPH_FLAGS(tcphdr) >> 1 & 1, + TCPH_FLAGS(tcphdr) & 1, + ntohs(tcphdr->wnd))); + tcp_debug_print_flags(TCPH_FLAGS(tcphdr)); + LWIP_DEBUGF(TCP_DEBUG, ("), win)\n")); + LWIP_DEBUGF(TCP_DEBUG, ("+-------------------------------+\n")); + LWIP_DEBUGF(TCP_DEBUG, ("| 0x%04"X16_F" | %5"U16_F" | (chksum, urgp)\n", + ntohs(tcphdr->chksum), ntohs(tcphdr->urgp))); + LWIP_DEBUGF(TCP_DEBUG, ("+-------------------------------+\n")); +} + +/** + * Print a tcp state for debugging purposes. + * + * @param s enum tcp_state to print + */ +void +tcp_debug_print_state(enum tcp_state s) +{ + LWIP_UNUSED_ARG(s); + LWIP_DEBUGF(TCP_DEBUG, ("State: %s\n", tcp_state_str[s])); +} + +/** + * Print tcp flags for debugging purposes. + * + * @param flags tcp flags, all active flags are printed + */ +void +tcp_debug_print_flags(u8_t flags) +{ + if (flags & TCP_FIN) { + LWIP_DEBUGF(TCP_DEBUG, ("FIN ")); + } + if (flags & TCP_SYN) { + LWIP_DEBUGF(TCP_DEBUG, ("SYN ")); + } + if (flags & TCP_RST) { + LWIP_DEBUGF(TCP_DEBUG, ("RST ")); + } + if (flags & TCP_PSH) { + LWIP_DEBUGF(TCP_DEBUG, ("PSH ")); + } + if (flags & TCP_ACK) { + LWIP_DEBUGF(TCP_DEBUG, ("ACK ")); + } + if (flags & TCP_URG) { + LWIP_DEBUGF(TCP_DEBUG, ("URG ")); + } + if (flags & TCP_ECE) { + LWIP_DEBUGF(TCP_DEBUG, ("ECE ")); + } + if (flags & TCP_CWR) { + LWIP_DEBUGF(TCP_DEBUG, ("CWR ")); + } + LWIP_DEBUGF(TCP_DEBUG, ("\n")); +} + +/** + * Print all tcp_pcbs in every list for debugging purposes. + */ +void +tcp_debug_print_pcbs(void) +{ + LWIP_DEBUGF(TCP_DEBUG, ("Listen PCB states: REMOVED\n")); +} +#endif /* TCP_DEBUG */ + +#endif /* LWIP_TCP */ diff --git a/src/vma/lwip/tcp.h b/src/vma/lwip/tcp.h new file mode 100644 index 0000000..22d8bb7 --- /dev/null +++ b/src/vma/lwip/tcp.h @@ -0,0 +1,556 @@ +/* + * Copyright (c) 2001-2004 Swedish Institute of Computer Science. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT + * SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY + * OF SUCH DAMAGE. + * + * This file is part of the lwIP TCP/IP stack. + * + * Author: Adam Dunkels + * + */ +#ifndef __LWIP_TCP_H__ +#define __LWIP_TCP_H__ + +#include + +#include "vma/lwip/opt.h" + +#if LWIP_TCP /* don't build if not configured for use in lwipopts.h */ + +#include "vma/lwip/pbuf.h" +#include "vma/lwip/ip.h" + +#ifdef __cplusplus +extern "C" { +#endif + +typedef u32_t (*sys_now_fn)(void); +void register_sys_now(sys_now_fn fn); + +#define LWIP_MEM_ALIGN_SIZE(size) (((size) + MEM_ALIGNMENT - 1) & ~(MEM_ALIGNMENT-1)) + +extern u16_t lwip_tcp_mss; + +#if LWIP_3RD_PARTY_L3 +#if LWIP_TSO +typedef err_t (*ip_output_fn)(struct pbuf *p, void* p_conn, u16_t flags); +#else +typedef err_t (*ip_output_fn)(struct pbuf *p, void* p_conn, int is_rexmit, u8_t is_dummy); +#endif /* LWIP_TSO */ +void register_ip_output(ip_output_fn fn); + +typedef ssize_t (*sys_readv_fn)(int __fd, const struct iovec *iov, int iovcnt); +void register_sys_readv(sys_readv_fn fn); + +#endif + +#if LWIP_3RD_PARTY_BUFS +typedef struct pbuf * (*tcp_tx_pbuf_alloc_fn)(void* p_conn); + +void register_tcp_tx_pbuf_alloc(tcp_tx_pbuf_alloc_fn fn); + +typedef void (*tcp_tx_pbuf_free_fn)(void* p_conn, struct pbuf * p); + +void register_tcp_tx_pbuf_free(tcp_tx_pbuf_free_fn fn); + +typedef struct tcp_seg * (*tcp_seg_alloc_fn)(void* p_conn); + +void register_tcp_seg_alloc(tcp_seg_alloc_fn fn); + +typedef void (*tcp_seg_free_fn)(void* p_conn, struct tcp_seg * seg); + +void register_tcp_seg_free(tcp_seg_free_fn fn); + + +extern tcp_tx_pbuf_alloc_fn external_tcp_tx_pbuf_alloc; +extern tcp_tx_pbuf_free_fn external_tcp_tx_pbuf_free; +extern tcp_seg_alloc_fn external_tcp_seg_alloc; +extern tcp_seg_free_fn external_tcp_seg_free; +#endif + + +struct tcp_pcb; + +#include "vma/lwip/cc.h" + +extern enum cc_algo_mod lwip_cc_algo_module; + +/** Function prototype for tcp accept callback functions. Called when a new + * connection can be accepted on a listening pcb. + * + * @param arg Additional argument to pass to the callback function (@see tcp_arg()) + * @param newpcb The new connection pcb + * @param err An error code if there has been an error accepting. + * Only return ERR_ABRT if you have called tcp_abort from within the + * callback function! + */ +typedef err_t (*tcp_accept_fn)(void *arg, struct tcp_pcb *newpcb, err_t err); + +/** Function prototype for tcp syn received callback functions. Called when a new + * syn is received. + * + * @param arg Additional argument to pass to the callback function (@see tcp_arg()) + * @param newpcb The new connection pcb + * @param err An error code if there has been an error. + * Only return ERR_ABRT if you have called tcp_abort from within the + * callback function! + */ +typedef err_t (*tcp_syn_handled_fn)(void *arg, struct tcp_pcb *newpcb, err_t err); + +/** Function prototype for tcp clone callback functions. Called to clone listen pcb + * on connection establishment. + * @param arg Additional argument to pass to the callback function (@see tcp_arg()) + * @param newpcb The new connection pcb + * @param err An error code if there has been an error. + * Only return ERR_ABRT if you have called tcp_abort from within the + * callback function! + * + */ +typedef err_t (*tcp_clone_conn_fn)(void *arg, struct tcp_pcb **newpcb, err_t err); + + +/** Function prototype for tcp receive callback functions. Called when data has + * been received. + * + * @param arg Additional argument to pass to the callback function (@see tcp_arg()) + * @param tpcb The connection pcb which received data + * @param p The received data (or NULL when the connection has been closed!) + * @param err An error code if there has been an error receiving + * Only return ERR_ABRT if you have called tcp_abort from within the + * callback function! + */ +typedef err_t (*tcp_recv_fn)(void *arg, struct tcp_pcb *tpcb, + struct pbuf *p, err_t err); + +/** Function prototype for tcp sent callback functions. Called when sent data has + * been acknowledged by the remote side. Use it to free corresponding resources. + * This also means that the pcb has now space available to send new data. + * + * @param arg Additional argument to pass to the callback function (@see tcp_arg()) + * @param tpcb The connection pcb for which data has been acknowledged + * @param len The amount of bytes acknowledged + * @return ERR_OK: try to send some data by calling tcp_output + * Only return ERR_ABRT if you have called tcp_abort from within the + * callback function! + */ +typedef err_t (*tcp_sent_fn)(void *arg, struct tcp_pcb *tpcb, + u16_t len); + +/** Function prototype for tcp poll callback functions. Called periodically as + * specified by @see tcp_poll. + * + * @param arg Additional argument to pass to the callback function (@see tcp_arg()) + * @param tpcb tcp pcb + * @return ERR_OK: try to send some data by calling tcp_output + * Only return ERR_ABRT if you have called tcp_abort from within the + * callback function! + */ +typedef err_t (*tcp_poll_fn)(void *arg, struct tcp_pcb *tpcb); + +/** Function prototype for tcp error callback functions. Called when the pcb + * receives a RST or is unexpectedly closed for any other reason. + * + * @note The corresponding pcb is already freed when this callback is called! + * + * @param arg Additional argument to pass to the callback function (@see tcp_arg()) + * @param err Error code to indicate why the pcb has been closed + * ERR_ABRT: aborted through tcp_abort or by a TCP timer + * ERR_RST: the connection was reset by the remote host + */ +typedef void (*tcp_err_fn)(void *arg, err_t err); + +/** Function prototype for tcp connected callback functions. Called when a pcb + * is connected to the remote side after initiating a connection attempt by + * calling tcp_connect(). + * + * @param arg Additional argument to pass to the callback function (@see tcp_arg()) + * @param tpcb The connection pcb which is connected + * @param err An unused error code, always ERR_OK currently ;-) TODO! + * Only return ERR_ABRT if you have called tcp_abort from within the + * callback function! + * + * @note When a connection attempt fails, the error callback is currently called! + */ +typedef err_t (*tcp_connected_fn)(void *arg, struct tcp_pcb *tpcb, err_t err); + +enum tcp_state { + CLOSED = 0, + LISTEN = 1, + SYN_SENT = 2, + SYN_RCVD = 3, + ESTABLISHED = 4, + FIN_WAIT_1 = 5, + FIN_WAIT_2 = 6, + CLOSE_WAIT = 7, + CLOSING = 8, + LAST_ACK = 9, + TIME_WAIT = 10 +}; + +static const char * const tcp_state_str[] = { + "CLOSED", + "LISTEN", + "SYN_SENT", + "SYN_RCVD", + "ESTABLISHED", + "FIN_WAIT_1", + "FIN_WAIT_2", + "CLOSE_WAIT", + "CLOSING", + "LAST_ACK", + "TIME_WAIT" +}; + +#define PCB_IN_CLOSED_STATE(pcb) (get_tcp_state(pcb) == CLOSED) +#define PCB_IN_LISTEN_STATE(pcb) (get_tcp_state(pcb) == LISTEN) +#define PCB_IN_ACTIVE_STATE(pcb) (get_tcp_state(pcb) > LISTEN && get_tcp_state(pcb) < TIME_WAIT) +#define PCB_IN_TIME_WAIT_STATE(pcb) (get_tcp_state(pcb) == TIME_WAIT) + +#if LWIP_CALLBACK_API + /* Function to call when a listener has been connected. + * @param arg user-supplied argument (tcp_pcb.callback_arg) + * @param pcb a new tcp_pcb that now is connected + * @param err an error argument (TODO: that is current always ERR_OK?) + * @return ERR_OK: accept the new connection, + * any other err_t abortsthe new connection + */ +#define DEF_ACCEPT_CALLBACK tcp_accept_fn accept; +#else /* LWIP_CALLBACK_API */ +#define DEF_ACCEPT_CALLBACK +#endif /* LWIP_CALLBACK_API */ + + +/* allow user to be notified upon tcp_state changes */ +typedef void (*tcp_state_observer_fn)(void* pcb_container, enum tcp_state new_state); +void register_tcp_state_observer(tcp_state_observer_fn fn); +extern tcp_state_observer_fn external_tcp_state_observer; + +/** + * members common to struct tcp_pcb and struct tcp_listen_pcb + */ +#define TCP_PCB_COMMON(type) \ + enum tcp_state private_state; /* TCP state - should only be touched thru get/set functions */ \ + u8_t prio; \ + void *callback_arg; \ + void *my_container; \ + /* Function to be called when sending data. */ \ + ip_output_fn ip_output; \ + /* the accept callback for listen- and normal pcbs, if LWIP_CALLBACK_API */ \ + DEF_ACCEPT_CALLBACK \ + /* ports are in host byte order */ \ + u16_t local_port; \ + u32_t rcv_wnd; /* receiver window available */ \ + u32_t rcv_ann_wnd; /* receiver window to announce */ \ + u32_t rcv_wnd_max; /* maximum available receive window */ \ + u32_t rcv_wnd_max_desired; + +#define RCV_WND_SCALE(pcb, wnd) (((wnd) >> (pcb)->rcv_scale)) +#define SND_WND_SCALE(pcb, wnd) ((u32_t)(wnd) << (pcb)->snd_scale) + +#define TCPWND_MIN16(x) ((u16_t)LWIP_MIN((x), 0xFFFF)) +#define VMA_NO_TCP_PCB_LISTEN_STRUCT 1 +/* Note: max_tcp_snd_queuelen is now a multiple by 16 (was 4 before) to match max_unsent_len */ +#define UPDATE_PCB_BY_MSS(pcb, snd_mss) \ + (pcb)->mss = (snd_mss); \ + (pcb)->max_tcp_snd_queuelen = (16*((pcb)->max_snd_buff)/((pcb)->mss)) ; \ + (pcb)->max_unsent_len = (16*((pcb)->max_snd_buff)/((pcb)->mss)); \ + (pcb)->tcp_oversize_val = (pcb)->mss; + +/* the TCP protocol control block */ +struct tcp_pcb { +/** common PCB members */ + IP_PCB; +/** protocol specific PCB members */ + TCP_PCB_COMMON(struct tcp_pcb); + + /* ports are in host byte order */ + u16_t remote_port; + + u16_t flags; +#define TF_ACK_DELAY ((u16_t)0x0001U) /* Delayed ACK. */ +#define TF_ACK_NOW ((u16_t)0x0002U) /* Immediate ACK. */ +#define TF_INFR ((u16_t)0x0004U) /* In fast recovery. */ +#define TF_TIMESTAMP ((u16_t)0x0008U) /* Timestamp option enabled */ +#define TF_RXCLOSED ((u16_t)0x0010U) /* rx closed by tcp_shutdown */ +#define TF_FIN ((u16_t)0x0020U) /* Connection was closed locally (FIN segment enqueued). */ +#define TF_NODELAY ((u16_t)0x0040U) /* Disable Nagle algorithm */ +#define TF_NAGLEMEMERR ((u16_t)0x0080U) /* nagle enabled, memerr, try to output to prevent delayed ACK to happen */ +#define TF_WND_SCALE ((u16_t)0x0100U) /* Window Scale option enabled */ + + /* the rest of the fields are in host byte order + as we have to do some math with them */ + /* receiver variables */ + u32_t rcv_nxt; /* next seqno expected */ + u32_t rcv_ann_right_edge; /* announced right edge of window */ + + /* Timers */ + u8_t tcp_timer; /* Timer counter to handle calling slow-timer from tcp_tmr() */ + u32_t tmr; + u8_t polltmr, pollinterval; + + /* Retransmission timer. */ + s16_t rtime; + + u16_t mss; /* maximum segment size */ + u16_t advtsd_mss; /* advertised maximum segment size */ + + /* RTT (round trip time) estimation variables */ + u32_t rttest; /* RTT estimate in 10ms ticks */ + u32_t rtseq; /* sequence number being timed */ +#if TCP_CC_ALGO_MOD + u32_t t_rttupdated; /* number of RTT estimations taken so far */ +#endif + s16_t sa, sv; /* @todo document this */ + + s16_t rto; /* retransmission time-out */ + u8_t nrtx; /* number of retransmissions */ + + /* fast retransmit/recovery */ + u32_t lastack; /* Highest acknowledged seqno. */ + u8_t dupacks; + + /* congestion avoidance/control variables */ +#if TCP_CC_ALGO_MOD + struct cc_algo* cc_algo; + void* cc_data; +#endif + u32_t cwnd; + u32_t ssthresh; + + /* sender variables */ + u32_t snd_nxt; /* next new seqno to be sent */ + u32_t snd_wnd; /* sender window */ + u32_t snd_wnd_max; /* the maximum sender window announced by the remote host */ + u32_t snd_wl1, snd_wl2; /* Sequence and acknowledgement numbers of last + window update. */ + u32_t snd_lbb; /* Sequence number of next byte to be buffered. */ + + u32_t acked; + + u32_t snd_buf; /* Available buffer space for sending (in bytes). */ + u32_t max_snd_buff; + + u32_t snd_sml_snt; /* maintain state for minshall's algorithm */ + u32_t snd_sml_add; /* maintain state for minshall's algorithm */ + +#define TCP_SNDQUEUELEN_OVERFLOW (0xffffffU-3) + u32_t snd_queuelen; /* Available buffer space for sending (in tcp_segs). */ + u32_t max_tcp_snd_queuelen; + +#if TCP_OVERSIZE + /* Extra bytes available at the end of the last pbuf in unsent. */ + u16_t unsent_oversize; + u16_t tcp_oversize_val; +#endif /* TCP_OVERSIZE */ + u16_t max_unsent_len; + /* These are ordered by sequence number: */ + struct tcp_seg *unsent; /* Unsent (queued) segments. */ + struct tcp_seg *last_unsent; /* Last unsent (queued) segment. */ + struct tcp_seg *unacked; /* Sent but unacknowledged segments. */ + struct tcp_seg *last_unacked; /* Last element in unacknowledged segments list. */ +#if TCP_QUEUE_OOSEQ + struct tcp_seg *ooseq; /* Received out of sequence segments. */ +#endif /* TCP_QUEUE_OOSEQ */ + + struct pbuf *refused_data; /* Data previously received but not yet taken by upper layer */ + struct tcp_seg *seg_alloc; /* Available tcp_seg element for use */ + struct pbuf *pbuf_alloc; /* Available pbuf element for use */ + +#if LWIP_CALLBACK_API + /* Function to be called when more send buffer space is available. */ + tcp_sent_fn sent; + /* Function to be called when (in-sequence) data has arrived. */ + tcp_recv_fn recv; + /* Function to be called when a connection has been set up. */ + tcp_connected_fn connected; + /* Function which is called periodically. */ + tcp_poll_fn poll; + /* Function to be called whenever a fatal error occurs. */ + tcp_err_fn errf; +#endif /* LWIP_CALLBACK_API */ + + u8_t enable_ts_opt; +#if LWIP_TCP_TIMESTAMPS + u32_t ts_lastacksent; + u32_t ts_recent; +#endif /* LWIP_TCP_TIMESTAMPS */ + + /* idle time before KEEPALIVE is sent */ + u32_t keep_idle; +#if LWIP_TCP_KEEPALIVE + u32_t keep_intvl; + u32_t keep_cnt; +#endif /* LWIP_TCP_KEEPALIVE */ + + /* Persist timer counter */ + u32_t persist_cnt; + /* Persist timer back-off */ + u8_t persist_backoff; + + /* KEEPALIVE counter */ + u8_t keep_cnt_sent; + + u8_t snd_scale; + u8_t rcv_scale; +#ifdef VMA_NO_TCP_PCB_LISTEN_STRUCT + tcp_syn_handled_fn syn_handled_cb; + tcp_clone_conn_fn clone_conn; + +#endif /* VMA_NO_TCP_PCB_LISTEN_STRUCT */ + + /* Delayed ACK control: number of quick acks */ + u8_t quickack; + +#if LWIP_TSO + /* TSO description */ + struct { + /* Maximum length of memory buffer */ + u32_t max_buf_sz; + + /* Maximum length of TCP payload for TSO */ + u32_t max_payload_sz; + + /* Maximum length of header for TSO */ + u16_t max_header_sz; + + /* Maximum number of SGE */ + u32_t max_send_sge; + } tso; +#endif /* LWIP_TSO */ +}; + +typedef u16_t (*ip_route_mtu_fn)(struct tcp_pcb *pcb); +void register_ip_route_mtu(ip_route_mtu_fn fn); + +#ifdef VMA_NO_TCP_PCB_LISTEN_STRUCT +#define tcp_pcb_listen tcp_pcb +#else +struct tcp_pcb_listen { +/* Common members of all PCB types */ + IP_PCB; +/* Protocol specific PCB members */ + TCP_PCB_COMMON(struct tcp_pcb_listen); + tcp_syn_handled_fn syn_handled_cb; + tcp_clone_conn_fn clone_conn; +}; +#endif /* VMA_NO_TCP_PCB_LISTEN_STRUCT */ + +#if LWIP_EVENT_API + +enum lwip_event { + LWIP_EVENT_ACCEPT, + LWIP_EVENT_SENT, + LWIP_EVENT_RECV, + LWIP_EVENT_CONNECTED, + LWIP_EVENT_POLL, + LWIP_EVENT_ERR +}; + +err_t lwip_tcp_event(void *arg, struct tcp_pcb *pcb, + enum lwip_event, + struct pbuf *p, + u16_t size, + err_t err); + +#endif /* LWIP_EVENT_API */ + +#if defined(__GNUC__) && (((__GNUC__ == 4) && (__GNUC_MINOR__ >= 4)) || (__GNUC__ > 4)) +#pragma GCC visibility push(hidden) +#endif + + +/*Initialization of tcp_pcb structure*/ +void tcp_pcb_init (struct tcp_pcb* pcb, u8_t prio); + +void tcp_arg (struct tcp_pcb *pcb, void *arg); +void tcp_ip_output (struct tcp_pcb *pcb, ip_output_fn ip_output); +void tcp_accept (struct tcp_pcb *pcb, tcp_accept_fn accept); +void tcp_syn_handled (struct tcp_pcb_listen *pcb, tcp_syn_handled_fn syn_handled); +void tcp_clone_conn (struct tcp_pcb_listen *pcb, tcp_clone_conn_fn clone_conn); +void tcp_recv (struct tcp_pcb *pcb, tcp_recv_fn recv); +void tcp_sent (struct tcp_pcb *pcb, tcp_sent_fn sent); +void tcp_poll (struct tcp_pcb *pcb, tcp_poll_fn poll, u8_t interval); +void tcp_err (struct tcp_pcb *pcb, tcp_err_fn err); + +#define tcp_mss(pcb) (((pcb)->flags & TF_TIMESTAMP) ? ((pcb)->mss - 12) : (pcb)->mss) +#define tcp_sndbuf(pcb) ((pcb)->snd_buf) +#define tcp_sndqueuelen(pcb) ((pcb)->snd_queuelen) +#define tcp_nagle_disable(pcb) ((pcb)->flags |= TF_NODELAY) +#define tcp_nagle_enable(pcb) ((pcb)->flags &= ~TF_NODELAY) +#define tcp_nagle_disabled(pcb) (((pcb)->flags & TF_NODELAY) != 0) + +#if LWIP_TSO +#define tcp_tso(pcb) ((pcb)->tso.max_payload_sz) +#else +#define tcp_tso(pcb) (0) +#endif /* LWIP_TSO */ + +#define tcp_accepted(pcb) LWIP_ASSERT("get_tcp_state(pcb) == LISTEN (called for wrong pcb?)", \ + get_tcp_state(pcb) == LISTEN) + +void tcp_recved (struct tcp_pcb *pcb, u32_t len); +err_t tcp_bind (struct tcp_pcb *pcb, ip_addr_t *ipaddr, + u16_t port); +err_t tcp_connect (struct tcp_pcb *pcb, ip_addr_t *ipaddr, + u16_t port, tcp_connected_fn connected); + +err_t tcp_listen(struct tcp_pcb_listen *listen_pcb, struct tcp_pcb *conn_pcb); + +void tcp_abort (struct tcp_pcb *pcb); +err_t tcp_close (struct tcp_pcb *pcb); +err_t tcp_shutdown(struct tcp_pcb *pcb, int shut_rx, int shut_tx); + +/* Flags for "apiflags" parameter in tcp_write */ +#define TCP_WRITE_FLAG_COPY 0x01 +#define TCP_WRITE_FLAG_MORE 0x02 +#define TCP_WRITE_REXMIT 0x08 +#define TCP_WRITE_DUMMY 0x10 +#define TCP_WRITE_TSO 0x20 +#define TCP_WRITE_FILE 0x40 + +err_t tcp_write (struct tcp_pcb *pcb, const void *dataptr, u32_t len, + u8_t apiflags); + +#define TCP_PRIO_MIN 1 +#define TCP_PRIO_NORMAL 64 +#define TCP_PRIO_MAX 127 + +err_t tcp_output (struct tcp_pcb *pcb); + +s32_t tcp_is_wnd_available(struct tcp_pcb *pcb, u32_t data_len); + +#if defined(__GNUC__) && (((__GNUC__ == 4) && (__GNUC_MINOR__ >= 4)) || (__GNUC__ > 4)) +#pragma GCC visibility pop +#endif + +#define get_tcp_state(pcb) ((pcb)->private_state) +#define set_tcp_state(pcb, state) external_tcp_state_observer((pcb)->my_container, (pcb)->private_state = state) + +#ifdef __cplusplus +} +#endif + +#endif /* LWIP_TCP */ + +#endif /* __LWIP_TCP_H__ */ diff --git a/src/vma/lwip/tcp_impl.h b/src/vma/lwip/tcp_impl.h new file mode 100644 index 0000000..2419178 --- /dev/null +++ b/src/vma/lwip/tcp_impl.h @@ -0,0 +1,497 @@ +/* + * Copyright (c) 2001-2004 Swedish Institute of Computer Science. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT + * SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY + * OF SUCH DAMAGE. + * + * This file is part of the lwIP TCP/IP stack. + * + * Author: Adam Dunkels + * + */ +#ifndef __LWIP_TCP_IMPL_H__ +#define __LWIP_TCP_IMPL_H__ + +#include "vma/lwip/opt.h" + +#if LWIP_TCP /* don't build if not configured for use in lwipopts.h */ + +#include "vma/lwip/tcp.h" + +#ifdef __cplusplus +extern "C" { +#endif + +#define tcp_init() /* Compatibility define, no init needed. */ + +/* Functions for interfacing with TCP: */ +#if defined(__GNUC__) && (((__GNUC__ == 4) && (__GNUC_MINOR__ >= 4)) || (__GNUC__ > 4)) +#pragma GCC visibility push(hidden) +#endif + +void tcp_tmr (struct tcp_pcb* pcb); /* Must be called every (slow_tmr_interval / 2) ms. */ +/* It is also possible to call these two functions at the right + intervals (instead of calling tcp_tmr()). */ +void tcp_slowtmr (struct tcp_pcb* pcb); +void tcp_fasttmr (struct tcp_pcb* pcb); + +#if LWIP_3RD_PARTY_L3 +void L3_level_tcp_input (struct pbuf *p, struct tcp_pcb *pcb); +#endif +/* Used within the TCP code only: */ +struct tcp_pcb * tcp_alloc (u8_t prio); +struct pbuf * tcp_tx_pbuf_alloc(struct tcp_pcb * pcb, u16_t length, pbuf_type type); +void tcp_tx_preallocted_buffers_free(struct tcp_pcb * pcb); +void tcp_tx_pbuf_free(struct tcp_pcb * pcb, struct pbuf * pbuf); +void tcp_abandon (struct tcp_pcb *pcb, int reset); +err_t tcp_send_empty_ack(struct tcp_pcb *pcb); +void tcp_split_segment(struct tcp_pcb *pcb, struct tcp_seg *seg, u32_t wnd); +void tcp_rexmit (struct tcp_pcb *pcb); +void tcp_rexmit_rto (struct tcp_pcb *pcb); +void tcp_rexmit_fast (struct tcp_pcb *pcb); +u32_t tcp_update_rcv_ann_wnd(struct tcp_pcb *pcb); +void set_tmr_resolution(u32_t v); + +#if defined(__GNUC__) && (((__GNUC__ == 4) && (__GNUC_MINOR__ >= 4)) || (__GNUC__ > 4)) +#pragma GCC visibility pop +#endif + +/** + * This is the Nagle algorithm: try to combine user data to send as few TCP + * segments as possible. Only send if + * - no previously transmitted data on the connection remains unacknowledged or + * - the TF_NODELAY flag is set (nagle algorithm turned off for this pcb) or + * - the only unsent segment is at least pcb->mss bytes long (or there is more + * than one unsent segment - with lwIP, this can happen although unsent->len < mss) + * - or if we are in fast-retransmit (TF_INFR) + */ +#define tcp_do_output_nagle(tpcb) ((((tpcb)->unacked == NULL) || \ + ((tpcb)->flags & (TF_NODELAY | TF_INFR)) || \ + (((tpcb)->unsent != NULL) && (((tpcb)->unsent->next != NULL) || \ + ((tpcb)->unsent->len >= (tpcb)->mss))) || \ + ((tcp_sndbuf(tpcb) == 0) || (tcp_sndqueuelen(tpcb) >= (tpcb)->max_tcp_snd_queuelen)) \ + ) ? 1 : 0) +#define tcp_output_nagle(tpcb) (tcp_do_output_nagle(tpcb) ? tcp_output(tpcb) : ERR_OK) + + +#define TCP_SEQ_LT(a,b) ((s32_t)((u32_t)(a)-(u32_t)(b)) < 0) +#define TCP_SEQ_LEQ(a,b) ((s32_t)((u32_t)(a)-(u32_t)(b)) <= 0) +#define TCP_SEQ_GT(a,b) ((s32_t)((u32_t)(a)-(u32_t)(b)) > 0) +#define TCP_SEQ_GEQ(a,b) ((s32_t)((u32_t)(a)-(u32_t)(b)) >= 0) +/* is b<=a<=c? */ +#if 0 /* see bug #10548 */ +#define TCP_SEQ_BETWEEN(a,b,c) ((c)-(b) >= (a)-(b)) +#endif +#define TCP_SEQ_BETWEEN(a,b,c) (TCP_SEQ_GEQ(a,b) && TCP_SEQ_LEQ(a,c)) +#define TCP_FIN 0x01U +#define TCP_SYN 0x02U +#define TCP_RST 0x04U +#define TCP_PSH 0x08U +#define TCP_ACK 0x10U +#define TCP_URG 0x20U +#define TCP_ECE 0x40U +#define TCP_CWR 0x80U + +#define TCP_FLAGS 0x3fU + +/* Length of the TCP header, excluding options. */ +#define TCP_HLEN 20 + +#define TCP_FIN_WAIT_TIMEOUT 20000 /* milliseconds */ +#define TCP_SYN_RCVD_TIMEOUT 20000 /* milliseconds */ + +#define TCP_OOSEQ_TIMEOUT 6U /* x RTO */ + +#ifndef TCP_MSL +#define TCP_MSL 60000UL /* The maximum segment lifetime in milliseconds */ +#endif + +/* Keepalive values, compliant with RFC 1122. Don't change this unless you know what you're doing */ +#ifndef TCP_KEEPIDLE_DEFAULT +#define TCP_KEEPIDLE_DEFAULT 7200000UL /* Default KEEPALIVE timer in milliseconds */ +#endif + +#ifndef TCP_KEEPINTVL_DEFAULT +#define TCP_KEEPINTVL_DEFAULT 75000UL /* Default Time between KEEPALIVE probes in milliseconds */ +#endif + +#ifndef TCP_KEEPCNT_DEFAULT +#define TCP_KEEPCNT_DEFAULT 9U /* Default Counter for KEEPALIVE probes */ +#endif + +#define TCP_MAXIDLE TCP_KEEPCNT_DEFAULT * TCP_KEEPINTVL_DEFAULT /* Maximum KEEPALIVE probe time */ + +/* Fields are (of course) in network byte order. + * Some fields are converted to host byte order in tcp_input(). + */ +PACK_STRUCT_BEGIN +struct tcp_hdr { + PACK_STRUCT_FIELD(u16_t src); + PACK_STRUCT_FIELD(u16_t dest); + PACK_STRUCT_FIELD(u32_t seqno); + PACK_STRUCT_FIELD(u32_t ackno); + PACK_STRUCT_FIELD(u16_t _hdrlen_rsvd_flags); + PACK_STRUCT_FIELD(u16_t wnd); + PACK_STRUCT_FIELD(u16_t chksum); + PACK_STRUCT_FIELD(u16_t urgp); +} PACK_STRUCT_STRUCT; +PACK_STRUCT_END + +#define TCPH_OFFSET(phdr) (ntohs((phdr)->_hdrlen_rsvd_flags) >> 8) +#define TCPH_HDRLEN(phdr) (ntohs((phdr)->_hdrlen_rsvd_flags) >> 12) +#define TCPH_FLAGS(phdr) (ntohs((phdr)->_hdrlen_rsvd_flags) & TCP_FLAGS) + +#define TCPH_OFFSET_SET(phdr, offset) (phdr)->_hdrlen_rsvd_flags = htons(((offset) << 8) | TCPH_FLAGS(phdr)) +#define TCPH_HDRLEN_SET(phdr, len) (phdr)->_hdrlen_rsvd_flags = htons(((len) << 12) | TCPH_FLAGS(phdr)) +#define TCPH_FLAGS_SET(phdr, flags) (phdr)->_hdrlen_rsvd_flags = (((phdr)->_hdrlen_rsvd_flags & PP_HTONS((u16_t)(~(u16_t)(TCP_FLAGS)))) | htons(flags)) +#define TCPH_HDRLEN_FLAGS_SET(phdr, len, flags) (phdr)->_hdrlen_rsvd_flags = htons(((len) << 12) | (flags)) + +#define TCPH_SET_FLAG(phdr, flags ) (phdr)->_hdrlen_rsvd_flags = ((phdr)->_hdrlen_rsvd_flags | htons(flags)) +#define TCPH_UNSET_FLAG(phdr, flags) (phdr)->_hdrlen_rsvd_flags = htons(ntohs((phdr)->_hdrlen_rsvd_flags) | (TCPH_FLAGS(phdr) & ~(flags)) ) + +#define TCP_TCPLEN(seg) ((seg)->len + (((TCPH_FLAGS((seg)->tcphdr) & (TCP_FIN | TCP_SYN)) != 0) ? 1U : 0U)) + +/** Flags used on input processing, not on pcb->flags +*/ +#define TF_RESET (u8_t)0x08U /* Connection was reset. */ +#define TF_CLOSED (u8_t)0x10U /* Connection was sucessfully closed. */ +#define TF_GOT_FIN (u8_t)0x20U /* Connection was closed by the remote end. */ + + +#if LWIP_EVENT_API + +#define TCP_EVENT_ACCEPT(pcb,err,ret) ret = lwip_tcp_event((pcb)->callback_arg, (pcb),\ + LWIP_EVENT_ACCEPT, NULL, 0, err) +#define TCP_EVENT_SENT(pcb,space,ret) ret = lwip_tcp_event((pcb)->callback_arg, (pcb),\ + LWIP_EVENT_SENT, NULL, space, ERR_OK) +#define TCP_EVENT_RECV(pcb,p,err,ret) ret = lwip_tcp_event((pcb)->callback_arg, (pcb),\ + LWIP_EVENT_RECV, (p), 0, (err)) +#define TCP_EVENT_CLOSED(pcb,ret) ret = lwip_tcp_event((pcb)->callback_arg, (pcb),\ + LWIP_EVENT_RECV, NULL, 0, ERR_OK) +#define TCP_EVENT_CONNECTED(pcb,err,ret) ret = lwip_tcp_event((pcb)->callback_arg, (pcb),\ + LWIP_EVENT_CONNECTED, NULL, 0, (err)) +#define TCP_EVENT_POLL(pcb,ret) ret = lwip_tcp_event((pcb)->callback_arg, (pcb),\ + LWIP_EVENT_POLL, NULL, 0, ERR_OK) +#define TCP_EVENT_ERR(errf,arg,err) lwip_tcp_event((arg), NULL, \ + LWIP_EVENT_ERR, NULL, 0, (err)) + +#else /* LWIP_EVENT_API */ + +#define TCP_EVENT_ACCEPT(pcb,err,ret) \ + do { \ + if((pcb)->accept != NULL) \ + (ret) = (pcb)->accept((pcb)->callback_arg,(pcb),(err)); \ + else (ret) = ERR_ARG; \ + } while (0) + +#define TCP_EVENT_SYN_RECEIVED(pcb,p_npcb,err,ret) \ + do { \ + if((pcb)->syn_handled_cb != NULL) \ + (ret) = (pcb)->syn_handled_cb((pcb)->callback_arg,(p_npcb),(err)); \ + else (ret) = ERR_ARG; \ + } while (0) + +#define TCP_EVENT_CLONE_PCB(pcb,p_npcb,err,ret) \ + do { \ + if((pcb)->clone_conn != NULL) \ + (ret) = (pcb)->clone_conn((pcb)->callback_arg,(p_npcb),(err)); \ + else (ret) = ERR_ARG; \ + } while (0) + +#define TCP_EVENT_SENT(pcb,space,ret) \ + do { \ + if((pcb)->sent != NULL) \ + (ret) = (pcb)->sent((pcb)->callback_arg,(pcb),(space)); \ + else (ret) = ERR_OK; \ + } while (0) + +#define TCP_EVENT_RECV(pcb,p,err,ret) \ + do { \ + if((pcb)->recv != NULL) { \ + (ret) = (pcb)->recv((pcb)->callback_arg,(pcb),(p),(err));\ + } else { \ + (ret) = tcp_recv_null(NULL, (pcb), (p), (err)); \ + } \ + } while (0) + +#define TCP_EVENT_CLOSED(pcb,ret) \ + do { \ + if(((pcb)->recv != NULL)) { \ + (ret) = (pcb)->recv((pcb)->callback_arg,(pcb),NULL,ERR_OK);\ + } else { \ + (ret) = ERR_OK; \ + } \ + } while (0) + +#define TCP_EVENT_CONNECTED(pcb,err,ret) \ + do { \ + if((pcb)->connected != NULL) \ + (ret) = (pcb)->connected((pcb)->callback_arg,(pcb),(err)); \ + else (ret) = ERR_OK; \ + } while (0) + +#define TCP_EVENT_POLL(pcb,ret) \ + do { \ + if((pcb)->poll != NULL) \ + (ret) = (pcb)->poll((pcb)->callback_arg,(pcb)); \ + else (ret) = ERR_OK; \ + } while (0) + +#define TCP_EVENT_ERR(errf,arg,err) \ + do { \ + if((errf) != NULL) \ + (errf)((arg),(err)); \ + } while (0) + +#endif /* LWIP_EVENT_API */ + +/** Enabled extra-check for TCP_OVERSIZE if LWIP_DEBUG is enabled */ +#if TCP_OVERSIZE && defined(LWIP_DEBUG) +#define TCP_OVERSIZE_DBGCHECK 1 +#else +#define TCP_OVERSIZE_DBGCHECK 0 +#endif + +/** Don't generate checksum on copy if CHECKSUM_GEN_TCP is disabled */ +#define TCP_CHECKSUM_ON_COPY (LWIP_CHECKSUM_ON_COPY && CHECKSUM_GEN_TCP) + +/* This structure represents a TCP segment on the unsent, unacked and ooseq queues */ +struct tcp_seg { + struct tcp_seg *next; /* used when putting segements on a queue */ + struct pbuf *p; /* buffer containing data + TCP header */ +#if LWIP_TSO + u32_t seqno; + u32_t len; /* the TCP length of this segment should allow >64K size */ +#else + void *dataptr; /* pointer to the TCP data in the pbuf */ + u32_t seqno; + u16_t len; /* the TCP length of this segment should allow >64K size */ +#endif /* LWIP_TSO */ + +#if TCP_OVERSIZE_DBGCHECK + u16_t oversize_left; /* Extra bytes available at the end of the last + pbuf in unsent (used for asserting vs. + tcp_pcb.unsent_oversized only) */ +#endif /* TCP_OVERSIZE_DBGCHECK */ +#if TCP_CHECKSUM_ON_COPY + u16_t chksum; + u8_t chksum_swapped; +#endif /* TCP_CHECKSUM_ON_COPY */ + u8_t flags; +#define TF_SEG_OPTS_MSS (u8_t)0x01U /* Include MSS option. */ +#define TF_SEG_OPTS_TS (u8_t)0x02U /* Include timestamp option. */ +#define TF_SEG_DATA_CHECKSUMMED (u8_t)0x04U /* ALL data (not the header) is + checksummed into 'chksum' */ +#define TF_SEG_OPTS_WNDSCALE (u8_t)0x08U /* Include window scaling option */ +#define TF_SEG_OPTS_DUMMY_MSG (u8_t)TCP_WRITE_DUMMY /* Include dummy send option */ +#define TF_SEG_OPTS_TSO (u8_t)TCP_WRITE_TSO /* Use TSO send mode */ + + struct tcp_hdr *tcphdr; /* the TCP header */ +}; + +#define LWIP_IS_DUMMY_SEGMENT(seg) (seg->flags & TF_SEG_OPTS_DUMMY_MSG) + +#if LWIP_TCP_TIMESTAMPS +#define LWIP_TCP_OPT_LEN_TS 10 +#endif + +/* This macro calculates total length of tcp additional options + * basing on option flags + */ +#define LWIP_TCP_OPT_LENGTH(flags) \ + (flags & TF_SEG_OPTS_MSS ? 4 : 0) + \ + (flags & TF_SEG_OPTS_WNDSCALE ? 1+3 : 0) + \ + (flags & TF_SEG_OPTS_TS ? 12 : 0) + +/* This macro calculates total length of tcp header including + * additional options + */ +#define LWIP_TCP_HDRLEN(_tcphdr) (TCPH_HDRLEN(((struct tcp_hdr *)(_tcphdr))) * 4) + +/** This returns a TCP header option for MSS in an u32_t */ +#define TCP_BUILD_MSS_OPTION(x, mss) (x) = PP_HTONL(((u32_t)2 << 24) | \ + ((u32_t)4 << 16) | \ + (((u32_t)mss / 256) << 8) | \ + (mss & 255)) + +/** This returns a TCP header option for WINDOW SCALING in an u32_t - NOTE: the 1 at MSB serves as NOOP */ +#define TCP_BUILD_WNDSCALE_OPTION(x, scale) (x) = PP_HTONL( ( ((u32_t)1 << 24) | \ + ((u32_t)3 << 16) | \ + ((u32_t)3 << 8) ) | \ + ((u32_t)scale )) + +/* Global variables: */ +extern struct tcp_pcb *tcp_input_pcb; +extern int32_t enable_wnd_scale; +extern u32_t rcv_wnd_scale; +extern u8_t enable_ts_option; +extern u32_t tcp_ticks; +extern ip_route_mtu_fn external_ip_route_mtu; + + +extern struct tcp_pcb *tcp_tmp_pcb; /* Only used for temporary storage. */ + +/* Define two macros, TCP_REG and TCP_RMV that registers a TCP PCB + with a PCB list or removes a PCB from a list, respectively. */ +#ifndef TCP_DEBUG_PCB_LISTS +#define TCP_DEBUG_PCB_LISTS 0 +#endif +#if TCP_DEBUG_PCB_LISTS +#define TCP_REG(pcbs, npcb) do {\ + LWIP_DEBUGF(TCP_DEBUG, ("TCP_REG %p local port %d\n", (npcb), (npcb)->local_port)); \ + LWIP_PLATFORM_DIAG(("%s:%d TCP_REG %p local port %d\n", __FUNCTION__, __LINE__, (npcb), (npcb)->local_port)); \ + for(tcp_tmp_pcb = *(pcbs); \ + tcp_tmp_pcb != NULL; \ + tcp_tmp_pcb = tcp_tmp_pcb->next) { \ + LWIP_ASSERT("TCP_REG: already registered\n", tcp_tmp_pcb != (npcb)); \ + } \ + LWIP_ASSERT("TCP_REG: get_tcp_state(pcb) != CLOSED", ((npcb)->state != CLOSED)); \ + (npcb)->next = *(pcbs); \ + LWIP_ASSERT("TCP_REG: npcb->next != npcb", (npcb)->next != (npcb)); \ + *(pcbs) = (npcb); \ + LWIP_ASSERT("TCP_RMV: tcp_pcbs sane", tcp_pcbs_sane()); \ + tcp_timer_needed(); \ + } while(0) +#define TCP_RMV(pcbs, npcb) do { \ + LWIP_DEBUGF(TCP_DEBUG, ("TCP_RMV: removing %p from %p\n", (npcb), *(pcbs))); \ + LWIP_PLATFORM_DIAG(("%s:%d TCP_RMV: removing %p from %p\n", __FUNCTION__, __LINE__, (npcb), *(pcbs))); \ + if(*(pcbs) == (npcb)) { \ + *(pcbs) = (*pcbs)->next; \ + } else for(tcp_tmp_pcb = *(pcbs); tcp_tmp_pcb != NULL; tcp_tmp_pcb = tcp_tmp_pcb->next) { \ + if(tcp_tmp_pcb->next == (npcb)) { \ + tcp_tmp_pcb->next = (npcb)->next; \ + break; \ + } \ + } \ + (npcb)->next = NULL; \ + LWIP_ASSERT("TCP_RMV: tcp_pcbs sane", tcp_pcbs_sane()); \ + LWIP_DEBUGF(TCP_DEBUG, ("TCP_RMV: removed %p from %p\n", (npcb), *(pcbs))); \ + } while(0) + +#else /* LWIP_DEBUG */ + +#define TCP_REG(pcbs, npcb) \ + do { \ + (npcb)->next = *pcbs; \ + *(pcbs) = (npcb); \ + } while (0) + +#define TCP_RMV(pcbs, npcb) \ + do { \ + if(*(pcbs) == (npcb)) { \ + (*(pcbs)) = (*pcbs)->next; \ + } \ + else { \ + for(tcp_tmp_pcb = *pcbs; \ + tcp_tmp_pcb != NULL; \ + tcp_tmp_pcb = tcp_tmp_pcb->next) { \ + if(tcp_tmp_pcb->next == (npcb)) { \ + tcp_tmp_pcb->next = (npcb)->next; \ + break; \ + } \ + } \ + } \ + (npcb)->next = NULL; \ + } while(0) + +#endif /* LWIP_DEBUG */ + +#if defined(__GNUC__) && (((__GNUC__ == 4) && (__GNUC_MINOR__ >= 4)) || (__GNUC__ > 4)) +#pragma GCC visibility push(hidden) +#endif +/* Internal functions: */ +struct tcp_pcb *tcp_pcb_copy(struct tcp_pcb *pcb); +void tcp_pcb_purge(struct tcp_pcb *pcb); +void tcp_pcb_remove(struct tcp_pcb *pcb); + +void tcp_segs_free(struct tcp_pcb *pcb, struct tcp_seg *seg); +void tcp_seg_free(struct tcp_pcb *pcb, struct tcp_seg *seg); +void tcp_tx_segs_free(struct tcp_pcb * pcb, struct tcp_seg *seg); +void tcp_tx_seg_free(struct tcp_pcb * pcb, struct tcp_seg *seg); +struct tcp_seg *tcp_seg_copy(struct tcp_pcb* pcb, struct tcp_seg *seg); + +#define tcp_ack(pcb) \ + do { \ + if((pcb)->flags & TF_ACK_DELAY) { \ + (pcb)->flags &= ~TF_ACK_DELAY; \ + (pcb)->flags |= TF_ACK_NOW; \ + } \ + else { \ + (pcb)->flags |= TF_ACK_DELAY; \ + } \ + } while (0) + +#define tcp_ack_now(pcb) \ + do { \ + (pcb)->flags |= TF_ACK_NOW; \ + } while (0) + +err_t tcp_send_fin(struct tcp_pcb *pcb); +err_t tcp_enqueue_flags(struct tcp_pcb *pcb, u8_t flags); + +void tcp_rst(u32_t seqno, u32_t ackno, + u16_t local_port, u16_t remote_port, struct tcp_pcb *pcb); + +u32_t tcp_next_iss(void); + +void tcp_keepalive(struct tcp_pcb *pcb); +void tcp_zero_window_probe(struct tcp_pcb *pcb); + +#if TCP_CALCULATE_EFF_SEND_MSS +u16_t tcp_eff_send_mss(u16_t sendmss, struct tcp_pcb *pcb); +#endif /* TCP_CALCULATE_EFF_SEND_MSS */ +u16_t tcp_mss_follow_mtu_with_default(u16_t sendmss, struct tcp_pcb *pcb); + +#if LWIP_CALLBACK_API +err_t tcp_recv_null(void *arg, struct tcp_pcb *pcb, struct pbuf *p, err_t err); +#endif /* LWIP_CALLBACK_API */ + +#if TCP_DEBUG || TCP_INPUT_DEBUG || TCP_OUTPUT_DEBUG +void tcp_debug_print(struct tcp_hdr *tcphdr); +void tcp_debug_print_flags(u8_t flags); +void tcp_debug_print_state(enum tcp_state s); +void tcp_debug_print_pcbs(void); +s16_t tcp_pcbs_sane(void); +#else +#define tcp_debug_print(tcphdr) +#define tcp_debug_print_flags(flags) +#define tcp_debug_print_state(s) +#define tcp_debug_print_pcbs() +#define tcp_pcbs_sane() 1 +#endif /* TCP_DEBUG */ + +/** External function (implemented in timers.c), called when TCP detects + * that a timer is needed (i.e. active- or time-wait-pcb found). */ +void tcp_timer_needed(void); + +#if defined(__GNUC__) && (((__GNUC__ == 4) && (__GNUC_MINOR__ >= 4)) || (__GNUC__ > 4)) +#pragma GCC visibility pop +#endif + +#ifdef __cplusplus +} +#endif + +#endif /* LWIP_TCP */ + +#endif /* __LWIP_TCP_H__ */ diff --git a/src/vma/lwip/tcp_in.c b/src/vma/lwip/tcp_in.c new file mode 100644 index 0000000..4a93c77 --- /dev/null +++ b/src/vma/lwip/tcp_in.c @@ -0,0 +1,1701 @@ +/** + * @file + * Transmission Control Protocol, incoming traffic + * + * The input processing functions of the TCP layer. + * + * These functions are generally called in the order (ip_input() ->) + * tcp_input() -> * tcp_process() -> tcp_receive() (-> application). + * + */ + +/* + * Copyright (c) 2001-2004 Swedish Institute of Computer Science. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT + * SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY + * OF SUCH DAMAGE. + * + * This file is part of the lwIP TCP/IP stack. + * + * Author: Adam Dunkels + * + */ + +#include "vma/lwip/opt.h" + +#if LWIP_TCP /* don't build if not configured for use in lwipopts.h */ + +#include "vma/lwip/tcp_impl.h" +#include "vma/lwip/stats.h" + +#include + +typedef struct tcp_in_data { + struct pbuf *recv_data; + struct tcp_hdr *tcphdr; + struct ip_hdr *iphdr; + u32_t seqno; + u32_t ackno; + struct tcp_seg inseg; + u16_t tcplen; + u8_t flags; + u8_t recv_flags; +} tcp_in_data; + +struct tcp_pcb *tcp_input_pcb; + +/* Forward declarations. */ +static err_t tcp_process(struct tcp_pcb *pcb, tcp_in_data* in_data); +static void tcp_receive(struct tcp_pcb *pcb, tcp_in_data* in_data); +static void tcp_parseopt(struct tcp_pcb *pcb, tcp_in_data* in_data); + +static err_t tcp_listen_input(struct tcp_pcb_listen *pcb, tcp_in_data* in_data); +static err_t tcp_timewait_input(struct tcp_pcb *pcb, tcp_in_data* in_data); +static s8_t tcp_quickack(struct tcp_pcb *pcb, tcp_in_data* in_data); + +/** + * Send quickack if TCP_QUICKACK is enabled + * Change LWIP_TCP_QUICKACK_THRESHOLD value in order to send quickacks + * depending on the payload size. + */ +s8_t +tcp_quickack(struct tcp_pcb *pcb, tcp_in_data* in_data) +{ +#if TCP_QUICKACK_THRESHOLD + return pcb->quickack && in_data->tcplen <= TCP_QUICKACK_THRESHOLD; +#else + LWIP_UNUSED_ARG(in_data); + return pcb->quickack; +#endif +} + +#if LWIP_3RD_PARTY_L3 +void +L3_level_tcp_input(struct pbuf *p, struct tcp_pcb* pcb) +{ + u8_t hdrlen; + err_t err; + u16_t iphdr_len; + tcp_in_data in_data; + + TCP_STATS_INC(tcp.recv); + in_data.iphdr = (struct ip_hdr *)p->payload; + + + iphdr_len = ntohs(IPH_LEN(in_data.iphdr)); + /* Trim pbuf. This should have been done at the netif layer, + * but we'll do it anyway just to be sure that its done. */ + pbuf_realloc(p, iphdr_len); + + in_data.tcphdr = (struct tcp_hdr *)((u8_t *)p->payload + IPH_HL(in_data.iphdr) * 4); + +#if TCP_INPUT_DEBUG + tcp_debug_print(in_data.tcphdr); +#endif + + + /* remove header from payload */ + if (pbuf_header(p, -((s16_t)(IPH_HL(in_data.iphdr) * 4))) || (p->tot_len < sizeof(struct tcp_hdr))) { + /* drop short packets */ + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_input: short packet (%"U16_F" bytes) discarded\n", (u16_t)p->tot_len)); + TCP_STATS_INC(tcp.lenerr); + TCP_STATS_INC(tcp.drop); + pbuf_free(p); + return; + } + + /* Move the payload pointer in the pbuf so that it points to the + TCP data instead of the TCP header. */ + hdrlen = TCPH_HDRLEN(in_data.tcphdr); + if(pbuf_header(p, -(hdrlen * 4))){ + /* drop short packets */ + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_input: short packet\n")); + TCP_STATS_INC(tcp.lenerr); + TCP_STATS_INC(tcp.drop); + pbuf_free(p); + return; + } + + /* Convert fields in TCP header to host byte order. */ + in_data.tcphdr->src = ntohs(in_data.tcphdr->src); + in_data.tcphdr->dest = ntohs(in_data.tcphdr->dest); + in_data.seqno = in_data.tcphdr->seqno = ntohl(in_data.tcphdr->seqno); + in_data.ackno = in_data.tcphdr->ackno = ntohl(in_data.tcphdr->ackno); + in_data.tcphdr->wnd = ntohs(in_data.tcphdr->wnd); + + in_data.flags = TCPH_FLAGS(in_data.tcphdr); + in_data.tcplen = p->tot_len + ((in_data.flags & (TCP_FIN | TCP_SYN)) ? 1 : 0); + + if (pcb != NULL) { + + if (PCB_IN_ACTIVE_STATE(pcb)) + { + /* The incoming segment belongs to a connection. */ + #if TCP_INPUT_DEBUG + #if TCP_DEBUG + tcp_debug_print_state(get_tcp_state(pcb)); + #endif /* TCP_DEBUG */ + #endif /* TCP_INPUT_DEBUG */ + + /* Set up a tcp_seg structure. */ + in_data.inseg.next = NULL; + in_data.inseg.len = p->tot_len; +#if LWIP_TSO +#else + in_data.inseg.dataptr = p->payload; +#endif /* LWIP_TSO */ + in_data.inseg.p = p; + in_data.inseg.tcphdr = in_data.tcphdr; + + in_data.recv_data = NULL; + in_data.recv_flags = 0; + + /* If there is data which was previously "refused" by upper layer */ + while (pcb->refused_data != NULL) { // 'while' instead of 'if' because windows scale uses large pbuf + struct pbuf *rest; + pbuf_split_64k(pcb->refused_data, &rest); + + /* Notify again application with data previously received. */ + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_input: notify kept packet\n")); + TCP_EVENT_RECV(pcb, pcb->refused_data, ERR_OK, err); + if (err == ERR_OK) { + pcb->refused_data = rest; + } else { + if (rest) { + pbuf_cat(pcb->refused_data, rest); /* undo splitting */ + } + /* if err == ERR_ABRT, 'pcb' is already deallocated */ + /* drop incoming packets, because pcb is "full" */ + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_input: drop incoming packets, because pcb is \"full\"\n")); + TCP_STATS_INC(tcp.drop);; + pbuf_free(p); + return; + } + } + tcp_input_pcb = pcb; + err = tcp_process(pcb, &in_data); + /* A return value of ERR_ABRT means that tcp_abort() was called + and that the pcb has been freed. If so, we don't do anything. */ + if (err != ERR_ABRT) { + if (in_data.recv_flags & TF_RESET) { + /* TF_RESET means that the connection was reset by the other + end. We then call the error callback to inform the + application that the connection is dead before we + deallocate the PCB. */ + TCP_EVENT_ERR(pcb->errf, pcb->my_container, ERR_RST); + tcp_pcb_remove(pcb); + } else if (in_data.recv_flags & TF_CLOSED) { + /* The connection has been closed and we will deallocate the + PCB. */ + tcp_pcb_remove(pcb); + } else { + /* If the application has registered a "sent" function to be + called when new send buffer space is available, we call it + now. */ + if (pcb->acked > 0) { + TCP_EVENT_SENT(pcb, pcb->acked, err); + if (err == ERR_ABRT) { + goto aborted; + } + } + + while (in_data.recv_data != NULL) { // 'while' instead of 'if' because windows scale uses large pbuf + struct pbuf *rest = NULL; + if (pcb->flags & TF_RXCLOSED) { + /* received data although already closed -> abort (send RST) to + notify the remote host that not all data has been processed */ + pbuf_free(in_data.recv_data); + tcp_abort(pcb); + goto aborted; + } + pbuf_split_64k(in_data.recv_data, &rest); + if (in_data.flags & TCP_PSH) { + in_data.recv_data->flags |= PBUF_FLAG_PUSH; + } + /* Notify application that data has been received. */ + TCP_EVENT_RECV(pcb, in_data.recv_data, ERR_OK, err); + if (err == ERR_ABRT) { + if (rest) { + pbuf_cat(in_data.recv_data, rest); /* undo splitting */ + } + goto aborted; + } + /* If the upper layer can't receive this data, store it */ + if (err != ERR_OK) { + if (rest) { + pbuf_cat(in_data.recv_data, rest); /* undo splitting */ + } + pcb->refused_data = in_data.recv_data; + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_input: keep incoming packet, because pcb is \"full\"\n")); + break; + } else { + in_data.recv_data = rest; + } + } + + /* If a FIN segment was received, we call the callback + function with a NULL buffer to indicate EOF. */ + if (in_data.recv_flags & TF_GOT_FIN) { + /* correct rcv_wnd as the application won't call tcp_recved() + for the FIN's seqno */ + if (pcb->rcv_wnd != pcb->rcv_wnd_max) { + pcb->rcv_wnd++; + } + TCP_EVENT_CLOSED(pcb, err); + if (err == ERR_ABRT) { + goto aborted; + } + } + + tcp_input_pcb = NULL; + /* Try to send something out. */ + tcp_output(pcb); + #if TCP_INPUT_DEBUG + #if TCP_DEBUG + tcp_debug_print_state(get_tcp_state(pcb)); + #endif /* TCP_DEBUG */ + #endif /* TCP_INPUT_DEBUG */ + } + } + /* Jump target if pcb has been aborted in a callback (by calling tcp_abort()). + Below this line, 'pcb' may not be dereferenced! */ + aborted: + tcp_input_pcb = NULL; + in_data.recv_data = NULL; + + /* give up our reference to inseg.p */ + if (in_data.inseg.p != NULL) + { + pbuf_free(in_data.inseg.p); + in_data.inseg.p = NULL; + } + } + else if (PCB_IN_LISTEN_STATE(pcb)) { + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_input: packed for LISTENing connection.\n")); + // TODO: tcp_listen_input creates a pcb and puts in the active pcb list. + // how should we approach? + tcp_listen_input((struct tcp_pcb_listen*)pcb, &in_data); + pbuf_free(p); + } + else if (PCB_IN_TIME_WAIT_STATE(pcb)){ + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_input: packed for TIME_WAITing connection.\n")); + tcp_timewait_input(pcb, &in_data); + pbuf_free(p); + } + else { + LWIP_DEBUGF(TCP_RST_DEBUG, ("tcp_input: illegal get_tcp_state(pcb).\n")); + pbuf_free(p); + } + } else { + + /* If no matching PCB was found, send a TCP RST (reset) to the + sender. */ + LWIP_DEBUGF(TCP_RST_DEBUG, ("tcp_input: no PCB match found, resetting.\n")); + if (!(TCPH_FLAGS(in_data.tcphdr) & TCP_RST)) { + TCP_STATS_INC(tcp.proterr); + TCP_STATS_INC(tcp.drop); + tcp_rst(in_data.ackno, in_data.seqno + in_data.tcplen, in_data.tcphdr->dest, + in_data.tcphdr->src, pcb); + } + pbuf_free(p); + } + + LWIP_ASSERT("tcp_input: tcp_pcbs_sane()", tcp_pcbs_sane()); +} +#endif //LWIP_3RD_PARTY_L3 +/** + * Called by tcp_input() when a segment arrives for a listening + * connection (from tcp_input()). + * + * @param pcb the tcp_pcb_listen for which a segment arrived + * @return ERR_OK if the segment was processed + * another err_t on error + * + * @note the return value is not (yet?) used in tcp_input() + * @note the segment which arrived is saved in global variables, therefore only the pcb + * involved is passed as a parameter to this function + */ +static err_t +tcp_listen_input(struct tcp_pcb_listen *pcb, tcp_in_data* in_data) +{ + struct tcp_pcb *npcb = NULL; + err_t rc; + + if (in_data->flags & TCP_RST) { + /* An incoming RST should be ignored. Return. */ + return ERR_OK; + } + + if (in_data->flags & TCP_FIN) { + /* An incoming FIN should be ignored. Return. */ + return ERR_OK; + } + + /* In the LISTEN state, we check for incoming SYN segments, + creates a new PCB, and responds with a SYN|ACK. */ + if (in_data->flags & TCP_ACK) { + /* For incoming segments with the ACK flag set, respond with a + RST. */ + LWIP_DEBUGF(TCP_RST_DEBUG, ("tcp_listen_input: ACK in LISTEN, sending reset\n")); + tcp_rst(in_data->ackno + 1, in_data->seqno + in_data->tcplen, + in_data->tcphdr->dest, in_data->tcphdr->src, NULL); + } else if (in_data->flags & TCP_SYN) { + LWIP_DEBUGF(TCP_DEBUG, ("TCP connection request %"U16_F" -> %"U16_F".\n", in_data->tcphdr->src, in_data->tcphdr->dest)); + + TCP_EVENT_CLONE_PCB(pcb,&npcb,ERR_OK,rc); + /* If a new PCB could not be created (probably due to lack of memory), + we don't do anything, but rely on the sender will retransmit the + SYN at a time when we have more memory available. */ + if (npcb == NULL) { + LWIP_DEBUGF(TCP_DEBUG, ("tcp_listen_input: could not allocate PCB\n")); + TCP_STATS_INC(tcp.memerr); + return ERR_MEM; + } + /* Set up the new PCB. */ + ip_addr_copy(npcb->local_ip, in_data->iphdr->dest); + npcb->local_port = pcb->local_port; + ip_addr_copy(npcb->remote_ip, in_data->iphdr->src); + npcb->remote_port = in_data->tcphdr->src; + set_tcp_state(npcb, SYN_RCVD); + npcb->rcv_nxt = in_data->seqno + 1; + npcb->rcv_ann_right_edge = npcb->rcv_nxt; + npcb->snd_wnd = in_data->tcphdr->wnd; + npcb->ssthresh = npcb->snd_wnd; + npcb->snd_wl1 = in_data->seqno - 1;/* initialise to seqno-1 to force window update */ + npcb->callback_arg = pcb->callback_arg; +#if LWIP_CALLBACK_API + npcb->accept = pcb->accept; +#endif /* LWIP_CALLBACK_API */ + /* inherit socket options */ + npcb->so_options = pcb->so_options & SOF_INHERITED; + + npcb->snd_scale = 0; + npcb->rcv_scale = 0; + + /* calculate advtsd_mss before parsing MSS option such that the resulting mss will take into account the updated advertized MSS */ + npcb->advtsd_mss = (LWIP_TCP_MSS > 0) ? tcp_eff_send_mss(LWIP_TCP_MSS, npcb) : tcp_mss_follow_mtu_with_default(536, npcb); + + /* Parse any options in the SYN. */ + tcp_parseopt(npcb, in_data); + + npcb->rcv_wnd = TCP_WND_SCALED(npcb); + npcb->rcv_ann_wnd = TCP_WND_SCALED(npcb); + npcb->rcv_wnd_max = TCP_WND_SCALED(npcb); + npcb->rcv_wnd_max_desired = TCP_WND_SCALED(npcb); + + npcb->snd_wnd = SND_WND_SCALE(npcb, in_data->tcphdr->wnd); + npcb->snd_wnd_max = npcb->snd_wnd; + npcb->ssthresh = npcb->snd_wnd; +#if TCP_CALCULATE_EFF_SEND_MSS + u16_t snd_mss = tcp_eff_send_mss(npcb->mss, npcb); + UPDATE_PCB_BY_MSS(npcb, snd_mss); +#endif /* TCP_CALCULATE_EFF_SEND_MSS */ + + /* Register the new PCB so that we can begin sending segments + for it. */ + TCP_EVENT_SYN_RECEIVED(pcb, npcb, ERR_OK, rc); + if (rc != ERR_OK) { + return rc; + } + + /* Send a SYN|ACK together with the MSS option. */ + rc = tcp_enqueue_flags(npcb, TCP_SYN | TCP_ACK); + if (rc != ERR_OK) { + tcp_abandon(npcb, 0); + return rc; + } + return tcp_output(npcb); + } + return ERR_OK; +} + +/** + * Called by tcp_input() when a segment arrives for a connection in + * TIME_WAIT. + * + * @param pcb the tcp_pcb for which a segment arrived + * + * @note the segment which arrived is saved in global variables, therefore only the pcb + * involved is passed as a parameter to this function + */ +static err_t +tcp_timewait_input(struct tcp_pcb *pcb, tcp_in_data* in_data) +{ + /* RFC 1337: in TIME_WAIT, ignore RST and ACK FINs + any 'acceptable' segments */ + /* RFC 793 3.9 Event Processing - Segment Arrives: + * - first check sequence number - we skip that one in TIME_WAIT (always + * acceptable since we only send ACKs) + * - second check the RST bit (... return) */ + if (in_data->flags & TCP_RST) { + return ERR_OK; + } + /* - fourth, check the SYN bit, */ + if (in_data->flags & TCP_SYN) { + /* If an incoming segment is not acceptable, an acknowledgment + should be sent in reply */ + if (TCP_SEQ_BETWEEN(in_data->seqno, pcb->rcv_nxt, pcb->rcv_nxt+pcb->rcv_wnd)) { + /* If the SYN is in the window it is an error, send a reset */ + tcp_rst(in_data->ackno, in_data->seqno + in_data->tcplen, + in_data->tcphdr->dest, in_data->tcphdr->src, pcb); + return ERR_OK; + } + } else if (in_data->flags & TCP_FIN) { + /* - eighth, check the FIN bit: Remain in the TIME-WAIT state. + Restart the 2 MSL time-wait timeout.*/ + pcb->tmr = tcp_ticks; + } + + if ((in_data->tcplen > 0)) { + /* Acknowledge data, FIN or out-of-window SYN */ + pcb->flags |= TF_ACK_NOW; + return tcp_output(pcb); + } + return ERR_OK; +} + +/** + * Implements the TCP state machine. Called by tcp_input. In some + * states tcp_receive() is called to receive data. The tcp_seg + * argument will be freed by the caller (tcp_input()) unless the + * recv_data pointer in the pcb is set. + * + * @param pcb the tcp_pcb for which a segment arrived + * + * @note the segment which arrived is saved in global variables, therefore only the pcb + * involved is passed as a parameter to this function + */ +static err_t +tcp_process(struct tcp_pcb *pcb, tcp_in_data* in_data) +{ + struct tcp_seg *rseg; + u8_t acceptable = 0; + err_t err; + + /* Process incoming RST segments. */ + if (in_data->flags & TCP_RST) { + /* First, determine if the reset is acceptable. */ + if (get_tcp_state(pcb) == SYN_SENT) { + if (in_data->ackno == pcb->snd_nxt) { + acceptable = 1; + } + } else { + if (TCP_SEQ_BETWEEN(in_data->seqno, pcb->rcv_nxt, + pcb->rcv_nxt+pcb->rcv_wnd)) { + acceptable = 1; + } + } + + if (acceptable) { + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_process: Connection RESET\n")); + LWIP_ASSERT("tcp_input: get_tcp_state(pcb) != CLOSED", get_tcp_state(pcb) != CLOSED); + in_data->recv_flags |= TF_RESET; + pcb->flags &= ~TF_ACK_DELAY; + return ERR_RST; + } else { + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_process: unacceptable reset seqno %"U32_F" rcv_nxt %"U32_F"\n", + in_data->seqno, pcb->rcv_nxt)); + LWIP_DEBUGF(TCP_DEBUG, ("tcp_process: unacceptable reset seqno %"U32_F" rcv_nxt %"U32_F"\n", + in_data->seqno, pcb->rcv_nxt)); + return ERR_OK; + } + } + + if ((in_data->flags & TCP_SYN) && (get_tcp_state(pcb) != SYN_SENT && get_tcp_state(pcb) != SYN_RCVD)) { + /* Cope with new connection attempt after remote end crashed */ + tcp_ack_now(pcb); + return ERR_OK; + } + + if ((pcb->flags & TF_RXCLOSED) == 0) { + /* Update the PCB (in)activity timer unless rx is closed (see tcp_shutdown) */ + pcb->tmr = tcp_ticks; + } + pcb->keep_cnt_sent = 0; + + tcp_parseopt(pcb, in_data); + + /* Do different things depending on the TCP state. */ + switch (get_tcp_state(pcb)) { + case SYN_SENT: + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("SYN-SENT: ackno %"U32_F" pcb->snd_nxt %"U32_F" unacked %"U32_F"\n", in_data->ackno, + pcb->snd_nxt, ntohl(pcb->unacked->tcphdr->seqno))); + /* received SYN ACK with expected sequence number? */ + if ((in_data->flags & TCP_ACK) && (in_data->flags & TCP_SYN) + && in_data->ackno == pcb->unacked->seqno + 1) { + // pcb->snd_buf++; SND_BUF_FOR_SYN_FIN + pcb->rcv_nxt = in_data->seqno + 1; + pcb->rcv_ann_right_edge = pcb->rcv_nxt; + pcb->lastack = in_data->ackno; + pcb->snd_wnd = SND_WND_SCALE(pcb, in_data->tcphdr->wnd); // Which means: tcphdr->wnd << pcb->snd_scale; + pcb->snd_wnd_max = pcb->snd_wnd; + pcb->snd_wl1 = in_data->seqno - 1; /* initialise to seqno - 1 to force window update */ + set_tcp_state(pcb, ESTABLISHED); + +#if TCP_CALCULATE_EFF_SEND_MSS + u16_t eff_mss = tcp_eff_send_mss(pcb->mss, pcb); + UPDATE_PCB_BY_MSS(pcb, eff_mss); +#endif /* TCP_CALCULATE_EFF_SEND_MSS */ + + /* Set ssthresh again after changing pcb->mss (already set in tcp_connect + * but for the default value of pcb->mss) */ + pcb->ssthresh = pcb->mss * 10; +#if TCP_CC_ALGO_MOD + cc_conn_init(pcb); +#else + pcb->cwnd = ((pcb->cwnd == 1) ? (pcb->mss * 2) : pcb->mss); +#endif + LWIP_ASSERT("pcb->snd_queuelen > 0", (pcb->snd_queuelen > 0)); + --pcb->snd_queuelen; + LWIP_DEBUGF(TCP_QLEN_DEBUG, ("tcp_process: SYN-SENT --queuelen %"U16_F"\n", (u16_t)pcb->snd_queuelen)); + rseg = pcb->unacked; + pcb->unacked = rseg->next; + + /* If there's nothing left to acknowledge, stop the retransmit + timer, otherwise reset it to start again */ + if(pcb->unacked == NULL) { + pcb->rtime = -1; + } else { + pcb->rtime = 0; + pcb->nrtx = 0; + } + + tcp_tx_seg_free(pcb, rseg); + + /* Call the user specified function to call when sucessfully + * connected. */ + TCP_EVENT_CONNECTED(pcb, ERR_OK, err); + if (err == ERR_ABRT) { + return ERR_ABRT; + } + tcp_ack_now(pcb); + } + /* received ACK? possibly a half-open connection */ + else if (in_data->flags & TCP_ACK) { + /* send a RST to bring the other side in a non-synchronized state. */ + tcp_rst(in_data->ackno, in_data->seqno + in_data->tcplen, + in_data->tcphdr->dest, in_data->tcphdr->src, pcb); + } + break; + case SYN_RCVD: + if (in_data->flags & TCP_ACK) { + /* expected ACK number? */ + if (TCP_SEQ_BETWEEN(in_data->ackno, pcb->lastack+1, pcb->snd_nxt)) { + u32_t old_cwnd; + set_tcp_state(pcb, ESTABLISHED); + LWIP_DEBUGF(TCP_DEBUG, ("TCP connection established %"U16_F" -> %"U16_F".\n", in_data->inseg.tcphdr->src, in_data->inseg.tcphdr->dest)); +#if LWIP_CALLBACK_API + LWIP_ASSERT("pcb->accept != NULL", pcb->accept != NULL); +#endif + /* Call the accept function. */ + TCP_EVENT_ACCEPT(pcb, ERR_OK, err); + if (err != ERR_OK) { + /* If the accept function returns with an error, we abort + * the connection. */ + /* Already aborted? */ + if (err != ERR_ABRT) { + tcp_abort(pcb); + } + return ERR_ABRT; + } + old_cwnd = pcb->cwnd; + /* If there was any data contained within this ACK, + * we'd better pass it on to the application as well. */ + tcp_receive(pcb, in_data); + + /* Prevent ACK for SYN to generate a sent event */ + if (pcb->acked != 0) { + pcb->acked--; + } +#if TCP_CC_ALGO_MOD + pcb->cwnd = old_cwnd; + cc_conn_init(pcb); +#else + pcb->cwnd = ((old_cwnd == 1) ? (pcb->mss * 2) : pcb->mss); +#endif + if (in_data->recv_flags & TF_GOT_FIN) { + tcp_ack_now(pcb); + set_tcp_state(pcb, CLOSE_WAIT); + } + } else { + /* incorrect ACK number, send RST */ + tcp_rst(in_data->ackno, in_data->seqno + in_data->tcplen, + in_data->tcphdr->dest, in_data->tcphdr->src, pcb); + } + } else if ((in_data->flags & TCP_SYN) && (in_data->seqno == pcb->rcv_nxt - 1)) { + /* Looks like another copy of the SYN - retransmit our SYN-ACK */ + tcp_rexmit(pcb); + } + break; + case CLOSE_WAIT: + /* FALLTHROUGH */ + case ESTABLISHED: + tcp_receive(pcb, in_data); + if (in_data->recv_flags & TF_GOT_FIN) { /* passive close */ + tcp_ack_now(pcb); + set_tcp_state(pcb, CLOSE_WAIT); + } + break; + case FIN_WAIT_1: + tcp_receive(pcb, in_data); + if (in_data->recv_flags & TF_GOT_FIN) { + if ((in_data->flags & TCP_ACK) && (in_data->ackno == pcb->snd_nxt)) { + LWIP_DEBUGF(TCP_DEBUG, + ("TCP connection closed: FIN_WAIT_1 %"U16_F" -> %"U16_F".\n", in_data->inseg.tcphdr->src, in_data->inseg.tcphdr->dest)); + tcp_ack_now(pcb); + tcp_pcb_purge(pcb); + set_tcp_state(pcb, TIME_WAIT); + } else { + tcp_ack_now(pcb); + set_tcp_state(pcb, CLOSING); + } + } else if ((in_data->flags & TCP_ACK) && (in_data->ackno == pcb->snd_nxt)) { + set_tcp_state(pcb, FIN_WAIT_2); + } + break; + case FIN_WAIT_2: + tcp_receive(pcb, in_data); + if (in_data->recv_flags & TF_GOT_FIN) { + LWIP_DEBUGF(TCP_DEBUG, ("TCP connection closed: FIN_WAIT_2 %"U16_F" -> %"U16_F".\n", in_data->inseg.tcphdr->src, in_data->inseg.tcphdr->dest)); + tcp_ack_now(pcb); + tcp_pcb_purge(pcb); + set_tcp_state(pcb, TIME_WAIT); + } + break; + case CLOSING: + tcp_receive(pcb, in_data); + if (in_data->flags & TCP_ACK && in_data->ackno == pcb->snd_nxt) { + LWIP_DEBUGF(TCP_DEBUG, ("TCP connection closed: CLOSING %"U16_F" -> %"U16_F".\n", in_data->inseg.tcphdr->src, in_data->inseg.tcphdr->dest)); + tcp_pcb_purge(pcb); + set_tcp_state(pcb, TIME_WAIT); + } + break; + case LAST_ACK: + tcp_receive(pcb, in_data); + if (in_data->flags & TCP_ACK && in_data->ackno == pcb->snd_nxt) { + LWIP_DEBUGF(TCP_DEBUG, ("TCP connection closed: LAST_ACK %"U16_F" -> %"U16_F".\n", in_data->inseg.tcphdr->src, in_data->inseg.tcphdr->dest)); + /* bugfix #21699: don't set_tcp_state to CLOSED here or we risk leaking segments */ + in_data->recv_flags |= TF_CLOSED; + } + break; + default: + break; + } + return ERR_OK; +} + +#if TCP_QUEUE_OOSEQ +/** + * Insert segment into the list (segments covered with new one will be deleted) + * + * Called from tcp_receive() + */ +static void +tcp_oos_insert_segment(struct tcp_pcb *pcb, struct tcp_seg *cseg, struct tcp_seg *next, tcp_in_data* in_data) +{ + struct tcp_seg *old_seg; + + if (TCPH_FLAGS(cseg->tcphdr) & TCP_FIN) { + /* received segment overlaps all following segments */ + tcp_segs_free(pcb, next); + next = NULL; + } + else { + /* delete some following segments + oos queue may have segments with FIN flag */ + while (next && + TCP_SEQ_GEQ((in_data->seqno + cseg->len), + (next->tcphdr->seqno + next->len))) { + /* cseg with FIN already processed */ + if (TCPH_FLAGS(next->tcphdr) & TCP_FIN) { + TCPH_SET_FLAG(cseg->tcphdr, TCP_FIN); + } + old_seg = next; + next = next->next; + tcp_seg_free(pcb, old_seg); + } + if (next && + TCP_SEQ_GT(in_data->seqno + cseg->len, next->tcphdr->seqno)) { + /* We need to trim the incoming segment. */ +#if LWIP_TSO + cseg->len = (u32_t)(next->tcphdr->seqno - in_data->seqno); +#else + cseg->len = (u16_t)(next->tcphdr->seqno - in_data->seqno); +#endif /* LWIP_TSO */ + pbuf_realloc(cseg->p, cseg->len); + } + } + cseg->next = next; +} +#endif /* TCP_QUEUE_OOSEQ */ + +#if LWIP_TSO +/** + * Called by tcp_output() to shrink TCP segment to lastackno. + * This call should process retransmitted TSO segment. + * + * @param pcb the tcp_pcb for the TCP connection used to send the segment + * @param seg the tcp_seg to send + * @param ackqno current ackqno + * @return number of freed pbufs + */ +static u32_t +tcp_shrink_segment(struct tcp_pcb *pcb, struct tcp_seg *seg, u32_t ackno) +{ + struct pbuf *cur_p = NULL; + struct pbuf *p = NULL; + u32_t len = 0; + u32_t count = 0; + u8_t optflags = 0; + u8_t optlen = 0; + + if ((NULL == seg) || (NULL == seg->p) || + !(TCP_SEQ_GT(ackno, seg->seqno) && TCP_SEQ_LT(ackno, seg->seqno + TCP_TCPLEN(seg)))) { + return count; + } + + /* Just shrink first pbuf */ + if (TCP_SEQ_GT((seg->seqno + seg->p->len - TCP_HLEN), ackno)) { + u8_t *dataptr = (u8_t *)seg->tcphdr + LWIP_TCP_HDRLEN(seg->tcphdr); + len = ackno - seg->seqno; + seg->len -= len; + seg->p->len -= len; + seg->p->tot_len -= len; + seg->seqno = ackno; + seg->tcphdr->seqno = htonl(seg->seqno); + MEMCPY(dataptr, dataptr + len, seg->p->len); + return count; + } + +#if LWIP_TCP_TIMESTAMPS + if ((pcb->flags & TF_TIMESTAMP)) { + optflags |= TF_SEG_OPTS_TS; + } +#endif /* LWIP_TCP_TIMESTAMPS */ + + optlen += LWIP_TCP_OPT_LENGTH(optflags); + + cur_p = seg->p->next; + + if (cur_p) { + /* Process more than first pbuf */ + seg->len -= (seg->p->len - TCP_HLEN - optlen); + seg->p->tot_len -= (seg->p->len - TCP_HLEN - optlen); + seg->seqno += (seg->p->len - TCP_HLEN - optlen); + seg->tcphdr->seqno = htonl(seg->seqno); + seg->p->len = TCP_HLEN + optlen; + } + + while (cur_p) { + if (TCP_SEQ_GT((seg->seqno + cur_p->len), ackno)) { + break; + } else { + seg->len -= cur_p->len; + seg->p->tot_len -= cur_p->len; + seg->p->next = cur_p->next; + seg->seqno += cur_p->len; + seg->tcphdr->seqno = htonl(seg->seqno); + + p = cur_p; + cur_p = p->next; + seg->p->next = cur_p; + p->next = NULL; + + if (p->type == PBUF_RAM) { + external_tcp_tx_pbuf_free(pcb, p); + } else { + pbuf_free(p); + } + count++; + } + } + + if (cur_p) { + u8_t *dataptr = (u8_t *)seg->tcphdr + LWIP_TCP_HDRLEN(seg->tcphdr); + len = ackno - seg->seqno; + seg->len -= len; + seg->p->len = TCP_HLEN + optlen + cur_p->len - len; + seg->p->tot_len -= len; + seg->seqno = ackno; + seg->tcphdr->seqno = htonl(seg->seqno); + MEMCPY(dataptr, cur_p->payload + len, cur_p->len - len); + + p = cur_p; + cur_p = p->next; + seg->p->next = cur_p; + p->next = NULL; + + if (p->type == PBUF_RAM) { + external_tcp_tx_pbuf_free(pcb, p); + } else { + pbuf_free(p); + } + count++; + } + +#if TCP_TSO_DEBUG + LWIP_DEBUGF(TCP_TSO_DEBUG | LWIP_DBG_TRACE, + ("tcp_shrink: count: %-5d unsent %s\n", + count, _dump_seg(pcb->unsent))); +#endif /* TCP_TSO_DEBUG */ + + return count; +} +#endif /* LWIP_TSO */ + +/** + * Called by tcp_process. Checks if the given segment is an ACK for outstanding + * data, and if so frees the memory of the buffered data. Next, is places the + * segment on any of the receive queues (pcb->recved or pcb->ooseq). If the segment + * is buffered, the pbuf is referenced by pbuf_ref so that it will not be freed until + * i it has been removed from the buffer. + * + * If the incoming segment constitutes an ACK for a segment that was used for RTT + * estimation, the RTT is estimated here as well. + * + * Called from tcp_process(). + */ +static void +tcp_receive(struct tcp_pcb *pcb, tcp_in_data* in_data) +{ + struct tcp_seg *next; +#if TCP_QUEUE_OOSEQ + struct tcp_seg *prev, *cseg; +#endif /* TCP_QUEUE_OOSEQ */ + struct pbuf *p; + s32_t off; + s16_t m; + u32_t right_wnd_edge; + u16_t new_tot_len; + int found_dupack = 0; + s8_t persist = 0; + + if (in_data->flags & TCP_ACK) { + right_wnd_edge = pcb->snd_wnd + pcb->snd_wl2; + + /* Update window. */ + if (TCP_SEQ_LT(pcb->snd_wl1, in_data->seqno) || + (pcb->snd_wl1 == in_data->seqno && TCP_SEQ_LT(pcb->snd_wl2, in_data->ackno)) || + (pcb->snd_wl2 == in_data->ackno && SND_WND_SCALE(pcb, in_data->tcphdr->wnd) > pcb->snd_wnd)) { + pcb->snd_wnd = SND_WND_SCALE(pcb, in_data->tcphdr->wnd); // Which means: tcphdr->wnd << pcb->snd_scale; + /* keep track of the biggest window announced by the remote host to calculate + the maximum segment size */ + if (pcb->snd_wnd_max < pcb->snd_wnd) { + pcb->snd_wnd_max = pcb->snd_wnd; + } + pcb->snd_wl1 = in_data->seqno; + pcb->snd_wl2 = in_data->ackno; + if (pcb->snd_wnd == 0) { + if (pcb->persist_backoff == 0) { + persist = 1; + } + } else if (pcb->persist_backoff > 0) { + /* stop persist timer */ + pcb->persist_backoff = 0; + } + LWIP_DEBUGF(TCP_WND_DEBUG, ("tcp_receive: window update %"U16_F"\n", pcb->snd_wnd)); +#if TCP_WND_DEBUG + } else { + if (pcb->snd_wnd != in_data->tcphdr->wnd) { + LWIP_DEBUGF(TCP_WND_DEBUG, + ("tcp_receive: no window update lastack %"U32_F" ackno %" + U32_F" wl1 %"U32_F" seqno %"U32_F" wl2 %"U32_F"\n", + pcb->lastack, in_data->ackno, pcb->snd_wl1, in_data->seqno, pcb->snd_wl2)); + } +#endif /* TCP_WND_DEBUG */ + } + + /* (From Stevens TCP/IP Illustrated Vol II, p970.) Its only a + * duplicate ack if: + * 1) It doesn't ACK new data + * 2) length of received packet is zero (i.e. no payload) + * 3) the advertised window hasn't changed + * 4) There is outstanding unacknowledged data (retransmission timer running) + * 5) The ACK is == biggest ACK sequence number so far seen (snd_una) + * + * If it passes all five, should process as a dupack: + * a) dupacks < 3: do nothing + * b) dupacks == 3: fast retransmit + * c) dupacks > 3: increase cwnd + * + * If it only passes 1-3, should reset dupack counter (and add to + * stats, which we don't do in lwIP) + * + * If it only passes 1, should reset dupack counter + * + */ + + /* Clause 1 */ + if (TCP_SEQ_LEQ(in_data->ackno, pcb->lastack)) { + pcb->acked = 0; + /* Clause 2 */ + if (in_data->tcplen == 0) { + /* Clause 3 */ + if (pcb->snd_wl2 + pcb->snd_wnd == right_wnd_edge){ + /* Clause 4 */ + if (pcb->rtime >= 0) { + /* Clause 5 */ + if (pcb->lastack == in_data->ackno) { + found_dupack = 1; + if ((u8_t)(pcb->dupacks + 1) > pcb->dupacks) + ++pcb->dupacks; + if (pcb->dupacks > 3) { +#if TCP_CC_ALGO_MOD + cc_ack_received(pcb, CC_DUPACK); +#else + /* Inflate the congestion window, but not if it means that + the value overflows. */ + if ((u32_t)(pcb->cwnd + pcb->mss) > pcb->cwnd) { + pcb->cwnd += pcb->mss; + } +#endif //TCP_CC_ALGO_MOD + } else if (pcb->dupacks == 3) { + /* Do fast retransmit */ + tcp_rexmit_fast(pcb); +#if TCP_CC_ALGO_MOD + cc_ack_received(pcb, 0); + //cc_ack_received(pcb, CC_DUPACK); +#endif + } + } + } + } + } + /* If Clause (1) or more is true, but not a duplicate ack, reset + * count of consecutive duplicate acks */ + if (!found_dupack) { + pcb->dupacks = 0; + } + } else if (TCP_SEQ_BETWEEN(in_data->ackno, pcb->lastack+1, pcb->snd_nxt)){ + /* We come here when the ACK acknowledges new data. */ + + /* Reset the "IN Fast Retransmit" flag, since we are no longer + in fast retransmit. Also reset the congestion window to the + slow start threshold. */ + if (pcb->flags & TF_INFR) { +#if TCP_CC_ALGO_MOD + cc_post_recovery(pcb); +#else + pcb->cwnd = pcb->ssthresh; +#endif + pcb->flags &= ~TF_INFR; + } + + /* Reset the number of retransmissions. */ + pcb->nrtx = 0; + + /* Reset the retransmission time-out. */ + pcb->rto = (pcb->sa >> 3) + pcb->sv; + + /* Update the send buffer space. Diff between the two can never exceed 64K? */ + pcb->acked = (u32_t)(in_data->ackno - pcb->lastack); + + pcb->snd_buf += pcb->acked; + + /* Reset the fast retransmit variables. */ + pcb->dupacks = 0; + pcb->lastack = in_data->ackno; + + /* Update the congestion control variables (cwnd and + ssthresh). */ + if (get_tcp_state(pcb) >= ESTABLISHED) { +#if TCP_CC_ALGO_MOD + cc_ack_received(pcb, CC_ACK); +#else + if (pcb->cwnd < pcb->ssthresh) { + if ((u32_t)(pcb->cwnd + pcb->mss) > pcb->cwnd) { + pcb->cwnd += pcb->mss; + } + LWIP_DEBUGF(TCP_CWND_DEBUG, ("tcp_receive: slow start cwnd %"U32_F"\n", pcb->cwnd)); + } else { + u32_t new_cwnd = (pcb->cwnd + ((u32_t)pcb->mss * (u32_t)pcb->mss) / pcb->cwnd); + if (new_cwnd > pcb->cwnd) { + pcb->cwnd = new_cwnd; + } + LWIP_DEBUGF(TCP_CWND_DEBUG, ("tcp_receive: congestion avoidance cwnd %"U32_F"\n", pcb->cwnd)); + } +#endif //TCP_CC_ALGO_MOD + } + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_receive: ACK for %"U32_F", unacked->seqno %"U32_F":%"U32_F"\n", + in_data->ackno, + pcb->unacked != NULL? + ntohl(pcb->unacked->tcphdr->seqno): 0, + pcb->unacked != NULL? + ntohl(pcb->unacked->tcphdr->seqno) + TCP_TCPLEN(pcb->unacked): 0)); + + /* Remove segment from the unacknowledged list if the incoming + ACK acknowlegdes them. */ +#if LWIP_TSO + while (pcb->unacked != NULL) { + + /* The purpose of this processing is to avoid to send again + * data from TSO segment that is partially acknowledged. + * This TSO segment was not released in tcp_receive() because + * input data processing releases whole acknowledged segment only. + */ + if (pcb->unacked->flags & TF_SEG_OPTS_TSO) { + pcb->snd_queuelen -= tcp_shrink_segment(pcb, pcb->unacked, in_data->ackno); + } + + if (!(TCP_SEQ_LEQ(pcb->unacked->seqno + TCP_TCPLEN(pcb->unacked), in_data->ackno))) { + break; + } +#else + while (pcb->unacked != NULL && + TCP_SEQ_LEQ(pcb->unacked->seqno + TCP_TCPLEN(pcb->unacked), in_data->ackno)) { +#endif /* LWIP_TSO */ + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_receive: removing %"U32_F":%"U32_F" from pcb->unacked\n", + ntohl(pcb->unacked->tcphdr->seqno), + ntohl(pcb->unacked->tcphdr->seqno) + + TCP_TCPLEN(pcb->unacked))); + + next = pcb->unacked; + pcb->unacked = pcb->unacked->next; + LWIP_DEBUGF(TCP_QLEN_DEBUG, ("tcp_receive: queuelen %"U32_F" ... ", (u32_t)pcb->snd_queuelen)); + LWIP_ASSERT("pcb->snd_queuelen >= pbuf_clen(next->p)", (pcb->snd_queuelen >= pbuf_clen(next->p))); + /* Prevent ACK for FIN to generate a sent event */ + if ((pcb->acked != 0) && ((TCPH_FLAGS(next->tcphdr) & TCP_FIN) != 0)) { + pcb->acked--; + } + + pcb->snd_queuelen -= pbuf_clen(next->p); + tcp_tx_seg_free(pcb, next); + LWIP_DEBUGF(TCP_QLEN_DEBUG, ("%"U32_F" (after freeing unacked)\n", (u32_t)pcb->snd_queuelen)); + } + + /* If there's nothing left to acknowledge, stop the retransmit + timer, otherwise reset it to start again */ + if(pcb->unacked == NULL) { + if (persist) { + /* start persist timer */ + pcb->persist_cnt = 0; + pcb->persist_backoff = 1; + } + pcb->rtime = -1; + } else { + pcb->rtime = 0; + } + + pcb->polltmr = 0; + } else { + /* Out of sequence ACK, didn't really ack anything */ + pcb->acked = 0; + tcp_send_empty_ack(pcb); + } + + /* We go through the ->unsent list to see if any of the segments + on the list are acknowledged by the ACK. This may seem + strange since an "unsent" segment shouldn't be acked. The + rationale is that lwIP puts all outstanding segments on the + ->unsent list after a retransmission, so these segments may + in fact have been sent once. */ + while (pcb->unsent != NULL && + TCP_SEQ_BETWEEN(in_data->ackno, ntohl(pcb->unsent->tcphdr->seqno) + + TCP_TCPLEN(pcb->unsent), pcb->snd_nxt)) { + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_receive: removing %"U32_F":%"U32_F" from pcb->unsent\n", + ntohl(pcb->unsent->tcphdr->seqno), ntohl(pcb->unsent->tcphdr->seqno) + + TCP_TCPLEN(pcb->unsent))); + + next = pcb->unsent; + pcb->unsent = pcb->unsent->next; +#if TCP_OVERSIZE + if (pcb->unsent == NULL) { + pcb->unsent_oversize = 0; + } +#endif /* TCP_OVERSIZE */ + LWIP_DEBUGF(TCP_QLEN_DEBUG, ("tcp_receive: queuelen %"U32_F" ... ", (u32_t)pcb->snd_queuelen)); + LWIP_ASSERT("pcb->snd_queuelen >= pbuf_clen(next->p)", (pcb->snd_queuelen >= pbuf_clen(next->p))); + /* Prevent ACK for FIN to generate a sent event */ + if ((pcb->acked != 0) && ((TCPH_FLAGS(next->tcphdr) & TCP_FIN) != 0)) { + pcb->acked--; + } + pcb->snd_queuelen -= pbuf_clen(next->p); + tcp_tx_seg_free(pcb, next); + LWIP_DEBUGF(TCP_QLEN_DEBUG, ("%"U16_F" (after freeing unsent)\n", (u32_t)pcb->snd_queuelen)); + if (pcb->snd_queuelen != 0) { + LWIP_ASSERT("tcp_receive: valid queue length", + pcb->unacked != NULL || pcb->unsent != NULL); + } + } + /* End of ACK for new data processing. */ + + LWIP_DEBUGF(TCP_RTO_DEBUG, ("tcp_receive: pcb->rttest %"U32_F" rtseq %"U32_F" ackno %"U32_F"\n", + pcb->rttest, pcb->rtseq, in_data->ackno)); + + /* RTT estimation calculations. This is done by checking if the + incoming segment acknowledges the segment we use to take a + round-trip time measurement. */ + if (pcb->rttest && TCP_SEQ_LT(pcb->rtseq, in_data->ackno)) { + /* diff between this shouldn't exceed 32K since this are tcp timer ticks + and a round-trip shouldn't be that long... */ +#if TCP_CC_ALGO_MOD + pcb->t_rttupdated++; +#endif + m = (s16_t)(tcp_ticks - pcb->rttest); + + LWIP_DEBUGF(TCP_RTO_DEBUG, ("tcp_receive: experienced rtt %"U16_F" ticks (%"U16_F" msec).\n", + m, m * slow_tmr_interval)); + + /* This is taken directly from VJs original code in his paper */ + m = m - (pcb->sa >> 3); + pcb->sa += m; + if (m < 0) { + m = -m; + } + m = m - (pcb->sv >> 2); + pcb->sv += m; + pcb->rto = (pcb->sa >> 3) + pcb->sv; + + LWIP_DEBUGF(TCP_RTO_DEBUG, ("tcp_receive: RTO %"U16_F" (%"U16_F" milliseconds)\n", + pcb->rto, pcb->rto * slow_tmr_interval)); + + pcb->rttest = 0; + } + } + + /* If the incoming segment contains data, we must process it + further unless the pcb already received a FIN. + (RFC 793, chapter 3.9, "SEGMENT ARRIVES" in states CLOSE-WAIT, CLOSING, + LAST-ACK and TIME-WAIT: "Ignore the segment text.") */ + if ((in_data->tcplen > 0) && (get_tcp_state(pcb) < CLOSE_WAIT)) { + /* This code basically does three things: + + +) If the incoming segment contains data that is the next + in-sequence data, this data is passed to the application. This + might involve trimming the first edge of the data. The rcv_nxt + variable and the advertised window are adjusted. + + +) If the incoming segment has data that is above the next + sequence number expected (->rcv_nxt), the segment is placed on + the ->ooseq queue. This is done by finding the appropriate + place in the ->ooseq queue (which is ordered by sequence + number) and trim the segment in both ends if needed. An + immediate ACK is sent to indicate that we received an + out-of-sequence segment. + + +) Finally, we check if the first segment on the ->ooseq queue + now is in sequence (i.e., if rcv_nxt >= ooseq->seqno). If + rcv_nxt > ooseq->seqno, we must trim the first edge of the + segment on ->ooseq before we adjust rcv_nxt. The data in the + segments that are now on sequence are chained onto the + incoming segment so that we only need to call the application + once. + */ + + /* First, we check if we must trim the first edge. We have to do + this if the sequence number of the incoming segment is less + than rcv_nxt, and the sequence number plus the length of the + segment is larger than rcv_nxt. */ + /* if (TCP_SEQ_LT(seqno, pcb->rcv_nxt)){ + if (TCP_SEQ_LT(pcb->rcv_nxt, seqno + tcplen)) {*/ + if (TCP_SEQ_BETWEEN(pcb->rcv_nxt, in_data->seqno + 1, in_data->seqno + in_data->tcplen - 1)){ + /* Trimming the first edge is done by pushing the payload + pointer in the pbuf downwards. This is somewhat tricky since + we do not want to discard the full contents of the pbuf up to + the new starting point of the data since we have to keep the + TCP header which is present in the first pbuf in the chain. + + What is done is really quite a nasty hack: the first pbuf in + the pbuf chain is pointed to by inseg.p. Since we need to be + able to deallocate the whole pbuf, we cannot change this + inseg.p pointer to point to any of the later pbufs in the + chain. Instead, we point the ->payload pointer in the first + pbuf to data in one of the later pbufs. We also set the + inseg.data pointer to point to the right place. This way, the + ->p pointer will still point to the first pbuf, but the + ->p->payload pointer will point to data in another pbuf. + + After we are done with adjusting the pbuf pointers we must + adjust the ->data pointer in the seg and the segment + length.*/ + + off = pcb->rcv_nxt - in_data->seqno; + p = in_data->inseg.p; + LWIP_ASSERT("inseg.p != NULL", in_data->inseg.p); + LWIP_ASSERT("insane offset!", (off < 0x7fff)); + if (in_data->inseg.p->len < off) { + LWIP_ASSERT("pbuf too short!", (((s32_t)in_data->inseg.p->tot_len) >= off)); + new_tot_len = (u16_t)(in_data->inseg.p->tot_len - off); + while (p->len < off) { + off -= p->len; + /* KJM following line changed (with addition of new_tot_len var) + to fix bug #9076 + inseg.p->tot_len -= p->len; */ + p->tot_len = new_tot_len; + p->len = 0; + p = p->next; + } + if(pbuf_header(p, (s16_t)-off)) { + /* Do we need to cope with this failing? Assert for now */ + LWIP_ASSERT("pbuf_header failed", 0); + } + } else { + if(pbuf_header(in_data->inseg.p, (s16_t)-off)) { + /* Do we need to cope with this failing? Assert for now */ + LWIP_ASSERT("pbuf_header failed", 0); + } + } + /* KJM following line changed to use p->payload rather than inseg->p->payload + to fix bug #9076 */ +#if LWIP_TSO +#else + in_data->inseg.dataptr = p->payload; +#endif /* LWIP_TSO */ + in_data->inseg.len -= (u16_t)(pcb->rcv_nxt - in_data->seqno); + in_data->inseg.tcphdr->seqno = in_data->seqno = pcb->rcv_nxt; + } + else { + if (TCP_SEQ_LT(in_data->seqno, pcb->rcv_nxt)){ + /* the whole segment is < rcv_nxt */ + /* must be a duplicate of a packet that has already been correctly handled */ + + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_receive: duplicate seqno %"U32_F"\n", in_data->seqno)); + tcp_ack_now(pcb); + } + } + + /* The sequence number must be within the window (above rcv_nxt + and below rcv_nxt + rcv_wnd) in order to be further + processed. */ + if (TCP_SEQ_BETWEEN(in_data->seqno, pcb->rcv_nxt, + pcb->rcv_nxt + pcb->rcv_wnd - 1)){ + if (pcb->rcv_nxt == in_data->seqno) { + /* The incoming segment is the next in sequence. We check if + we have to trim the end of the segment and update rcv_nxt + and pass the data to the application. */ + in_data->tcplen = TCP_TCPLEN(&in_data->inseg); + + if (in_data->tcplen > pcb->rcv_wnd) { + LWIP_DEBUGF(TCP_INPUT_DEBUG, + ("tcp_receive: other end overran receive window" + "seqno %"U32_F" len %"U16_F" right edge %"U32_F"\n", + in_data->seqno, in_data->tcplen, pcb->rcv_nxt + pcb->rcv_wnd)); + if (TCPH_FLAGS(in_data->inseg.tcphdr) & TCP_FIN) { + /* Must remove the FIN from the header as we're trimming + * that byte of sequence-space from the packet */ + TCPH_FLAGS_SET(in_data->inseg.tcphdr, TCPH_FLAGS(in_data->inseg.tcphdr) &~ TCP_FIN); + } + /* Adjust length of segment to fit in the window. */ + in_data->inseg.len = pcb->rcv_wnd; + if (TCPH_FLAGS(in_data->inseg.tcphdr) & TCP_SYN) { + in_data->inseg.len -= 1; + } + pbuf_realloc(in_data->inseg.p, in_data->inseg.len); + in_data->tcplen = TCP_TCPLEN(&in_data->inseg); + LWIP_ASSERT("tcp_receive: segment not trimmed correctly to rcv_wnd\n", + (in_data->seqno + in_data->tcplen) == (pcb->rcv_nxt + pcb->rcv_wnd)); + } +#if TCP_QUEUE_OOSEQ + /* Received in-sequence data, adjust ooseq data if: + - FIN has been received or + - inseq overlaps with ooseq */ + if (pcb->ooseq != NULL) { + if (TCPH_FLAGS(in_data->inseg.tcphdr) & TCP_FIN) { + LWIP_DEBUGF(TCP_INPUT_DEBUG, + ("tcp_receive: received in-order FIN, binning ooseq queue\n")); + /* Received in-order FIN means anything that was received + * out of order must now have been received in-order, so + * bin the ooseq queue */ + while (pcb->ooseq != NULL) { + struct tcp_seg *old_ooseq = pcb->ooseq; + pcb->ooseq = pcb->ooseq->next; + tcp_seg_free(pcb, old_ooseq); + } + } + else { + next = pcb->ooseq; + /* Remove all segments on ooseq that are covered by inseg already. + * FIN is copied from ooseq to inseg if present. */ + while (next && + TCP_SEQ_GEQ(in_data->seqno + in_data->tcplen, + next->tcphdr->seqno + next->len)) { + /* inseg cannot have FIN here (already processed above) */ + if (TCPH_FLAGS(next->tcphdr) & TCP_FIN && + (TCPH_FLAGS(in_data->inseg.tcphdr) & TCP_SYN) == 0) { + TCPH_SET_FLAG(in_data->inseg.tcphdr, TCP_FIN); + in_data->tcplen = TCP_TCPLEN(&in_data->inseg); + } + prev = next; + next = next->next; + tcp_seg_free(pcb, prev); + } + /* Now trim right side of inseg if it overlaps with the first + * segment on ooseq */ + if (next && + TCP_SEQ_GT(in_data->seqno + in_data->tcplen, + next->tcphdr->seqno)) { + /* inseg cannot have FIN here (already processed above) */ +#if LWIP_TSO + in_data->inseg.len = (u32_t)(next->tcphdr->seqno - in_data->seqno); +#else + in_data->inseg.len = (u16_t)(next->tcphdr->seqno - in_data->seqno); +#endif /* LWIP_TSO */ + if (TCPH_FLAGS(in_data->inseg.tcphdr) & TCP_SYN) { + in_data->inseg.len -= 1; + } + pbuf_realloc(in_data->inseg.p, in_data->inseg.len); + in_data->tcplen = TCP_TCPLEN(&in_data->inseg); + LWIP_ASSERT("tcp_receive: segment not trimmed correctly to ooseq queue\n", + (in_data->seqno + in_data->tcplen) == next->in_data->tcphdr->in_data->seqno); + } + pcb->ooseq = next; + } + } +#endif /* TCP_QUEUE_OOSEQ */ + + pcb->rcv_nxt = in_data->seqno + in_data->tcplen; + + /* Update the receiver's (our) window. */ + LWIP_ASSERT("tcp_receive: tcplen > rcv_wnd\n", pcb->rcv_wnd >= in_data->tcplen); + pcb->rcv_wnd -= in_data->tcplen; + + tcp_update_rcv_ann_wnd(pcb); + + /* If there is data in the segment, we make preparations to + pass this up to the application. The ->recv_data variable + is used for holding the pbuf that goes to the + application. The code for reassembling out-of-sequence data + chains its data on this pbuf as well. + + If the segment was a FIN, we set the TF_GOT_FIN flag that will + be used to indicate to the application that the remote side has + closed its end of the connection. */ + if (in_data->inseg.p->tot_len > 0) { + in_data->recv_data = in_data->inseg.p; + /* Since this pbuf now is the responsibility of the + application, we delete our reference to it so that we won't + (mistakingly) deallocate it. */ + in_data->inseg.p = NULL; + } + if (TCPH_FLAGS(in_data->inseg.tcphdr) & TCP_FIN) { + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_receive: received FIN.\n")); + in_data->recv_flags |= TF_GOT_FIN; + } + +#if TCP_QUEUE_OOSEQ + /* We now check if we have segments on the ->ooseq queue that + are now in sequence. */ + while (pcb->ooseq != NULL && + pcb->ooseq->tcphdr->seqno == pcb->rcv_nxt) { + + cseg = pcb->ooseq; + in_data->seqno = pcb->ooseq->tcphdr->seqno; + + pcb->rcv_nxt += TCP_TCPLEN(cseg); + LWIP_ASSERT("tcp_receive: ooseq tcplen > rcv_wnd\n", + pcb->rcv_wnd >= TCP_TCPLEN(cseg)); + pcb->rcv_wnd -= TCP_TCPLEN(cseg); + + tcp_update_rcv_ann_wnd(pcb); + + if (cseg->p->tot_len > 0) { + /* Chain this pbuf onto the pbuf that we will pass to + the application. */ + if (in_data->recv_data) { + pbuf_cat(in_data->recv_data, cseg->p); + } else { + in_data->recv_data = cseg->p; + } + cseg->p = NULL; + } + if (TCPH_FLAGS(cseg->tcphdr) & TCP_FIN) { + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_receive: dequeued FIN.\n")); + in_data->recv_flags |= TF_GOT_FIN; + if (get_tcp_state(pcb) == ESTABLISHED) { /* force passive close or we can move to active close */ + set_tcp_state(pcb, CLOSE_WAIT); + } + } + + pcb->ooseq = cseg->next; + tcp_seg_free(pcb, cseg); + } +#endif /* TCP_QUEUE_OOSEQ */ + + + /* Acknowledge the segment(s). */ + if ((in_data->recv_data && in_data->recv_data->next) || tcp_quickack(pcb, in_data)) { + tcp_ack_now(pcb); + } else { + tcp_ack(pcb); + } + + } else { + /* We get here if the incoming segment is out-of-sequence. */ + tcp_send_empty_ack(pcb); +#if TCP_QUEUE_OOSEQ + /* We queue the segment on the ->ooseq queue. */ + if (pcb->ooseq == NULL) { + pcb->ooseq = tcp_seg_copy(pcb, &in_data->inseg); + } else { + /* If the queue is not empty, we walk through the queue and + try to find a place where the sequence number of the + incoming segment is between the sequence numbers of the + previous and the next segment on the ->ooseq queue. That is + the place where we put the incoming segment. If needed, we + trim the second edges of the previous and the incoming + segment so that it will fit into the sequence. + + If the incoming segment has the same sequence number as a + segment on the ->ooseq queue, we discard the segment that + contains less data. */ + + prev = NULL; + for(next = pcb->ooseq; next != NULL; next = next->next) { + if (in_data->seqno == next->tcphdr->seqno) { + /* The sequence number of the incoming segment is the + same as the sequence number of the segment on + ->ooseq. We check the lengths to see which one to + discard. */ + if (in_data->inseg.len > next->len) { + /* The incoming segment is larger than the old + segment. We replace some segments with the new + one. */ + cseg = tcp_seg_copy(pcb, &in_data->inseg); + if (cseg != NULL) { + if (prev != NULL) { + prev->next = cseg; + } else { + pcb->ooseq = cseg; + } + tcp_oos_insert_segment(pcb, cseg, next, in_data); + } + break; + } else { + /* Either the lenghts are the same or the incoming + segment was smaller than the old one; in either + case, we ditch the incoming segment. */ + break; + } + } else { + if (prev == NULL) { + if (TCP_SEQ_LT(in_data->seqno, next->tcphdr->seqno)) { + /* The sequence number of the incoming segment is lower + than the sequence number of the first segment on the + queue. We put the incoming segment first on the + queue. */ + cseg = tcp_seg_copy(pcb, &in_data->inseg); + if (cseg != NULL) { + pcb->ooseq = cseg; + tcp_oos_insert_segment(pcb, cseg, next, in_data); + } + break; + } + } else { + /*if (TCP_SEQ_LT(prev->tcphdr->seqno, seqno) && + TCP_SEQ_LT(seqno, next->tcphdr->seqno)) {*/ + if (TCP_SEQ_BETWEEN(in_data->seqno, prev->tcphdr->seqno+1, next->tcphdr->seqno-1)) { + /* The sequence number of the incoming segment is in + between the sequence numbers of the previous and + the next segment on ->ooseq. We trim trim the previous + segment, delete next segments that included in received segment + and trim received, if needed. */ + cseg = tcp_seg_copy(pcb, &in_data->inseg); + if (cseg != NULL) { + if (TCP_SEQ_GT(prev->tcphdr->seqno + prev->len, in_data->seqno)) { + /* We need to trim the prev segment. */ +#if LWIP_TSO + prev->len = (u32_t)(in_data->seqno - prev->tcphdr->seqno); +#else + prev->len = (u16_t)(in_data->seqno - prev->tcphdr->seqno); +#endif /* LWIP_TSO */ + pbuf_realloc(prev->p, prev->len); + } + prev->next = cseg; + tcp_oos_insert_segment(pcb, cseg, next, in_data); + } + break; + } + } + /* If the "next" segment is the last segment on the + ooseq queue, we add the incoming segment to the end + of the list. */ + if (next->next == NULL && + TCP_SEQ_GT(in_data->seqno, next->tcphdr->seqno)) { + if (TCPH_FLAGS(next->tcphdr) & TCP_FIN) { + /* segment "next" already contains all data */ + break; + } + next->next = tcp_seg_copy(pcb, &in_data->inseg); + if (next->next != NULL) { + if (TCP_SEQ_GT(next->tcphdr->seqno + next->len, in_data->seqno)) { + /* We need to trim the last segment. */ +#if LWIP_TSO + next->len = (u32_t)(in_data->seqno - next->tcphdr->seqno); +#else + next->len = (u16_t)(in_data->seqno - next->tcphdr->seqno); +#endif /* LWIP_TSO */ + pbuf_realloc(next->p, next->len); + } + /* check if the remote side overruns our receive window */ + if ((u32_t)in_data->tcplen + in_data->seqno > pcb->rcv_nxt + (u32_t)pcb->rcv_wnd) { + LWIP_DEBUGF(TCP_INPUT_DEBUG, + ("tcp_receive: other end overran receive window" + "seqno %"U32_F" len %"U16_F" right edge %"U32_F"\n", + in_data->seqno, in_data->tcplen, pcb->rcv_nxt + pcb->rcv_wnd)); + if (TCPH_FLAGS(next->next->tcphdr) & TCP_FIN) { + /* Must remove the FIN from the header as we're trimming + * that byte of sequence-space from the packet */ + TCPH_FLAGS_SET(next->next->tcphdr, TCPH_FLAGS(next->next->tcphdr) &~ TCP_FIN); + } + /* Adjust length of segment to fit in the window. */ + next->next->len = pcb->rcv_nxt + pcb->rcv_wnd - in_data->seqno; + pbuf_realloc(next->next->p, next->next->len); + in_data->tcplen = TCP_TCPLEN(next->next); + LWIP_ASSERT("tcp_receive: segment not trimmed correctly to rcv_wnd\n", + (in_data->seqno + in_data->tcplen) == (pcb->rcv_nxt + pcb->rcv_wnd)); + } + } + break; + } + } + prev = next; + } + } +#endif /* TCP_QUEUE_OOSEQ */ + + } + } else { + /* The incoming segment is not withing the window. */ + tcp_send_empty_ack(pcb); + } + } else { + /* Segments with length 0 is taken care of here. Segments that + fall out of the window are ACKed. */ + /*if (TCP_SEQ_GT(pcb->rcv_nxt, seqno) || + TCP_SEQ_GEQ(seqno, pcb->rcv_nxt + pcb->rcv_wnd)) {*/ + if(!TCP_SEQ_BETWEEN(in_data->seqno, pcb->rcv_nxt, pcb->rcv_nxt + pcb->rcv_wnd-1)){ + tcp_ack_now(pcb); + } + } +} + +/** + * Parses the options contained in the incoming segment. + * + * Called from tcp_listen_input() and tcp_process(). + * Currently, only the MSS and window scaling options are supported! + * + * @param pcb the tcp_pcb for which a segment arrived + */ +static void +tcp_parseopt(struct tcp_pcb *pcb, tcp_in_data* in_data) +{ + u16_t c, max_c; + u16_t mss; + u16_t snd_mss; + u8_t *opts, opt; +#if LWIP_TCP_TIMESTAMPS + u32_t tsval; +#endif + + opts = (u8_t *)in_data->tcphdr + TCP_HLEN; + + /* Parse the TCP MSS option, if present. */ + if(TCPH_HDRLEN(in_data->tcphdr) > 0x5) { + max_c = (TCPH_HDRLEN(in_data->tcphdr) - 5) << 2; + for (c = 0; c < max_c; ) { + opt = opts[c]; + switch (opt) { + case 0x00: + /* End of options. */ + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_parseopt: EOL\n")); + return; + case 0x01: + /* NOP option. */ + ++c; + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_parseopt: NOP\n")); + break; + case 0x02: + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_parseopt: MSS\n")); + if (opts[c + 1] != 0x04 || c + 0x04 > max_c) { + /* Bad length */ + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_parseopt: bad length\n")); + return; + } + /* Check if the incoming flag is SYN. */ + if(in_data->flags & TCP_SYN) { + /* An MSS option with the right option length. */ + mss = (opts[c + 2] << 8) | opts[c + 3]; + /* Limit the mss to the configured TCP_MSS and prevent division by zero */ + snd_mss = ((mss > pcb->advtsd_mss) || (mss == 0)) ? pcb->advtsd_mss : mss; + UPDATE_PCB_BY_MSS(pcb, snd_mss); + } + /* Advance to next option */ + c += 0x04; + break; + case 0x03: + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_parseopt: WND SCALE\n")); + if (opts[c + 1] != 0x03 || (c + 0x03 > max_c)) { + /* Bad length */ + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_parseopt: bad length\n")); + return; + } + /* If syn was received with wnd scale option, + activate wnd scale opt, but only if this is not a retransmission */ + if(enable_wnd_scale && (in_data->flags & TCP_SYN) && !(pcb->flags & TF_WND_SCALE)) { + pcb->snd_scale = opts[c + 2] > 14U ? 14U : opts[c + 2]; + pcb->rcv_scale = rcv_wnd_scale; + pcb->flags |= TF_WND_SCALE; + } + /* Advance to next option */ + c += 0x03; + break; +#if LWIP_TCP_TIMESTAMPS + case 0x08: + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_parseopt: TS\n")); + if (opts[c + 1] != 0x0A || c + 0x0A > max_c) { + /* Bad length */ + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_parseopt: bad length\n")); + return; + } + /* TCP timestamp option with valid length */ + tsval = (opts[c+2]) | (opts[c+3] << 8) | + (opts[c+4] << 16) | (opts[c+5] << 24); + if (in_data->flags & TCP_SYN) { + if (pcb->enable_ts_opt) { + pcb->ts_recent = ntohl(tsval); + pcb->flags |= TF_TIMESTAMP; + } + } else if (TCP_SEQ_BETWEEN(pcb->ts_lastacksent, in_data->seqno, in_data->seqno+in_data->tcplen)) { + pcb->ts_recent = ntohl(tsval); + } + /* Advance to next option */ + c += 0x0A; + break; +#endif + default: + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_parseopt: other\n")); + if (opts[c + 1] == 0) { + LWIP_DEBUGF(TCP_INPUT_DEBUG, ("tcp_parseopt: bad length\n")); + /* If the length field is zero, the options are malformed + and we don't process them further. */ + return; + } + /* All other options have a length field, so that we easily + can skip past them. */ + c += opts[c + 1]; + } + } + } +} + +#endif /* LWIP_TCP */ diff --git a/src/vma/lwip/tcp_out.c b/src/vma/lwip/tcp_out.c new file mode 100644 index 0000000..5ef12c7 --- /dev/null +++ b/src/vma/lwip/tcp_out.c @@ -0,0 +1,2079 @@ +/** + * @file + * Transmission Control Protocol, outgoing traffic + * + * The output functions of TCP. + * + */ + +/* + * Copyright (c) 2001-2004 Swedish Institute of Computer Science. + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without modification, + * are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, + * this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * 3. The name of the author may not be used to endorse or promote products + * derived from this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR ``AS IS'' AND ANY EXPRESS OR IMPLIED + * WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT + * SHALL THE AUTHOR BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, + * EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT + * OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS + * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN + * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING + * IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY + * OF SUCH DAMAGE. + * + * This file is part of the lwIP TCP/IP stack. + * + * Author: Adam Dunkels + * + */ + +#include "vma/lwip/opt.h" + +#if LWIP_TCP /* don't build if not configured for use in lwipopts.h */ + +#include "vma/lwip/tcp_impl.h" +#include "vma/lwip/stats.h" + +#include +#include + +/* Define some copy-macros for checksum-on-copy so that the code looks + nicer by preventing too many ifdef's. */ +#if TCP_CHECKSUM_ON_COPY +#define TCP_DATA_COPY(dst, src, len, seg) do { \ + tcp_seg_add_chksum(LWIP_CHKSUM_COPY(dst, src, len), \ + len, &seg->chksum, &seg->chksum_swapped); \ + seg->flags |= TF_SEG_DATA_CHECKSUMMED; } while(0) +#define TCP_DATA_COPY2(dst, src, len, chksum, chksum_swapped) \ + tcp_seg_add_chksum(LWIP_CHKSUM_COPY(dst, src, len), len, chksum, chksum_swapped); +#else /* TCP_CHECKSUM_ON_COPY*/ +#define TCP_DATA_COPY(dst, src, len, seg) MEMCPY(dst, src, len) +#define TCP_DATA_COPY2(dst, src, len, chksum, chksum_swapped) MEMCPY(dst, src, len) +#endif /* TCP_CHECKSUM_ON_COPY*/ + +/** Define this to 1 for an extra check that the output checksum is valid + * (usefule when the checksum is generated by the application, not the stack) */ +#ifndef TCP_CHECKSUM_ON_COPY_SANITY_CHECK +#define TCP_CHECKSUM_ON_COPY_SANITY_CHECK 0 +#endif + +#if LWIP_DEBUG_ENABLE +static char* _dump_seg(struct tcp_seg *seg) +{ + static __thread char _tcp_dump_buf[100]; + struct tcp_seg *cur_seg = NULL; + struct pbuf *cur_pbuf = NULL; + int seg_num = 0; + int pbuf_num = 0; + int seg_len = 0; + int pbuf_len = 0; + + cur_seg = seg; + while (cur_seg) { + seg_len += cur_seg->len; + seg_num++; + cur_pbuf = cur_seg->p; + while (cur_pbuf) { + pbuf_len += cur_pbuf->len; + pbuf_num++; + cur_pbuf = cur_pbuf->next; + } + cur_seg = cur_seg->next; + } + + snprintf(_tcp_dump_buf, sizeof(_tcp_dump_buf), + "[seg] num: %-2d len: %-6d [pbuf] num: %-2d len: %-6d", + seg_num, seg_len, pbuf_num, pbuf_len); + return _tcp_dump_buf; +} +#endif /* LWIP_DEBUG_ENABLE */ + +sys_now_fn sys_now; +void register_sys_now(sys_now_fn fn) +{ + sys_now = fn; +} + +#if LWIP_3RD_PARTY_L3 +ip_output_fn external_ip_output; + +void register_ip_output(ip_output_fn fn) +{ + external_ip_output = fn; +} + +ip_route_mtu_fn external_ip_route_mtu; + +void register_ip_route_mtu(ip_route_mtu_fn fn) +{ + external_ip_route_mtu = fn; +} + +sys_readv_fn sys_readv; +void register_sys_readv(sys_readv_fn fn) +{ + sys_readv = fn; +} +#endif + +/* Forward declarations.*/ +static void tcp_output_segment(struct tcp_seg *seg, struct tcp_pcb *pcb); + +/** Allocate a pbuf and create a tcphdr at p->payload, used for output + * functions other than the default tcp_output -> tcp_output_segment + * (e.g. tcp_send_empty_ack, etc.) + * + * @param pcb tcp pcb for which to send a packet (used to initialize tcp_hdr) + * @param optlen length of header-options + * @param datalen length of tcp data to reserve in pbuf + * @param seqno_be seqno in network byte order (big-endian) + * @return pbuf with p->payload being the tcp_hdr + */ +static struct pbuf * +tcp_output_alloc_header(struct tcp_pcb *pcb, u16_t optlen, u16_t datalen, + u32_t seqno_be /* already in network byte order */) +{ + struct tcp_hdr *tcphdr; + struct pbuf *p = tcp_tx_pbuf_alloc(pcb, optlen + datalen, PBUF_RAM); + if (p != NULL) { + pbuf_header(p, TCP_HLEN); + LWIP_ASSERT("check that first pbuf can hold struct tcp_hdr", + (p->len >= TCP_HLEN + optlen)); + tcphdr = (struct tcp_hdr *)p->payload; + tcphdr->src = htons(pcb->local_port); + tcphdr->dest = htons(pcb->remote_port); + tcphdr->seqno = seqno_be; + tcphdr->ackno = htonl(pcb->rcv_nxt); + TCPH_HDRLEN_FLAGS_SET(tcphdr, (5 + optlen / 4), TCP_ACK); + tcphdr->wnd = htons(TCPWND_MIN16(RCV_WND_SCALE(pcb, pcb->rcv_ann_wnd))); + tcphdr->chksum = 0; + tcphdr->urgp = 0; + + /* If we're sending a packet, update the announced right window edge */ + pcb->rcv_ann_right_edge = pcb->rcv_nxt + pcb->rcv_ann_wnd; + } + return p; +} + +/** + * Called by tcp_close() to send a segment including FIN flag but not data. + * + * @param pcb the tcp_pcb over which to send a segment + * @return ERR_OK if sent, another err_t otherwise + */ +err_t +tcp_send_fin(struct tcp_pcb *pcb) +{ + /* first, try to add the fin to the last unsent segment */ + if (pcb->unsent != NULL) { + struct tcp_seg *last_unsent; + for (last_unsent = pcb->unsent; last_unsent->next != NULL; + last_unsent = last_unsent->next); + + if ((TCPH_FLAGS(last_unsent->tcphdr) & (TCP_SYN | TCP_FIN | TCP_RST)) == 0) { + /* no SYN/FIN/RST flag in the header, we can add the FIN flag */ + TCPH_SET_FLAG(last_unsent->tcphdr, TCP_FIN); + pcb->flags |= TF_FIN; + return ERR_OK; + } + } + /* no data, no length, flags, copy=1, no optdata */ + return tcp_enqueue_flags(pcb, TCP_FIN); +} + +/** + * Create a TCP segment with prefilled header. + * + * Called by tcp_write and tcp_enqueue_flags. + * + * @param pcb Protocol control block for the TCP connection. + * @param p pbuf that is used to hold the TCP header. + * @param flags TCP flags for header. + * @param seqno TCP sequence number of this packet + * @param optflags options to include in TCP header + * @return a new tcp_seg pointing to p, or NULL. + * The TCP header is filled in except ackno and wnd. + * p is freed on failure. + */ +static struct tcp_seg * +tcp_create_segment(struct tcp_pcb *pcb, struct pbuf *p, u8_t flags, u32_t seqno, u8_t optflags) +{ + struct tcp_seg *seg; + u8_t optlen = LWIP_TCP_OPT_LENGTH(optflags); + + if (!pcb->seg_alloc) { + // seg_alloc is not valid, we should allocate a new segment. + if ((seg = external_tcp_seg_alloc(pcb)) == NULL) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 2, ("tcp_create_segment: no memory.\n")); + tcp_tx_pbuf_free(pcb, p); + return NULL; + } + + seg->next = NULL; +#if TCP_OVERSIZE_DBGCHECK + seg->oversize_left = 0; +#endif /* TCP_OVERSIZE_DBGCHECK */ +#if TCP_CHECKSUM_ON_COPY + seg->chksum = 0; + seg->chksum_swapped = 0; + /* check optflags */ + LWIP_ASSERT("invalid optflags passed: TF_SEG_DATA_CHECKSUMMED", + (optflags & TF_SEG_DATA_CHECKSUMMED) == 0); +#endif /* TCP_CHECKSUM_ON_COPY */ + + if (p == NULL) { + // Request a new segment in order to update seg_alloc for the next packet. + seg->p = NULL; + return seg; + } + } else { + // seg_alloc is valid, we dont need to allocate a new segment element. + seg = pcb->seg_alloc; + pcb->seg_alloc = NULL; + } + + seg->flags = optflags; + seg->p = p; +#if LWIP_TSO +#else + seg->dataptr = p->payload; +#endif /* LWIP_TSO */ + seg->len = p->tot_len - optlen; + seg->seqno = seqno; + + /* build TCP header */ + if (pbuf_header(p, TCP_HLEN)) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 2, ("tcp_create_segment: no room for TCP header in pbuf.\n")); + TCP_STATS_INC(tcp.err); + tcp_tx_seg_free(pcb, seg); + return NULL; + } + seg->tcphdr = (struct tcp_hdr *)seg->p->payload; + seg->tcphdr->src = htons(pcb->local_port); + seg->tcphdr->dest = htons(pcb->remote_port); + seg->tcphdr->seqno = htonl(seqno); + /* ackno is set in tcp_output */ + TCPH_HDRLEN_FLAGS_SET(seg->tcphdr, (5 + optlen / 4), flags); + /* wnd and chksum are set in tcp_output */ + seg->tcphdr->urgp = 0; + return seg; +} + +/** + * Allocate a PBUF_RAM pbuf, perhaps with extra space at the end. + * + * This function is like pbuf_alloc(layer, length, PBUF_RAM) except + * there may be extra bytes available at the end. + * + * @param length size of the pbuf's payload. + * @param max_length maximum usable size of payload+oversize. + * @param oversize pointer to a u16_t that will receive the number of usable tail bytes. + * @param pcb The TCP connection that willo enqueue the pbuf. + * @param tcp_write_flag_more true if TCP_WRITE_FLAG_MORE flag was enabled. + * @param first_seg true when this pbuf will be used in the first enqueued segment. + * @param + */ +static struct pbuf * +tcp_pbuf_prealloc(u16_t length, u16_t max_length, + u16_t *oversize, struct tcp_pcb *pcb, u8_t tcp_write_flag_more, + u8_t first_seg) +{ + struct pbuf *p; + u16_t alloc = length; + + if (length < max_length) { + /* Should we allocate an oversized pbuf, or just the minimum + * length required? If tcp_write is going to be called again + * before this segment is transmitted, we want the oversized + * buffer. If the segment will be transmitted immediately, we can + * save memory by allocating only length. We use a simple + * heuristic based on the following information: + * + * Will the Nagle algorithm defer transmission of this segment? + */ + if (tcp_write_flag_more || + (!(pcb->flags & TF_NODELAY) && + (!first_seg || + pcb->unsent != NULL || + pcb->unacked != NULL))) { + alloc = LWIP_MIN(max_length, LWIP_MEM_ALIGN_SIZE(length + pcb->tcp_oversize_val)); + } + } + p = tcp_tx_pbuf_alloc(pcb, alloc, PBUF_RAM); + if (p == NULL) { + return NULL; + } + LWIP_ASSERT("need unchained pbuf", p->next == NULL); + *oversize = p->len - length; + /* trim p->len to the currently used size */ + p->len = p->tot_len = length; + return p; +} + +/** Checks if tcp_write is allowed or not (checks state, snd_buf and snd_queuelen). + * + * @param pcb the tcp pcb to check for + * @param len length of data to send (checked agains snd_buf) + * @return ERR_OK if tcp_write is allowed to proceed, another err_t otherwise + */ +static err_t +tcp_write_checks(struct tcp_pcb *pcb, u32_t len) +{ + /* connection is in invalid state for data transmission? */ + if ((get_tcp_state(pcb) != ESTABLISHED) && + (get_tcp_state(pcb) != CLOSE_WAIT) && + (get_tcp_state(pcb) != SYN_SENT) && + (get_tcp_state(pcb) != SYN_RCVD)) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | LWIP_DBG_STATE | LWIP_DBG_LEVEL_SEVERE, ("tcp_write() called in invalid state\n")); + return ERR_CONN; + } else if (len == 0) { + return ERR_OK; + } + + /* fail on too much data */ + if (len > pcb->snd_buf) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 3, ("tcp_write: too much data (len=%"U32_F" > snd_buf=%"U32_F")\n", + len, pcb->snd_buf)); + pcb->flags |= TF_NAGLEMEMERR; + return ERR_MEM; + } + LWIP_DEBUGF(TCP_QLEN_DEBUG, ("tcp_write: queuelen: %"U32_F"\n", (u32_t)pcb->snd_queuelen)); + + /* If total number of pbufs on the unsent/unacked queues exceeds the + * configured maximum, return an error */ + /* check for configured max queuelen and possible overflow */ + if ((pcb->snd_queuelen >= pcb->max_unsent_len) || (pcb->snd_queuelen > TCP_SNDQUEUELEN_OVERFLOW)) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 3, ("tcp_write: too long queue %"U32_F" (max %"U32_F")\n", + pcb->snd_queuelen, pcb->max_unsent_len)); + TCP_STATS_INC(tcp.memerr); + pcb->flags |= TF_NAGLEMEMERR; + return ERR_MEM; + } + if (pcb->snd_queuelen != 0) { + } else { + LWIP_ASSERT("tcp_write: no pbufs on queue => both queues empty", + pcb->unacked == NULL && pcb->unsent == NULL); + } + return ERR_OK; +} + +#if LWIP_TSO +static inline u16_t tcp_xmit_size_goal(struct tcp_pcb *pcb, int use_max) +{ + u16_t size = pcb->mss; + +#if LWIP_TCP_TIMESTAMPS + if ((pcb->flags & TF_TIMESTAMP)) { + /* ensure that segments can hold at least one data byte... */ + size = LWIP_MAX(size, LWIP_TCP_OPT_LEN_TS + 1); + } +#endif /* LWIP_TCP_TIMESTAMPS */ + +#if LWIP_TSO + if (use_max && tcp_tso(pcb) && pcb->tso.max_buf_sz) { + /* use maximum buffer size in case TSO */ + size = LWIP_MAX(size, pcb->tso.max_buf_sz); + } +#endif /* LWIP_TSO */ + + /* don't allocate segments bigger than half the maximum window we ever received */ + size = LWIP_MIN(size, (pcb->snd_wnd_max >> 1)); + + return size; +} +#endif /* LWIP_TSO */ + +/** + * Write data for sending (but does not send it immediately). + * + * It waits in the expectation of more data being sent soon (as + * it can send them more efficiently by combining them together). + * To prompt the system to send data now, call tcp_output() after + * calling tcp_write(). + * + * @param pcb Protocol control block for the TCP connection to enqueue data for. + * @param arg Pointer to the data to be enqueued for sending. + * @param len Data length in bytes + * @param apiflags combination of following flags : + * - TCP_WRITE_FLAG_COPY (0x01) data will be copied into memory belonging to the stack + * - TCP_WRITE_FLAG_MORE (0x02) for TCP connection, PSH flag will be set on last segment sent + * - TCP_WRITE_DUMMY (0x10) indicates if the packet is a dummy packet + * - TCP_WRITE_FILE (0x40) data should be taken from file + * @return ERR_OK if enqueued, another err_t on error + */ +err_t +tcp_write(struct tcp_pcb *pcb, const void *arg, u32_t len, u8_t apiflags) +{ + struct pbuf *concat_p = NULL; + struct tcp_seg *seg = NULL, *prev_seg = NULL, *queue = NULL; + u32_t pos = 0; /* position in 'arg' data */ + u32_t queuelen; + u8_t optlen = 0; + u8_t optflags = 0; +#if TCP_OVERSIZE + u16_t oversize = 0; + u16_t oversize_used = 0; +#endif /* TCP_OVERSIZE */ +#if TCP_CHECKSUM_ON_COPY + u16_t concat_chksum = 0; + u8_t concat_chksum_swapped = 0; + u16_t concat_chksummed = 0; +#endif /* TCP_CHECKSUM_ON_COPY */ + err_t err; + u16_t mss_local = 0; +#if LWIP_TSO + int tot_p = 0; +#endif /* LWIP_TSO */ + const int piov_max_size = 512; + const int piov_max_len = 65536; + struct iovec piov[piov_max_size]; + int piov_cur_index = 0; + int piov_cur_len = 0; + + int byte_queued = pcb->snd_nxt - pcb->lastack; + if ( len < pcb->mss && !(apiflags & TCP_WRITE_DUMMY)) + pcb->snd_sml_add = (pcb->unacked ? pcb->unacked->len : 0) + byte_queued; + + LWIP_DEBUGF(TCP_OUTPUT_DEBUG, ("tcp_write(pcb=%p, data=%p, len=%"U16_F", apiflags=%"U16_F")\n", + (void *)pcb, arg, len, (u16_t)apiflags)); + LWIP_ERROR("tcp_write: arg == NULL (programmer violates API)", + arg != NULL, return ERR_ARG;); + + err = tcp_write_checks(pcb, len); + if (err != ERR_OK) { + return err; + } + queuelen = pcb->snd_queuelen; + +#if LWIP_TSO + mss_local = tcp_xmit_size_goal(pcb, 1); +#else + mss_local = LWIP_MIN(pcb->mss, pcb->snd_wnd_max/2); + mss_local = mss_local ? mss_local : pcb->mss; +#endif /* LWIP_TSO */ + + optflags |= (apiflags & TCP_WRITE_DUMMY ? TF_SEG_OPTS_DUMMY_MSG : 0); + +#if LWIP_TCP_TIMESTAMPS + if ((pcb->flags & TF_TIMESTAMP)) { + optflags |= TF_SEG_OPTS_TS; +#if LWIP_TSO +#else + /* ensure that segments can hold at least one data byte... */ + mss_local = LWIP_MAX(mss_local, LWIP_TCP_OPT_LEN_TS + 1); +#endif /* LWIP_TSO */ + } +#endif /* LWIP_TCP_TIMESTAMPS */ + + optlen = LWIP_TCP_OPT_LENGTH( optflags ); + + /* + * TCP segmentation is done in three phases with increasing complexity: + * + * 1. Copy data directly into an oversized pbuf. + * 2. Chain a new pbuf to the end of pcb->unsent. + * 3. Create new segments. + * + * We may run out of memory at any point. In that case we must + * return ERR_MEM and not change anything in pcb. Therefore, all + * changes are recorded in local variables and committed at the end + * of the function. Some pcb fields are maintained in local copies: + * + * queuelen = pcb->snd_queuelen + * oversize = pcb->unsent_oversize + * + * These variables are set consistently by the phases: + * + * seg points to the last segment tampered with. + * + * pos records progress as data is segmented. + */ + + /* Find the tail of the unsent queue. */ + if (pcb->unsent != NULL) { + u16_t space; + u16_t unsent_optlen; + + if (!pcb->last_unsent || pcb->last_unsent->next) { + /* @todo: this could be sped up by keeping last_unsent in the pcb */ + for (pcb->last_unsent = pcb->unsent; pcb->last_unsent->next != NULL; + pcb->last_unsent = pcb->last_unsent->next); + } + /* Usable space at the end of the last unsent segment */ + unsent_optlen = LWIP_TCP_OPT_LENGTH(pcb->last_unsent->flags); + LWIP_ASSERT("mss_local is too small", mss_local >= pcb->last_unsent->len + unsent_optlen); + space = mss_local - (pcb->last_unsent->len + unsent_optlen); + seg = pcb->last_unsent; +#if LWIP_TSO + tot_p = pbuf_clen(seg->p); +#endif /* LWIP_TSO */ + + /* + * Phase 1: Copy data directly into an oversized pbuf. + * + * The number of bytes copied is recorded in the oversize_used + * variable. The actual copying is done at the bottom of the + * function. + */ +#if TCP_OVERSIZE +#if TCP_OVERSIZE_DBGCHECK + /* check that pcb->unsent_oversize matches last_unsent->unsent_oversize */ + LWIP_ASSERT("unsent_oversize mismatch (pcb vs. last_unsent)", + pcb->unsent_oversize == pcb->last_unsent->oversize_left); +#endif /* TCP_OVERSIZE_DBGCHECK */ + + if (pcb->unsent_oversize > 0) { + if (!(apiflags & TCP_WRITE_FILE)) { + oversize = pcb->unsent_oversize; + LWIP_ASSERT("inconsistent oversize vs. space", oversize_used <= space); + oversize_used = oversize < len ? oversize : len; + pos += oversize_used; + oversize -= oversize_used; + space -= oversize_used; + } + } + /* now we are either finished or oversize is zero */ + LWIP_ASSERT("inconsistend oversize vs. len", (oversize == 0) || (pos == len)); +#endif /* TCP_OVERSIZE */ + + /* + * Phase 2: Chain a new pbuf to the end of pcb->unsent. + * + * We don't extend segments containing SYN/FIN flags or options + * (len==0). The new pbuf is kept in concat_p and pbuf_cat'ed at + * the end. + */ +#if LWIP_TSO + if ((pos < len) && (space > 0) && (pcb->last_unsent->len > 0) && + (tot_p < (int)pcb->tso.max_send_sge)) { +#else + if ((pos < len) && (space > 0) && (pcb->last_unsent->len > 0)) { +#endif /* LWIP_TSO */ + + u16_t seglen = space < len - pos ? space : len - pos; + + /* Create a pbuf with a copy or reference to seglen bytes. We + * can use PBUF_RAW here since the data appears in the middle of + * a segment. A header will never be prepended. */ + /* Data is copied */ + if ((concat_p = tcp_pbuf_prealloc(seglen, space, &oversize, pcb, TCP_WRITE_FLAG_MORE, 1)) == NULL) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 2, + ("tcp_write : could not allocate memory for pbuf copy size %"U16_F"\n", + seglen)); + goto memerr; + } +#if TCP_OVERSIZE_DBGCHECK + pcb->last_unsent->oversize_left += oversize; +#endif /* TCP_OVERSIZE_DBGCHECK */ + TCP_DATA_COPY2(concat_p->payload, (u8_t*)arg + pos, seglen, &concat_chksum, &concat_chksum_swapped); +#if TCP_CHECKSUM_ON_COPY + concat_chksummed += seglen; +#endif /* TCP_CHECKSUM_ON_COPY */ + + pos += seglen; + queuelen += pbuf_clen(concat_p); + } + } else { +#if TCP_OVERSIZE + pcb->last_unsent = NULL; + LWIP_ASSERT("unsent_oversize mismatch (pcb->unsent is NULL)", + pcb->unsent_oversize == 0); +#endif /* TCP_OVERSIZE */ + } + + /* + * Phase 3: Create new segments. + * + * The new segments are chained together in the local 'queue' + * variable, ready to be appended to pcb->unsent. + */ + while (pos < len) { + struct pbuf *p; + u32_t left = len - pos; + u16_t max_len = mss_local - optlen; + u16_t seglen = left > max_len ? max_len : left; + + /* If copy is set, memory should be allocated and data copied + * into pbuf */ + if ((p = tcp_pbuf_prealloc(seglen + optlen, max_len, &oversize, pcb, TCP_WRITE_FLAG_MORE, queue == NULL)) == NULL) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 2, ("tcp_write : could not allocate memory for pbuf copy size %"U16_F"\n", seglen)); + goto memerr; + } + LWIP_ASSERT("tcp_write: check that first pbuf can hold the complete seglen", + (p->len >= seglen)); + if (apiflags & TCP_WRITE_FILE) { + piov[piov_cur_index].iov_base = (void *)((char *)p->payload + optlen); + piov[piov_cur_index].iov_len = seglen; + + piov_cur_index++; + piov_cur_len += seglen; + if ((left <= seglen ) || (piov_cur_index >= piov_max_size) || (piov_cur_len >= piov_max_len)) { + int ret = 0; + int fd = *(int *)arg; + ret = sys_readv(fd, piov, piov_cur_index); + /* Set as failure any unexpected return values because tcp_write() function + * does not support partial write + */ + if (ret != piov_cur_len) { + goto memerr; + } + piov_cur_index = 0; + piov_cur_len = 0; + } + } else { + TCP_DATA_COPY2((char *)p->payload + optlen, (u8_t*)arg + pos, seglen, &chksum, &chksum_swapped); + } + + queuelen += pbuf_clen(p); + + /* Now that there are more segments queued, we check again if the + * length of the queue exceeds the configured maximum or + * overflows. */ + if ((queuelen > pcb->max_unsent_len) || (queuelen > TCP_SNDQUEUELEN_OVERFLOW)) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 2, ("tcp_write: queue too long %"U32_F" (%"U32_F")\n", queuelen, pcb->max_unsent_len)); + tcp_tx_pbuf_free(pcb, p); + goto memerr; + } + + if ((seg = tcp_create_segment(pcb, p, 0, pcb->snd_lbb + pos, optflags)) == NULL) { + goto memerr; + } +#if TCP_OVERSIZE_DBGCHECK + seg->oversize_left = oversize; +#endif /* TCP_OVERSIZE_DBGCHECK */ +#if TCP_CHECKSUM_ON_COPY + seg->chksum = chksum; + seg->chksum_swapped = chksum_swapped; + seg->flags |= TF_SEG_DATA_CHECKSUMMED; +#endif /* TCP_CHECKSUM_ON_COPY */ + + /* first segment of to-be-queued data? */ + if (queue == NULL) { + queue = seg; + } else { + /* Attach the segment to the end of the queued segments */ + LWIP_ASSERT("prev_seg != NULL", prev_seg != NULL); + prev_seg->next = seg; + } + /* remember last segment of to-be-queued data for next iteration */ + prev_seg = seg; + + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | LWIP_DBG_TRACE, ("tcp_write: queueing %"U32_F":%"U32_F"\n", + ntohl(seg->tcphdr->seqno), + ntohl(seg->tcphdr->seqno) + TCP_TCPLEN(seg))); + + pos += seglen; + } + + /* + * All three segmentation phases were successful. We can commit the + * transaction. + */ + + /* + * Phase 1: If data has been added to the preallocated tail of + * last_unsent, we update the length fields of the pbuf chain. + */ +#if TCP_OVERSIZE + if (oversize_used > 0) { + struct pbuf *p; + /* Bump tot_len of whole chain, len of tail */ + for (p = pcb->last_unsent->p; p; p = p->next) { + p->tot_len += oversize_used; + if (p->next == NULL) { + TCP_DATA_COPY((char *)p->payload + p->len, arg, oversize_used, pcb->last_unsent); + p->len += oversize_used; + } + } + pcb->last_unsent->len += oversize_used; +#if TCP_OVERSIZE_DBGCHECK + pcb->last_unsent->oversize_left -= oversize_used; +#endif /* TCP_OVERSIZE_DBGCHECK */ + } + pcb->unsent_oversize = oversize; +#endif /* TCP_OVERSIZE */ + + /* + * Phase 2: concat_p can be concatenated onto pcb->last_unsent->p + */ + if (concat_p != NULL) { + LWIP_ASSERT("tcp_write: cannot concatenate when pcb->unsent is empty", + (pcb->last_unsent != NULL)); + pbuf_cat(pcb->last_unsent->p, concat_p); + pcb->last_unsent->len += concat_p->tot_len; +#if TCP_CHECKSUM_ON_COPY + if (concat_chksummed) { + tcp_seg_add_chksum(concat_chksum, concat_chksummed, &pcb->last_unsent->chksum, + &pcb->last_unsent->chksum_swapped); + pcb->last_unsent->flags |= TF_SEG_DATA_CHECKSUMMED; + } +#endif /* TCP_CHECKSUM_ON_COPY */ + } + + /* + * Phase 3: Append queue to pcb->unsent. Queue may be NULL, but that + * is harmless + */ + if (pcb->last_unsent == NULL) { + pcb->unsent = queue; + } else { + pcb->last_unsent->next = queue; + } + pcb->last_unsent = seg; + + /* + * Finally update the pcb state. + */ + pcb->snd_lbb += len; + pcb->snd_buf -= len; + pcb->snd_queuelen = queuelen; + + LWIP_DEBUGF(TCP_QLEN_DEBUG, ("tcp_write: %"S16_F" (after enqueued)\n", + pcb->snd_queuelen)); + if (pcb->snd_queuelen != 0) { + LWIP_ASSERT("tcp_write: valid queue length", + pcb->unacked != NULL || pcb->unsent != NULL); + } + + /* Set the PSH flag in the last segment that we enqueued. */ + if (seg != NULL && seg->tcphdr != NULL) { + TCPH_SET_FLAG(seg->tcphdr, TCP_PSH); + } + + LWIP_DEBUGF(TCP_TSO_DEBUG | LWIP_DBG_TRACE, + ("tcp_write: mss: %-5d unsent %s\n", mss_local, _dump_seg(pcb->unsent))); + + return ERR_OK; +memerr: + pcb->flags |= TF_NAGLEMEMERR; + TCP_STATS_INC(tcp.memerr); + + if (concat_p != NULL) { + tcp_tx_pbuf_free(pcb, concat_p); + } + if (queue != NULL) { + tcp_tx_segs_free(pcb, queue); + } + if (pcb->snd_queuelen != 0) { + LWIP_ASSERT("tcp_write: valid queue length", pcb->unacked != NULL || + pcb->unsent != NULL); + } + LWIP_DEBUGF(TCP_QLEN_DEBUG | LWIP_DBG_STATE, ("tcp_write: %"S16_F" (with mem err)\n", pcb->snd_queuelen)); + return ERR_MEM; +} + +/** + * Enqueue TCP options for transmission. + * + * Called by tcp_connect(), tcp_listen_input(), and tcp_send_ctrl(). + * + * @param pcb Protocol control block for the TCP connection. + * @param flags TCP header flags to set in the outgoing segment. + * @param optdata pointer to TCP options, or NULL. + * @param optlen length of TCP options in bytes. + */ +err_t +tcp_enqueue_flags(struct tcp_pcb *pcb, u8_t flags) +{ + struct pbuf *p; + struct tcp_seg *seg; + u8_t optflags = 0; + u8_t optlen = 0; + + LWIP_DEBUGF(TCP_QLEN_DEBUG, ("tcp_enqueue_flags: queuelen: %"U16_F"\n", (u16_t)pcb->snd_queuelen)); + + LWIP_ASSERT("tcp_enqueue_flags: need either TCP_SYN or TCP_FIN in flags (programmer violates API)", + (flags & (TCP_SYN | TCP_FIN)) != 0); + + /* check for configured max queuelen and possible overflow (FIN flag should always come through!)*/ + if (((pcb->snd_queuelen >= pcb->max_unsent_len) || (pcb->snd_queuelen > TCP_SNDQUEUELEN_OVERFLOW)) && + ((flags & TCP_FIN) == 0)) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 3, ("tcp_enqueue_flags: too long queue %"U16_F" (max %"U16_F")\n", + pcb->snd_queuelen, pcb->max_unsent_len)); + TCP_STATS_INC(tcp.memerr); + pcb->flags |= TF_NAGLEMEMERR; + return ERR_MEM; + } + + if (flags & TCP_SYN) { + optflags = TF_SEG_OPTS_MSS; + if(enable_wnd_scale && ((get_tcp_state(pcb) != SYN_RCVD) || (pcb->flags & TF_WND_SCALE))) { + /* In a (sent in state SYN_RCVD), the window scale option may only + be sent if we received a window scale option from the remote host. */ + optflags |= TF_SEG_OPTS_WNDSCALE; + } + #if LWIP_TCP_TIMESTAMPS + if (pcb->enable_ts_opt && !(flags & TCP_ACK)) { + // enable initial timestamp announcement only for the connecting side. accepting side reply accordingly. + optflags |= TF_SEG_OPTS_TS; + } + #endif + } +#if LWIP_TCP_TIMESTAMPS + if ((pcb->flags & TF_TIMESTAMP)) { + optflags |= TF_SEG_OPTS_TS; + } +#endif /* LWIP_TCP_TIMESTAMPS */ + optlen = LWIP_TCP_OPT_LENGTH(optflags); + + /* tcp_enqueue_flags is always called with either SYN or FIN in flags. + * We need one available snd_buf byte to do that. + * This means we can't send FIN while snd_buf==0. A better fix would be to + * not include SYN and FIN sequence numbers in the snd_buf count. */ + + /*if (pcb->snd_buf == 0) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 3, ("tcp_enqueue_flags: no send buffer available\n")); + TCP_STATS_INC(tcp.memerr); + return ERR_MEM; + }*/ //to consider snd_buf for syn or fin, unmarked sections with SND_BUF_FOR_SYN_FIN + + /* Allocate pbuf with room for TCP header + options */ + if ((p = tcp_tx_pbuf_alloc(pcb, optlen, PBUF_RAM)) == NULL) { + pcb->flags |= TF_NAGLEMEMERR; + TCP_STATS_INC(tcp.memerr); + return ERR_MEM; + } + LWIP_ASSERT("tcp_enqueue_flags: check that first pbuf can hold optlen", + (p->len >= optlen)); + + /* Allocate memory for tcp_seg, and fill in fields. */ + if ((seg = tcp_create_segment(pcb, p, flags, pcb->snd_lbb, optflags)) == NULL) { + pcb->flags |= TF_NAGLEMEMERR; + TCP_STATS_INC(tcp.memerr); + return ERR_MEM; + } + LWIP_ASSERT("tcp_enqueue_flags: invalid segment length", seg->len == 0); + + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | LWIP_DBG_TRACE, + ("tcp_enqueue_flags: queueing %"U32_F":%"U32_F" (0x%"X16_F")\n", + ntohl(seg->tcphdr->seqno), + ntohl(seg->tcphdr->seqno) + TCP_TCPLEN(seg), + (u16_t)flags)); + + /* Now append seg to pcb->unsent queue */ + if (pcb->unsent == NULL) { + pcb->unsent = seg; + } else { + struct tcp_seg *useg; + for (useg = pcb->unsent; useg->next != NULL; useg = useg->next); + useg->next = seg; + } +#if TCP_OVERSIZE + /* The new unsent tail has no space */ + pcb->unsent_oversize = 0; +#endif /* TCP_OVERSIZE */ + + /* SYN and FIN bump the sequence number */ + if ((flags & TCP_SYN) || (flags & TCP_FIN)) { + pcb->snd_lbb++; + /* optlen does not influence snd_buf */ + // pcb->snd_buf--; SND_BUF_FOR_SYN_FIN + } + if (flags & TCP_FIN) { + pcb->flags |= TF_FIN; + } + + /* update number of segments on the queues */ + pcb->snd_queuelen += pbuf_clen(seg->p); + LWIP_DEBUGF(TCP_QLEN_DEBUG, ("tcp_enqueue_flags: %"S16_F" (after enqueued)\n", pcb->snd_queuelen)); + if (pcb->snd_queuelen != 0) { + LWIP_ASSERT("tcp_enqueue_flags: invalid queue length", + pcb->unacked != NULL || pcb->unsent != NULL); + } + + return ERR_OK; +} + + +#if LWIP_TCP_TIMESTAMPS +/* Build a timestamp option (12 bytes long) at the specified options pointer) + * + * @param pcb tcp_pcb + * @param opts option pointer where to store the timestamp option + */ +static void +tcp_build_timestamp_option(struct tcp_pcb *pcb, u32_t *opts) +{ + /* Pad with two NOP options to make everything nicely aligned */ + opts[0] = PP_HTONL(0x0101080A); + opts[1] = htonl(sys_now()); + opts[2] = htonl(pcb->ts_recent); +} +#endif + +/** Send an ACK without data. + * + * @param pcb Protocol control block for the TCP connection to send the ACK + */ +err_t +tcp_send_empty_ack(struct tcp_pcb *pcb) +{ + struct pbuf *p; + struct tcp_hdr *tcphdr; + u8_t optlen = 0; + u32_t *opts; + +#if LWIP_TCP_TIMESTAMPS + if (pcb->flags & TF_TIMESTAMP) { + optlen = LWIP_TCP_OPT_LENGTH(TF_SEG_OPTS_TS); + } +#endif + + p = tcp_output_alloc_header(pcb, optlen, 0, htonl(pcb->snd_nxt)); + if (p == NULL) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG, ("tcp_output: (ACK) could not allocate pbuf\n")); + return ERR_BUF; + } + tcphdr = (struct tcp_hdr *)p->payload; + LWIP_DEBUGF(TCP_OUTPUT_DEBUG, + ("tcp_output: sending ACK for %"U32_F"\n", pcb->rcv_nxt)); + /* remove ACK flags from the PCB, as we send an empty ACK now */ + pcb->flags &= ~(TF_ACK_DELAY | TF_ACK_NOW); + + opts = (u32_t *)(void *)(tcphdr + 1); + + /* NB. MSS option is only sent on SYNs, so ignore it here */ +#if LWIP_TCP_TIMESTAMPS + pcb->ts_lastacksent = pcb->rcv_nxt; + + if (pcb->flags & TF_TIMESTAMP) { + tcp_build_timestamp_option(pcb, opts ); + opts += 3; + } +#endif +#if LWIP_TSO + pcb->ip_output(p, pcb, 0); +#else + pcb->ip_output(p, pcb, 0, 0); +#endif /* LWIP_TSO */ + tcp_tx_pbuf_free(pcb, p); + + (void)opts; /* Fix warning -Wunused-but-set-variable */ + + return ERR_OK; +} + +#if LWIP_TSO +/** + * Called by tcp_output() to actually join few following TCP segments + * in one to send a TCP segment over IP using Large Segment Offload method. + * + * @param pcb the tcp_pcb for the TCP connection used to send the segment + * @param seg the tcp_seg to send + * @param wnd current wnd + * @return pbuf with p->payload being the tcp_hdr + */ +static void +tcp_tso_segment(struct tcp_pcb *pcb, struct tcp_seg *seg, u32_t wnd) +{ + struct tcp_seg *cur_seg = seg; + u32_t max_payload_sz = LWIP_MIN(pcb->tso.max_payload_sz, (wnd - (seg->seqno - pcb->lastack))); + u32_t tot_len = 0; + u8_t flags = seg->flags; + int tot_p = 0; + + /* Ignore retransmitted segments and special segments + */ + if (TCP_SEQ_LT(seg->seqno, pcb->snd_nxt) || + (seg->flags & (TF_SEG_OPTS_TSO | TF_SEG_OPTS_DUMMY_MSG)) || + ((TCPH_FLAGS(seg->tcphdr) & (~(TCP_ACK | TCP_PSH))) != 0)) { + goto err; + } + + while (cur_seg && cur_seg->next && + (cur_seg->flags == flags) && + ((TCPH_FLAGS(cur_seg->tcphdr) & (~(TCP_ACK | TCP_PSH))) == 0)) { + + tot_len += cur_seg->len; + if (tot_len > max_payload_sz) { + goto err; + } + + tot_p += pbuf_clen(cur_seg->p); + if (tot_p > (int)pcb->tso.max_send_sge) { + goto err; + } + + if (seg != cur_seg) { + /* Update the original segment with current segment details */ + seg->next = cur_seg->next; + seg->len += cur_seg->len; + + /* Update the first pbuf of current segment */ + cur_seg->p->payload = (u8_t *)cur_seg->tcphdr + LWIP_TCP_HDRLEN(cur_seg->tcphdr); + cur_seg->p->len = cur_seg->len - (cur_seg->p->tot_len - cur_seg->p->len); + cur_seg->p->tot_len = cur_seg->len; + + /* Concatenate two pbufs (each may be a pbuf chain) and + * update tot_len values for all pbuf in the chain + */ + pbuf_cat(seg->p, cur_seg->p); + + /* Free joined segment w/o releasing pbuf + * tcp_seg_free() and tcp_segs_free() release pbuf chain + */ + external_tcp_seg_free(pcb, cur_seg); + } + cur_seg = seg->next; + } + +err: + + /* All segments that greater than MSS must be processed as TSO segments + * For example it can be actual for segments with large (more than MSS) buffer size + */ + if (seg->len > pcb->mss) { + seg->flags |= TF_SEG_OPTS_TSO; + } + +#if TCP_TSO_DEBUG + LWIP_DEBUGF(TCP_TSO_DEBUG | LWIP_DBG_TRACE, + ("tcp_join: max: %-5d unsent %s\n", + max_payload_sz, _dump_seg(pcb->unsent))); +#endif /* TCP_TSO_DEBUG */ + + return; +} + +static struct tcp_seg * +tcp_split_one_segment(struct tcp_pcb *pcb, struct tcp_seg *seg, u32_t lentosend, u8_t optflags, u8_t optlen) +{ + struct tcp_seg *cur_seg = NULL; + struct tcp_seg *new_seg = NULL; + struct pbuf *cur_p = NULL; + u16_t max_length = 0; + u16_t oversize = 0; + + cur_seg = seg; + max_length = cur_seg->p->len; + while ((cur_seg->p->len == cur_seg->p->tot_len) && (cur_seg->len > lentosend)) { + + u32_t lentoqueue = cur_seg->len - lentosend; + + /* Allocate memory for p_buf and fill in fields. */ + if (NULL == (cur_p = tcp_pbuf_prealloc(lentoqueue + optlen, max_length, &oversize, pcb, 0, 0))) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 2, ("tcp_split_one_segment: could not allocate memory for pbuf copy size %"U16_F"\n", (lentoqueue + optlen))); + goto err; + } + + /* Do prefetch to avoid no memory issue during segment creation with + * predefined pbuf. It allows to avoid releasing pbuf during failure processing. + */ + if (!pcb->seg_alloc) { + if (NULL == (pcb->seg_alloc = tcp_create_segment(pcb, NULL, 0, 0, 0))) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 2, ("tcp_split_one_segment: could not allocate memory for segment\n")); + tcp_tx_pbuf_free(pcb, cur_p); + goto err; + } + } + + /* Copy the data from the original buffer */ + TCP_DATA_COPY2((char *)cur_p->payload + optlen, (u8_t *)cur_seg->tcphdr + LWIP_TCP_HDRLEN(cur_seg->tcphdr) + lentosend, lentoqueue , &chksum, &chksum_swapped); + + /* Update new buffer */ + cur_p->tot_len = cur_seg->p->tot_len - lentosend - TCP_HLEN ; + cur_p->next = cur_seg->p->next; + + /* Fill in tcp_seg (allocation was done before). + * Do not expect NULL but it is possible as far as pbuf_header(p, TCP_HLEN) can return NULL inside tcp_create_segment() + */ + if (NULL == (new_seg = tcp_create_segment(pcb, cur_p, 0, cur_seg->seqno + lentosend, optflags))) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 2, ("tcp_split_one_segment: could not allocate memory for segment\n")); + goto err; + } + + /* New segment update */ + new_seg->next = cur_seg->next; + new_seg->flags = cur_seg->flags; + + /* Update original buffer */ + cur_seg->p->next = NULL; + cur_seg->p->len = cur_seg->p->len - lentoqueue; + cur_seg->p->tot_len = cur_seg->p->len; + + /* Original segment update */ + cur_seg->next = new_seg; + cur_seg->len = cur_seg->p->len - (TCP_HLEN + optlen); + + cur_seg = new_seg; + + /* Update number of buffer to be send */ + pcb->snd_queuelen++; + + /* Update last unsent segment */ + if (pcb->last_unsent == seg) { + pcb->last_unsent = new_seg; + pcb->unsent_oversize = oversize; + } + } + + return seg; + +err: + if (cur_seg->len > pcb->mss) { + cur_seg->flags |= TF_SEG_OPTS_TSO; + } + return NULL; +} + + /** + * Called by tcp_output() to process TCP segment with ref > 1. + * This call should process retransmitted TSO segment. + * + * @param pcb the tcp_pcb for the TCP connection used to send the segment + * @param seg the tcp_seg to send + * @param wnd current window size + * @return current segment to proceed + */ +static struct tcp_seg * +tcp_rexmit_segment(struct tcp_pcb *pcb, struct tcp_seg *seg, u32_t wnd) +{ + struct tcp_seg *cur_seg = NULL; + struct tcp_seg *new_seg = NULL; + struct pbuf *cur_p = NULL; + u16_t mss_local = 0; + u8_t optflags = 0; + u8_t optlen = 0; + + /* Use ref = 1 or TCP_SEQ_LT(seg->seqno, pcb->snd_nxt) can be used as + * retransmission attribute + */ + if ((NULL == seg) || (NULL == seg->p) || + ((seg->p->ref == 1) && ((seg->len + seg->seqno - pcb->lastack) <= wnd))) { + return seg; + } + +#if LWIP_TCP_TIMESTAMPS + if ((pcb->flags & TF_TIMESTAMP)) { + optflags |= TF_SEG_OPTS_TS; + } +#endif /* LWIP_TCP_TIMESTAMPS */ + + optlen += LWIP_TCP_OPT_LENGTH(optflags); + + mss_local = tcp_xmit_size_goal(pcb, 0); + + cur_seg = seg; + cur_seg->flags &= (~TF_SEG_OPTS_TSO); + cur_p = cur_seg->p->next; + while (cur_p) { + /* Do prefetch to avoid no memory issue during segment creation with + * predefined pbuf. It allows to avoid releasing pbuf inside tcp_create_segment() + * during failure processing. + */ + if (!pcb->seg_alloc) { + if (NULL == (pcb->seg_alloc = tcp_create_segment(pcb, NULL, 0, 0, 0))) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 2, ("tcp_split_segment: could not allocate memory for segment\n")); + return seg; + } + } + + cur_p->len += optlen; + cur_p->tot_len = cur_p->len; + cur_p->payload = (u8_t *)cur_p->payload - optlen; + + /* Fill in tcp_seg (allocation was done before). + * Do not expect NULL but it is possible as far as pbuf_header(p, TCP_HLEN) can return NULL inside tcp_create_segment() + */ + if (NULL == (new_seg = tcp_create_segment(pcb, cur_p, 0, cur_seg->seqno + cur_seg->p->len - TCP_HLEN - optlen, optflags))) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 2, ("tcp_split_one_segment: could not allocate memory for segment\n")); + if (cur_seg->len > pcb->mss) { + cur_seg->flags |= TF_SEG_OPTS_TSO; + } + return seg; + } + + /* New segment update */ + new_seg->next = cur_seg->next; + new_seg->flags = cur_seg->flags; + + /* Original segment update */ + cur_seg->next = new_seg; + cur_seg->len = cur_seg->p->len - TCP_HLEN - optlen; + cur_seg->p->tot_len = cur_seg->p->len; + + cur_seg->p->next = NULL; + if (NULL == tcp_split_one_segment(pcb, cur_seg, mss_local, optflags, optlen)) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 2, ("tcp_split_one_segment: could not allocate memory for segment\n")); + if (new_seg->len > pcb->mss) { + new_seg->flags |= TF_SEG_OPTS_TSO; + } + return seg; + } + cur_seg = new_seg; + + cur_p = cur_seg->p->next; + } + + if (NULL == tcp_split_one_segment(pcb, cur_seg, mss_local, optflags, optlen)) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 2, ("tcp_split_one_segment: could not allocate memory for segment\n")); + return seg; + } + +#if TCP_TSO_DEBUG + LWIP_DEBUGF(TCP_TSO_DEBUG | LWIP_DBG_TRACE, + ("tcp_rexmit: cwnd: %-5d unsent %s\n", + pcb->cwnd, _dump_seg(pcb->unsent))); +#endif /* TCP_TSO_DEBUG */ + + return seg; +} +#endif /* LWIP_TSO */ + +void +tcp_split_segment(struct tcp_pcb *pcb, struct tcp_seg *seg, u32_t wnd) +{ + struct pbuf *p = NULL; + struct tcp_seg *newseg = NULL; + u32_t lentosend = 0; + u16_t oversize = 0; + u8_t optlen = 0, optflags = 0; + u16_t mss_local = 0; + + if ((NULL == seg) || (NULL == seg->p) || + ((seg->seqno - pcb->lastack) >= wnd) || (seg->p->ref > 1)) { + return ; + } + + lentosend = (wnd - (seg->seqno - pcb->lastack)); + +#if LWIP_TSO + mss_local = tcp_xmit_size_goal(pcb, 0); +#else + /* don't allocate segments bigger than half the maximum window we ever received */ + mss_local = LWIP_MIN(pcb->mss, pcb->snd_wnd_max / 2); + mss_local = mss_local ? mss_local : pcb->mss; + +#if LWIP_TCP_TIMESTAMPS + if ((pcb->flags & TF_TIMESTAMP)) { + optflags |= TF_SEG_OPTS_TS; + /* ensure that segments can hold at least one data byte... */ + mss_local = LWIP_MAX(mss_local, LWIP_TCP_OPT_LEN_TS + 1); + } +#endif /* LWIP_TCP_TIMESTAMPS */ +#endif /* LWIP_TSO */ + + optlen += LWIP_TCP_OPT_LENGTH( optflags ); + + if (seg->p->len > ((TCP_HLEN + optlen) + lentosend)) {/* First buffer is too big, split it */ + u32_t lentoqueue = seg->p->len - (TCP_HLEN + optlen) - lentosend; + + if (NULL == (p = tcp_pbuf_prealloc(lentoqueue + optlen, mss_local, &oversize, pcb, 0, 0))) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 2, ("tcp_split_segment: could not allocate memory for pbuf copy size %"U16_F"\n", (lentoqueue + optlen))); + return; + } + + /* Copy the data from the original buffer */ +#if LWIP_TSO + TCP_DATA_COPY2((char *)p->payload + optlen, (u8_t *)seg->tcphdr + LWIP_TCP_HDRLEN(seg->tcphdr) + lentosend, lentoqueue , &chksum, &chksum_swapped); +#else + TCP_DATA_COPY2((char *)p->payload + optlen, (u8_t *)seg->dataptr + lentosend, lentoqueue , &chksum, &chksum_swapped); +#endif /* LWIP_TSO */ + + /* Update new buffer */ + p->tot_len = seg->p->tot_len - lentosend - TCP_HLEN ; + p->next = seg->p->next; + + /* Allocate memory for tcp_seg and fill in fields. */ + if (NULL == (newseg = tcp_create_segment(pcb, p, 0, seg->seqno + lentosend, optflags))) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 2, ("tcp_split_segment: could not allocate memory for segment\n")); + return; + } + + /* Update original buffer */ + seg->p->next = NULL; + seg->p->len = seg->p->len - lentoqueue; + seg->p->tot_len = seg->p->len; + + /* New segment update */ + newseg->next = seg->next; + newseg->flags = seg->flags; + + /* Original segment update */ + seg->next = newseg; + seg->len = seg->p->len - (TCP_HLEN + optlen); + + /* Set the PSH flag in the last segment that we enqueued. */ + TCPH_SET_FLAG(newseg->tcphdr, TCP_PSH); + + /* Update number of buffer to be send */ + pcb->snd_queuelen++; + + /* Update last unsent segment */ + if (pcb->last_unsent == seg) { + pcb->last_unsent = newseg; + pcb->unsent_oversize = oversize; + } + } + else if (seg->p->next) { + /* Segment with more than one pbuffer and seg->p->len <= lentosend + split segment pbuff chain. At least one pBuffer will be sent */ + struct pbuf *pnewhead = seg->p->next; + struct pbuf *pnewtail = seg->p; + struct pbuf *ptmp = seg->p; + u32_t headchainlen = seg->p->len; + + while ((headchainlen + pnewhead->len - (TCP_HLEN + optlen))<= lentosend) { + if (pnewtail->ref > 1) { + return; + } + + headchainlen += pnewhead->len; + pnewtail = pnewhead; + pnewhead = pnewhead->next; + + if (NULL == pnewhead) { + LWIP_ASSERT("tcp_split_segment: We should not be here",0); + return; + } + } + + /* Allocate memory for tcp_seg, and fill in fields. */ + if (NULL == (newseg = tcp_create_segment(pcb, pnewhead, 0, seg->seqno + headchainlen - (TCP_HLEN + optlen), optflags))) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG | 2, ("tcp_split_segment: could not allocate memory for segment\n")); + return; + } + + /* Update new tail */ + pnewtail->next = NULL; + + /* New segment update */ + newseg->next = seg->next; + newseg->flags = seg->flags; + + /* Original segment update */ + seg->next = newseg; + seg->len = headchainlen - (TCP_HLEN + optlen); + + /* Update original buffers */ + while (ptmp) { + ptmp->tot_len = headchainlen; + headchainlen -= ptmp->len; + ptmp = ptmp->next; + } + + /* Update last unsent segment */ + if (pcb->last_unsent == seg) { + pcb->last_unsent = newseg; + } + } + else { + LWIP_ASSERT("tcp_split_segment: We should not be here [else]",0); + } + +#if TCP_TSO_DEBUG + LWIP_DEBUGF(TCP_TSO_DEBUG | LWIP_DBG_TRACE, + ("tcp_split: max: %-5d unsent %s\n", + lentosend, _dump_seg(pcb->unsent))); +#endif /* TCP_TSO_DEBUG */ + + return; +} + +/** + * Check whether the input data_len fits the window + * + * @param pcb Protocol control block for the TCP connection to send data + * @parma data_len length to be checked + * @return 1 if input size fits the window, else 0. + */ +s32_t +tcp_is_wnd_available(struct tcp_pcb *pcb, u32_t data_len) +{ + s32_t tot_unacked_len = 0; + s32_t tot_unsent_len = 0; + s32_t wnd = (s32_t)(LWIP_MIN(pcb->snd_wnd, pcb->cwnd)); + s32_t tot_opts_hdrs_len = 0; + +#if LWIP_TCP_TIMESTAMPS + if (pcb->flags & TF_TIMESTAMP) { + /* The default TCP Maximum Segment Size is 536 (LWIP_TCP_MSS) - RFC-879 */ + u16_t mss = pcb->mss ? pcb->mss : LWIP_TCP_MSS; + u16_t mss_local = LWIP_MIN(pcb->mss, pcb->snd_wnd_max / 2); + mss_local = mss_local ? mss_local : mss; + tot_opts_hdrs_len = ((LWIP_TCP_OPT_LENGTH(TF_SEG_OPTS_TS)) * (1 + ((data_len - 1) / (mss_local)))); + } +#endif + + if (pcb->unacked) { + tot_unacked_len = pcb->last_unacked->seqno - pcb->unacked->seqno + pcb->last_unacked->len; + } + + if (pcb->unsent) { + tot_unsent_len = pcb->last_unsent->seqno - pcb->unsent->seqno + pcb->last_unsent->len; + } + + return ((wnd - tot_unacked_len) >= (tot_unsent_len + (tot_opts_hdrs_len + (s32_t)data_len))); +} + +/** + * Find out what we can send and send it + * + * @param pcb Protocol control block for the TCP connection to send data + * @return ERR_OK if data has been sent or nothing to send + * another err_t on error + */ +err_t +tcp_output(struct tcp_pcb *pcb) +{ + struct tcp_seg *seg, *useg; + u32_t wnd, snd_nxt; +#if TCP_CWND_DEBUG + s16_t i = 0; +#endif /* TCP_CWND_DEBUG */ + + /* First, check if we are invoked by the TCP input processing + code. If so, we do not output anything. Instead, we rely on the + input processing code to call us when input processing is done + with. */ + if (tcp_input_pcb == pcb) { + return ERR_OK; + } + + wnd = LWIP_MIN(pcb->snd_wnd, pcb->cwnd); + + LWIP_DEBUGF(TCP_CWND_DEBUG, ("tcp_output: snd_wnd %"U32_F", cwnd %"U32_F + ", wnd %"U32_F"\n",pcb->snd_wnd, pcb->cwnd, wnd )); + seg = pcb->unsent; + + /* If the TF_ACK_NOW flag is set and no data will be sent (either + * because the ->unsent queue is empty or because the window does + * not allow it), construct an empty ACK segment and send it. + * + * If data is to be sent, we will just piggyback the ACK (see below). + */ + if ((pcb->flags & TF_ACK_NOW) && + (seg == NULL || + seg->seqno - pcb->lastack + seg->len > wnd)) { + return tcp_send_empty_ack(pcb); + } + +#if TCP_OUTPUT_DEBUG + if (seg == NULL) { + LWIP_DEBUGF(TCP_OUTPUT_DEBUG, ("tcp_output: nothing to send (%p)\n", + (void*)pcb->unsent)); + } +#endif /* TCP_OUTPUT_DEBUG */ +#if TCP_CWND_DEBUG + if (seg == NULL) { + LWIP_DEBUGF(TCP_CWND_DEBUG, ("tcp_output: snd_wnd %"U32_F + ", cwnd %"U32_F", wnd %"U32_F + ", seg == NULL, ack %"U32_F"\n", + pcb->snd_wnd, pcb->cwnd, wnd, pcb->lastack)); + } else { + LWIP_DEBUGF(TCP_CWND_DEBUG, + ("tcp_output: snd_wnd %"U32_F", cwnd %"U32_F", wnd %"U32_F + ", effwnd %"U32_F", seq %"U32_F", ack %"U32_F"\n", + pcb->snd_wnd, pcb->cwnd, wnd, + ntohl(seg->tcphdr->seqno) - pcb->lastack + seg->len, + ntohl(seg->tcphdr->seqno), pcb->lastack)); + } +#endif /* TCP_CWND_DEBUG */ +#if TCP_TSO_DEBUG + if (seg) { + LWIP_DEBUGF(TCP_TSO_DEBUG | LWIP_DBG_TRACE, + ("tcp_output: wnd: %-5d unsent %s\n", + wnd, _dump_seg(pcb->unsent))); + } +#endif /* TCP_TSO_DEBUG */ + + while (seg) { +#if LWIP_TSO + /* TSO segment can be in unsent queue only in case retransmission + * The purpose of this processing is to avoid to send TSO segment + * during retransmition. + */ + if (seg->flags & TF_SEG_OPTS_TSO) { + seg = tcp_rexmit_segment(pcb, seg, wnd); + } +#endif /* LWIP_TSO */ + + /* Split the segment in case of a small window */ + if ((NULL == pcb->unacked) && (wnd) && ((seg->len + seg->seqno - pcb->lastack) > wnd)) { + LWIP_ASSERT("tcp_output: no window for dummy packet", !LWIP_IS_DUMMY_SEGMENT(seg)); + tcp_split_segment(pcb, seg, wnd); + } + + /* data available and window allows it to be sent? */ + if (((seg->seqno - pcb->lastack + seg->len) <= wnd)){ + LWIP_ASSERT("RST not expected here!", + (TCPH_FLAGS(seg->tcphdr) & TCP_RST) == 0); + + /* Stop sending if the nagle algorithm would prevent it + * Don't stop: + * - if tcp_write had a memory error before (prevent delayed ACK timeout) or + * - if this is not a dummy segment + * - if FIN was already enqueued for this PCB (SYN is always alone in a segment - + * either seg->next != NULL or pcb->unacked == NULL; + * RST is no sent using tcp_write/tcp_output. + */ + if((tcp_do_output_nagle(pcb) == 0) && + !LWIP_IS_DUMMY_SEGMENT(seg) && + ((pcb->flags & (TF_NAGLEMEMERR | TF_FIN)) == 0)){ + if ( pcb->snd_sml_snt > (pcb->unacked ? pcb->unacked->len : 0) ) { + break; + } + else { + if ( (u32_t)((seg->next ? seg->next->len : 0) + seg->len) <= pcb->snd_sml_add ) { + pcb->snd_sml_snt = pcb->snd_sml_add; + } + } + } + +#if LWIP_TSO + /* Use TSO send operation in case TSO is enabled + * and current segment is not retransmitted + */ + if (tcp_tso(pcb)) { + tcp_tso_segment(pcb, seg, wnd); + } +#endif /* LWIP_TSO */ + + #if TCP_CWND_DEBUG + LWIP_DEBUGF(TCP_CWND_DEBUG, ("tcp_output: snd_wnd %"U32_F", cwnd %"U16_F", wnd %"U32_F", effwnd %"U32_F", seq %"U32_F", ack %"U32_F", i %"S16_F"\n", + pcb->snd_wnd, pcb->cwnd, wnd, + ntohl(seg->tcphdr->seqno) + seg->len - + pcb->lastack, + ntohl(seg->tcphdr->seqno), pcb->lastack, i)); + ++i; + #endif /* TCP_CWND_DEBUG */ + + pcb->unsent = seg->next; + + // Send ack now if the packet is a dummy packet + if (LWIP_IS_DUMMY_SEGMENT(seg) && (pcb->flags & (TF_ACK_DELAY | TF_ACK_NOW))) { + tcp_send_empty_ack(pcb); + } + + if (get_tcp_state(pcb) != SYN_SENT) { + TCPH_SET_FLAG(seg->tcphdr, TCP_ACK); + pcb->flags &= ~(TF_ACK_DELAY | TF_ACK_NOW); + } + + #if TCP_OVERSIZE_DBGCHECK + seg->oversize_left = 0; + #endif /* TCP_OVERSIZE_DBGCHECK */ + + tcp_output_segment(seg, pcb); + snd_nxt = seg->seqno + TCP_TCPLEN(seg); + if (TCP_SEQ_LT(pcb->snd_nxt, snd_nxt) && !LWIP_IS_DUMMY_SEGMENT(seg)) { + pcb->snd_nxt = snd_nxt; + } + /* put segment on unacknowledged list if length > 0 */ + if (TCP_TCPLEN(seg) > 0) { + seg->next = NULL; + // unroll dummy segment + if (LWIP_IS_DUMMY_SEGMENT(seg)) { + pcb->snd_lbb -= seg->len; + pcb->snd_buf += seg->len; + pcb->snd_queuelen -= pbuf_clen(seg->p); + tcp_tx_seg_free(pcb, seg); + } else { + /* unacked list is empty? */ + if (pcb->unacked == NULL) { + pcb->unacked = seg; + pcb->last_unacked = seg; + /* unacked list is not empty? */ + } else { + /* In the case of fast retransmit, the packet should not go to the tail + * of the unacked queue, but rather somewhere before it. We need to check for + * this case. -STJ Jul 27, 2004 */ + useg = pcb->last_unacked; + if (TCP_SEQ_LT(seg->seqno, useg->seqno)) { + /* add segment to before tail of unacked list, keeping the list sorted */ + struct tcp_seg **cur_seg = &(pcb->unacked); + while (*cur_seg && + TCP_SEQ_LT((*cur_seg)->seqno, seg->seqno)) { + cur_seg = &((*cur_seg)->next ); + } + LWIP_ASSERT("Value of last_unacked is invalid", + *cur_seg != pcb->last_unacked->next); + seg->next = (*cur_seg); + (*cur_seg) = seg; + } else { + /* add segment to tail of unacked list */ + useg->next = seg; + pcb->last_unacked = seg; + } + } + } + /* do not queue empty segments on the unacked list */ + } else { + tcp_tx_seg_free(pcb, seg); + } + seg = pcb->unsent; + } + else { + break; + } + } + +#if TCP_OVERSIZE + if (pcb->unsent == NULL) { + /* last unsent has been removed, reset unsent_oversize */ + pcb->unsent_oversize = 0; + } +#endif /* TCP_OVERSIZE */ + + pcb->flags &= ~TF_NAGLEMEMERR; + + // Fetch buffers for the next packet. + if (!pcb->seg_alloc) { + // Fetch tcp segment for the next packet. + pcb->seg_alloc = tcp_create_segment(pcb, NULL, 0, 0, 0); + } + + if (!pcb->pbuf_alloc) { + // Fetch pbuf for the next packet. + pcb->pbuf_alloc = tcp_tx_pbuf_alloc(pcb, 0, PBUF_RAM); + } + + return ERR_OK; +} + +/** + * Called by tcp_output() to actually send a TCP segment over IP. + * + * @param seg the tcp_seg to send + * @param pcb the tcp_pcb for the TCP connection used to send the segment + */ +static void +tcp_output_segment(struct tcp_seg *seg, struct tcp_pcb *pcb) +{ + u16_t len; + u32_t *opts; + + /* The TCP header has already been constructed, but the ackno and + wnd fields remain. */ + seg->tcphdr->ackno = htonl(pcb->rcv_nxt); + + + if (seg->flags & TF_SEG_OPTS_WNDSCALE) { + /* The Window field in a SYN segment itself (the only type where we send + the window scale option) is never scaled. */ + seg->tcphdr->wnd = htons(TCPWND_MIN16(pcb->rcv_ann_wnd)); + } else { + /* advertise our receive window size in this TCP segment */ + seg->tcphdr->wnd = htons(TCPWND_MIN16(RCV_WND_SCALE(pcb, pcb->rcv_ann_wnd))); + } + + if (!LWIP_IS_DUMMY_SEGMENT(seg)) { + pcb->rcv_ann_right_edge = pcb->rcv_nxt + pcb->rcv_ann_wnd; + } + /* Add any requested options. NB MSS option is only set on SYN + packets, so ignore it here */ + LWIP_ASSERT("seg->tcphdr not aligned", ((mem_ptr_t)(seg->tcphdr + 1) % 4) == 0); + opts = (u32_t *)(void *)(seg->tcphdr + 1); + if (seg->flags & TF_SEG_OPTS_MSS) { + /* coverity[result_independent_of_operands] */ + TCP_BUILD_MSS_OPTION(*opts, pcb->advtsd_mss); + opts += 1; // Move to the next line (meaning next 32 bit) as this option is 4 bytes long + } + + /* If RCV_SCALE is set then prepare segment for window scaling option */ + if (seg->flags & TF_SEG_OPTS_WNDSCALE) { + TCP_BUILD_WNDSCALE_OPTION(*opts, rcv_wnd_scale); + opts += 1; // Move to the next line (meaning next 32 bit) as this option is 3 bytes long + we added 1 byte NOOP padding => total 4 bytes + } + +#if LWIP_TCP_TIMESTAMPS + if (!LWIP_IS_DUMMY_SEGMENT(seg)) { + pcb->ts_lastacksent = pcb->rcv_nxt; + } + + if (seg->flags & TF_SEG_OPTS_TS) { + tcp_build_timestamp_option(pcb, opts); + /* opts += 3; */ /* Note: suppress warning 'opts' is never read */ // Move to the next line (meaning next 32 bit) as this option is 10 bytes long, 12 with padding (so jump 3 lines) + } +#endif + + /* If we don't have a local IP address, we get one by + calling ip_route(). */ + if (ip_addr_isany(&(pcb->local_ip))) { + LWIP_ASSERT("tcp_output_segment: need to find route to host", 0); + } + + /* Set retransmission timer running if it is not currently enabled */ + if (!LWIP_IS_DUMMY_SEGMENT(seg)) { + if(pcb->rtime == -1) { + pcb->rtime = 0; + } + + if (pcb->rttest == 0) { + pcb->rttest = tcp_ticks; + pcb->rtseq = seg->seqno; + + LWIP_DEBUGF(TCP_RTO_DEBUG, ("tcp_output_segment: rtseq %"U32_F"\n", pcb->rtseq)); + } + } + LWIP_DEBUGF(TCP_OUTPUT_DEBUG, ("tcp_output_segment: %"U32_F":%"U32_F"\n", + htonl(seg->tcphdr->seqno), htonl(seg->tcphdr->seqno) + + seg->len)); + + len = (u16_t)((u8_t *)seg->tcphdr - (u8_t *)seg->p->payload); + + seg->p->len -= len; + seg->p->tot_len -= len; + + seg->p->payload = seg->tcphdr; + + seg->tcphdr->chksum = 0; + + TCP_STATS_INC(tcp.xmit); + +#if LWIP_TSO + u16_t flags = 0; + flags |= seg->flags & TF_SEG_OPTS_DUMMY_MSG; + flags |= seg->flags & TF_SEG_OPTS_TSO; + flags |= (TCP_SEQ_LT(seg->seqno, pcb->snd_nxt) ? TCP_WRITE_REXMIT : 0); + pcb->ip_output(seg->p, pcb, flags); +#else + pcb->ip_output(seg->p, pcb, seg->seqno < pcb->snd_nxt, LWIP_IS_DUMMY_SEGMENT(seg)); +#endif /* LWIP_TSO */ +} + +/** + * Send a TCP RESET packet (empty segment with RST flag set) either to + * abort a connection or to show that there is no matching local connection + * for a received segment. + * + * Called by tcp_abort() (to abort a local connection), tcp_input() (if no + * matching local pcb was found), tcp_listen_input() (if incoming segment + * has ACK flag set) and tcp_process() (received segment in the wrong state) + * + * Since a RST segment is in most cases not sent for an active connection, + * tcp_rst() has a number of arguments that are taken from a tcp_pcb for + * most other segment output functions. + * + * The pcb is given only when its valid and from an output context. + * It is used with the external_ip_output function. + * + * @param seqno the sequence number to use for the outgoing segment + * @param ackno the acknowledge number to use for the outgoing segment + * @param local_ip the local IP address to send the segment from + * @param remote_ip the remote IP address to send the segment to + * @param local_port the local TCP port to send the segment from + * @param remote_port the remote TCP port to send the segment to + */ +void +tcp_rst(u32_t seqno, u32_t ackno, u16_t local_port, u16_t remote_port, struct tcp_pcb *pcb) +{ + struct pbuf *p; + struct tcp_hdr *tcphdr; +#if LWIP_3RD_PARTY_BUFS + if (!pcb) return; +#endif + p = tcp_tx_pbuf_alloc(pcb, 0, PBUF_RAM); + if (p == NULL) { + LWIP_DEBUGF(TCP_DEBUG, ("tcp_rst: could not allocate memory for pbuf\n")); + return; + } + pbuf_header(p, TCP_HLEN); + LWIP_ASSERT("check that first pbuf can hold struct tcp_hdr", + (p->len >= sizeof(struct tcp_hdr))); + + tcphdr = (struct tcp_hdr *)p->payload; + tcphdr->src = htons(local_port); + tcphdr->dest = htons(remote_port); + tcphdr->seqno = htonl(seqno); + tcphdr->ackno = htonl(ackno); + TCPH_HDRLEN_FLAGS_SET(tcphdr, TCP_HLEN/4, TCP_RST | TCP_ACK); + tcphdr->wnd = PP_HTONS(( TCP_WND & 0xFFFF )); + tcphdr->chksum = 0; + tcphdr->urgp = 0; + + TCP_STATS_INC(tcp.xmit); + /* Send output with hardcoded TTL since we have no access to the pcb */ +#if LWIP_TSO + if(pcb) pcb->ip_output(p, pcb, 0); +#else + if(pcb) pcb->ip_output(p, pcb, 0, 0); +#endif /* LWIP_TSO */ + /* external_ip_output(p, NULL, local_ip, remote_ip, TCP_TTL, 0, IP_PROTO_TCP) */; + tcp_tx_pbuf_free(pcb, p); + LWIP_DEBUGF(TCP_RST_DEBUG, ("tcp_rst: seqno %"U32_F" ackno %"U32_F".\n", seqno, ackno)); +} + +/** + * Requeue all unacked segments for retransmission + * + * Called by tcp_slowtmr() for slow retransmission. + * + * @param pcb the tcp_pcb for which to re-enqueue all unacked segments + */ +void +tcp_rexmit_rto(struct tcp_pcb *pcb) +{ + struct tcp_seg *seg; + + if (pcb->unacked == NULL) { + return; + } + + /* Move all unacked segments to the head of the unsent queue */ + for (seg = pcb->unacked; seg->next != NULL; seg = seg->next); + /* concatenate unsent queue after unacked queue */ + seg->next = pcb->unsent; +#if TCP_OVERSIZE && TCP_OVERSIZE_DBGCHECK + /* if last unsent changed, we need to update unsent_oversize */ + if (pcb->unsent == NULL) { + pcb->unsent_oversize = seg->oversize_left; + } +#endif /* TCP_OVERSIZE && TCP_OVERSIZE_DBGCHECK*/ + /* unsent queue is the concatenated queue (of unacked, unsent) */ + pcb->unsent = pcb->unacked; + /* unacked queue is now empty */ + pcb->unacked = NULL; + + /* increment number of retransmissions */ + ++pcb->nrtx; + + /* Don't take any RTT measurements after retransmitting. */ + pcb->rttest = 0; + + /* Do the actual retransmission */ + tcp_output(pcb); +} + +/** + * Requeue the first unacked segment for retransmission + * + * Called by tcp_receive() for fast retramsmit. + * + * @param pcb the tcp_pcb for which to retransmit the first unacked segment + */ +void +tcp_rexmit(struct tcp_pcb *pcb) +{ + struct tcp_seg *seg; + struct tcp_seg **cur_seg; + + if (pcb->unacked == NULL) { + return; + } + + /* Move the first unacked segment to the unsent queue */ + /* Keep the unsent queue sorted. */ + seg = pcb->unacked; + pcb->unacked = seg->next; + + cur_seg = &(pcb->unsent); + while (*cur_seg && + TCP_SEQ_LT((*cur_seg)->seqno, seg->seqno)) { + cur_seg = &((*cur_seg)->next ); + } + seg->next = *cur_seg; + *cur_seg = seg; +#if TCP_OVERSIZE + if (seg->next == NULL) { + /* the retransmitted segment is last in unsent, so reset unsent_oversize */ + pcb->unsent_oversize = 0; + } +#endif /* TCP_OVERSIZE */ + + ++pcb->nrtx; + + /* Don't take any rtt measurements after retransmitting. */ + pcb->rttest = 0; +} + + +/** + * Handle retransmission after three dupacks received + * + * @param pcb the tcp_pcb for which to retransmit the first unacked segment + */ +void +tcp_rexmit_fast(struct tcp_pcb *pcb) +{ + if (pcb->unacked != NULL && !(pcb->flags & TF_INFR)) { + /* This is fast retransmit. Retransmit the first unacked segment. */ + LWIP_DEBUGF(TCP_FR_DEBUG, + ("tcp_receive: dupacks %"U16_F" (%"U32_F + "), fast retransmit %"U32_F"\n", + (u16_t)pcb->dupacks, pcb->lastack, + pcb->unacked->seqno)); + tcp_rexmit(pcb); +#if TCP_CC_ALGO_MOD + cc_cong_signal(pcb, CC_NDUPACK); +#else + /* Set ssthresh to half of the minimum of the current + * cwnd and the advertised window */ + if (pcb->cwnd > pcb->snd_wnd) { + pcb->ssthresh = pcb->snd_wnd / 2; + } else { + pcb->ssthresh = pcb->cwnd / 2; + } + + /* The minimum value for ssthresh should be 2 MSS */ + if (pcb->ssthresh < (2U * pcb->mss)) { + LWIP_DEBUGF(TCP_FR_DEBUG, + ("tcp_receive: The minimum value for ssthresh %"U16_F + " should be min 2 mss %"U16_F"...\n", + pcb->ssthresh, 2*pcb->mss)); + pcb->ssthresh = 2*pcb->mss; + } + + pcb->cwnd = pcb->ssthresh + 3 * pcb->mss; +#endif + pcb->flags |= TF_INFR; + } +} + + +/** + * Send keepalive packets to keep a connection active although + * no data is sent over it. + * + * Called by tcp_slowtmr() + * + * @param pcb the tcp_pcb for which to send a keepalive packet + */ +void +tcp_keepalive(struct tcp_pcb *pcb) +{ + struct pbuf *p; + struct tcp_hdr *tcphdr; + + LWIP_DEBUGF(TCP_DEBUG, ("tcp_keepalive: sending KEEPALIVE probe to %"U16_F".%"U16_F".%"U16_F".%"U16_F"\n", + ip4_addr1_16(&pcb->remote_ip), ip4_addr2_16(&pcb->remote_ip), + ip4_addr3_16(&pcb->remote_ip), ip4_addr4_16(&pcb->remote_ip))); + + LWIP_DEBUGF(TCP_DEBUG, ("tcp_keepalive: tcp_ticks %"U32_F" pcb->tmr %"U32_F" pcb->keep_cnt_sent %"U16_F"\n", + tcp_ticks, pcb->tmr, pcb->keep_cnt_sent)); + + p = tcp_output_alloc_header(pcb, 0, 0, htonl(pcb->snd_nxt - 1)); + if(p == NULL) { + LWIP_DEBUGF(TCP_DEBUG, + ("tcp_keepalive: could not allocate memory for pbuf\n")); + return; + } + tcphdr = (struct tcp_hdr *)p->payload; + +#if CHECKSUM_GEN_TCP + tcphdr->chksum = inet_chksum_pseudo(p, &pcb->local_ip, &pcb->remote_ip, + IP_PROTO_TCP, (u16_t)p->tot_len); +#endif + TCP_STATS_INC(tcp.xmit); + + /* Send output to IP */ +#if LWIP_TSO + pcb->ip_output(p, pcb, 0); +#else + pcb->ip_output(p, pcb, 0, 0); +#endif /* LWIP_TSO */ + + tcp_tx_pbuf_free(pcb, p); + + LWIP_DEBUGF(TCP_DEBUG, ("tcp_keepalive: seqno %"U32_F" ackno %"U32_F".\n", + pcb->snd_nxt - 1, pcb->rcv_nxt)); + (void)tcphdr; /* Fix warning -Wunused-but-set-variable*/ +} + + +/** + * Send persist timer zero-window probes to keep a connection active + * when a window update is lost. + * + * Called by tcp_slowtmr() + * + * @param pcb the tcp_pcb for which to send a zero-window probe packet + */ +void +tcp_zero_window_probe(struct tcp_pcb *pcb) +{ + struct pbuf *p; + struct tcp_hdr *tcphdr; + struct tcp_seg *seg; + u16_t len; + u8_t is_fin; + u32_t snd_nxt; + + LWIP_DEBUGF(TCP_DEBUG, + ("tcp_zero_window_probe: sending ZERO WINDOW probe to %" + U16_F".%"U16_F".%"U16_F".%"U16_F"\n", + ip4_addr1_16(&pcb->remote_ip), ip4_addr2_16(&pcb->remote_ip), + ip4_addr3_16(&pcb->remote_ip), ip4_addr4_16(&pcb->remote_ip))); + + LWIP_DEBUGF(TCP_DEBUG, + ("tcp_zero_window_probe: tcp_ticks %"U32_F + " pcb->tmr %"U32_F" pcb->keep_cnt_sent %"U16_F"\n", + tcp_ticks, pcb->tmr, pcb->keep_cnt_sent)); + + /* Only consider unsent, persist timer should be off when there data is in-flight */ + seg = pcb->unsent; + if(seg == NULL) { + /* Not expected, persist timer should be off when the send buffer is empty */ + return; + } + + is_fin = ((TCPH_FLAGS(seg->tcphdr) & TCP_FIN) != 0) && (seg->len == 0); + /* we want to send one seqno: either FIN or data (no options) */ + len = is_fin ? 0 : 1; + + /** + * While sending probe of 1 byte we must split the first unsent segment. + * This change is commented out because tcp_zero_window_probe() was replaced + * with tcp_keepalive(). + * if (len > 0 && seg->len != 1) { + * tcp_split_segment(pcb, seg, seg->seqno - pcb->lastack + 1); + * seg = pcb->unsent; + * } + */ + + p = tcp_output_alloc_header(pcb, 0, len, seg->tcphdr->seqno); + if(p == NULL) { + LWIP_DEBUGF(TCP_DEBUG, ("tcp_zero_window_probe: no memory for pbuf\n")); + return; + } + tcphdr = (struct tcp_hdr *)p->payload; + + if (is_fin) { + /* FIN segment, no data */ + TCPH_FLAGS_SET(tcphdr, TCP_ACK | TCP_FIN); + } else { + /* Data segment, copy in one byte from the head of the unacked queue */ +#if LWIP_TSO + *((char *)p->payload + TCP_HLEN) = *(char *)((u8_t *)seg->tcphdr + LWIP_TCP_HDRLEN(seg->tcphdr)); +#else + *((char *)p->payload + TCP_HLEN) = *(char *)seg->dataptr; +#endif /* LWIP_TSO */ + } + + /* The byte may be acknowledged without the window being opened. */ + snd_nxt = lwip_ntohl(seg->tcphdr->seqno) + 1; + if (TCP_SEQ_LT(pcb->snd_nxt, snd_nxt)) { + pcb->snd_nxt = snd_nxt; + } + +#if CHECKSUM_GEN_TCP + tcphdr->chksum = inet_chksum_pseudo(p, &pcb->local_ip, &pcb->remote_ip, + IP_PROTO_TCP, (u16_t)p->tot_len); +#endif + TCP_STATS_INC(tcp.xmit); + + /* Send output to IP */ +#if LWIP_TSO + pcb->ip_output(p, pcb, 0); +#else + pcb->ip_output(p, pcb, 0, 0); +#endif /* LWIP_TSO */ + + tcp_tx_pbuf_free(pcb, p); + + LWIP_DEBUGF(TCP_DEBUG, ("tcp_zero_window_probe: seqno %"U32_F + " ackno %"U32_F".\n", + pcb->snd_nxt - 1, pcb->rcv_nxt)); +} +#endif /* LWIP_TCP */ diff --git a/src/vma/main.cpp b/src/vma/main.cpp new file mode 100644 index 0000000..c82bde3 --- /dev/null +++ b/src/vma/main.cpp @@ -0,0 +1,914 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vlogger/vlogger.h" +#include "utils/rdtsc.h" +#include "vma/util/vma_stats.h" +#include "vma/util/utils.h" +#include "vma/event/event_handler_manager.h" +#include "vma/event/vlogger_timer_handler.h" +#include "vma/dev/buffer_pool.h" +#include "vma/dev/ib_ctx_handler_collection.h" +#include "vma/dev/net_device_table_mgr.h" +#include "vma/dev/ring_profile.h" +#include "vma/proto/ip_frag.h" +#include "vma/proto/vma_lwip.h" +#include "vma/proto/route_table_mgr.h" +#include "vma/proto/rule_table_mgr.h" +#include "vma/proto/igmp_mgr.h" + +#include "vma/proto/neighbour_table_mgr.h" +#include "vma/netlink/netlink_wrapper.h" +#include "vma/event/command.h" + +#include "vma/sock/sock-redirect.h" +#include "vma/sock/fd_collection.h" +#include "vma/sock/sockinfo_tcp.h" +#include "vma/sock/sockinfo_udp.h" +#include "vma/iomux/io_mux_call.h" + +#include "vma/util/instrumentation.h" +#include "vma/util/agent.h" + +void check_netperf_flags(); + + +// Start of vma_version_str - used in "$ strings libvma.so | grep VMA_VERSION" +#define STR_EXPAND(x) #x +#define STR(x) STR_EXPAND(x) +const char *vma_version_str = "VMA_VERSION: " PACKAGE_VERSION "-" STR(VMA_LIBRARY_RELEASE) + +#if _BullseyeCoverage + " Bullseye" +#endif +#ifdef VMA_SVN_REVISION + " Release" +#else + " Development Snapshot" +#endif + + " built on " +#ifdef VMA_DATE_TIME + VMA_DATE_TIME +#else + __DATE__ " " __TIME__ +#endif + +#ifdef _DEBUG + " -*- DEBUG -*-" +#endif + ; // End of vma_version_str - used in "$ strings libvma.so | grep VMA_VERSION" + + +bool g_b_exit = false; +bool g_init_ibv_fork_done = false; +bool g_is_forked_child = false; +bool g_init_global_ctors_done = true; +static command_netlink *s_cmd_nl = NULL; +#define MAX_VERSION_STR_LEN 128 + +static int free_libvma_resources() +{ + vlog_printf(VLOG_DEBUG, "%s: Closing libvma resources\n", __FUNCTION__); + + g_b_exit = true; + + //Triggers connection close, relevant for TCP which may need some time to terminate the connection. + //and for any socket that may wait from another thread + if (g_p_fd_collection) { + g_p_fd_collection->prepare_to_close(); + } + + /* Probably this timeout is needless as far as all TCP connections + * are closed with shutdown option (tcp_abort()->tcp_abandon()) + */ + usleep(50000); + + //Handle pending received data, this is critical for proper TCP connection termination + if (g_p_net_device_table_mgr) { + g_p_net_device_table_mgr->global_ring_drain_and_procces(); + } + + if(g_p_igmp_mgr) { + igmp_mgr* g_p_igmp_mgr_tmp = g_p_igmp_mgr; + g_p_igmp_mgr = NULL; + delete g_p_igmp_mgr_tmp; + usleep(50000); + } + + if (g_p_event_handler_manager) + g_p_event_handler_manager->stop_thread(); + + if (g_tcp_timers_collection) g_tcp_timers_collection->clean_obj(); + g_tcp_timers_collection = NULL; + + // Block all sock-redicrt API calls into our offloading core + fd_collection* g_p_fd_collection_temp = g_p_fd_collection; + g_p_fd_collection = NULL; + if (g_p_fd_collection_temp) delete g_p_fd_collection_temp; + + if (g_p_lwip) delete g_p_lwip; + g_p_lwip = NULL; + + if (g_p_route_table_mgr) delete g_p_route_table_mgr; + g_p_route_table_mgr = NULL; + + if (g_p_rule_table_mgr) delete g_p_rule_table_mgr; + g_p_rule_table_mgr = NULL; + + if(g_p_net_device_table_mgr) delete g_p_net_device_table_mgr; + g_p_net_device_table_mgr = NULL; + + ip_frag_manager* g_p_ip_frag_manager_temp = g_p_ip_frag_manager; + g_p_ip_frag_manager = NULL; + if (g_p_ip_frag_manager_temp) delete g_p_ip_frag_manager_temp; + + if (g_p_neigh_table_mgr) delete g_p_neigh_table_mgr; + g_p_neigh_table_mgr = NULL; + + if (g_tcp_seg_pool) delete g_tcp_seg_pool; + g_tcp_seg_pool = NULL; + + if (g_buffer_pool_tx) delete g_buffer_pool_tx; + g_buffer_pool_tx = NULL; + + if (g_buffer_pool_rx) delete g_buffer_pool_rx; + g_buffer_pool_rx = NULL; + + if (s_cmd_nl) delete s_cmd_nl; + s_cmd_nl = NULL; + + if (g_p_netlink_handler) delete g_p_netlink_handler; + g_p_netlink_handler = NULL; + + if (g_p_ib_ctx_handler_collection) delete g_p_ib_ctx_handler_collection; + g_p_ib_ctx_handler_collection = NULL; + + if (g_p_vlogger_timer_handler) delete g_p_vlogger_timer_handler; + g_p_vlogger_timer_handler = NULL; + + if (g_p_event_handler_manager) delete g_p_event_handler_manager; + g_p_event_handler_manager = NULL; + + if (g_p_agent) delete g_p_agent; + g_p_agent = NULL; + + if (g_p_ring_profile) delete g_p_ring_profile; + g_p_ring_profile = NULL; + + if (safe_mce_sys().app_name) free(safe_mce_sys().app_name); + safe_mce_sys().app_name = NULL; + + vlog_printf(VLOG_DEBUG, "Stopping logger module\n"); + + sock_redirect_exit(); + + vlog_stop(); + + if (g_stats_file) { + //cosmetics - remove when adding iomux block + fprintf(g_stats_file, "======================================================\n"); + fclose (g_stats_file); + g_stats_file = NULL; + } + + return 0; +} + +static void handle_segfault(int) +{ + vlog_printf(VLOG_ERROR, "Segmentation Fault\n"); + printf_backtrace(); + + kill(getpid(), SIGKILL); +} + +void check_debug() +{ + if (safe_mce_sys().log_level >= VLOG_DEBUG) { + vlog_printf(VLOG_WARNING, "*************************************************************\n"); + vlog_printf(VLOG_WARNING, "* VMA is currently configured with high log level *\n"); + vlog_printf(VLOG_WARNING, "* Application performance will decrease in this log level! *\n"); + vlog_printf(VLOG_WARNING, "* This log level is recommended for debugging purposes only *\n"); + vlog_printf(VLOG_WARNING, "*************************************************************\n"); + } +} + +void check_cpu_speed() +{ + double hz_min = -1, hz_max = -1; + if (!get_cpu_hz(hz_min, hz_max)) { + vlog_printf(VLOG_DEBUG, "***************************************************************************\n"); + vlog_printf(VLOG_DEBUG, "Failure in reading CPU speeds\n"); + vlog_printf(VLOG_DEBUG, "Time measurements will not be accurate and Max Performance might not be achieved\n"); + vlog_printf(VLOG_DEBUG, "Verify with: cat /proc/cpuinfo | grep \"MHz\\|clock\"\n"); + vlog_printf(VLOG_DEBUG, "***************************************************************************\n"); + } + else if (!compare_double(hz_min, hz_max)) { + // CPU cores are running at different speed + // Machine is probably running not in high performance configuration + vlog_printf(VLOG_DEBUG, "***************************************************************************\n"); + vlog_printf(VLOG_DEBUG, "CPU cores are running at different speeds: min= %.3lf MHz, max= %.3lf MHz\n", hz_min/1e6, hz_max/1e6); + vlog_printf(VLOG_DEBUG, "Time measurements will not be accurate and Max Performance might not be achieved\n"); + vlog_printf(VLOG_DEBUG, "Verify with: cat /proc/cpuinfo | grep \"MHz\\|clock\"\n"); + vlog_printf(VLOG_DEBUG, "***************************************************************************\n"); + } + else { + // CPU cores are all running at identical speed + vlog_printf(VLOG_DEBUG, "CPU speed for all cores is: %.3lf MHz\n", hz_min/1e6); + } +} + +void check_locked_mem() +{ + struct rlimit rlim; + if (getrlimit(RLIMIT_MEMLOCK, &rlim) == 0 && rlim.rlim_max != RLIM_INFINITY) { + vlog_printf(VLOG_WARNING, "************************************************************************\n"); + vlog_printf(VLOG_WARNING, "Your current max locked memory is: %ld. Please change it to unlimited.\n", rlim.rlim_max); + vlog_printf(VLOG_WARNING, "Set this user's default to `ulimit -l unlimited`.\n"); + vlog_printf(VLOG_WARNING, "Read more about this topic in the VMA's User Manual.\n"); + vlog_printf(VLOG_WARNING, "************************************************************************\n"); + } +} + +const char* thread_mode_str(thread_mode_t thread_mode) +{ + switch (thread_mode) { + case THREAD_MODE_SINGLE: return "Single"; + case THREAD_MODE_MULTI: return "Multi spin lock"; + case THREAD_MODE_MUTEX: return "Multi mutex lock"; + case THREAD_MODE_PLENTY: return "Multi more threads than cores"; + default: break; + } + return ""; +} + +const char* buffer_batching_mode_str(buffer_batching_mode_t buffer_batching_mode) +{ + switch (buffer_batching_mode) { + case BUFFER_BATCHING_NONE: return "(No batching buffers)"; + case BUFFER_BATCHING_WITH_RECLAIM: return "(Batch and reclaim buffers)"; + case BUFFER_BATCHING_NO_RECLAIM: return "(Batch and don't reclaim buffers)"; + default: break; + } + return ""; +} + +#define FORMAT_NUMBER "%-30s %-26d [%s]\n" +#define FORMAT_STRING "%-30s %-26s [%s]\n" +#define FORMAT_NUMSTR "%-30s %-2d%-24s [%s]\n" + + +#define VLOG_STR_PARAM_DETAILS(param_val, param_def_val, args...) \ + do { \ + if (param_val && strcmp(param_val, param_def_val)) { \ + vlog_printf(VLOG_INFO, ##args); \ + } \ + else { \ + vlog_printf(VLOG_DETAILS, ##args); \ + } \ + } while (0); + +#define VLOG_NUM_PARAM_DETAILS(param_val, param_def_val, args...) \ + do { \ + if (param_val != param_def_val) { \ + vlog_printf(VLOG_INFO, ##args); \ + } \ + else { \ + vlog_printf(VLOG_DETAILS, ##args); \ + } \ + } while (0); + +#define VLOG_STR_PARAM_STRING(param_desc, param_val, param_def_val, param_name, val_desc_str) \ + VLOG_STR_PARAM_DETAILS (param_val, param_def_val, FORMAT_STRING, param_desc, val_desc_str, param_name) \ + +#define VLOG_PARAM_NUMBER(param_desc, param_val, param_def_val, param_name) \ + VLOG_NUM_PARAM_DETAILS (param_val, param_def_val, FORMAT_NUMBER, param_desc, param_val, param_name) + +#define VLOG_PARAM_STRING(param_desc, param_val, param_def_val, param_name, val_desc_str) \ + VLOG_NUM_PARAM_DETAILS (param_val, param_def_val, FORMAT_STRING, param_desc, val_desc_str, param_name) + +#define VLOG_PARAM_NUMSTR(param_desc, param_val, param_def_val, param_name, val_desc_str) \ + VLOG_NUM_PARAM_DETAILS (param_val, param_def_val, FORMAT_NUMSTR, param_desc, param_val, val_desc_str, param_name) + +int get_ofed_version_info(char* ofed_version_str, int len) +{ + return run_and_retreive_system_command("ofed_info -s 2>/dev/null | head -1 | tr -d '\n'", ofed_version_str, len); +} + +void print_vma_global_settings() +{ + struct utsname sys_info; + time_t clock = time(NULL); + char ofed_version_info[MAX_VERSION_STR_LEN]; + + vlog_printf(VLOG_INFO,"---------------------------------------------------------------------------\n"); + vlog_printf(VLOG_INFO,"%s\n", vma_version_str); + if (VMA_GIT_VERSION[0]) { + vlog_printf(VLOG_INFO,"%s\n", "Git: " VMA_GIT_VERSION); + } + vlog_printf(VLOG_INFO,"Cmd Line: %s\n", safe_mce_sys().app_name); + + // Use DEBUG level logging with more details in RPM release builds + vlog_levels_t log_level = VLOG_DEBUG; +#ifndef VMA_SVN_REVISION + // If non RPM (development builds) use more verbosity + log_level = VLOG_DEFAULT; +#endif + vlog_printf(log_level,"Current Time: %s", ctime(&clock)); + vlog_printf(log_level,"Pid: %5u\n", getpid()); + + ofed_version_info[0] = '\0'; + int ret = get_ofed_version_info(ofed_version_info, MAX_VERSION_STR_LEN); + if (!ret && strlen(ofed_version_info) > 0) { + vlog_printf(VLOG_INFO,"OFED Version: %s\n", ofed_version_info); + } + + if (!uname(&sys_info)) { + vlog_printf(VLOG_DEBUG,"System: %s\n", sys_info.release); + vlog_printf(log_level,"Architecture: %s\n", sys_info.machine); + vlog_printf(log_level,"Node: %s\n", sys_info.nodename); + } + + vlog_printf(VLOG_INFO,"---------------------------------------------------------------------------\n"); + + if (safe_mce_sys().mce_spec != MCE_SPEC_NONE) { + vlog_printf(VLOG_INFO, FORMAT_STRING, "VMA Spec", vma_spec::to_str((vma_spec_t)safe_mce_sys().mce_spec), SYS_VAR_SPEC); + + if (safe_mce_sys().mce_spec == MCE_SPEC_29WEST_LBM_29 || safe_mce_sys().mce_spec == MCE_SPEC_WOMBAT_FH_LBM_554) { + vlog_printf(VLOG_INFO, FORMAT_NUMBER, "Param 1:", safe_mce_sys().mce_spec_param1, SYS_VAR_SPEC_PARAM1); + vlog_printf(VLOG_INFO, FORMAT_NUMBER, "Param 2:", safe_mce_sys().mce_spec_param2, SYS_VAR_SPEC_PARAM2); + } + } + + VLOG_STR_PARAM_STRING("Log Level", log_level::to_str(safe_mce_sys().log_level), "", SYS_VAR_LOG_LEVEL, log_level::to_str(safe_mce_sys().log_level)); + VLOG_PARAM_NUMBER("Log Details", safe_mce_sys().log_details, MCE_DEFAULT_LOG_DETAILS, SYS_VAR_LOG_DETAILS); + VLOG_PARAM_STRING("Log Colors", safe_mce_sys().log_colors, MCE_DEFAULT_LOG_COLORS, SYS_VAR_LOG_COLORS, safe_mce_sys().log_colors ? "Enabled " : "Disabled"); + VLOG_STR_PARAM_STRING("Log File", safe_mce_sys().log_filename, MCE_DEFAULT_LOG_FILE, SYS_VAR_LOG_FILENAME, safe_mce_sys().log_filename); + VLOG_STR_PARAM_STRING("Stats File", safe_mce_sys().stats_filename, MCE_DEFAULT_STATS_FILE, SYS_VAR_STATS_FILENAME, safe_mce_sys().stats_filename); + VLOG_STR_PARAM_STRING("Stats shared memory directory", safe_mce_sys().stats_shmem_dirname, MCE_DEFAULT_STATS_SHMEM_DIR, SYS_VAR_STATS_SHMEM_DIRNAME, safe_mce_sys().stats_shmem_dirname); + VLOG_STR_PARAM_STRING("VMAD output directory", safe_mce_sys().vmad_notify_dir, MCE_DEFAULT_VMAD_FOLDER, SYS_VAR_VMAD_DIR, safe_mce_sys().vmad_notify_dir); + VLOG_PARAM_NUMBER("Stats FD Num (max)", safe_mce_sys().stats_fd_num_max, MCE_DEFAULT_STATS_FD_NUM, SYS_VAR_STATS_FD_NUM); + VLOG_STR_PARAM_STRING("Conf File", safe_mce_sys().conf_filename, MCE_DEFAULT_CONF_FILE, SYS_VAR_CONF_FILENAME, safe_mce_sys().conf_filename); + VLOG_STR_PARAM_STRING("Application ID", safe_mce_sys().app_id, MCE_DEFAULT_APP_ID, SYS_VAR_APPLICATION_ID, safe_mce_sys().app_id); + VLOG_PARAM_STRING("Polling CPU idle usage", safe_mce_sys().select_handle_cpu_usage_stats, MCE_DEFAULT_SELECT_CPU_USAGE_STATS, SYS_VAR_SELECT_CPU_USAGE_STATS, safe_mce_sys().select_handle_cpu_usage_stats ? "Enabled " : "Disabled"); + VLOG_PARAM_STRING("SigIntr Ctrl-C Handle", safe_mce_sys().handle_sigintr, MCE_DEFAULT_HANDLE_SIGINTR, SYS_VAR_HANDLE_SIGINTR, safe_mce_sys().handle_sigintr ? "Enabled " : "Disabled"); + VLOG_PARAM_STRING("SegFault Backtrace", safe_mce_sys().handle_segfault, MCE_DEFAULT_HANDLE_SIGFAULT, SYS_VAR_HANDLE_SIGSEGV, safe_mce_sys().handle_segfault ? "Enabled " : "Disabled"); + + + VLOG_PARAM_NUMSTR("Ring allocation logic TX", safe_mce_sys().ring_allocation_logic_tx, MCE_DEFAULT_RING_ALLOCATION_LOGIC_TX, SYS_VAR_RING_ALLOCATION_LOGIC_TX, ring_logic_str(safe_mce_sys().ring_allocation_logic_tx)); + VLOG_PARAM_NUMSTR("Ring allocation logic RX", safe_mce_sys().ring_allocation_logic_rx, MCE_DEFAULT_RING_ALLOCATION_LOGIC_RX, SYS_VAR_RING_ALLOCATION_LOGIC_RX, ring_logic_str(safe_mce_sys().ring_allocation_logic_rx)); + if (safe_mce_sys().ring_allocation_logic_rx == RING_LOGIC_PER_USER_ID) { + vlog_printf(VLOG_WARNING,"user_id is not supported using " + "environment variable , use etra_api, using default\n"); + safe_mce_sys().ring_allocation_logic_rx = MCE_DEFAULT_RING_ALLOCATION_LOGIC_RX; + } + + if (safe_mce_sys().ring_allocation_logic_tx == RING_LOGIC_PER_USER_ID) { + vlog_printf(VLOG_WARNING,"user_id is not supported using " + "environment variable , use etra_api, using default\n"); + safe_mce_sys().ring_allocation_logic_tx = MCE_DEFAULT_RING_ALLOCATION_LOGIC_TX; + } + + VLOG_PARAM_NUMBER("Ring migration ratio TX", safe_mce_sys().ring_migration_ratio_tx, MCE_DEFAULT_RING_MIGRATION_RATIO_TX, SYS_VAR_RING_MIGRATION_RATIO_TX); + VLOG_PARAM_NUMBER("Ring migration ratio RX", safe_mce_sys().ring_migration_ratio_rx, MCE_DEFAULT_RING_MIGRATION_RATIO_RX, SYS_VAR_RING_MIGRATION_RATIO_RX); + + if (safe_mce_sys().ring_limit_per_interface) { + VLOG_PARAM_NUMBER("Ring limit per interface", safe_mce_sys().ring_limit_per_interface, MCE_DEFAULT_RING_LIMIT_PER_INTERFACE, SYS_VAR_RING_LIMIT_PER_INTERFACE); + } else { + VLOG_PARAM_NUMSTR("Ring limit per interface", safe_mce_sys().ring_limit_per_interface, MCE_DEFAULT_RING_LIMIT_PER_INTERFACE, SYS_VAR_RING_LIMIT_PER_INTERFACE, "(no limit)"); + } + + VLOG_PARAM_NUMBER("Ring On Device Memory TX", safe_mce_sys().ring_dev_mem_tx, MCE_DEFAULT_RING_DEV_MEM_TX, SYS_VAR_RING_DEV_MEM_TX); + + if (safe_mce_sys().tcp_max_syn_rate) { + VLOG_PARAM_NUMSTR("TCP max syn rate", safe_mce_sys().tcp_max_syn_rate, MCE_DEFAULT_TCP_MAX_SYN_RATE, SYS_VAR_TCP_MAX_SYN_RATE, "(per sec)"); + } else { + VLOG_PARAM_NUMSTR("TCP max syn rate", safe_mce_sys().tcp_max_syn_rate, MCE_DEFAULT_TCP_MAX_SYN_RATE, SYS_VAR_TCP_MAX_SYN_RATE, "(no limit)"); + } + + VLOG_PARAM_NUMBER("Tx Mem Segs TCP", safe_mce_sys().tx_num_segs_tcp, MCE_DEFAULT_TX_NUM_SEGS_TCP, SYS_VAR_TX_NUM_SEGS_TCP); + VLOG_PARAM_NUMBER("Tx Mem Bufs", safe_mce_sys().tx_num_bufs, MCE_DEFAULT_TX_NUM_BUFS, SYS_VAR_TX_NUM_BUFS); +#ifdef DEFINED_TSO + VLOG_PARAM_NUMBER("Tx Mem Buf size", safe_mce_sys().tx_buf_size, MCE_DEFAULT_TX_BUF_SIZE, SYS_VAR_TX_BUF_SIZE); +#endif /* DEFINED_TSO */ + VLOG_PARAM_NUMBER("Tx QP WRE", safe_mce_sys().tx_num_wr, MCE_DEFAULT_TX_NUM_WRE, SYS_VAR_TX_NUM_WRE); + VLOG_PARAM_NUMBER("Tx QP WRE Batching", safe_mce_sys().tx_num_wr_to_signal, MCE_DEFAULT_TX_NUM_WRE_TO_SIGNAL, SYS_VAR_TX_NUM_WRE_TO_SIGNAL); + VLOG_PARAM_NUMBER("Tx Max QP INLINE", safe_mce_sys().tx_max_inline, MCE_DEFAULT_TX_MAX_INLINE, SYS_VAR_TX_MAX_INLINE); + VLOG_PARAM_STRING("Tx MC Loopback", safe_mce_sys().tx_mc_loopback_default, MCE_DEFAULT_TX_MC_LOOPBACK, SYS_VAR_TX_MC_LOOPBACK, safe_mce_sys().tx_mc_loopback_default ? "Enabled " : "Disabled"); + VLOG_PARAM_STRING("Tx non-blocked eagains", safe_mce_sys().tx_nonblocked_eagains, MCE_DEFAULT_TX_NONBLOCKED_EAGAINS, SYS_VAR_TX_NONBLOCKED_EAGAINS, safe_mce_sys().tx_nonblocked_eagains ? "Enabled " : "Disabled"); + VLOG_PARAM_NUMBER("Tx Prefetch Bytes", safe_mce_sys().tx_prefetch_bytes, MCE_DEFAULT_TX_PREFETCH_BYTES, SYS_VAR_TX_PREFETCH_BYTES); + + VLOG_PARAM_NUMBER("Rx Mem Bufs", safe_mce_sys().rx_num_bufs, MCE_DEFAULT_RX_NUM_BUFS, SYS_VAR_RX_NUM_BUFS); + VLOG_PARAM_NUMBER("Rx QP WRE", safe_mce_sys().rx_num_wr, MCE_DEFAULT_RX_NUM_WRE, SYS_VAR_RX_NUM_WRE); + VLOG_PARAM_NUMBER("Rx QP WRE Batching", safe_mce_sys().rx_num_wr_to_post_recv, MCE_DEFAULT_RX_NUM_WRE_TO_POST_RECV, SYS_VAR_RX_NUM_WRE_TO_POST_RECV); + VLOG_PARAM_NUMBER("Rx Byte Min Limit", safe_mce_sys().rx_ready_byte_min_limit, MCE_DEFAULT_RX_BYTE_MIN_LIMIT, SYS_VAR_RX_BYTE_MIN_LIMIT); + VLOG_PARAM_NUMBER("Rx Poll Loops", safe_mce_sys().rx_poll_num, MCE_DEFAULT_RX_NUM_POLLS, SYS_VAR_RX_NUM_POLLS); + VLOG_PARAM_NUMBER("Rx Poll Init Loops", safe_mce_sys().rx_poll_num_init, MCE_DEFAULT_RX_NUM_POLLS_INIT, SYS_VAR_RX_NUM_POLLS_INIT); + if (safe_mce_sys().rx_udp_poll_os_ratio) { + VLOG_PARAM_NUMBER("Rx UDP Poll OS Ratio", safe_mce_sys().rx_udp_poll_os_ratio, MCE_DEFAULT_RX_UDP_POLL_OS_RATIO, SYS_VAR_RX_UDP_POLL_OS_RATIO); + } else { + VLOG_PARAM_STRING("Rx UDP Poll OS Ratio", safe_mce_sys().rx_udp_poll_os_ratio, MCE_DEFAULT_RX_UDP_POLL_OS_RATIO, SYS_VAR_RX_UDP_POLL_OS_RATIO, "Disabled"); + } + + VLOG_PARAM_NUMBER("HW TS Conversion", safe_mce_sys().hw_ts_conversion_mode, MCE_DEFAULT_HW_TS_CONVERSION_MODE, SYS_VAR_HW_TS_CONVERSION_MODE); + + if (safe_mce_sys().rx_poll_yield_loops) { + VLOG_PARAM_NUMBER("Rx Poll Yield", safe_mce_sys().rx_poll_yield_loops, MCE_DEFAULT_RX_POLL_YIELD, SYS_VAR_RX_POLL_YIELD); + } + else { + VLOG_PARAM_STRING("Rx Poll Yield", safe_mce_sys().rx_poll_yield_loops, MCE_DEFAULT_RX_POLL_YIELD, SYS_VAR_RX_POLL_YIELD, "Disabled"); + } + VLOG_PARAM_NUMBER("Rx Prefetch Bytes", safe_mce_sys().rx_prefetch_bytes, MCE_DEFAULT_RX_PREFETCH_BYTES, SYS_VAR_RX_PREFETCH_BYTES); + + VLOG_PARAM_NUMBER("Rx Prefetch Bytes Before Poll", safe_mce_sys().rx_prefetch_bytes_before_poll, MCE_DEFAULT_RX_PREFETCH_BYTES_BEFORE_POLL, SYS_VAR_RX_PREFETCH_BYTES_BEFORE_POLL); + + if (safe_mce_sys().rx_cq_drain_rate_nsec == MCE_RX_CQ_DRAIN_RATE_DISABLED) { + VLOG_PARAM_STRING("Rx CQ Drain Rate", safe_mce_sys().rx_cq_drain_rate_nsec, MCE_DEFAULT_RX_CQ_DRAIN_RATE, SYS_VAR_RX_CQ_DRAIN_RATE_NSEC, "Disabled"); + } + else { + VLOG_PARAM_NUMBER("Rx CQ Drain Rate (nsec)", safe_mce_sys().rx_cq_drain_rate_nsec, MCE_DEFAULT_RX_CQ_DRAIN_RATE, SYS_VAR_RX_CQ_DRAIN_RATE_NSEC); + } + + VLOG_PARAM_NUMBER("GRO max streams", safe_mce_sys().gro_streams_max, MCE_DEFAULT_GRO_STREAMS_MAX, SYS_VAR_GRO_STREAMS_MAX); + + VLOG_PARAM_STRING("TCP 3T rules", safe_mce_sys().tcp_3t_rules, MCE_DEFAULT_TCP_3T_RULES, SYS_VAR_TCP_3T_RULES, safe_mce_sys().tcp_3t_rules ? "Enabled " : "Disabled"); + VLOG_PARAM_STRING("ETH MC L2 only rules", safe_mce_sys().eth_mc_l2_only_rules, MCE_DEFAULT_ETH_MC_L2_ONLY_RULES, SYS_VAR_ETH_MC_L2_ONLY_RULES, safe_mce_sys().eth_mc_l2_only_rules ? "Enabled " : "Disabled"); + VLOG_PARAM_STRING("Force Flowtag for MC", safe_mce_sys().mc_force_flowtag, MCE_DEFAULT_MC_FORCE_FLOWTAG, SYS_VAR_MC_FORCE_FLOWTAG, safe_mce_sys().mc_force_flowtag ? "Enabled " : "Disabled"); + + VLOG_PARAM_NUMBER("Select Poll (usec)", safe_mce_sys().select_poll_num, MCE_DEFAULT_SELECT_NUM_POLLS, SYS_VAR_SELECT_NUM_POLLS); + VLOG_PARAM_STRING("Select Poll OS Force", safe_mce_sys().select_poll_os_force, MCE_DEFAULT_SELECT_POLL_OS_FORCE, SYS_VAR_SELECT_POLL_OS_FORCE, safe_mce_sys().select_poll_os_force ? "Enabled " : "Disabled"); + + if (safe_mce_sys().select_poll_os_ratio) { + VLOG_PARAM_NUMBER("Select Poll OS Ratio", safe_mce_sys().select_poll_os_ratio, MCE_DEFAULT_SELECT_POLL_OS_RATIO, SYS_VAR_SELECT_POLL_OS_RATIO); + } + else { + VLOG_PARAM_STRING("Select Poll OS Ratio", safe_mce_sys().select_poll_os_ratio, MCE_DEFAULT_SELECT_POLL_OS_RATIO, SYS_VAR_SELECT_POLL_OS_RATIO, "Disabled"); + } + + if (safe_mce_sys().select_skip_os_fd_check) { + VLOG_PARAM_NUMBER("Select Skip OS", safe_mce_sys().select_skip_os_fd_check, MCE_DEFAULT_SELECT_SKIP_OS, SYS_VAR_SELECT_SKIP_OS); + } + else { + VLOG_PARAM_STRING("Select Skip OS", safe_mce_sys().select_skip_os_fd_check, MCE_DEFAULT_SELECT_SKIP_OS, SYS_VAR_SELECT_SKIP_OS, "Disabled"); + } + + if (safe_mce_sys().progress_engine_interval_msec == MCE_CQ_DRAIN_INTERVAL_DISABLED || safe_mce_sys().progress_engine_wce_max == 0) { + vlog_printf(VLOG_INFO, FORMAT_STRING, "CQ Drain Thread", "Disabled", SYS_VAR_PROGRESS_ENGINE_INTERVAL); + } + else { + VLOG_PARAM_NUMBER("CQ Drain Interval (msec)", safe_mce_sys().progress_engine_interval_msec, MCE_DEFAULT_PROGRESS_ENGINE_INTERVAL_MSEC, SYS_VAR_PROGRESS_ENGINE_INTERVAL); + VLOG_PARAM_NUMBER("CQ Drain WCE (max)", safe_mce_sys().progress_engine_wce_max, MCE_DEFAULT_PROGRESS_ENGINE_WCE_MAX, SYS_VAR_PROGRESS_ENGINE_WCE_MAX); + } + + VLOG_PARAM_STRING("CQ Interrupts Moderation", safe_mce_sys().cq_moderation_enable, MCE_DEFAULT_CQ_MODERATION_ENABLE, SYS_VAR_CQ_MODERATION_ENABLE, safe_mce_sys().cq_moderation_enable ? "Enabled " : "Disabled"); + VLOG_PARAM_NUMBER("CQ Moderation Count", safe_mce_sys().cq_moderation_count, MCE_DEFAULT_CQ_MODERATION_COUNT, SYS_VAR_CQ_MODERATION_COUNT); + VLOG_PARAM_NUMBER("CQ Moderation Period (usec)", safe_mce_sys().cq_moderation_period_usec, MCE_DEFAULT_CQ_MODERATION_PERIOD_USEC, SYS_VAR_CQ_MODERATION_PERIOD_USEC); + VLOG_PARAM_NUMBER("CQ AIM Max Count", safe_mce_sys().cq_aim_max_count, MCE_DEFAULT_CQ_AIM_MAX_COUNT, SYS_VAR_CQ_AIM_MAX_COUNT); + VLOG_PARAM_NUMBER("CQ AIM Max Period (usec)", safe_mce_sys().cq_aim_max_period_usec, MCE_DEFAULT_CQ_AIM_MAX_PERIOD_USEC, SYS_VAR_CQ_AIM_MAX_PERIOD_USEC); + if (safe_mce_sys().cq_aim_interval_msec == MCE_CQ_ADAPTIVE_MODERATION_DISABLED) { + vlog_printf(VLOG_INFO, FORMAT_STRING, "CQ Adaptive Moderation", "Disabled", SYS_VAR_CQ_AIM_INTERVAL_MSEC); + } else { + VLOG_PARAM_NUMBER("CQ AIM Interval (msec)", safe_mce_sys().cq_aim_interval_msec, MCE_DEFAULT_CQ_AIM_INTERVAL_MSEC, SYS_VAR_CQ_AIM_INTERVAL_MSEC); + } + VLOG_PARAM_NUMBER("CQ AIM Interrupts Rate (per sec)", safe_mce_sys().cq_aim_interrupts_rate_per_sec, MCE_DEFAULT_CQ_AIM_INTERRUPTS_RATE_PER_SEC, SYS_VAR_CQ_AIM_INTERRUPTS_RATE_PER_SEC); + + VLOG_PARAM_NUMBER("CQ Poll Batch (max)", safe_mce_sys().cq_poll_batch_max, MCE_DEFAULT_CQ_POLL_BATCH, SYS_VAR_CQ_POLL_BATCH_MAX); + VLOG_PARAM_STRING("CQ Keeps QP Full", safe_mce_sys().cq_keep_qp_full, MCE_DEFAULT_CQ_KEEP_QP_FULL, SYS_VAR_CQ_KEEP_QP_FULL, safe_mce_sys().cq_keep_qp_full ? "Enabled" : "Disabled"); + VLOG_PARAM_NUMBER("QP Compensation Level", safe_mce_sys().qp_compensation_level, MCE_DEFAULT_QP_COMPENSATION_LEVEL, SYS_VAR_QP_COMPENSATION_LEVEL); + VLOG_PARAM_STRING("Offloaded Sockets", safe_mce_sys().offloaded_sockets, MCE_DEFAULT_OFFLOADED_SOCKETS, SYS_VAR_OFFLOADED_SOCKETS, safe_mce_sys().offloaded_sockets ? "Enabled" : "Disabled"); + VLOG_PARAM_NUMBER("Timer Resolution (msec)", safe_mce_sys().timer_resolution_msec, MCE_DEFAULT_TIMER_RESOLUTION_MSEC, SYS_VAR_TIMER_RESOLUTION_MSEC); + VLOG_PARAM_NUMBER("TCP Timer Resolution (msec)", safe_mce_sys().tcp_timer_resolution_msec, MCE_DEFAULT_TCP_TIMER_RESOLUTION_MSEC, SYS_VAR_TCP_TIMER_RESOLUTION_MSEC); + VLOG_PARAM_NUMSTR("TCP control thread", safe_mce_sys().tcp_ctl_thread, MCE_DEFAULT_TCP_CTL_THREAD, SYS_VAR_TCP_CTL_THREAD, ctl_thread_str(safe_mce_sys().tcp_ctl_thread)); + VLOG_PARAM_NUMBER("TCP timestamp option", safe_mce_sys().tcp_ts_opt, MCE_DEFAULT_TCP_TIMESTAMP_OPTION, SYS_VAR_TCP_TIMESTAMP_OPTION); + VLOG_PARAM_NUMBER("TCP nodelay", safe_mce_sys().tcp_nodelay, MCE_DEFAULT_TCP_NODELAY, SYS_VAR_TCP_NODELAY); + VLOG_PARAM_NUMBER("TCP quickack", safe_mce_sys().tcp_quickack, MCE_DEFAULT_TCP_QUICKACK, SYS_VAR_TCP_QUICKACK); + VLOG_PARAM_NUMSTR(vma_exception_handling::getName(), (int)safe_mce_sys().exception_handling, vma_exception_handling::MODE_DEFAULT, vma_exception_handling::getSysVar(), safe_mce_sys().exception_handling.to_str()); + VLOG_PARAM_STRING("Avoid sys-calls on tcp fd", safe_mce_sys().avoid_sys_calls_on_tcp_fd, MCE_DEFAULT_AVOID_SYS_CALLS_ON_TCP_FD, SYS_VAR_AVOID_SYS_CALLS_ON_TCP_FD, safe_mce_sys().avoid_sys_calls_on_tcp_fd ? "Enabled" : "Disabled"); + VLOG_PARAM_STRING("Allow privileged sock opt", safe_mce_sys().allow_privileged_sock_opt, MCE_DEFAULT_ALLOW_PRIVILEGED_SOCK_OPT, SYS_VAR_ALLOW_PRIVILEGED_SOCK_OPT, safe_mce_sys().allow_privileged_sock_opt ? "Enabled" : "Disabled"); + VLOG_PARAM_NUMBER("Delay after join (msec)", safe_mce_sys().wait_after_join_msec, MCE_DEFAULT_WAIT_AFTER_JOIN_MSEC, SYS_VAR_WAIT_AFTER_JOIN_MSEC); + VLOG_STR_PARAM_STRING("Internal Thread Affinity", safe_mce_sys().internal_thread_affinity_str, MCE_DEFAULT_INTERNAL_THREAD_AFFINITY_STR, SYS_VAR_INTERNAL_THREAD_AFFINITY, safe_mce_sys().internal_thread_affinity_str); + VLOG_STR_PARAM_STRING("Internal Thread Cpuset", safe_mce_sys().internal_thread_cpuset, MCE_DEFAULT_INTERNAL_THREAD_CPUSET, SYS_VAR_INTERNAL_THREAD_CPUSET, safe_mce_sys().internal_thread_cpuset); + VLOG_PARAM_STRING("Internal Thread Arm CQ", safe_mce_sys().internal_thread_arm_cq_enabled, MCE_DEFAULT_INTERNAL_THREAD_ARM_CQ_ENABLED, SYS_VAR_INTERNAL_THREAD_ARM_CQ, safe_mce_sys().internal_thread_arm_cq_enabled ? "Enabled " : "Disabled"); + VLOG_PARAM_NUMSTR("Internal Thread TCP Handling", safe_mce_sys().internal_thread_tcp_timer_handling, MCE_DEFAULT_INTERNAL_THREAD_TCP_TIMER_HANDLING, SYS_VAR_INTERNAL_THREAD_TCP_TIMER_HANDLING, internal_thread_tcp_timer_handling_str(safe_mce_sys().internal_thread_tcp_timer_handling)); + VLOG_PARAM_STRING("Thread mode", safe_mce_sys().thread_mode, MCE_DEFAULT_THREAD_MODE, SYS_VAR_THREAD_MODE, thread_mode_str(safe_mce_sys().thread_mode)); + VLOG_PARAM_NUMSTR("Buffer batching mode", safe_mce_sys().buffer_batching_mode, MCE_DEFAULT_BUFFER_BATCHING_MODE, SYS_VAR_BUFFER_BATCHING_MODE, buffer_batching_mode_str(safe_mce_sys().buffer_batching_mode)); + switch (safe_mce_sys().mem_alloc_type) { + case ALLOC_TYPE_HUGEPAGES: + VLOG_PARAM_NUMSTR("Mem Allocate type", safe_mce_sys().mem_alloc_type, MCE_DEFAULT_MEM_ALLOC_TYPE, SYS_VAR_MEM_ALLOC_TYPE, "(Huge Pages)"); break; + case ALLOC_TYPE_ANON: + VLOG_PARAM_NUMSTR("Mem Allocate type", safe_mce_sys().mem_alloc_type, MCE_DEFAULT_MEM_ALLOC_TYPE, SYS_VAR_MEM_ALLOC_TYPE, "(Malloc)"); break; + case ALLOC_TYPE_CONTIG: + default: + VLOG_PARAM_NUMSTR("Mem Allocate type", safe_mce_sys().mem_alloc_type, MCE_DEFAULT_MEM_ALLOC_TYPE, SYS_VAR_MEM_ALLOC_TYPE, "(Contig Pages)"); break; + } + + VLOG_PARAM_NUMBER("Num of UC ARPs", safe_mce_sys().neigh_uc_arp_quata, MCE_DEFAULT_NEIGH_UC_ARP_QUATA, SYS_VAR_NEIGH_UC_ARP_QUATA); + VLOG_PARAM_NUMBER("UC ARP delay (msec)", safe_mce_sys().neigh_wait_till_send_arp_msec, MCE_DEFAULT_NEIGH_UC_ARP_DELAY_MSEC, SYS_VAR_NEIGH_UC_ARP_DELAY_MSEC); + VLOG_PARAM_NUMBER("Num of neigh restart retries", safe_mce_sys().neigh_num_err_retries, MCE_DEFAULT_NEIGH_NUM_ERR_RETRIES, SYS_VAR_NEIGH_NUM_ERR_RETRIES ); + + VLOG_PARAM_STRING("IPOIB support", safe_mce_sys().enable_ipoib, MCE_DEFAULT_IPOIB_FLAG, SYS_VAR_IPOIB, safe_mce_sys().enable_ipoib ? "Enabled " : "Disabled"); + VLOG_PARAM_STRING("SocketXtreme mode", safe_mce_sys().enable_socketxtreme, MCE_DEFAULT_SOCKETXTREME, SYS_VAR_SOCKETXTREME, safe_mce_sys().enable_socketxtreme ? "Enabled " : "Disabled"); +#ifdef DEFINED_TSO + VLOG_PARAM_STRING("TSO support", safe_mce_sys().enable_tso, MCE_DEFAULT_TSO, SYS_VAR_TSO, safe_mce_sys().enable_tso ? "Enabled " : "Disabled"); +#endif /* DEFINED_TSO */ + VLOG_PARAM_STRING("BF (Blue Flame)", safe_mce_sys().handle_bf, MCE_DEFAULT_BF_FLAG, SYS_VAR_BF, safe_mce_sys().handle_bf ? "Enabled " : "Disabled"); + VLOG_PARAM_STRING("fork() support", safe_mce_sys().handle_fork, MCE_DEFAULT_FORK_SUPPORT, SYS_VAR_FORK, safe_mce_sys().handle_fork ? "Enabled " : "Disabled"); + VLOG_PARAM_STRING("close on dup2()", safe_mce_sys().close_on_dup2, MCE_DEFAULT_CLOSE_ON_DUP2, SYS_VAR_CLOSE_ON_DUP2, safe_mce_sys().close_on_dup2 ? "Enabled " : "Disabled"); + switch (safe_mce_sys().mtu) { + case MTU_FOLLOW_INTERFACE: + VLOG_PARAM_NUMSTR("MTU", safe_mce_sys().mtu, MCE_DEFAULT_MTU, SYS_VAR_MTU, "(follow actual MTU)"); break; + default: + VLOG_PARAM_NUMBER("MTU", safe_mce_sys().mtu, MCE_DEFAULT_MTU, SYS_VAR_MTU); break; + } + switch (safe_mce_sys().lwip_mss) { + case MSS_FOLLOW_MTU: + VLOG_PARAM_NUMSTR("MSS", safe_mce_sys().lwip_mss, MCE_DEFAULT_MSS, SYS_VAR_MSS, "(follow VMA_MTU)"); break; + default: + VLOG_PARAM_NUMBER("MSS", safe_mce_sys().lwip_mss, MCE_DEFAULT_MSS, SYS_VAR_MSS); break; + } + VLOG_PARAM_NUMSTR("TCP CC Algorithm", safe_mce_sys().lwip_cc_algo_mod, MCE_DEFAULT_LWIP_CC_ALGO_MOD, SYS_VAR_TCP_CC_ALGO, lwip_cc_algo_str(safe_mce_sys().lwip_cc_algo_mod)); + VLOG_PARAM_STRING("Polling Rx on Tx TCP", safe_mce_sys().rx_poll_on_tx_tcp, MCE_DEFAULT_RX_POLL_ON_TX_TCP, SYS_VAR_VMA_RX_POLL_ON_TX_TCP, safe_mce_sys().rx_poll_on_tx_tcp ? "Enabled " : "Disabled"); + VLOG_PARAM_STRING("Trig dummy send getsockname()", safe_mce_sys().trigger_dummy_send_getsockname, MCE_DEFAULT_TRIGGER_DUMMY_SEND_GETSOCKNAME, SYS_VAR_VMA_TRIGGER_DUMMY_SEND_GETSOCKNAME, safe_mce_sys().trigger_dummy_send_getsockname ? "Enabled " : "Disabled"); + +#ifdef VMA_TIME_MEASURE + VLOG_PARAM_NUMBER("Time Measure Num Samples", safe_mce_sys().vma_time_measure_num_samples, MCE_DEFAULT_TIME_MEASURE_NUM_SAMPLES, SYS_VAR_VMA_TIME_MEASURE_NUM_SAMPLES); + VLOG_STR_PARAM_STRING("Time Measure Dump File", safe_mce_sys().vma_time_measure_filename, MCE_DEFAULT_TIME_MEASURE_DUMP_FILE, SYS_VAR_VMA_TIME_MEASURE_DUMP_FILE, safe_mce_sys().vma_time_measure_filename); +#endif + + vlog_printf(VLOG_INFO,"---------------------------------------------------------------------------\n"); +} + +void prepare_fork() +{ + if (safe_mce_sys().handle_fork && !g_init_ibv_fork_done) { + IF_VERBS_FAILURE(ibv_fork_init()) { + vlog_printf(VLOG_DEBUG,"ibv_fork_init failed (errno=%d %m)\n", errno); + vlog_printf(VLOG_ERROR, "************************************************************************\n"); + vlog_printf(VLOG_ERROR, "ibv_fork_init() failed! The effect of the application calling 'fork()' is undefined!\n"); + vlog_printf(VLOG_ERROR, "Read the fork section in the VMA's User Manual for more information\n"); + vlog_printf(VLOG_ERROR, "************************************************************************\n"); + } + else { + g_init_ibv_fork_done = true; + vlog_printf(VLOG_DEBUG,"ibv_fork_init() succeeded, fork() may be used safely!!\n"); + } ENDIF_VERBS_FAILURE; + } +} + +void register_handler_segv() +{ + struct sigaction act; + memset(&act, 0, sizeof(act)); + act.sa_handler = handle_segfault; + act.sa_flags = 0; + sigemptyset(&act.sa_mask); + sigaction(SIGSEGV, &act, NULL); + vlog_printf(VLOG_INFO, "Registered a SIGSEGV handler\n"); +} + +extern "C" void sock_redirect_main(void) +{ + vlog_printf(VLOG_DEBUG, "%s()\n", __FUNCTION__); +// int ret = atexit(sock_redirect_exit); +// if (ret) +// vlog_printf(VLOG_ERROR, "%s() ERROR at atexit() (ret=%d %m)\n", __FUNCTION__, ret); + + tv_clear(&g_last_zero_polling_time); + + if (safe_mce_sys().handle_segfault) { + register_handler_segv(); + } + +#ifdef RDTSC_MEASURE + init_rdtsc(); +#endif + +#ifdef VMA_TIME_MEASURE + init_instrumentation(); +#endif +} + +extern "C" void sock_redirect_exit(void) +{ +#ifdef RDTSC_MEASURE + print_rdtsc_summary(); +#endif +#ifdef VMA_TIME_MEASURE + finit_instrumentation(safe_mce_sys().vma_time_measure_filename); +#endif + vlog_printf(VLOG_DEBUG, "%s()\n", __FUNCTION__); + vma_shmem_stats_close(); +} + +#define NEW_CTOR(ptr, ctor) \ +do { \ + if (!ptr) { \ + ptr = new ctor; \ + BULLSEYE_EXCLUDE_BLOCK_START \ + if (ptr == NULL) { \ + throw_vma_exception("Failed allocate " #ctor "\n"); \ + return; \ + } \ + BULLSEYE_EXCLUDE_BLOCK_END \ + } \ +} while (0); + +static void do_global_ctors_helper() +{ + static lock_spin_recursive g_globals_lock; + auto_unlocker lock(g_globals_lock); + + if (g_init_global_ctors_done) { + return; + } + g_init_global_ctors_done = true; + + set_env_params(); + prepare_fork(); + + if (g_is_forked_child == true) + g_is_forked_child = false; + + /* Open communication with daemon */ + NEW_CTOR(g_p_agent, agent()); + vlog_printf(VLOG_DEBUG,"Agent setup state: g_p_agent=%p active=%d\n", + g_p_agent, (g_p_agent ? g_p_agent->state() : -1)); + + // Create all global managment objects + NEW_CTOR(g_p_event_handler_manager, event_handler_manager()); + + vma_shmem_stats_open(&g_p_vlogger_level, &g_p_vlogger_details); + *g_p_vlogger_level = g_vlogger_level; + *g_p_vlogger_details = g_vlogger_details; + + //Create new netlink listener + NEW_CTOR(g_p_netlink_handler, netlink_wrapper()); + + NEW_CTOR(g_p_ib_ctx_handler_collection, ib_ctx_handler_collection()); + + NEW_CTOR(g_p_neigh_table_mgr, neigh_table_mgr()); + + // net_device should be initialized after event_handler and before buffer pool and g_p_neigh_table_mgr. + NEW_CTOR(g_p_net_device_table_mgr, net_device_table_mgr()); + + NEW_CTOR(g_p_rule_table_mgr, rule_table_mgr()); + + NEW_CTOR(g_p_route_table_mgr, route_table_mgr()); + + NEW_CTOR(g_p_igmp_mgr, igmp_mgr()); + + NEW_CTOR(g_buffer_pool_rx, buffer_pool(safe_mce_sys().rx_num_bufs, + RX_BUF_SIZE(g_p_net_device_table_mgr->get_max_mtu()), + buffer_pool::free_rx_lwip_pbuf_custom)); + g_buffer_pool_rx->set_RX_TX_for_stats(true); + + #ifdef DEFINED_TSO + safe_mce_sys().tx_buf_size = MIN((int)safe_mce_sys().tx_buf_size, (int)0xFF00); + if (safe_mce_sys().tx_buf_size <= get_lwip_tcp_mss(g_p_net_device_table_mgr->get_max_mtu(), safe_mce_sys().lwip_mss)) { + safe_mce_sys().tx_buf_size = 0; + } + NEW_CTOR(g_buffer_pool_tx, buffer_pool(safe_mce_sys().tx_num_bufs, + TX_BUF_SIZE(safe_mce_sys().tx_buf_size ? + safe_mce_sys().tx_buf_size : + get_lwip_tcp_mss(g_p_net_device_table_mgr->get_max_mtu(), safe_mce_sys().lwip_mss)), + buffer_pool::free_tx_lwip_pbuf_custom)); +#else + NEW_CTOR(g_buffer_pool_tx, buffer_pool(safe_mce_sys().tx_num_bufs, + TX_BUF_SIZE(get_lwip_tcp_mss(g_p_net_device_table_mgr->get_max_mtu(), safe_mce_sys().lwip_mss)), + buffer_pool::free_tx_lwip_pbuf_custom)); +#endif /* DEFINED_TSO */ + g_buffer_pool_tx->set_RX_TX_for_stats(false); + + NEW_CTOR(g_tcp_seg_pool, tcp_seg_pool(safe_mce_sys().tx_num_segs_tcp)); + + NEW_CTOR(g_tcp_timers_collection, tcp_timers_collection(safe_mce_sys().tcp_timer_resolution_msec, safe_mce_sys().timer_resolution_msec)); + + NEW_CTOR(g_p_vlogger_timer_handler, vlogger_timer_handler()); + + NEW_CTOR(g_p_ip_frag_manager, ip_frag_manager()); + + NEW_CTOR(g_p_fd_collection, fd_collection()); + + if (check_if_regular_file (safe_mce_sys().conf_filename)) + { + vlog_printf(VLOG_WARNING,"FAILED to read VMA configuration file. %s is not a regular file.\n", + safe_mce_sys().conf_filename); + if (strcmp (MCE_DEFAULT_CONF_FILE, safe_mce_sys().conf_filename)) + vlog_printf(VLOG_INFO,"Please see README.txt section regarding VMA_CONFIG_FILE\n"); + } + else if (__vma_parse_config_file(safe_mce_sys().conf_filename)) + vlog_printf(VLOG_DEBUG,"FAILED to read VMA configuration file: %s\n", safe_mce_sys().conf_filename); + + + // initialize LWIP tcp/ip stack + NEW_CTOR(g_p_lwip, vma_lwip()); + + if (g_p_netlink_handler) { + // Open netlink socket + BULLSEYE_EXCLUDE_BLOCK_START + if (g_p_netlink_handler->open_channel()) { + throw_vma_exception("Failed in netlink open_channel()\n"); + } + + int fd = g_p_netlink_handler->get_channel(); + if(fd == -1) { + throw_vma_exception("Netlink fd == -1\n"); + } + + // Register netlink fd to the event_manager + s_cmd_nl = new command_netlink(g_p_netlink_handler); + if (s_cmd_nl == NULL) { + throw_vma_exception("Failed allocating command_netlink\n"); + } + BULLSEYE_EXCLUDE_BLOCK_END + g_p_event_handler_manager->register_command_event(fd, s_cmd_nl); + g_p_event_handler_manager->register_timer_event( + safe_mce_sys().timer_netlink_update_msec, + s_cmd_nl, + PERIODIC_TIMER, + NULL); + } + +// neigh_test(); +// igmp_test(); + NEW_CTOR(g_p_ring_profile, ring_profiles_collection()); +} + +int do_global_ctors() +{ + int errno_backup = errno; + try { + do_global_ctors_helper(); + } + catch (const vma_exception& error) { + vlog_printf(VLOG_DETAILS, "Error: %s", error.what()); + return -1; + } + catch (const std::exception& error ) { + vlog_printf(VLOG_ERROR, "%s", error.what()); + return -1; + } + /* do not return internal errno in case constructor is executed successfully */ + errno = errno_backup; + return 0; +} + +void reset_globals() +{ + g_p_fd_collection = NULL; + g_p_igmp_mgr = NULL; + g_p_ip_frag_manager = NULL; + g_buffer_pool_rx = NULL; + g_buffer_pool_tx = NULL; + g_tcp_seg_pool = NULL; + g_tcp_timers_collection = NULL; + g_p_vlogger_timer_handler = NULL; + g_p_event_handler_manager = NULL; + g_p_agent = NULL; + g_p_route_table_mgr = NULL; + g_p_rule_table_mgr = NULL; + g_stats_file = NULL; + g_p_net_device_table_mgr = NULL; + g_p_neigh_table_mgr = NULL; + g_p_lwip = NULL; + g_p_netlink_handler = NULL; + g_p_ib_ctx_handler_collection = NULL; + g_p_ring_profile = NULL; + s_cmd_nl = NULL; + g_cpu_manager.reset(); +} + +// checks that netserver runs with flags: -D, -f. Otherwise, warn user for wrong usage +// this test is performed since vma does not support fork, and these flags make sure the netserver application will not use fork. +void check_netperf_flags() +{ + char cmd_line[FILENAME_MAX]; + char *pch, *command; + bool b_D_flag = false, b_f_flag = false; + char add_flags[4] = {0}; + + strncpy(cmd_line, safe_mce_sys().app_name, sizeof(cmd_line) - 1); + cmd_line[sizeof(cmd_line) - 1] = '\0'; + pch = strtok(cmd_line, " "); + + command = basename(pch); //extract only "netserver" from full path + if (strcmp(command, "netserver")) { + return; + } + pch = strtok(NULL, " "); + + while (pch != NULL) { + if (*pch == '-') { + if (strchr(pch, 'D')) + b_D_flag = true; + if (strchr(pch, 'f')) + b_f_flag = true; + } + if (b_f_flag && b_D_flag) + break; + pch = strtok(NULL, " "); + } + if (!b_D_flag || !b_f_flag) { + vlog_printf(VLOG_WARNING, + "Running netserver without flags: -D, -f can cause failure\n"); + add_flags[0] = '-'; // check which flags need to be added to the command + if (!b_D_flag) + add_flags[1] = 'D'; + if (!b_f_flag) + add_flags[1] == 0 ? add_flags[1] = 'f' : add_flags[2] = 'f'; + vlog_printf(VLOG_WARNING, "Recommended command line: %s %s\n", + safe_mce_sys().app_name, add_flags); + } +} + +//----------------------------------------------------------------------------- +// library init function +//----------------------------------------------------------------------------- +// __attribute__((constructor)) causes the function to be called when +// library is firsrt loaded +//extern "C" int __attribute__((constructor)) sock_redirect_lib_load_constructor(void) +extern "C" int main_init(void) +{ + +#ifndef VMA_SVN_REVISION + // Force GCC's malloc() to check the consistency of dynamic memory in development build (Non Release) + //mcheck(vma_mcheck_abort_cb); +#endif + get_orig_funcs(); + safe_mce_sys(); + + g_init_global_ctors_done = false; + + vlog_start("VMA", safe_mce_sys().log_level, safe_mce_sys().log_filename, safe_mce_sys().log_details, safe_mce_sys().log_colors); + + print_vma_global_settings(); + + check_debug(); + check_cpu_speed(); + check_locked_mem(); + check_netperf_flags(); + + if (*safe_mce_sys().stats_filename) { + if (check_if_regular_file (safe_mce_sys().stats_filename)) + vlog_printf(VLOG_WARNING,"FAILED to create VMA statistics file. %s is not a regular file.\n", safe_mce_sys().stats_filename); + else if (!(g_stats_file = fopen(safe_mce_sys().stats_filename, "w"))) + vlog_printf(VLOG_WARNING," Couldn't open statistics file: %s\n", safe_mce_sys().stats_filename); + } + + sock_redirect_main(); + + return 0; +} + +//extern "C" int __attribute__((destructor)) sock_redirect_lib_load_destructor(void) +extern "C" int main_destroy(void) +{ + return free_libvma_resources(); +} diff --git a/src/vma/main.h b/src/vma/main.h new file mode 100644 index 0000000..da3442e --- /dev/null +++ b/src/vma/main.h @@ -0,0 +1,52 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef MAIN_H +#define MAIN_H + +#include +#include +#include +#include + +void print_vma_global_settings(); +void check_locked_mem(); +void set_env_params(); +void prepare_fork(); + +extern "C" void sock_redirect_main(void); +extern "C" void sock_redirect_exit(void); + +extern bool g_init_ibv_fork_done; + +#endif //MAIN_H diff --git a/src/vma/netlink/Makefile.am b/src/vma/netlink/Makefile.am new file mode 100755 index 0000000..6de81d5 --- /dev/null +++ b/src/vma/netlink/Makefile.am @@ -0,0 +1,47 @@ +AM_CFLAGS = -Wall -g #-O3 + +AM_CPPFLAGS := \ + -I$(top_srcdir)/src \ + ${LIBNL_CFLAGS} + +noinst_LTLIBRARIES = libnetlink.la +libnetlink_la_LDFLAGS = -static +libnetlink_la_SOURCES = \ + neigh_info.cpp \ + route_info.cpp \ + link_info.cpp \ + netlink_compatibility.cpp \ + netlink_wrapper.cpp + +EXTRA_DIST = \ + test_main.cpp + +# This section is disabled +# (just keep one for future use) +#noinst_PROGRAMS = nl_test + +#nl_test_LDADD = -lrt -ldl -lpthread -libverbs -lrdmacm \ +# ${LIBNL_LIBS} \ +# libnetlink.la \ +# $(top_builddir)/src/vlogger/libvlogger.la + +#nl_test_SOURCES = \ +# neigh_info.cpp \ +# route_info.cpp \ +# link_info.cpp \ +# netlink_compatibility.cpp \ +# netlink_wrapper.cpp \ +# ../infra/subject_observer.cpp \ +# ../event/netlink_event.cpp \ +# test_main.cpp \ +# link_info.h \ +# neigh_info.h \ +# netlink_compatibility.h \ +# netlink_wrapper.h \ +# route_info.h + +#nl_test_CXXFLAGS = -g + +#nl_test_DEPENDENCIES = \ +# libnetlink.la \ +# $(top_builddir)/src/vlogger/libvlogger.la diff --git a/src/vma/netlink/link_info.cpp b/src/vma/netlink/link_info.cpp new file mode 100644 index 0000000..985f707 --- /dev/null +++ b/src/vma/netlink/link_info.cpp @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "link_info.h" +#include "vlogger/vlogger.h" + +#define MODULE_NAME "netlink_event" + +#define ADDR_MAX_STR_LEN (128) + + +netlink_link_info::netlink_link_info(struct rtnl_link* link): + /*arptype(0),*/ broadcast_str(""), addr_family(0), flags(0), ifindex( + 0), /*mode(0),*/ master_ifindex(0), mtu( + 0), name(""), operstate(0), txqlen( + 0) +{ + fill(link); +} + +void netlink_link_info::fill(struct rtnl_link* link) +{ + if (link) { + //arptype=rtnl_link_get_arptype(link); + addr_family=rtnl_link_get_family(link); + flags=rtnl_link_get_flags(link); + ifindex=rtnl_link_get_ifindex(link); + master_ifindex=rtnl_link_get_master(link); + mtu=rtnl_link_get_mtu(link); + txqlen=rtnl_link_get_txqlen(link); + operstate=rtnl_link_get_operstate(link); + //mode=rtnl_link_get_linkmode(link); + + nl_addr* addr; + char addr_str[ADDR_MAX_STR_LEN + 1]; + + const char* namestr=rtnl_link_get_name(link); + if (namestr) { + name = namestr; + } + + addr = rtnl_link_get_broadcast(link); + if (addr) { + broadcast_str = nl_addr2str(addr, addr_str, ADDR_MAX_STR_LEN); + } + + } +} + +const std::string netlink_link_info::get_operstate2str() const { + char operstate_str[256]; + return rtnl_link_operstate2str(operstate,operstate_str, 255); +} diff --git a/src/vma/netlink/link_info.h b/src/vma/netlink/link_info.h new file mode 100644 index 0000000..bdfcb6b --- /dev/null +++ b/src/vma/netlink/link_info.h @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef NETLINK_LINK_INFO_H_ +#define NETLINK_LINK_INFO_H_ + +#include +#include + +class netlink_link_info +{ +public: + netlink_link_info(struct rtnl_link* link); + virtual ~netlink_link_info() + { + } + + // fill all attributes using the provided netlink original link + void fill(struct rtnl_link* link); + + +// // Hardware type (eg. ARPHRD_ETHER or ARPHRD_VOID) +// uint32_t arptype; + + // the link layer broadcast address string + std::string broadcast_str; + + // Address family or AF_UNSPEC if not specified. + int addr_family; + + /* return link flags: + * IFF_UP Link is up (administratively) + * IFF_RUNNING Link is up and carrier is OK (RFC2863 OPER_UP) + * IFF_LOWER_UP Link layer is operational + * IFF_DORMANT Driver signals dormant + * IFF_BROADCAST Link supports broadcasting + * IFF_MULTICAST Link supports multicasting + * IFF_ALLMULTI Link supports multicast routing + * IFF_DEBUG Tell driver to do debugging (currently unused) + * IFF_LOOPBACK Link loopback network + * IFF_POINTOPOINT Point-to-point link + * IFF_NOARP ARP is not supported + * IFF_PROMISC Status of promiscious mode + * IFF_MASTER Master of a load balancer (bonding) + * IFF_SLAVE Slave to a master link + * IFF_PORTSEL Driver supports setting media type (only used by ARM ethernet) + * IFF_AUTOMEDIA Link selects port automatically (only used by ARM ethernet) + * IFF_ECHO Echo sent packets (testing feature, CAN only) + * IFF_DYNAMIC Unused (BSD compatibility) + * IFF_NOTRAILERS Unused (BSD compatibility) + * + */ + uint32_t flags; + + // the interface index of the link + int ifindex; + +// /* the link mode +// * IF_LINK_MODE_DEFAULT Default link mode +// * IF_LINK_MODE_DORMANT Limit upward transition to dormant +// */ +// uint8_t mode; + + // interface index of master link or 0 if not specified + int master_ifindex; + + /* the maximum transmission unit + * specifies the maximum packet size a network device can transmit or receive + * */ + uint32_t mtu; + + /* a unique,human readable description of the link. + * by default, links are named based on their type and then enumerated, + * e.g. eth0, eth1, ethn but they may be renamed at any time + * */ + std::string name; + + /* extended information on the link status (from: RFC 2863 operational status linux/if.h) + * Unknown state IF_OPER_UNKNOWN + * Link not present IF_OPER_NOTPRESENT + * Link down IF_OPER_DOWN + * L1 down IF_OPER_LOWERLAYERDOWN + * Testing IF_OPER_TESTING + * Dormant IF_OPER_DORMANT + * Link up IF_OPER_UP + * + */ + uint8_t operstate; + + // transmission queue length + uint32_t txqlen; + + const std::string get_operstate2str() const; + +}; + +#endif /* NETLINK_LINK_INFO_H_ */ diff --git a/src/vma/netlink/neigh_info.cpp b/src/vma/netlink/neigh_info.cpp new file mode 100644 index 0000000..753b581 --- /dev/null +++ b/src/vma/netlink/neigh_info.cpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include +#include "neigh_info.h" + +#define ADDR_MAX_STR_LEN (128) + +netlink_neigh_info::netlink_neigh_info(struct rtnl_neigh* neigh) : + dst_addr_str(""), dst_addr(NULL), dst_addr_len(0), flags(0), ifindex( + 0), lladdr_str(""), lladdr(NULL), lladdr_len(0), state( + 0), type(0) +{ + fill(neigh); +} + +void netlink_neigh_info::fill(struct rtnl_neigh* neigh) +{ + if (!neigh) + return; + + nl_addr* addr; + char addr_str[ADDR_MAX_STR_LEN + 1]; + + addr = rtnl_neigh_get_dst(neigh); + if (addr) { + dst_addr_str = nl_addr2str(addr, addr_str, ADDR_MAX_STR_LEN); + dst_addr = (unsigned char*)nl_addr_get_binary_addr(addr); + dst_addr_len = nl_addr_get_len(addr); + } + + addr = rtnl_neigh_get_lladdr(neigh); + if (addr) { + lladdr_str = nl_addr2str(addr, addr_str, ADDR_MAX_STR_LEN); + lladdr = (unsigned char*)nl_addr_get_binary_addr(addr); + lladdr_len = nl_addr_get_len(addr); + } + //addr_family = rtnl_neigh_get_family(neigh); + flags = rtnl_neigh_get_flags(neigh); + ifindex = rtnl_neigh_get_ifindex(neigh); + state = rtnl_neigh_get_state(neigh); + type = rtnl_neigh_get_type(neigh); +} + + diff --git a/src/vma/netlink/neigh_info.h b/src/vma/netlink/neigh_info.h new file mode 100644 index 0000000..1857afd --- /dev/null +++ b/src/vma/netlink/neigh_info.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef NETLINK_NEIGH_INFO_H +#define NETLINK_NEIGH_INFO_H + +#include +#include +#include + +class netlink_neigh_info +{ +public: + netlink_neigh_info() : + dst_addr_str(""), + dst_addr(NULL), + dst_addr_len(0), + flags(0), + ifindex(0), + lladdr_str(""), + lladdr(NULL), + lladdr_len(0), + state(0), + type(0) { } + + netlink_neigh_info(struct rtnl_neigh* neigh); + virtual ~netlink_neigh_info() {} + + // fill all attributes using the provided netlink original neigh + void fill(struct rtnl_neigh* neigh); + + // neigh's destination address as string + std::string dst_addr_str; // rtnl_neigh_get_dst() + + // neigh's destination address + unsigned char* dst_addr; // + + // neigh's destination address length + uint32_t dst_addr_len; + + +// // neigh addr family +// int neigh_addr_family; //rtnl_neigh_get_family(); + + /* return neigh flags: + * NTF_USE + * NTF_PROXY + * NTF_ROUTER + */ + uint32_t flags; + + // interface index OR RTNL_LINK_NOT_FOUND if not set + int ifindex; //rtnl_neigh_get_ifindex(); + + // link layer addr as string + std::string lladdr_str; // rtnl_neigh_get_lladdr() + + // link layer addr + unsigned char* lladdr; + + // link layer addr length + uint32_t lladdr_len; + + /* neigh state: + a bitmask of the following states: + + NUD_INCOMPLETE a currently resolving cache entry + NUD_REACHABLE a confirmed working cache entry + NUD_STALE an expired cache entry + NUD_DELAY an entry waiting for a timer + NUD_PROBE a cache entry that is currently reprobed + NUD_FAILED an invalid cache entry + NUD_NOARP a device with no destination cache + NUD_PERMANENT a static entry + + -1 if not set. + * */ + int state; // rtnl_neigh_get_state(); + + /* + * neigh type + * ?? not documented properly. + * -1 if not set + * */ + int type; // rtnl_neigh_get_type(); + + std::string get_state2str() const { + if (state == -1) { + return "NOT SET"; + } + else if (state < 0) { + return "ILLEGAL STATE"; + } + else { + char state_str[256]; + return rtnl_neigh_state2str(state, state_str, 255); + } + } + +}; + +#endif /* NETLINK_NEIGH_INFO_H */ diff --git a/src/vma/netlink/netlink_compatibility.cpp b/src/vma/netlink/netlink_compatibility.cpp new file mode 100644 index 0000000..6bfdd3a --- /dev/null +++ b/src/vma/netlink/netlink_compatibility.cpp @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "utils/bullseye.h" +#include "vlogger/vlogger.h" +#include "netlink_compatibility.h" +#include "vma/util/if.h" + +#define MODULE_NAME "nl_wrapper:" +#define nl_logerr __log_err +#define nl_logwarn __log_warn +#define nl_logdbg __log_dbg + + +extern void link_event_callback(nl_object* obj); +extern void neigh_event_callback(nl_object* obj); +extern void route_event_callback(nl_object* obj); + +#ifdef HAVE_LIBNL3 + +nl_sock* nl_socket_handle_alloc() { + return nl_socket_alloc(); +} + +void nl_socket_handle_free(struct nl_sock * sock) { + nl_socket_free(sock); +} + +void neigh_callback(nl_cache* , nl_object* obj, int, void*) { + neigh_event_callback(obj); +} + +void link_callback(nl_cache* , nl_object* obj, int, void*) { + link_event_callback(obj); +} + +void route_callback(nl_cache* , nl_object* obj, int, void*) { + route_event_callback(obj); +} + +void nl_socket_handle_disable_seq_check(nl_socket_handle* handle) { + return nl_socket_disable_seq_check(handle); +} + +nl_cache_mngr* nl_cache_mngr_compatible_alloc(nl_socket_handle* handle, int protocol, int flags) { + nl_cache_mngr* cache_mngr; + + /* allocate temporary 10 nl_sockets for marking the first 10 bits of user_port_map[0] (@[libnl/lib/socket.c]) as workaround + * to avoid conflict between the cache manager's internal sync socket and other netlink sockets on same process + */ + struct nl_sock* tmp_socket_arr[10]; + for (int i=0; i<10; i++) { + tmp_socket_arr[i] = nl_socket_handle_alloc(); + } + + int err = nl_cache_mngr_alloc(handle, protocol, flags, &cache_mngr); + + // free the temporary sockets after cache manager was allocated and bounded the sync socket + for (int i=0; i<10; i++) { + nl_socket_free(tmp_socket_arr[i]); + } + + BULLSEYE_EXCLUDE_BLOCK_START + if (err) { + nl_logerr("Fail to allocate cache manager, error=%s", nl_geterror(err)); + return NULL; + } + int nl_socket_fd = nl_socket_get_fd(handle); + if (fcntl(nl_socket_fd, F_SETFD, FD_CLOEXEC) != 0) { + nl_logwarn("Fail in fctl, error = %d", errno); + } + BULLSEYE_EXCLUDE_BLOCK_END + + return cache_mngr; +} + +int nl_cache_mngr_compatible_add(struct nl_cache_mngr* mngr, const char* name, change_func_t cb, void* data, struct nl_cache** result){ + int err = nl_cache_mngr_add(mngr, name, cb, data, result); + BULLSEYE_EXCLUDE_BLOCK_START + if (err) { + errno = ELIBEXEC; + nl_logerr("Fail to add to cache manager, error=%s", nl_geterror(err)); + } + BULLSEYE_EXCLUDE_BLOCK_END + return err; +} + +in_addr_t nl_object_get_compatible_gateway(struct rtnl_route* nl_route_obj) { + struct rtnl_nexthop *nh; + nh = rtnl_route_nexthop_n(nl_route_obj, 0); + if (nh) { + struct nl_addr * addr; + addr = rtnl_route_nh_get_gateway(nh); + if (addr) { + return *(in_addr_t *) nl_addr_get_binary_addr(addr); + } + } + return INADDR_ANY; +} + +int nl_object_get_compatible_oif(struct rtnl_route* nl_route_obj) { + struct rtnl_nexthop *nh; + nh = rtnl_route_nexthop_n(nl_route_obj, 0); + if (nh) { + return rtnl_route_nh_get_ifindex(nh); + } + return -1; +} + +int nl_object_get_compatible_metric(struct rtnl_route* nl_route_obj, int attr) { + uint32_t val; + + int rc = rtnl_route_get_metric(nl_route_obj, attr, &val); + if (rc == 0) { + return val; + } + nl_logdbg("Fail parsing route metric %d error=%d\n", attr, rc); + return 0; +} + + +#else //HAVE_LIBNL1 + +nl_handle* nl_socket_handle_alloc() { + return nl_handle_alloc(); +} + +void nl_socket_handle_free(struct nl_handle* handle) { + nl_handle_destroy(handle); +} + +void neigh_callback(nl_cache* , nl_object* obj, int) { + neigh_event_callback(obj); +} + +void link_callback(nl_cache* , nl_object* obj, int) { + link_event_callback(obj); +} + +void route_callback(nl_cache* , nl_object* obj, int) { + route_event_callback(obj); +} + +void nl_socket_handle_disable_seq_check(nl_socket_handle* handle) { + return nl_disable_sequence_check(handle); +} + +nl_cache_mngr* nl_cache_mngr_compatible_alloc(nl_socket_handle* handle, int protocol, int flags) { + nl_cache_mngr* cache_mgr = nl_cache_mngr_alloc(handle, protocol, flags); + + BULLSEYE_EXCLUDE_BLOCK_START + if (!cache_mgr) { + nl_logerr("Fail to allocate cache manager"); + } + + int nl_socket_fd = nl_socket_get_fd(handle); + if (fcntl(nl_socket_fd, F_SETFD, FD_CLOEXEC) != 0) { + nl_logwarn("Fail in fctl, error = %d", errno); + } + BULLSEYE_EXCLUDE_BLOCK_END + + return cache_mgr; +} + +int nl_cache_mngr_compatible_add(struct nl_cache_mngr* mngr, const char* name, change_func_t cb, void* , struct nl_cache** result){ + *result = nl_cache_mngr_add(mngr, name, cb); + if (*result == NULL) { + errno = ELIBEXEC; + nl_logerr("Fail adding to cache manager, error=%d %s\n", + nl_get_errno(), nl_geterror()); + return -1; + } + return 0; +} + +in_addr_t nl_object_get_compatible_gateway(struct rtnl_route* nl_route_obj) { + struct nl_addr * addr; + addr = rtnl_route_get_gateway(nl_route_obj); + if (addr) { + return *(in_addr_t *) nl_addr_get_binary_addr(addr); + } + return INADDR_ANY; +} + +int nl_object_get_compatible_oif(struct rtnl_route* nl_route_obj) { + return rtnl_route_get_oif(nl_route_obj); +} + +int nl_object_get_compatible_metric(struct rtnl_route* nl_route_obj, int attr) { + uint32_t val = rtnl_route_get_metric(nl_route_obj, attr); + if (val == UINT_MAX) { + nl_logdbg("Fail parsing route metric %d error=%d\n", attr, val); + return 0; + } + return val; +} +#endif diff --git a/src/vma/netlink/netlink_compatibility.h b/src/vma/netlink/netlink_compatibility.h new file mode 100644 index 0000000..8c96a95 --- /dev/null +++ b/src/vma/netlink/netlink_compatibility.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + + +#ifndef NETLINK_COMPATIBILITY_H_ +#define NETLINK_COMPATIBILITY_H_ + +#include +#include +#include +#include +#include "config.h" +#include +#include +#include +#include +#include +#include "vma/infra/subject_observer.h" +#include +using namespace std; + +extern "C" void link_event_callback(nl_object* obj); +extern "C" void neigh_event_callback(nl_object* obj); +extern "C" void route_event_callback(nl_object* obj); + +class netlink_wrapper; +enum e_netlink_event_type +{ + nlgrpNEIGH = 0, + nlgrpLINK = 1, + nlgrpROUTE = 2, + /* TODO: not supported yet + nlgrpADDRESS=3, + nlgrpPREFIX=4, + */ +}; + +#ifdef HAVE_LIBNL3 + +typedef struct nl_sock nl_socket_handle; +#define rtnl_compatible_route_get_priority rtnl_route_get_priority + +nl_sock* nl_socket_handle_alloc(); +void nl_socket_handle_free(struct nl_sock * sock); +void neigh_callback(nl_cache* , nl_object* obj, int, void*); +void link_callback(nl_cache* , nl_object* obj, int, void*); +void route_callback(nl_cache* , nl_object* obj, int, void*); + +#else //HAVE_LIBNL1 + +#define rtnl_compatible_route_get_priority rtnl_route_get_prio +typedef struct nl_handle nl_socket_handle; + +nl_handle* nl_socket_handle_alloc(); +void nl_socket_handle_free(struct nl_handle* handle); +void neigh_callback(nl_cache* , nl_object* obj, int); +void link_callback(nl_cache* , nl_object* obj, int); +void route_callback(nl_cache* , nl_object* obj, int); + +#endif + +void nl_socket_handle_disable_seq_check(nl_socket_handle* handle); +nl_cache_mngr* nl_cache_mngr_compatible_alloc(nl_socket_handle* handle, int protocol, int flags); +int nl_cache_mngr_compatible_add(struct nl_cache_mngr* mngr, const char* name, change_func_t cb, void* data, struct nl_cache** result); +in_addr_t nl_object_get_compatible_gateway(struct rtnl_route* nl_route_obj); +int nl_object_get_compatible_oif(struct rtnl_route* nl_route_obj); +int nl_object_get_compatible_metric(struct rtnl_route* nl_route_obj, int attr); + + +#endif /* NETLINK_COMPATIBILITY_H_ */ diff --git a/src/vma/netlink/netlink_wrapper.cpp b/src/vma/netlink/netlink_wrapper.cpp new file mode 100644 index 0000000..4cee1b9 --- /dev/null +++ b/src/vma/netlink/netlink_wrapper.cpp @@ -0,0 +1,421 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include +#include + +#include "vlogger/vlogger.h" +#include "utils/bullseye.h" +#include "netlink_wrapper.h" +#include +#include +#include +#include +#include +#include + +#define MODULE_NAME "nl_wrapper:" + +#define nl_logpanic __log_panic +#define nl_logerr __log_err +#define nl_logwarn __log_warn +#define nl_loginfo __log_info +#define nl_logdbg __log_dbg +#define nl_logfunc __log_func + +netlink_wrapper* g_p_netlink_handler = NULL; + +// structure to pass arguments on internal netlink callbacks handling +typedef struct rcv_msg_arg +{ + netlink_wrapper* netlink; + nl_socket_handle* socket_handle; + map* subjects_map; + nlmsghdr* msghdr; +} rcv_msg_arg_t; + +static rcv_msg_arg_t g_nl_rcv_arg; + +int nl_msg_rcv_cb(struct nl_msg *msg, void *arg) { + nl_logfunc( "---> nl_msg_rcv_cb"); + NOT_IN_USE(arg); + g_nl_rcv_arg.msghdr = nlmsg_hdr(msg); + // NETLINK MESAGE DEBUG + //nl_msg_dump(msg, stdout); + nl_logfunc( "<--- nl_msg_rcv_cb"); + return 0; +} + +/* This function is called from internal thread only as neigh_timer_expired() + * so it is protected by m_cache_lock call + */ +void netlink_wrapper::notify_observers(netlink_event *p_new_event, e_netlink_event_type type) +{ + g_nl_rcv_arg.netlink->m_cache_lock.unlock(); + g_nl_rcv_arg.netlink->m_subj_map_lock.lock(); + + subject_map_iter iter = g_nl_rcv_arg.subjects_map->find(type); + if(iter != g_nl_rcv_arg.subjects_map->end()) + iter->second->notify_observers(p_new_event); + + g_nl_rcv_arg.netlink->m_subj_map_lock.unlock(); + /* coverity[missing_unlock] */ + g_nl_rcv_arg.netlink->m_cache_lock.lock(); +} + +extern void link_event_callback(nl_object* obj) { + netlink_wrapper::link_cache_callback(obj); +} +extern void neigh_event_callback(nl_object* obj) { + netlink_wrapper::neigh_cache_callback(obj); +} +extern void route_event_callback(nl_object* obj) { + netlink_wrapper::route_cache_callback(obj); +} + +void netlink_wrapper::neigh_cache_callback(nl_object* obj) +{ + nl_logdbg( "---> neigh_cache_callback"); + struct rtnl_neigh* neigh = (struct rtnl_neigh*)obj; + neigh_nl_event new_event(g_nl_rcv_arg.msghdr, neigh, g_nl_rcv_arg.netlink); + + netlink_wrapper::notify_observers(&new_event, nlgrpNEIGH); + + g_nl_rcv_arg.msghdr = NULL; + nl_logdbg( "<--- neigh_cache_callback"); + +} + +void netlink_wrapper::link_cache_callback(nl_object* obj) +{ + nl_logfunc( "---> link_cache_callback"); + struct rtnl_link* link = (struct rtnl_link*) obj; + link_nl_event new_event(g_nl_rcv_arg.msghdr, link, g_nl_rcv_arg.netlink); + + netlink_wrapper::notify_observers(&new_event, nlgrpLINK); + + g_nl_rcv_arg.msghdr = NULL; + nl_logfunc( "<--- link_cache_callback"); +} + +void netlink_wrapper::route_cache_callback(nl_object* obj) +{ + nl_logfunc( "---> route_cache_callback"); + struct rtnl_route* route = (struct rtnl_route*) obj; + if (route) { + int table_id = rtnl_route_get_table(route); + int family = rtnl_route_get_family(route); + if ((table_id > (int)RT_TABLE_UNSPEC) && (table_id != RT_TABLE_LOCAL) && (family == AF_INET)) { + route_nl_event new_event(g_nl_rcv_arg.msghdr, route, g_nl_rcv_arg.netlink); + netlink_wrapper::notify_observers(&new_event, nlgrpROUTE); + } + else { + nl_logdbg("Received event for not handled route entry: family=%d, table_id=%d", family, table_id); + } + } + else { + nl_logdbg("Received invalid route event"); + } + g_nl_rcv_arg.msghdr = NULL; + nl_logfunc( "<--- route_cache_callback"); +} + + +netlink_wrapper::netlink_wrapper() : + m_socket_handle(NULL), m_mngr(NULL), m_cache_link(NULL), m_cache_neigh( + NULL), m_cache_route(NULL) +{ + nl_logdbg( "---> netlink_route_listener CTOR"); + g_nl_rcv_arg.subjects_map = &m_subjects_map; + g_nl_rcv_arg.netlink = this; + g_nl_rcv_arg.msghdr = NULL; + nl_logdbg( "<--- netlink_route_listener CTOR"); +} + +netlink_wrapper::~netlink_wrapper() +{ + /* different handling under LIBNL1 versus LIBNL3 */ +#ifdef HAVE_LIBNL3 + nl_logdbg( "---> netlink_route_listener DTOR (LIBNL3)"); + /* should not call nl_cache_free() for link, neigh, route as nl_cach_mngr_free() does the freeing */ + // nl_cache_free(m_cache_link); + // nl_cache_free(m_cache_neigh); + // nl_cache_free(m_cache_route); + nl_cache_mngr_free(m_mngr); + nl_socket_handle_free(m_socket_handle); +#else // HAVE_LINBL1 + nl_logdbg( "---> netlink_route_listener DTOR (LIBNL1)"); + /* should not call nl_socket_handle_free(m_socket_handle) as nl_cache_mngr_free() does the freeing */ + /* nl_socket_handle_free(m_socket_handle); */ + nl_cache_free(m_cache_link); + nl_cache_free(m_cache_neigh); + nl_cache_free(m_cache_route); + nl_cache_mngr_free(m_mngr); +#endif // HAVE_LIBNL3 + + subject_map_iter iter = m_subjects_map.begin(); + while (iter != m_subjects_map.end()) { + delete iter->second; + iter++; + } + nl_logdbg( "<--- netlink_route_listener DTOR"); +} + +int netlink_wrapper::open_channel() +{ + auto_unlocker lock(m_cache_lock); + nl_logdbg("opening netlink channel"); + + /* + // build to subscriptions groups mask for indicating what type of events the kernel will send on channel + unsigned subscriptions = ~RTMGRP_TC; + if (netlink_route_group_mask & nlgrpLINK) { + subscriptions |= (1 << (RTNLGRP_LINK - 1)); + } + if (netlink_route_group_mask & nlgrpADDRESS) { + if (!m_preferred_family || m_preferred_family == AF_INET) + subscriptions |= (1 << (RTNLGRP_IPV4_IFADDR - 1)); + if (!m_preferred_family || m_preferred_family == AF_INET6) + subscriptions |= (1 << (RTNLGRP_IPV6_IFADDR - 1)); + } + if (netlink_route_group_mask & nlgrpROUTE) { + if (!m_preferred_family || m_preferred_family == AF_INET) + subscriptions |= (1 << (RTNLGRP_IPV4_ROUTE - 1)); + if (!m_preferred_family || m_preferred_family == AF_INET6) + subscriptions |= (1 << (RTNLGRP_IPV4_ROUTE - 1)); + } + if (netlink_route_group_mask & nlgrpPREFIX) { + if (!m_preferred_family || m_preferred_family == AF_INET6) + subscriptions |= (1 << (RTNLGRP_IPV6_PREFIX - 1)); + } + if (netlink_route_group_mask & nlgrpNEIGH) { + subscriptions |= (1 << (RTNLGRP_NEIGH - 1)); + } + */ + + // Allocate a new netlink socket/handle + m_socket_handle = nl_socket_handle_alloc(); + + BULLSEYE_EXCLUDE_BLOCK_START + if (m_socket_handle == NULL) { + nl_logerr("failed to allocate netlink handle"); + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_END + + // set internal structure to pass the handle with callbacks from netlink + g_nl_rcv_arg.socket_handle = m_socket_handle; + + // if multiple handles being allocated then a unique netlink PID need to be provided + // If port is 0, a unique port identifier will be generated automatically as a unique PID + nl_socket_set_local_port(m_socket_handle, 0); + + + //Disables checking of sequence numbers on the netlink handle. + //This is required to allow messages to be processed which were not requested by a preceding request message, e.g. netlink events. + nl_socket_handle_disable_seq_check(m_socket_handle); + + //joining group + //nl_join_groups(m_handle, 0); + + // Allocate a new cache manager for RTNETLINK + // NL_AUTO_PROVIDE = automatically provide the caches added to the manager. + m_mngr = nl_cache_mngr_compatible_alloc(m_socket_handle, NETLINK_ROUTE, NL_AUTO_PROVIDE); + BULLSEYE_EXCLUDE_BLOCK_START + if (!m_mngr) { + nl_logerr("Fail to allocate cache manager"); + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_END + + nl_logdbg("netlink socket is open"); + + if (nl_cache_mngr_compatible_add(m_mngr, "route/link", link_callback, NULL, &m_cache_link)) + return -1; + if (nl_cache_mngr_compatible_add(m_mngr, "route/route", route_callback, NULL, &m_cache_route)) + return -1; + if (nl_cache_mngr_compatible_add(m_mngr, "route/neigh", neigh_callback, NULL, &m_cache_neigh)) + return -1; + + // set custom callback for every message to update message + nl_socket_modify_cb(m_socket_handle, NL_CB_MSG_IN, NL_CB_CUSTOM, nl_msg_rcv_cb ,NULL); + + // set the socket non-blocking + BULLSEYE_EXCLUDE_BLOCK_START + if (nl_socket_set_nonblocking(m_socket_handle)) { + nl_logerr("Failed to set the socket non-blocking"); + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_END + + return 0; + +} + +int netlink_wrapper::get_channel() +{ + auto_unlocker lock(m_cache_lock); + if (m_socket_handle) + return nl_socket_get_fd(m_socket_handle); + else + return -1; +} + +int netlink_wrapper::handle_events() +{ + m_cache_lock.lock(); + + nl_logfunc("--->handle_events"); + + BULLSEYE_EXCLUDE_BLOCK_START + if (!m_socket_handle) { + nl_logerr("Cannot handle events before opening the channel. please call first open_channel()"); + m_cache_lock.unlock(); + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_END + + int n = nl_cache_mngr_data_ready(m_mngr); + + //int n = nl_recvmsgs_default(m_handle); + nl_logfunc("nl_recvmsgs=%d", n); + if (n < 0) + nl_logdbg("recvmsgs returned with error = %d", n); + + + nl_logfunc("<---handle_events"); + + m_cache_lock.unlock(); + + return n; +} + +bool netlink_wrapper::register_event(e_netlink_event_type type, + const observer* new_obs) +{ + auto_unlocker lock(m_subj_map_lock); + subject* sub; + subject_map_iter iter = m_subjects_map.find(type); + if (iter == m_subjects_map.end()) { + sub = new subject(); + m_subjects_map[type] = sub; + } + else { + sub = m_subjects_map[type]; + } + + return sub->register_observer(new_obs); +} + +bool netlink_wrapper::unregister(e_netlink_event_type type, + const observer* obs) +{ + auto_unlocker lock(m_subj_map_lock); + if (obs == NULL) + return false; + + subject_map_iter iter = m_subjects_map.find(type); + if (iter != m_subjects_map.end()) { + return m_subjects_map[type]->unregister_observer(obs); + } + + return true; +} + +int netlink_wrapper::get_neigh(const char* ipaddr, int ifindex, netlink_neigh_info* new_neigh_info) +{ + auto_unlocker lock(m_cache_lock); + nl_logfunc("--->netlink_listener::get_neigh"); + nl_object* obj; + rtnl_neigh* neigh; + char addr_str[256]; + + BULLSEYE_EXCLUDE_BLOCK_START + if (!new_neigh_info) { + nl_logerr("Illegal argument. user pass NULL neigh_info to fill"); + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_END + + obj = nl_cache_get_first(m_cache_neigh); + while (obj) { + nl_object_get(obj); //Acquire a reference on a cache object. cache won't use/free it until calling to nl_object_put(obj) + neigh = (rtnl_neigh*) obj; + nl_addr* addr = rtnl_neigh_get_dst(neigh); + int index = rtnl_neigh_get_ifindex(neigh); + if ((addr) && (index > 0)) { + nl_addr2str(addr, addr_str, 255); + if (!strcmp(addr_str, ipaddr) && (ifindex == index)) { + new_neigh_info->fill(neigh); + nl_object_put(obj); + nl_logdbg("neigh - DST_IP:%s IF_INDEX:%d LLADDR:%s", addr_str, index, new_neigh_info->lladdr_str.c_str() ); + nl_logfunc("<---netlink_listener::get_neigh"); + return 1; + } + } + nl_object_put(obj); + obj = nl_cache_get_next(obj); + } + + nl_logfunc("<---netlink_listener::get_neigh"); + return 0; +} + +void netlink_wrapper::neigh_timer_expired() { + m_cache_lock.lock(); + + nl_logfunc("--->netlink_wrapper::neigh_timer_expired"); + nl_cache_refill(m_socket_handle, m_cache_neigh); + notify_neigh_cache_entries(); + nl_logfunc("<---netlink_wrapper::neigh_timer_expired"); + + m_cache_lock.unlock(); +} + +void netlink_wrapper::notify_neigh_cache_entries() { + nl_logfunc("--->netlink_wrapper::notify_cache_entries"); + g_nl_rcv_arg.msghdr = NULL; + nl_object* obj = nl_cache_get_first(m_cache_neigh); + while (obj) { + nl_object_get(obj); + neigh_event_callback(obj); + nl_object_put(obj); + obj = nl_cache_get_next(obj); + } + nl_logfunc("<---netlink_wrapper::notify_cache_entries"); + +} + + + + diff --git a/src/vma/netlink/netlink_wrapper.h b/src/vma/netlink/netlink_wrapper.h new file mode 100644 index 0000000..db02ad9 --- /dev/null +++ b/src/vma/netlink/netlink_wrapper.h @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + + +#ifndef NETLINKROUTELISTENER_H_ +#define NETLINKROUTELISTENER_H_ + +#include +#include +#include +#include "utils/lock_wrapper.h" +#include "neigh_info.h" +#include "vma/event/netlink_event.h" +#include "netlink_compatibility.h" + +#define subject_map_iter map::iterator + +/* + * the class provide simple API for registering observers to NETLINK ROUTE_FAMILY events from kernel. + * ROUTE_FAMILY: NEIGHBOURS, LINKS (interfaces), ROUTE TABLE, ADDRESSES + * the user can register/unregister to different type of events with his own implemented observer. + * netlink_listener doesn't manage an internal context for handling the events, + * it provides the user with a method to handle the events on his context. + * + * the class uses LIBNL (netlink library) as an API to use netlink core functions + * LIBNL documentation: http://www.infradead.org/~tgr/libnl/ + * + * TODO: + * -thread-safe + * -currently supports only processing of NEIGH and LINK netlink kernel multicast groups + */ +class netlink_wrapper +{ +public: + netlink_wrapper(); + virtual ~netlink_wrapper(); + + static void neigh_cache_callback(nl_object* obj); + static void link_cache_callback(nl_object* obj); + static void route_cache_callback(nl_object* obj); + + /* return fd for the specific netlink instace's channel to kernel + * the channel is a NON_BLOCKING socket opened as socket(AF_NETLINK, SOCK_RAW, NETLINK_ROUTE) + * return <0 if channel is not open or failed to open. + */ + int get_channel(); + + /* + * register an observer to a subject specified by type of netlink events. + * the registered observer will be notified for every netlink event related to the provided type + * events will be notified only from hadnle_events(). there is no internal context to handles the events, + * user need to provide a context by calling hadnle_events(). + */ + bool register_event(e_netlink_event_type type, const observer* new_obs); + + /* + * unregister an observer from the subject specified by netlink event type + */ + bool unregister(e_netlink_event_type type, const observer* obs); + + /* + * Receive messages, parse, build relevant netlink_events and notify the registered observers. + * return the number of events or negative number on error + * **must first insure that opne_channel was called + */ + int handle_events(); + + /* open the netlink channel: + 1. Allocate a new netlink handle + 2. [TODO: for querying]: allocate cache + 3. join netlink multicast groups + 4. Connect to link netlink socket on kernel side + 5. set netlink callback + 6. set the socket non-blocking + ** the channel must be opned before calling handle_events() + */ + int open_channel(); + + // search for the first matching neigh using (ipaddr and ifindex) on the neigh cache + // if matching neigh was found, then it fills the provided new_neigh_info* and return 1 + // else if no matching neigh then return 0 + // on error return -1 + // ** neigh cache is keep being updated for every neigh netlink event + int get_neigh(const char* ipaddr, int ifindex, netlink_neigh_info* new_neigh_info); + + // periodic maintenance method for keeping caches updated with kernel. + // user of netlink wrapper should provide context to call this function periodically. + // ** Currently, it refills neigh's cache info from current kernel's table + // because when neigh state is changed from STALE to REACHABLE directly , kernel does not notifies netlink + void neigh_timer_expired(); + +private: + nl_socket_handle* m_socket_handle; + + struct nl_cache_mngr* m_mngr; + struct nl_cache* m_cache_link; + struct nl_cache* m_cache_neigh; + struct nl_cache* m_cache_route; + + map m_subjects_map; + lock_mutex_recursive m_cache_lock; + lock_mutex_recursive m_subj_map_lock; + + //This method should be called with m_cache_lock held! + static void notify_observers(netlink_event *p_new_event, e_netlink_event_type type); + + void notify_neigh_cache_entries(); +}; + +extern netlink_wrapper* g_p_netlink_handler; + +#endif /* NETLINKROUTELISTENER_H_ */ diff --git a/src/vma/netlink/route_info.cpp b/src/vma/netlink/route_info.cpp new file mode 100644 index 0000000..65ba587 --- /dev/null +++ b/src/vma/netlink/route_info.cpp @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "route_info.h" +#include "config.h" +#include "vma/util/if.h" +#include "vma/util/libvma.h" +#include "vlogger/vlogger.h" +#include "netlink_compatibility.h" + +#define MODULE_NAME "route_info:" +#define ADDR_MAX_STR_LEN (128) + +netlink_route_info::netlink_route_info(struct rtnl_route* nl_route_obj) : m_route_val(NULL) +{ + fill(nl_route_obj); +} + +netlink_route_info::~netlink_route_info() +{ + if (m_route_val) { + delete m_route_val; + } +} +void netlink_route_info::fill(struct rtnl_route* nl_route_obj) +{ + if (!nl_route_obj) { + return; + } + + m_route_val = new route_val(); + if (!m_route_val) { + __log_warn("Failed to allocate memory for new route object"); + return; + } + + int table = rtnl_route_get_table(nl_route_obj); + if (table > 0) { + m_route_val->set_table_id(table); + } + + int scope = rtnl_route_get_scope(nl_route_obj); + if (scope > 0) { + m_route_val->set_scope(scope); + } + int mtu = nl_object_get_compatible_metric(nl_route_obj, RTAX_MTU); + if (mtu > 0) { + m_route_val->set_mtu(mtu); + } + int protocol = rtnl_route_get_protocol(nl_route_obj); + if (protocol > 0) { + m_route_val->set_protocol(protocol); + } + + int type = rtnl_route_get_type(nl_route_obj); + if (type > 0) { + m_route_val->set_type(type); + } + + struct nl_addr* addr = rtnl_route_get_dst(nl_route_obj); + if (addr) { + unsigned int dst_prefixlen = nl_addr_get_prefixlen(addr); + m_route_val->set_dst_mask(htonl(VMA_NETMASK(dst_prefixlen))); + m_route_val->set_dst_pref_len(dst_prefixlen); + m_route_val->set_dst_addr(*(in_addr_t *) nl_addr_get_binary_addr(addr)); + } + + addr = rtnl_route_get_pref_src(nl_route_obj); + if (addr) { + m_route_val->set_src_addr(*(in_addr_t *) nl_addr_get_binary_addr(addr)); + } + + int oif = nl_object_get_compatible_oif(nl_route_obj); + if (oif > 0) { + m_route_val->set_if_index(oif); + char if_name[IFNAMSIZ]; + if_indextoname(oif, if_name); + m_route_val->set_if_name(if_name); + } + + in_addr_t gateway = nl_object_get_compatible_gateway(nl_route_obj); + if (gateway != INADDR_ANY) { + m_route_val->set_gw(gateway); + } +} + + diff --git a/src/vma/netlink/route_info.h b/src/vma/netlink/route_info.h new file mode 100644 index 0000000..2944076 --- /dev/null +++ b/src/vma/netlink/route_info.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef NETLINK_ROUTE_INFO_H_ +#define NETLINK_ROUTE_INFO_H_ + +#include +#include +#include + +#include "vma/proto/route_val.h" + +class netlink_route_info +{ +public: + + netlink_route_info(struct rtnl_route* nl_route_obj); + ~netlink_route_info(); + + route_val* get_route_val() { return m_route_val; }; + +private: + // fill all attributes using the provided netlink original route + void fill(struct rtnl_route* nl_route_obj); + + route_val* m_route_val; +}; + +#endif /* NETLINK_ROUTE_INFO_H_ */ diff --git a/src/vma/netlink/test_main.cpp b/src/vma/netlink/test_main.cpp new file mode 100644 index 0000000..f19f979 --- /dev/null +++ b/src/vma/netlink/test_main.cpp @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "vma/infra/subject_observer.h" +#include "netlink_wrapper.h" +#include "neigh_info.h" +#include +#include "errno.h" +#include +#include "vlogger/vlogger.h" +#include "vma/event/netlink_event.h" + +extern uint8_t g_vlogger_level; +#define MODULE_NAME "NETLINK_TEST" + +class neigh_observer : public observer { + virtual void notify_cb(event * ev) { + if (ev) { +/* + neigh_nl_event* net_ev = dynamic_cast (ev); + if (net_ev->neigh_dst_addr_str == "1.1.1.12") { + //__log_info("!!! IN neigh_observer !!!"); + __log_info("%s", ev->to_str().c_str()); + } +*/ + __log_info("!!! IN neigh_observer !!!"); + neigh_nl_event* nlev = dynamic_cast(ev); + __log_info("%s", ev->to_str().c_str()); + netlink_neigh_info info; + g_p_netlink_handler->get_neigh("1.1.1.1", 1, &info); + __log_info("AFTER get_neigh"); + __log_info("NEIGH STATE=%s", nlev->get_neigh_info()->get_state2str().c_str()); + } + } +}; + + +class route_observer : public observer { + virtual void notify_cb(event * ev) { + if (ev) { + __log_info("!!! IN route_observer !!!"); + //route_nl_event* nlev = dynamic_cast(ev); + __log_info("%s", ev->to_str().c_str()); + } + } +}; + +class link_observer : public observer { + virtual void notify_cb(event * ev) { + if (ev) { + __log_info("!!! IN link_observer !!!"); + __log_info("%s", ev->to_str().c_str()); + } + } +}; + + + +void netlink_test() +{ + g_vlogger_level=3; + netlink_wrapper* nl = new netlink_wrapper(); + g_p_netlink_handler=nl; + neigh_observer neigh_obs; + route_observer route_obs; + link_observer link_obs; + nl->register_event(nlgrpNEIGH, &neigh_obs); + //nl->register_event(nlgrpROUTE, &route_obs); + //nl->register_event(nlgrpLINK, &link_obs); + int nevents; + struct epoll_event events[32]; + + if (nl->open_channel()) { + printf("fail to open nl channel\n"); + exit(-1); + } + + int fd = nl->get_channel(); + + if (fd < 0) { + printf("netlink channel is illegal\n"); + exit(-1); + } + int epfd = epoll_create(10); + + + struct epoll_event* e = new struct epoll_event(); + e->data.fd=fd; + e->data.ptr=NULL; + e->events=EPOLLIN | EPOLLET; + epoll_ctl(epfd, EPOLL_CTL_ADD, fd, e); + +// netlink_neigh_info* neigh_info = new netlink_neigh_info(); +// printf("GOING TO NIEGH QUERY\n"); +// int rc = nl->get_neigh("172.30.20.111", 1, neigh_info); +// if (rc == 1) { +// printf("NIEGH QUERY is:\n"); +// printf("NEIGH: ip=%s, lladdr=%s, state=%s\n", neigh_info->dst_addr_str.c_str(), neigh_info->lladdr_str.c_str(), neigh_info->get_state2str().c_str()); +// printf("NIEGH QUERY done\n"); +// } +// else { +// printf("NO NIEGH QUERY, rc=%d\n", rc); +// } +// + while (1) { + + /* Poll events from both main threads and the event channel */ + + nevents = epoll_wait(epfd, events, + sizeof(events) / sizeof(events[0]), 2000); + + if (nevents < 0) { + if (errno != EINTR) { + printf("epoll_wait errno=%m\n"); + } + } else if (nevents) { + printf("*** --> going to handle events (n=%d)\n", nevents); + nl->handle_events(); + printf("*** <-- handle events\n"); + } + } + printf("-------->>>>> event_processor thread stopped <<<<<--------"); + exit(1); +} + + +int main(int argc, char* argv[]) +{ + g_vlogger_level = 3; + if (argv && argc > 1) { + int tracelevel = atoi(argv[1]); + if (tracelevel > 0 && tracelevel <= 5) + g_vlogger_level = tracelevel; + } + netlink_test(); + return 0; +} + diff --git a/src/vma/proto/L2_address.cpp b/src/vma/proto/L2_address.cpp new file mode 100644 index 0000000..0108cf7 --- /dev/null +++ b/src/vma/proto/L2_address.cpp @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "utils/bullseye.h" +#include "vlogger/vlogger.h" +#include "vma/proto/L2_address.h" + + +#define MODULE_NAME "L2_addr" + +#define L2_panic __log_panic +#define L2_logerr __log_info_err +#define L2_logwarn __log_info_warn +#define L2_loginfo __log_info_info +#define L2_logdbg __log_info_dbg +#define L2_logfunc __log_info_func +#define L2_logfuncall __log_info_funcall + +L2_address::L2_address(address_t const address, addrlen_t const len) +{ + set(address, len); +} + +void L2_address::set(address_t const address, addrlen_t const len) +{ + BULLSEYE_EXCLUDE_BLOCK_START + if (len <= 0 || len > L2_ADDR_MAX) + L2_panic("len = %d", len); + + if (address == NULL) + L2_panic("address == NULL"); + BULLSEYE_EXCLUDE_BLOCK_END + + // Copy the new address + m_len = len; + memcpy((void*)m_p_raw_address, (void*)address, m_len); +} + +bool L2_address::compare(L2_address const& other) const +{ + if (other.m_len != m_len) + return false; + return (!memcmp((void*)other.m_p_raw_address, (void*)m_p_raw_address, m_len)); +} + +const std::string ETH_addr::to_str() const +{ + char s[100]; + if (m_len > 0) + sprintf(s, ETH_HW_ADDR_PRINT_FMT, ETH_HW_ADDR_PRINT_ADDR(m_p_raw_address)); + return (std::string(s)); +} + +const std::string IPoIB_addr::to_str() const +{ + char s[100]; + if (m_len > 0) + sprintf(s, IPOIB_HW_ADDR_PRINT_FMT, IPOIB_HW_ADDR_PRINT_ADDR(m_p_raw_address)); + return (std::string(s)); +} + +void IPoIB_addr::extract_qpn() +{ + unsigned char rem_qpn[4]; + + rem_qpn[0] = m_p_raw_address[3]; + rem_qpn[1] = m_p_raw_address[2]; + rem_qpn[2] = m_p_raw_address[1]; + rem_qpn[3] = 0; + memcpy(&m_qpn, rem_qpn, 4); + L2_logdbg("qpn = %#x", m_qpn); +} + + diff --git a/src/vma/proto/L2_address.h b/src/vma/proto/L2_address.h new file mode 100644 index 0000000..774e820 --- /dev/null +++ b/src/vma/proto/L2_address.h @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef L2_ADDRESS_H +#define L2_ADDRESS_H + +#include +#include +#include + +#include "vma/util/to_str.h" +#include "vma/util/vtypes.h" + +typedef size_t addrlen_t; +typedef unsigned char* address_t; + +// 20 Bytes will +#define L2_ADDR_MAX 20 + +class L2_address : public tostr +{ +public: + L2_address(address_t const address, addrlen_t const len); + L2_address() : m_len(0) {}; + virtual ~L2_address() {}; + + virtual L2_address* clone() const = 0; + + void set(address_t const address, addrlen_t const len); + + addrlen_t get_addrlen() const { return m_len; }; + address_t get_address() const { return (address_t)m_p_raw_address; }; + + virtual bool compare(L2_address const& other) const; + +protected: + addrlen_t m_len; + unsigned char m_p_raw_address[L2_ADDR_MAX]; +}; + +class ETH_addr : public L2_address +{ +public: + ETH_addr(address_t const address) : L2_address(address, 6) {}; + ~ETH_addr() {}; + const std::string to_str() const; + + virtual L2_address* clone() const + { + return (new ETH_addr(get_address())); + } +}; + +class IPoIB_addr : public L2_address +{ +public: + + IPoIB_addr(): L2_address(), m_qpn(0) + { + + } + + //This constructor is for UC + IPoIB_addr(address_t const address) : L2_address(address, 20), m_qpn(0) + { + extract_qpn(); + }; + //This constructor is for MC + IPoIB_addr(uint32_t qpn, address_t const address) : L2_address(address, 20), m_qpn(qpn) {}; + ~IPoIB_addr() {}; + + virtual L2_address* clone() const + { + uint32_t qpn = ((IPoIB_addr*)this)->get_qpn(); + return (new IPoIB_addr(qpn, get_address())); + } + + void set_qpn(uint32_t qpn) { m_qpn = qpn; }; + uint32_t get_qpn() { return m_qpn; }; + + const std::string to_str() const; + +private: + uint32_t m_qpn; + + void extract_qpn(); +}; + +#endif /* L2_ADDRESS_H */ diff --git a/src/vma/proto/arp.cpp b/src/vma/proto/arp.cpp new file mode 100644 index 0000000..41230d5 --- /dev/null +++ b/src/vma/proto/arp.cpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include +#include +#include +#include "vma/proto/arp.h" +#include "vma/util/vtypes.h" +#include + +/* ARP message types (opcodes) */ +#define ARP_REQUEST 0x0001 + +#define HWTYPE_ETHERNET 0x0001 +#define HWTYPE_IB 0x0020 +#define IPv4_ALEN 0x04 +#define ETHADDR_COPY(dst, src) memcpy(dst, src, ETH_ALEN) +#define IBADDR_COPY(dst, src) memcpy(dst, src, IPOIB_HW_ADDR_LEN) + +void set_eth_arp_hdr(eth_arp_hdr *p_arph, in_addr_t ipsrc_addr, in_addr_t ipdst_addr, const uint8_t* hwsrc_addr, const uint8_t* hwdst_addr) +{ + p_arph->m_hwtype = htons(HWTYPE_ETHERNET); + p_arph->m_proto = htons(ETH_P_IP); + p_arph->m_hwlen = ETH_ALEN; + p_arph->m_protolen = IPv4_ALEN; + p_arph->m_opcode = htons(ARP_REQUEST); + ETHADDR_COPY(p_arph->m_shwaddr, hwsrc_addr); + p_arph->m_sipaddr = ipsrc_addr; + ETHADDR_COPY(p_arph->m_dhwaddr, hwdst_addr); + p_arph->m_dipaddr = ipdst_addr; +} + +void set_ib_arp_hdr(ib_arp_hdr* p_arph, in_addr_t ipsrc_addr, in_addr_t ipdst_addr, const uint8_t* hwsrc_addr, const uint8_t* hwdst_addr) +{ + p_arph->m_hwtype = htons(HWTYPE_IB); + p_arph->m_proto = htons(ETH_P_IP); + p_arph->m_hwlen = IPOIB_HW_ADDR_LEN; + p_arph->m_protolen = IPv4_ALEN; + p_arph->m_opcode = htons(ARP_REQUEST); + IBADDR_COPY(p_arph->m_shwaddr, hwsrc_addr); + p_arph->m_sipaddr = ipsrc_addr; + if(hwdst_addr) + IBADDR_COPY(p_arph->m_dhwaddr, hwdst_addr); + p_arph->m_dipaddr = ipdst_addr; +} diff --git a/src/vma/proto/arp.h b/src/vma/proto/arp.h new file mode 100644 index 0000000..3937528 --- /dev/null +++ b/src/vma/proto/arp.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef ARP_H +#define ARP_H + +#include +#include +#include "vma/util/vtypes.h" + +struct __attribute__ ((packed)) eth_arp_hdr +{ + uint16_t m_hwtype; + uint16_t m_proto; + uint8_t m_hwlen; + uint8_t m_protolen; + uint16_t m_opcode; + uint8_t m_shwaddr[ETH_ALEN]; + uint32_t m_sipaddr; + uint8_t m_dhwaddr[ETH_ALEN]; + uint32_t m_dipaddr; +}; + +void set_eth_arp_hdr(eth_arp_hdr* p_arph, in_addr_t ipsrc_addr, in_addr_t ipdst_addr, const uint8_t* hwsrc_addr, const uint8_t* hwdst_addr); + +struct __attribute__ ((packed)) ib_arp_hdr +{ + uint16_t m_hwtype; + uint16_t m_proto; + uint8_t m_hwlen; + uint8_t m_protolen; + uint16_t m_opcode; + uint8_t m_shwaddr[IPOIB_HW_ADDR_LEN]; + uint32_t m_sipaddr; + uint8_t m_dhwaddr[IPOIB_HW_ADDR_LEN]; + uint32_t m_dipaddr; +}; + +void set_ib_arp_hdr(ib_arp_hdr* p_arph, in_addr_t ipsrc_addr, in_addr_t ipdst_addr, const uint8_t* hwsrc_addr, const uint8_t* hwdst_addr); + + +#endif diff --git a/src/vma/proto/dst_entry.cpp b/src/vma/proto/dst_entry.cpp new file mode 100644 index 0000000..42a2a6f --- /dev/null +++ b/src/vma/proto/dst_entry.cpp @@ -0,0 +1,830 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "utils/bullseye.h" +#include "dst_entry.h" +#include "vma/proto/rule_table_mgr.h" +#include "vma/proto/route_table_mgr.h" +#include "vma/util/utils.h" + +#define MODULE_NAME "dst" + +#define dst_logpanic __log_panic +#define dst_logerr __log_err +#define dst_logwarn __log_warn +#define dst_loginfo __log_info +#define dst_logdbg __log_info_dbg +#define dst_logfunc __log_info_func +#define dst_logfuncall __log_info_funcall + + +dst_entry::dst_entry(in_addr_t dst_ip, uint16_t dst_port, uint16_t src_port, socket_data &sock_data, resource_allocation_key &ring_alloc_logic): + m_dst_ip(dst_ip), m_dst_port(dst_port), m_src_port(src_port), m_bound_ip(0), + m_so_bindtodevice_ip(0), m_route_src_ip(0), m_pkt_src_ip(0), + m_ring_alloc_logic(sock_data.fd, ring_alloc_logic, this), + m_p_tx_mem_buf_desc_list(NULL), m_b_tx_mem_buf_desc_list_pending(false), + m_ttl(sock_data.ttl), m_tos(sock_data.tos), m_pcp(sock_data.pcp), m_id(0) +{ + dst_logdbg("dst:%s:%d src: %d", m_dst_ip.to_str().c_str(), ntohs(m_dst_port), ntohs(m_src_port)); + init_members(); +} + +dst_entry::~dst_entry() +{ + dst_logdbg("%s", to_str().c_str()); + + if (m_p_neigh_entry) { + ip_address dst_addr = m_dst_ip; + if (m_p_rt_val && m_p_rt_val->get_gw_addr() != INADDR_ANY && !dst_addr.is_mc()) { + dst_addr = m_p_rt_val->get_gw_addr(); + } + g_p_neigh_table_mgr->unregister_observer(neigh_key(dst_addr, m_p_net_dev_val),this); + } + + if (m_p_rt_entry) { + g_p_route_table_mgr->unregister_observer(route_rule_table_key(m_dst_ip.get_in_addr(), m_route_src_ip, m_tos), this); + m_p_rt_entry = NULL; + } + + if (m_p_ring) { + if (m_sge) { + delete[] m_sge; + m_sge = NULL; + } + + if (m_p_tx_mem_buf_desc_list) { + m_p_ring->mem_buf_tx_release(m_p_tx_mem_buf_desc_list, true); + m_p_tx_mem_buf_desc_list = NULL; + } + + m_p_net_dev_val->release_ring(m_ring_alloc_logic.get_key()); + m_p_ring = NULL; + } + + if (m_p_net_dev_entry && m_p_net_dev_val) { + g_p_net_device_table_mgr->unregister_observer(m_p_net_dev_val->get_local_addr(), this); + } + + if (m_p_send_wqe_handler) { + delete m_p_send_wqe_handler; + m_p_send_wqe_handler = NULL; + } + + if (m_p_neigh_val) { + delete m_p_neigh_val; + m_p_neigh_val = NULL; + } + + dst_logdbg("Done %s", to_str().c_str()); +} + +void dst_entry::init_members() +{ + set_state(false); + m_p_rt_val = NULL; + m_p_net_dev_val = NULL; + m_p_ring = NULL; + m_p_net_dev_entry = NULL; + m_p_neigh_entry = NULL; + m_p_neigh_val = NULL; + m_p_rt_entry = NULL; + memset(&m_inline_send_wqe, 0, sizeof(m_inline_send_wqe)); + memset(&m_not_inline_send_wqe, 0, sizeof(m_not_inline_send_wqe)); + memset(&m_fragmented_send_wqe, 0, sizeof(m_not_inline_send_wqe)); + m_p_send_wqe_handler = NULL; + m_sge = NULL; + m_b_is_offloaded = true; + m_b_is_initialized = false; + m_p_send_wqe = NULL; + m_max_inline = 0; + m_max_ip_payload_size = 0; + m_max_udp_payload_size = 0; + m_b_force_os = false; +} + +void dst_entry::set_src_addr() +{ + m_pkt_src_ip = INADDR_ANY; + if (m_route_src_ip) { + m_pkt_src_ip = m_route_src_ip; + } + else if (m_p_rt_val && m_p_rt_val->get_src_addr()) { + m_pkt_src_ip = m_p_rt_val->get_src_addr(); + } + else if (m_p_net_dev_val && m_p_net_dev_val->get_local_addr()) { + m_pkt_src_ip = m_p_net_dev_val->get_local_addr(); + } +} + +bool dst_entry::update_net_dev_val() +{ + bool ret_val = false; + + net_device_val* new_nd_val = m_p_net_dev_val; + if (m_so_bindtodevice_ip && g_p_net_device_table_mgr) { + new_nd_val = g_p_net_device_table_mgr->get_net_device_val(m_so_bindtodevice_ip); + // TODO should we register to g_p_net_device_table_mgr with m_p_net_dev_entry? + // what should we do with an old one? + dst_logdbg("getting net_dev_val by bindtodevice ip"); + } else if (m_p_rt_entry) { + new_nd_val = m_p_rt_entry->get_net_dev_val(); + } + + if (m_p_net_dev_val != new_nd_val) { + dst_logdbg("updating net_device"); + + if (m_p_neigh_entry) { + ip_address dst_addr = m_dst_ip; + if (m_p_rt_val && m_p_rt_val->get_gw_addr() != INADDR_ANY && !dst_addr.is_mc()) { + dst_addr = m_p_rt_val->get_gw_addr(); + } + g_p_neigh_table_mgr->unregister_observer(neigh_key(dst_addr, m_p_net_dev_val),this); + m_p_neigh_entry = NULL; + } + + // Change the net_device, clean old resources... + release_ring(); + + // Save the new net_device + m_p_net_dev_val = new_nd_val; + + if (m_p_net_dev_val) { + // more resource clean and alloc... + ret_val = alloc_transport_dep_res(); + } + else { + dst_logdbg("Netdev is not offloaded fallback to OS"); + } + } + else { + if (m_p_net_dev_val) { + // Only if we already had a valid net_device_val which did not change + dst_logdbg("no change in net_device"); + ret_val = true; + } + else { + dst_logdbg("Netdev is not offloaded fallback to OS"); + } + } + + return ret_val; +} + +bool dst_entry::update_rt_val() +{ + bool ret_val = true; + route_val* p_rt_val = NULL; + + if (m_p_rt_entry && m_p_rt_entry->get_val(p_rt_val)) { + if (m_p_rt_val == p_rt_val) { + dst_logdbg("no change in route_val"); + } + else { + dst_logdbg("updating route val"); + m_p_rt_val = p_rt_val; + } + } + else { + dst_logdbg("Route entry is not valid"); + ret_val = false; + } + + return ret_val; +} + +bool dst_entry::resolve_net_dev(bool is_connect) +{ + bool ret_val = false; + + cache_entry_subject* p_ces = NULL; + + if (ZERONET_N(m_dst_ip.get_in_addr())) { + dst_logdbg("VMA does not offload zero net IP address"); + return ret_val; + } + + if (LOOPBACK_N(m_dst_ip.get_in_addr())) { + dst_logdbg("VMA does not offload local loopback IP address"); + return ret_val; + } + + //When VMA will support routing with OIF, we need to check changing in outgoing interface + //Source address changes is not checked since multiple bind is not allowed on the same socket + if (!m_p_rt_entry) { + m_route_src_ip = m_bound_ip; + route_rule_table_key rtk(m_dst_ip.get_in_addr(), m_route_src_ip, m_tos); + if (g_p_route_table_mgr->register_observer(rtk, this, &p_ces)) { + // In case this is the first time we trying to resolve route entry, + // means that register_observer was run + m_p_rt_entry = dynamic_cast(p_ces); + if (is_connect && !m_route_src_ip) { + route_val* p_rt_val = NULL; + if (m_p_rt_entry && m_p_rt_entry->get_val(p_rt_val) && p_rt_val->get_src_addr()) { + g_p_route_table_mgr->unregister_observer(rtk, this); + m_route_src_ip = p_rt_val->get_src_addr(); + route_rule_table_key new_rtk(m_dst_ip.get_in_addr(), m_route_src_ip, m_tos); + if (g_p_route_table_mgr->register_observer(new_rtk, this, &p_ces)) { + m_p_rt_entry = dynamic_cast(p_ces); + } + else { + dst_logdbg("Error in route resolving logic"); + return ret_val; + } + } + } + } + else { + dst_logdbg("Error in registering route entry"); + return ret_val; + } + } + + if (update_rt_val()) { + ret_val = update_net_dev_val(); + } + return ret_val; +} + +bool dst_entry::resolve_neigh() +{ + dst_logdbg(""); + bool ret_val = false; + ip_address dst_addr = m_dst_ip; + + if (m_p_rt_val && m_p_rt_val->get_gw_addr() != INADDR_ANY && !dst_addr.is_mc()) { + dst_addr = m_p_rt_val->get_gw_addr(); + } + cache_entry_subject* p_ces = NULL; + if (m_p_neigh_entry || g_p_neigh_table_mgr->register_observer(neigh_key(dst_addr, m_p_net_dev_val), this, &p_ces)) { + if(m_p_neigh_entry == NULL) + m_p_neigh_entry = dynamic_cast(p_ces); + if (m_p_neigh_entry) { + if (m_p_neigh_entry->get_peer_info(m_p_neigh_val)) { + dst_logdbg("neigh is valid"); + ret_val = true; + } + else { + dst_logdbg("neigh is not valid"); + } + } + } + return ret_val; +} + +bool dst_entry::resolve_ring() +{ + bool ret_val = false; + + if (m_p_net_dev_val) { + if (!m_p_ring) { + dst_logdbg("getting a ring"); + m_p_ring = m_p_net_dev_val->reserve_ring(m_ring_alloc_logic.create_new_key(m_pkt_src_ip)); + } + if (m_p_ring) { + if (m_sge) { + delete[] m_sge; + m_sge = NULL; + } +#ifdef DEFINED_TSO + m_sge = new (nothrow) struct ibv_sge [m_p_ring->get_max_send_sge()]; +#else + m_sge = new (nothrow) struct ibv_sge [2]; +#endif /* DEFINED_TSO */ + if (!m_sge) { + dst_logpanic("%s Failed to allocate send SGE", to_str().c_str()); + } + m_max_inline = m_p_ring->get_max_inline_data(); + m_max_inline = std::min(m_max_inline, get_route_mtu() + (uint32_t)m_header.m_transport_header_len); + ret_val = true; + } + } + return ret_val; +} + +bool dst_entry::release_ring() +{ + bool ret_val = false; + if (m_p_net_dev_val) { + if (m_p_ring) { + if (m_p_tx_mem_buf_desc_list) { + m_p_ring->mem_buf_tx_release(m_p_tx_mem_buf_desc_list, true); + m_p_tx_mem_buf_desc_list = NULL; + } + dst_logdbg("releasing a ring"); + if (m_p_net_dev_val->release_ring(m_ring_alloc_logic.get_key())) { + dst_logerr("Failed to release ring for allocation key %s", + m_ring_alloc_logic.get_key()->to_str()); + } + m_p_ring = NULL; + } + ret_val = true; + } + return ret_val; +} + +void dst_entry::notify_cb() +{ + dst_logdbg(""); + set_state(false); +} + +void dst_entry::configure_ip_header(header *h, uint16_t packet_id) +{ + h->configure_ip_header(get_protocol_type(), m_pkt_src_ip, m_dst_ip.get_in_addr(), m_ttl, m_tos, packet_id); +} + +bool dst_entry::conf_l2_hdr_and_snd_wqe_eth() +{ + bool ret_val = false; + + //Maybe we after invalidation so we free the wqe_handler since we are going to build it from scratch + if (m_p_send_wqe_handler) { + delete m_p_send_wqe_handler; + m_p_send_wqe_handler = NULL; + } + + m_p_send_wqe_handler = new wqe_send_handler(); + if (!m_p_send_wqe_handler) { + dst_logpanic("%s Failed to allocate send WQE handler", to_str().c_str()); + } + m_p_send_wqe_handler->init_inline_wqe(m_inline_send_wqe, get_sge_lst_4_inline_send(), get_inline_sge_num()); + m_p_send_wqe_handler->init_not_inline_wqe(m_not_inline_send_wqe, get_sge_lst_4_not_inline_send(), 1); + m_p_send_wqe_handler->init_wqe(m_fragmented_send_wqe, get_sge_lst_4_not_inline_send(), 1); + + net_device_val_eth *netdevice_eth = dynamic_cast(m_p_net_dev_val); + BULLSEYE_EXCLUDE_BLOCK_START + if (netdevice_eth) { + BULLSEYE_EXCLUDE_BLOCK_END + const L2_address *src = m_p_net_dev_val->get_l2_address(); + const L2_address *dst = m_p_neigh_val->get_l2_address(); + + BULLSEYE_EXCLUDE_BLOCK_START + if (src && dst) { + BULLSEYE_EXCLUDE_BLOCK_END + if (netdevice_eth->get_vlan()) { //vlan interface + uint32_t prio = get_priority_by_tc_class(m_pcp); + uint16_t vlan_tci = (prio << NET_ETH_VLAN_PCP_OFFSET) | + netdevice_eth->get_vlan(); + m_header.configure_vlan_eth_headers(*src, *dst, vlan_tci); + } + else { + m_header.configure_eth_headers(*src, *dst); + } + init_sge(); + ret_val = true; + } + else { + dst_logerr("Can't build proper L2 header, L2 address is not available"); + } + } + else { + dst_logerr("Dynamic cast failed, can't build proper L2 header"); + } + + return ret_val; +} + + +bool dst_entry::conf_l2_hdr_and_snd_wqe_ib() +{ + bool ret_val = false; + neigh_ib_val *neigh_ib = dynamic_cast(m_p_neigh_val); + + BULLSEYE_EXCLUDE_BLOCK_START + if (!neigh_ib) { + dst_logerr("Dynamic cast to neigh_ib failed, can't build proper ibv_send_wqe: header"); + BULLSEYE_EXCLUDE_BLOCK_END + } + else { + uint32_t qpn = neigh_ib->get_qpn(); + uint32_t qkey = neigh_ib->get_qkey(); + struct ibv_ah *ah = (struct ibv_ah *)neigh_ib->get_ah(); + + //Maybe we after invalidation so we free the wqe_handler since we are going to build it from scratch + if (m_p_send_wqe_handler) { + delete m_p_send_wqe_handler; + m_p_send_wqe_handler = NULL; + } + m_p_send_wqe_handler = new wqe_send_ib_handler(); + + BULLSEYE_EXCLUDE_BLOCK_START + if (!m_p_send_wqe_handler) { + dst_logpanic("%s Failed to allocate send WQE handler", to_str().c_str()); + } + BULLSEYE_EXCLUDE_BLOCK_END + ((wqe_send_ib_handler *)(m_p_send_wqe_handler))->init_inline_ib_wqe(m_inline_send_wqe, get_sge_lst_4_inline_send(), get_inline_sge_num(), ah, qpn, qkey); + ((wqe_send_ib_handler*)(m_p_send_wqe_handler))->init_not_inline_ib_wqe(m_not_inline_send_wqe, get_sge_lst_4_not_inline_send(), 1, ah, qpn, qkey); + ((wqe_send_ib_handler*)(m_p_send_wqe_handler))->init_ib_wqe(m_fragmented_send_wqe, get_sge_lst_4_not_inline_send(), 1, ah, qpn, qkey); + m_header.configure_ipoib_headers(); + init_sge(); + + ret_val = true; + } + return ret_val; +} + +bool dst_entry::conf_hdrs_and_snd_wqe() +{ + transport_type_t tranposrt = VMA_TRANSPORT_IB; + bool ret_val = true; + + dst_logdbg("dst_entry %s configuring the header template", to_str().c_str()); + + configure_ip_header(&m_header); + + if (m_p_net_dev_val) { + tranposrt = m_p_net_dev_val->get_transport_type(); + } + + switch (tranposrt) { + case VMA_TRANSPORT_ETH: + ret_val = conf_l2_hdr_and_snd_wqe_eth(); + break; + case VMA_TRANSPORT_IB: + default: + ret_val = conf_l2_hdr_and_snd_wqe_ib(); + break; + } + return ret_val; +} + +#if _BullseyeCoverage + #pragma BullseyeCoverage off +#endif + +bool dst_entry::get_net_dev_val() +{ + bool ret_val = false; + + if (m_p_rt_entry) { + m_p_rt_entry->get_val(m_p_rt_val); + ret_val = true; + } + else { + dst_logdbg("%s doesn't use route table to resolve netdev", to_str().c_str()); + } + return ret_val; +} + +#if _BullseyeCoverage + #pragma BullseyeCoverage on +#endif + +//Implementation of pure virtual function of neigh_observer +transport_type_t dst_entry::get_obs_transport_type() const +{ + if(m_p_net_dev_val) + return(m_p_net_dev_val->get_transport_type()); + return VMA_TRANSPORT_UNKNOWN; +} + +#if _BullseyeCoverage + #pragma BullseyeCoverage off +#endif + +flow_tuple dst_entry::get_flow_tuple() const +{ + in_addr_t dst_ip = 0; + in_protocol_t protocol = PROTO_UNDEFINED; + + dst_ip = m_dst_ip.get_in_addr(); + protocol = (in_protocol_t)get_protocol_type(); + + return flow_tuple(dst_ip, m_dst_port, m_pkt_src_ip, m_src_port, protocol); +} + +#if _BullseyeCoverage + #pragma BullseyeCoverage on +#endif + +bool dst_entry::offloaded_according_to_rules() +{ + bool ret_val = true; + transport_t target_transport; + + sockaddr_in to; + memset(&to, 0, sizeof(to)); + to.sin_family = AF_INET; + to.sin_addr.s_addr = m_dst_ip.get_in_addr(); + to.sin_port = m_dst_port; + + + target_transport = get_transport(to); + + if (target_transport == TRANS_OS) { + ret_val = false; + } + return ret_val; +} + +bool dst_entry::prepare_to_send(struct vma_rate_limit_t &rate_limit, bool skip_rules, bool is_connect) +{ + bool resolved = false; + m_slow_path_lock.lock(); + if (!m_b_is_initialized) { + if((!skip_rules) && (!offloaded_according_to_rules())) { + dst_logdbg("dst_entry in BLACK LIST!"); + m_b_is_offloaded = false; + m_b_force_os = true; + } + m_b_is_initialized = true; + } + dst_logdbg("%s", to_str().c_str()); + if (!m_b_force_os && !is_valid()) { + bool is_ofloaded = false; + set_state(true); + if (resolve_net_dev(is_connect)) { + set_src_addr(); + // overwrite mtu from route if exists + m_max_udp_payload_size = get_route_mtu() - sizeof(struct iphdr); + m_max_ip_payload_size = m_max_udp_payload_size & ~0x7; + if (resolve_ring()) { + is_ofloaded = true; + modify_ratelimit(rate_limit); + if (resolve_neigh()) { + if (get_obs_transport_type() == VMA_TRANSPORT_ETH) { + dst_logdbg("local mac: %s peer mac: %s", m_p_net_dev_val->get_l2_address()->to_str().c_str(), m_p_neigh_val->get_l2_address()->to_str().c_str()); + } else { + dst_logdbg("peer L2 address: %s", m_p_neigh_val->get_l2_address()->to_str().c_str()); + } + configure_headers(); + m_id = m_p_ring->generate_id(m_p_net_dev_val->get_l2_address()->get_address(), + m_p_neigh_val->get_l2_address()->get_address(), + ((ethhdr*)(m_header.m_actual_hdr_addr))->h_proto /* if vlan, use vlan proto */, + htons(ETH_P_IP), + m_pkt_src_ip, + m_dst_ip.get_in_addr(), + m_src_port, + m_dst_port); + if (m_p_tx_mem_buf_desc_list) { + m_p_ring->mem_buf_tx_release(m_p_tx_mem_buf_desc_list, true); + m_p_tx_mem_buf_desc_list = NULL; + } + resolved = true; + } + } + } + m_b_is_offloaded = is_ofloaded; + if (m_b_is_offloaded) { + dst_logdbg("dst_entry is offloaded!"); + } + else { + dst_logdbg("dst_entry is NOT offloaded!"); + } + if (!resolved) { + set_state(false); + } + } + m_slow_path_lock.unlock(); + + return m_b_is_offloaded; +} + +bool dst_entry::try_migrate_ring(lock_base& socket_lock) +{ + bool ret = false; + if (m_ring_alloc_logic.is_logic_support_migration()) { + if (!m_tx_migration_lock.trylock()) { + if (m_ring_alloc_logic.should_migrate_ring()) { + resource_allocation_key old_key(*m_ring_alloc_logic.get_key()); + do_ring_migration(socket_lock, old_key); + ret = true; + } + m_tx_migration_lock.unlock(); + } + } + return ret; +} + +int dst_entry::get_route_mtu() +{ + if (m_p_rt_val && m_p_rt_val->get_mtu() > 0 ) { + return m_p_rt_val->get_mtu(); + } + return m_p_net_dev_val->get_mtu(); +} + +void dst_entry::do_ring_migration(lock_base& socket_lock, resource_allocation_key &old_key) +{ + m_slow_path_lock.lock(); + + if (!m_p_net_dev_val || !m_p_ring) { + m_slow_path_lock.unlock(); + return; + } + + uint64_t new_calc_id = m_ring_alloc_logic.calc_res_key_by_logic(); + resource_allocation_key *new_key = m_ring_alloc_logic.get_key(); + // Check again if migration is needed before migration + if (old_key.get_user_id_key() == new_calc_id && + old_key.get_ring_alloc_logic() == new_key->get_ring_alloc_logic()) { + m_slow_path_lock.unlock(); + return; + } + // Update key to new ID + new_key->set_user_id_key(new_calc_id); + m_slow_path_lock.unlock(); + socket_lock.unlock(); + + ring* new_ring = m_p_net_dev_val->reserve_ring(new_key); + if (!new_ring) { + socket_lock.lock(); + return; + } + if (new_ring == m_p_ring) { + if (!m_p_net_dev_val->release_ring(&old_key)) { + dst_logerr("Failed to release ring for allocation key %s", + old_key.to_str()); + } + socket_lock.lock(); + return; + } + + dst_logdbg("migrating from key=%s and ring=%p to key=%s and ring=%p", + old_key.to_str(), m_p_ring, new_key->to_str(), new_ring); + + socket_lock.lock(); + m_slow_path_lock.lock(); + + set_state(false); + + ring* old_ring = m_p_ring; + m_p_ring = new_ring; + if (m_sge) { + delete[] m_sge; + m_sge = NULL; + } +#ifdef DEFINED_TSO + m_sge = new (nothrow) struct ibv_sge [m_p_ring->get_max_send_sge()]; +#else + m_sge = new (nothrow) struct ibv_sge [2]; +#endif /* DEFINED_TSO */ + if (!m_sge) { + dst_logpanic("%s Failed to allocate send SGE", to_str().c_str()); + } + m_max_inline = m_p_ring->get_max_inline_data(); + m_max_inline = std::min(m_max_inline, get_route_mtu() + (uint32_t)m_header.m_transport_header_len); + + mem_buf_desc_t* tmp_list = m_p_tx_mem_buf_desc_list; + m_p_tx_mem_buf_desc_list = NULL; + + m_slow_path_lock.unlock(); + socket_lock.unlock(); + + if (tmp_list) { + old_ring->mem_buf_tx_release(tmp_list, true); + } + + m_p_net_dev_val->release_ring(&old_key); + + socket_lock.lock(); +} + +void dst_entry::set_bound_addr(in_addr_t addr) +{ + dst_logdbg(""); + m_bound_ip = addr; + set_state(false); +} + +void dst_entry::set_so_bindtodevice_addr(in_addr_t addr) +{ + dst_logdbg(""); + m_so_bindtodevice_ip = addr; + set_state(false); +} + +in_addr_t dst_entry::get_dst_addr() +{ + return m_dst_ip.get_in_addr(); +} + +uint16_t dst_entry::get_dst_port() +{ + return m_dst_port; +} + +ssize_t dst_entry::pass_buff_to_neigh(const iovec * p_iov, size_t sz_iov, uint16_t packet_id) +{ + ssize_t ret_val = 0; + + dst_logdbg(""); + + configure_ip_header(&m_header_neigh, packet_id); + + if (m_p_neigh_entry) { + neigh_send_info n_send_info(const_cast(p_iov), + sz_iov, &m_header_neigh, + get_protocol_type(), get_route_mtu(), + m_tos); + ret_val = m_p_neigh_entry->send(n_send_info); + } + + return ret_val; +} + +bool dst_entry::alloc_transport_dep_res() +{ + return alloc_neigh_val(get_obs_transport_type()); +} + +bool dst_entry::alloc_neigh_val(transport_type_t tranport) +{ + bool ret_val = false; + + if (m_p_neigh_val) { + delete m_p_neigh_val; + m_p_neigh_val = NULL; + } + + switch (tranport) { + case VMA_TRANSPORT_IB: + m_p_neigh_val = new neigh_ib_val; + break; + case VMA_TRANSPORT_ETH: + default: + m_p_neigh_val = new neigh_eth_val; + break; + } + if (m_p_neigh_val) { + ret_val = true; + } + return ret_val; +} + +void dst_entry::return_buffers_pool() +{ + if (m_p_tx_mem_buf_desc_list == NULL) { + return; + } + + if (m_b_tx_mem_buf_desc_list_pending && m_p_ring && + m_p_ring->mem_buf_tx_release(m_p_tx_mem_buf_desc_list, true, true)) { + m_p_tx_mem_buf_desc_list = NULL; + set_tx_buff_list_pending(false); + } else { + set_tx_buff_list_pending(true); + } +} + +int dst_entry::modify_ratelimit(struct vma_rate_limit_t &rate_limit) +{ + if (m_p_ring) { + return m_p_ring->modify_ratelimit(rate_limit); + } + return 0; +} + +int dst_entry::get_priority_by_tc_class(uint32_t pcp) +{ + // translate class to priority + if (m_p_net_dev_val) { + return m_p_net_dev_val->get_priority_by_tc_class(pcp); + } + return VMA_DEFAULT_ENGRESS_MAP_PRIO; +} + +bool dst_entry::update_ring_alloc_logic(int fd, lock_base& socket_lock, resource_allocation_key &ring_alloc_logic) +{ + resource_allocation_key old_key(*m_ring_alloc_logic.get_key()); + + m_ring_alloc_logic = ring_allocation_logic_tx(fd, ring_alloc_logic, this); + + if (*m_ring_alloc_logic.get_key() != old_key) { + auto_unlocker locker(m_tx_migration_lock); + do_ring_migration(socket_lock, old_key); + return true; + } + + return false; +} diff --git a/src/vma/proto/dst_entry.h b/src/vma/proto/dst_entry.h new file mode 100644 index 0000000..22eb310 --- /dev/null +++ b/src/vma/proto/dst_entry.h @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef DST_ENTRY_H +#define DST_ENTRY_H + +#include +#include +#include "vma/util/if.h" +#include + +#include "vlogger/vlogger.h" +#include "utils/lock_wrapper.h" +#include "vma/sock/socket_fd_api.h" +#include "vma/proto/route_entry.h" +#include "vma/proto/route_val.h" +#include "vma/proto/neighbour_table_mgr.h" +#include "vma/dev/net_device_val.h" +#include "vma/dev/net_device_table_mgr.h" +#include "vma/dev/wqe_send_handler.h" +#include "vma/dev/wqe_send_ib_handler.h" +#include "vma/dev/ring.h" +#include "vma/dev/ring_allocation_logic.h" +#include "vma/infra/sender.h" +#include "header.h" +#include "ip_address.h" + +struct socket_data { + int fd; + uint8_t ttl; + uint8_t tos; + uint32_t pcp; +}; + +typedef struct { + vma_wr_tx_packet_attr flags; + uint16_t mss; +} vma_send_attr; + +class dst_entry : public cache_observer, public tostr, public neigh_observer +{ + +public: + dst_entry(in_addr_t dst_ip, uint16_t dst_port, uint16_t src_port, socket_data &sock_data, resource_allocation_key &ring_alloc_logic); + virtual ~dst_entry(); + + virtual void notify_cb(); + + virtual bool prepare_to_send(struct vma_rate_limit_t &rate_limit, bool skip_rules=false, bool is_connect=false); +#ifdef DEFINED_TSO + virtual ssize_t fast_send(const iovec* p_iov, const ssize_t sz_iov, vma_send_attr attr) = 0; + virtual ssize_t slow_send(const iovec* p_iov, const ssize_t sz_iov, vma_send_attr attr, + struct vma_rate_limit_t &rate_limit, int flags = 0, + socket_fd_api* sock = 0, tx_call_t call_type = TX_UNDEF) = 0; +#else + virtual ssize_t slow_send(const iovec* p_iov, size_t sz_iov, bool is_dummy, struct vma_rate_limit_t &rate_limit, bool b_blocked = true, bool is_rexmit = false, int flags = 0, socket_fd_api* sock = 0, tx_call_t call_type = TX_UNDEF) = 0 ; + virtual ssize_t fast_send(const iovec* p_iov, const ssize_t sz_iov, bool is_dummy, bool b_blocked = true, bool is_rexmit = false) = 0; +#endif /* DEFINED_TSO */ + + bool try_migrate_ring(lock_base& socket_lock); + + bool is_offloaded() { return m_b_is_offloaded; } + void set_bound_addr(in_addr_t addr); + void set_so_bindtodevice_addr(in_addr_t addr); + in_addr_t get_dst_addr(); + uint16_t get_dst_port(); + inline in_addr_t get_src_addr() const { + return m_pkt_src_ip; + } + int modify_ratelimit(struct vma_rate_limit_t &rate_limit); + bool update_ring_alloc_logic(int fd, lock_base & socket_lock, resource_allocation_key & ring_alloc_logic); + +#if _BullseyeCoverage + #pragma BullseyeCoverage off +#endif + net_device_val* get_net_dev() + { + return m_p_net_dev_val; + } +#if _BullseyeCoverage + #pragma BullseyeCoverage on +#endif + + virtual transport_type_t get_obs_transport_type() const; + virtual flow_tuple get_flow_tuple() const; + + void return_buffers_pool(); + int get_route_mtu(); + inline void set_ip_ttl(uint8_t ttl) { m_header.set_ip_ttl(ttl); } + inline void set_ip_tos(uint8_t tos) { m_header.set_ip_tos(tos); } + inline bool set_pcp(uint32_t pcp) { + return m_header.set_vlan_pcp(get_priority_by_tc_class(pcp)); } + inline header* get_network_header() { return &m_header;} + inline ring* get_ring() { return m_p_ring;} +protected: + ip_address m_dst_ip; + uint16_t m_dst_port; + uint16_t m_src_port; + + in_addr_t m_bound_ip; + in_addr_t m_so_bindtodevice_ip; + in_addr_t m_route_src_ip; // source IP used to register in route manager + in_addr_t m_pkt_src_ip; // source IP address copied into IP header + lock_mutex_recursive m_slow_path_lock; + lock_mutex m_tx_migration_lock; + vma_ibv_send_wr m_inline_send_wqe; + vma_ibv_send_wr m_not_inline_send_wqe; + vma_ibv_send_wr m_fragmented_send_wqe; + wqe_send_handler* m_p_send_wqe_handler; + ibv_sge *m_sge; + route_entry* m_p_rt_entry; + route_val* m_p_rt_val; + net_device_entry* m_p_net_dev_entry; + net_device_val* m_p_net_dev_val; + neigh_entry* m_p_neigh_entry; + neigh_val* m_p_neigh_val; + bool m_b_is_offloaded; + bool m_b_force_os; + ring* m_p_ring; + ring_allocation_logic_tx m_ring_alloc_logic; + mem_buf_desc_t* m_p_tx_mem_buf_desc_list; + int m_b_tx_mem_buf_desc_list_pending; + header m_header; + header m_header_neigh; + uint8_t m_ttl; + uint8_t m_tos; + uint8_t m_pcp; + bool m_b_is_initialized; + + vma_ibv_send_wr* m_p_send_wqe; + uint32_t m_max_inline; + ring_user_id_t m_id; + uint16_t m_max_ip_payload_size; + uint16_t m_max_udp_payload_size; + + virtual transport_t get_transport(sockaddr_in to) = 0; + virtual uint8_t get_protocol_type() const = 0; + virtual bool get_net_dev_val(); + virtual uint32_t get_inline_sge_num() = 0; + virtual ibv_sge* get_sge_lst_4_inline_send() = 0; + virtual ibv_sge* get_sge_lst_4_not_inline_send() = 0; + + virtual bool offloaded_according_to_rules(); + virtual void init_members(); + virtual bool resolve_net_dev(bool is_connect=false); + virtual void set_src_addr(); + bool update_net_dev_val(); + bool update_rt_val(); + virtual bool resolve_neigh(); + virtual bool resolve_ring(); + virtual bool release_ring(); + virtual ssize_t pass_buff_to_neigh(const iovec *p_iov, size_t sz_iov, uint16_t packet_id = 0); + virtual void configure_ip_header(header *h, uint16_t packet_id = 0); + virtual void configure_headers() { conf_hdrs_and_snd_wqe();}; + bool conf_hdrs_and_snd_wqe(); + virtual bool conf_l2_hdr_and_snd_wqe_eth(); + virtual bool conf_l2_hdr_and_snd_wqe_ib(); + virtual void init_sge() {}; + bool alloc_transport_dep_res(); + bool alloc_neigh_val(transport_type_t tranport); + + void do_ring_migration(lock_base& socket_lock, resource_allocation_key &old_key); + inline void set_tx_buff_list_pending(bool is_pending = true) {m_b_tx_mem_buf_desc_list_pending = is_pending;} + int get_priority_by_tc_class(uint32_t tc_clas); + inline void send_ring_buffer(ring_user_id_t id, vma_ibv_send_wr* p_send_wqe, vma_wr_tx_packet_attr attr) + { + if (unlikely(is_set(attr, VMA_TX_PACKET_DUMMY))) { + if (m_p_ring->get_hw_dummy_send_support(id, p_send_wqe)) { + vma_ibv_wr_opcode last_opcode = m_p_send_wqe_handler->set_opcode(*p_send_wqe, VMA_IBV_WR_NOP); + m_p_ring->send_ring_buffer(id, p_send_wqe, attr); + m_p_send_wqe_handler->set_opcode(*p_send_wqe, last_opcode); + } else { + /* free the buffer if dummy send is not supported */ + mem_buf_desc_t* p_mem_buf_desc = (mem_buf_desc_t*)(p_send_wqe->wr_id); + m_p_ring->mem_buf_tx_release(p_mem_buf_desc, true); + } + } else { + m_p_ring->send_ring_buffer(id, p_send_wqe, attr); + } + } +}; + + +#endif /* DST_ENTRY_H */ diff --git a/src/vma/proto/dst_entry_tcp.cpp b/src/vma/proto/dst_entry_tcp.cpp new file mode 100644 index 0000000..7582e0d --- /dev/null +++ b/src/vma/proto/dst_entry_tcp.cpp @@ -0,0 +1,464 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + + +#include "dst_entry_tcp.h" +#include + +#define MODULE_NAME "dst_tcp" + +#define dst_tcp_logpanic __log_panic +#define dst_tcp_logerr __log_err +#define dst_tcp_logwarn __log_warn +#define dst_tcp_loginfo __log_info +#define dst_tcp_logdbg __log_info_dbg +#define dst_tcp_logfunc __log_info_fine +#define dst_tcp_logfine __log_info_fine +#define dst_tcp_logfuncall __log_info_finer + + +dst_entry_tcp::dst_entry_tcp(in_addr_t dst_ip, uint16_t dst_port, uint16_t src_port, + socket_data &sock_data , resource_allocation_key &ring_alloc_logic): + dst_entry(dst_ip, dst_port, src_port, sock_data, ring_alloc_logic), + m_n_sysvar_tx_bufs_batch_tcp(safe_mce_sys().tx_bufs_batch_tcp) +{ + +} + +dst_entry_tcp::~dst_entry_tcp() +{ + +} + +transport_t dst_entry_tcp::get_transport(sockaddr_in to) +{ + NOT_IN_USE(to); + return TRANS_VMA; +} + +#ifdef DEFINED_TSO +ssize_t dst_entry_tcp::fast_send(const iovec* p_iov, const ssize_t sz_iov, vma_send_attr attr) +{ + int ret = 0; + tx_packet_template_t* p_pkt; + tcp_iovec* p_tcp_iov = NULL; + size_t hdr_alignment_diff = 0; + + /* The header is aligned for fast copy but we need to maintain this diff + * in order to get the real header pointer easily + */ + hdr_alignment_diff = m_header.m_aligned_l2_l3_len - m_header.m_total_hdr_len; + + p_tcp_iov = (tcp_iovec*)p_iov; + + attr.flags = (vma_wr_tx_packet_attr)(attr.flags | VMA_TX_PACKET_L3_CSUM | VMA_TX_PACKET_L4_CSUM); + + /* Supported scenarios: + * 1. Standard: + * Use lwip memory buffer (zero copy) in case iov consists of single buffer with single TCP packet. + * 2. Large send offload: + * Use lwip sequence of memory buffers (zero copy) in case attribute is set as TSO and no retransmission. + * Size of iov can be one or more. + * 3. Simple: + * Use intermediate buffers for data send + */ + if (likely(m_p_ring->is_active_member(p_tcp_iov->p_desc->p_desc_owner, m_id) && + (is_set(attr.flags, (vma_wr_tx_packet_attr)(VMA_TX_PACKET_TSO)) || + (sz_iov == 1 && !is_set(attr.flags, (vma_wr_tx_packet_attr)(VMA_TX_PACKET_REXMIT)))))) { + size_t total_packet_len = 0; + vma_ibv_send_wr send_wqe; + wqe_send_handler send_wqe_h; + + /* iov_base is a pointer to TCP header and data + * so p_pkt should point to L2 + */ + p_pkt = (tx_packet_template_t*)((uint8_t*)p_tcp_iov[0].iovec.iov_base - m_header.m_aligned_l2_l3_len); + + /* iov_len is a size of TCP header and data + * m_total_hdr_len is a size of L2/L3 header + */ + total_packet_len = p_tcp_iov[0].iovec.iov_len + m_header.m_total_hdr_len; + + /* copy just L2/L3 headers to p_pkt */ + m_header.copy_l2_ip_hdr(p_pkt); + + /* L3(Total Length) field means nothing in case TSO usage and can be set as zero but + * setting this field to actual value allows to do valid call for scenario + * when payload size less or equal to mss + */ + p_pkt->hdr.m_ip_hdr.tot_len = (htons)(p_tcp_iov[0].iovec.iov_len + m_header.m_ip_header_len); + + if ((total_packet_len < m_max_inline) && (1 == sz_iov)) { + m_p_send_wqe = &m_inline_send_wqe; + m_sge[0].addr = (uintptr_t)((uint8_t*)p_pkt + hdr_alignment_diff); + m_sge[0].length = total_packet_len; + } else if (is_set(attr.flags, (vma_wr_tx_packet_attr)(VMA_TX_PACKET_TSO))) { + /* update send work request. do not expect noninlined scenario */ + send_wqe_h.init_not_inline_wqe(send_wqe, m_sge, sz_iov); + send_wqe_h.enable_tso(send_wqe, + (void *)((uint8_t*)p_pkt + hdr_alignment_diff), + m_header.m_total_hdr_len + p_pkt->hdr.m_tcp_hdr.doff * 4, + attr.mss); + m_p_send_wqe = &send_wqe; + m_sge[0].addr = (uintptr_t)((uint8_t *)&p_pkt->hdr.m_tcp_hdr + p_pkt->hdr.m_tcp_hdr.doff * 4); + m_sge[0].length = p_tcp_iov[0].iovec.iov_len - p_pkt->hdr.m_tcp_hdr.doff * 4; + } else { + m_p_send_wqe = &m_not_inline_send_wqe; + m_sge[0].addr = (uintptr_t)((uint8_t*)p_pkt + hdr_alignment_diff); + m_sge[0].length = total_packet_len; + } + + /* save pointers to ip and tcp headers for software checksum calculation */ + p_tcp_iov[0].p_desc->tx.p_ip_h = &p_pkt->hdr.m_ip_hdr; + p_tcp_iov[0].p_desc->tx.p_tcp_h =(struct tcphdr*)((uint8_t*)(&(p_pkt->hdr.m_ip_hdr)) + sizeof(p_pkt->hdr.m_ip_hdr)); + p_tcp_iov[0].p_desc->lwip_pbuf.pbuf.ref++; + + /* set wr_id as a pointer to memory descriptor */ + m_p_send_wqe->wr_id = (uintptr_t)p_tcp_iov[0].p_desc; + + /* Update scatter gather element list + * ref counter is incremented for the first memory descriptor only because it is needed + * for processing send wr completion (tx batching mode) + */ + m_sge[0].lkey = m_p_ring->get_tx_lkey(m_id); + for (int i = 1; i < sz_iov; ++i) { + m_sge[i].addr = (uintptr_t)p_tcp_iov[i].iovec.iov_base; + m_sge[i].length = p_tcp_iov[i].iovec.iov_len; + m_sge[i].lkey = m_sge[0].lkey; + } + + send_lwip_buffer(m_id, m_p_send_wqe, attr.flags); + + } else { // We don'nt support inline in this case, since we believe that this a very rare case + mem_buf_desc_t *p_mem_buf_desc; + size_t total_packet_len = 0; + + p_mem_buf_desc = get_buffer(is_set(attr.flags, VMA_TX_PACKET_BLOCK)); + if (p_mem_buf_desc == NULL) { + ret = -1; + goto out; + } + + m_header.copy_l2_ip_hdr((tx_packet_template_t*)p_mem_buf_desc->p_buffer); + + // Actually this is not the real packet len we will subtract the alignment diff at the end of the copy + total_packet_len = m_header.m_aligned_l2_l3_len; + + for (int i = 0; i < sz_iov; ++i) { + memcpy(p_mem_buf_desc->p_buffer + total_packet_len, p_tcp_iov[i].iovec.iov_base, p_tcp_iov[i].iovec.iov_len); + total_packet_len += p_tcp_iov[i].iovec.iov_len; + } + + m_sge[0].addr = (uintptr_t)(p_mem_buf_desc->p_buffer + hdr_alignment_diff); + m_sge[0].length = total_packet_len - hdr_alignment_diff; + m_sge[0].lkey = m_p_ring->get_tx_lkey(m_id); + + p_pkt = (tx_packet_template_t*)((uint8_t*)p_mem_buf_desc->p_buffer); + p_pkt->hdr.m_ip_hdr.tot_len = (htons)(m_sge[0].length - m_header.m_transport_header_len); + + p_mem_buf_desc->tx.p_ip_h = &p_pkt->hdr.m_ip_hdr; + p_mem_buf_desc->tx.p_tcp_h = (struct tcphdr*)((uint8_t*)(&(p_pkt->hdr.m_ip_hdr))+sizeof(p_pkt->hdr.m_ip_hdr)); + + m_p_send_wqe = &m_not_inline_send_wqe; + m_p_send_wqe->wr_id = (uintptr_t)p_mem_buf_desc; + + send_ring_buffer(m_id, m_p_send_wqe, attr.flags); + + } + + if (unlikely(m_p_tx_mem_buf_desc_list == NULL)) { + m_p_tx_mem_buf_desc_list = m_p_ring->mem_buf_tx_get(m_id, + is_set(attr.flags, VMA_TX_PACKET_BLOCK), m_n_sysvar_tx_bufs_batch_tcp); + } + +out: + if (unlikely(is_set(attr.flags, VMA_TX_PACKET_REXMIT))) { + m_p_ring->inc_tx_retransmissions_stats(m_id); + } + + return ret; +} + +ssize_t dst_entry_tcp::slow_send(const iovec* p_iov, const ssize_t sz_iov, vma_send_attr attr, + struct vma_rate_limit_t &rate_limit, int flags /*= 0*/, + socket_fd_api* sock /*= 0*/, tx_call_t call_type /*= 0*/) +{ + ssize_t ret_val = -1; + + NOT_IN_USE(sock); + NOT_IN_USE(call_type); + NOT_IN_USE(flags); + + m_slow_path_lock.lock(); + + prepare_to_send(rate_limit, true); + + if (m_b_is_offloaded) { + if (!is_valid()) { // That means that the neigh is not resolved yet + //there is a copy inside so we should not update any ref-counts + ret_val = pass_buff_to_neigh(p_iov, sz_iov); + } + else { + ret_val = fast_send(p_iov, sz_iov, attr); + } + } + else { + dst_tcp_logdbg("Dst_entry is not offloaded, bug?"); + } + m_slow_path_lock.unlock(); + return ret_val; +} +#else +ssize_t dst_entry_tcp::fast_send(const iovec* p_iov, const ssize_t sz_iov, bool is_dummy, bool b_blocked /*= true*/, bool is_rexmit /*= false*/) +{ + int ret = 0; + tx_packet_template_t* p_pkt; + mem_buf_desc_t *p_mem_buf_desc; + size_t total_packet_len = 0; + // The header is aligned for fast copy but we need to maintain this diff in order to get the real header pointer easily + size_t hdr_alignment_diff = m_header.m_aligned_l2_l3_len - m_header.m_total_hdr_len; + + tcp_iovec* p_tcp_iov = NULL; + bool no_copy = true; + if (likely(sz_iov == 1 && !is_rexmit)) { + p_tcp_iov = (tcp_iovec*)p_iov; + if (unlikely(!m_p_ring->is_active_member(p_tcp_iov->p_desc->p_desc_owner, m_id))) { + no_copy = false; + dst_tcp_logdbg("p_desc=%p wrong desc_owner=%p, this ring=%p. did migration occurred?", p_tcp_iov->p_desc, p_tcp_iov->p_desc->p_desc_owner, m_p_ring); + //todo can we handle this in migration (by going over all buffers lwip hold) instead for every send? + } + } else { + no_copy = false; + } + + vma_wr_tx_packet_attr attr = (vma_wr_tx_packet_attr)((VMA_TX_PACKET_BLOCK * b_blocked) | (VMA_TX_PACKET_DUMMY * is_dummy) | VMA_TX_PACKET_L3_CSUM | VMA_TX_PACKET_L4_CSUM); + + if (likely(no_copy)) { + p_pkt = (tx_packet_template_t*)((uint8_t*)p_tcp_iov[0].iovec.iov_base - m_header.m_aligned_l2_l3_len); + total_packet_len = p_tcp_iov[0].iovec.iov_len + m_header.m_total_hdr_len; + m_header.copy_l2_ip_hdr(p_pkt); + // We've copied to aligned address, and now we must update p_pkt to point to real + // L2 header + //p_pkt = (tx_packet_template_t*)((uint8_t*)p_pkt + hdr_alignment_diff); + p_pkt->hdr.m_ip_hdr.tot_len = (htons)(p_tcp_iov[0].iovec.iov_len + m_header.m_ip_header_len); + + m_sge[0].addr = (uintptr_t)((uint8_t*)p_pkt + hdr_alignment_diff); + m_sge[0].length = total_packet_len; + + if (total_packet_len < m_max_inline) { // inline send + m_p_send_wqe = &m_inline_send_wqe; + } else { + m_p_send_wqe = &m_not_inline_send_wqe; + } + + m_p_send_wqe->wr_id = (uintptr_t)p_tcp_iov[0].p_desc; + p_tcp_iov[0].p_desc->tx.p_ip_h = &p_pkt->hdr.m_ip_hdr; + p_tcp_iov[0].p_desc->tx.p_tcp_h =(struct tcphdr*)((uint8_t*)(&(p_pkt->hdr.m_ip_hdr))+sizeof(p_pkt->hdr.m_ip_hdr)); + + send_lwip_buffer(m_id, m_p_send_wqe, attr); + + /* for DEBUG */ + if ((uint8_t*)m_sge[0].addr < p_tcp_iov[0].p_desc->p_buffer || (uint8_t*)p_pkt < p_tcp_iov[0].p_desc->p_buffer) { + dst_tcp_logerr("p_buffer - addr=%d, m_total_hdr_len=%zd, p_buffer=%p, type=%d, len=%d, tot_len=%d, payload=%p, hdr_alignment_diff=%zd\n", + (int)(p_tcp_iov[0].p_desc->p_buffer - (uint8_t*)m_sge[0].addr), m_header.m_total_hdr_len, + p_tcp_iov[0].p_desc->p_buffer, p_tcp_iov[0].p_desc->lwip_pbuf.pbuf.type, + p_tcp_iov[0].p_desc->lwip_pbuf.pbuf.len, p_tcp_iov[0].p_desc->lwip_pbuf.pbuf.tot_len, + p_tcp_iov[0].p_desc->lwip_pbuf.pbuf.payload, hdr_alignment_diff); + } + } + else { // We don'nt support inline in this case, since we believe that this a very rare case + p_mem_buf_desc = get_buffer(b_blocked); + if (p_mem_buf_desc == NULL) { + ret = -1; + goto out; + } + + m_header.copy_l2_ip_hdr((tx_packet_template_t*)p_mem_buf_desc->p_buffer); + + // Actually this is not the real packet len we will subtract the alignment diff at the end of the copy + total_packet_len = m_header.m_aligned_l2_l3_len; + + for (int i = 0; i < sz_iov; ++i) { + memcpy(p_mem_buf_desc->p_buffer + total_packet_len, p_iov[i].iov_base, p_iov[i].iov_len); + total_packet_len += p_iov[i].iov_len; + } + + m_sge[0].addr = (uintptr_t)(p_mem_buf_desc->p_buffer + hdr_alignment_diff); + m_sge[0].length = total_packet_len - hdr_alignment_diff; + // LKey will be updated in ring->send() // m_sge[0].lkey = p_mem_buf_desc->lkey; + + p_pkt = (tx_packet_template_t*)((uint8_t*)p_mem_buf_desc->p_buffer); + p_pkt->hdr.m_ip_hdr.tot_len = (htons)(m_sge[0].length - m_header.m_transport_header_len); + + p_mem_buf_desc->tx.p_ip_h = &p_pkt->hdr.m_ip_hdr; + p_mem_buf_desc->tx.p_tcp_h = (struct tcphdr*)((uint8_t*)(&(p_pkt->hdr.m_ip_hdr))+sizeof(p_pkt->hdr.m_ip_hdr)); + + m_p_send_wqe = &m_not_inline_send_wqe; + m_p_send_wqe->wr_id = (uintptr_t)p_mem_buf_desc; + send_ring_buffer(m_id, m_p_send_wqe, attr); + + /* for DEBUG */ + if ((uint8_t*)m_sge[0].addr < p_mem_buf_desc->p_buffer) { + dst_tcp_logerr("p_buffer - addr=%d, m_total_hdr_len=%zd, p_buffer=%p, type=%d, len=%d, tot_len=%d, payload=%p, hdr_alignment_diff=%zd\n", + (int)(p_mem_buf_desc->p_buffer - (uint8_t*)m_sge[0].addr), m_header.m_total_hdr_len, + p_mem_buf_desc->p_buffer, p_mem_buf_desc->lwip_pbuf.pbuf.type, + p_mem_buf_desc->lwip_pbuf.pbuf.len, p_mem_buf_desc->lwip_pbuf.pbuf.tot_len, + p_mem_buf_desc->lwip_pbuf.pbuf.payload, hdr_alignment_diff); + } + } + + if (unlikely(m_p_tx_mem_buf_desc_list == NULL)) { + m_p_tx_mem_buf_desc_list = m_p_ring->mem_buf_tx_get(m_id, b_blocked, m_n_sysvar_tx_bufs_batch_tcp); + } + +out: + if (unlikely(is_rexmit)) { + m_p_ring->inc_tx_retransmissions_stats(m_id); + } + + return ret; +} + +ssize_t dst_entry_tcp::slow_send(const iovec* p_iov, size_t sz_iov, bool is_dummy, struct vma_rate_limit_t &rate_limit, bool b_blocked /*= true*/, bool is_rexmit /*= false*/, int flags /*= 0*/, socket_fd_api* sock /*= 0*/, tx_call_t call_type /*= 0*/) +{ + ssize_t ret_val = -1; + + NOT_IN_USE(sock); + NOT_IN_USE(call_type); + NOT_IN_USE(flags); + + m_slow_path_lock.lock(); + + prepare_to_send(rate_limit, true); + + if (m_b_is_offloaded) { + if (!is_valid()) { // That means that the neigh is not resolved yet + //there is a copy inside so we should not update any ref-counts + ret_val = pass_buff_to_neigh(p_iov, sz_iov); + } + else { + ret_val = fast_send(p_iov, sz_iov, is_dummy, b_blocked, is_rexmit); + } + } + else { + dst_tcp_logdbg("Dst_entry is not offloaded, bug?"); + } + m_slow_path_lock.unlock(); + return ret_val; +} +#endif /* DEFINED_TSO */ + +ssize_t dst_entry_tcp::slow_send_neigh( const iovec* p_iov, size_t sz_iov, struct vma_rate_limit_t &rate_limit) +{ + ssize_t ret_val = -1; + + m_slow_path_lock.lock(); + + prepare_to_send(rate_limit, true); + + if (m_b_is_offloaded) { + ret_val = pass_buff_to_neigh(p_iov, sz_iov); + } + else { + dst_tcp_logdbg("Dst_entry is not offloaded, bug?"); + } + + m_slow_path_lock.unlock(); + return ret_val; +} + +//The following function supposed to be called under m_lock +void dst_entry_tcp::configure_headers() +{ + m_header.init(); + dst_entry::configure_headers(); +} + +ssize_t dst_entry_tcp::pass_buff_to_neigh(const iovec * p_iov, size_t sz_iov, uint16_t packet_id) +{ + NOT_IN_USE(packet_id); + m_header_neigh.init(); + m_header_neigh.configure_tcp_ports(m_dst_port, m_src_port); + return(dst_entry::pass_buff_to_neigh(p_iov, sz_iov)); +} + +mem_buf_desc_t* dst_entry_tcp::get_buffer(bool b_blocked /*=false*/) +{ + set_tx_buff_list_pending(false); + + // Get a bunch of tx buf descriptor and data buffers + if (unlikely(m_p_tx_mem_buf_desc_list == NULL)) { + m_p_tx_mem_buf_desc_list = m_p_ring->mem_buf_tx_get(m_id, b_blocked, m_n_sysvar_tx_bufs_batch_tcp); + } + + mem_buf_desc_t* p_mem_buf_desc = m_p_tx_mem_buf_desc_list; + if (unlikely(p_mem_buf_desc == NULL)) { + dst_tcp_logfunc("silent packet drop, no buffers!"); + } + else { + m_p_tx_mem_buf_desc_list = m_p_tx_mem_buf_desc_list->p_next_desc; + p_mem_buf_desc->p_next_desc = NULL; + // for TX, set lwip payload to the data segment. + // lwip will send it with payload pointing to the tcp header. + p_mem_buf_desc->lwip_pbuf.pbuf.payload = (u8_t *)p_mem_buf_desc->p_buffer + m_header.m_aligned_l2_l3_len + sizeof(struct tcphdr); + } + + return p_mem_buf_desc; +} + +//called from lwip under sockinfo_tcp lock +//handle un-chained pbuf +// only single p_desc +void dst_entry_tcp::put_buffer(mem_buf_desc_t * p_desc) +{ + //todo accumulate buffers? + + if (unlikely(p_desc == NULL)) + return; + + if (likely(m_p_ring->is_member(p_desc->p_desc_owner))) { + m_p_ring->mem_buf_desc_return_single_to_owner_tx(p_desc); + } else { + + //potential race, ref is protected here by tcp lock, and in ring by ring_tx lock + if (likely(p_desc->lwip_pbuf.pbuf.ref)) + p_desc->lwip_pbuf.pbuf.ref--; + else + dst_tcp_logerr("ref count of %p is already zero, double free??", p_desc); + + if (p_desc->lwip_pbuf.pbuf.ref == 0) { + p_desc->p_next_desc = NULL; + g_buffer_pool_tx->put_buffers_thread_safe(p_desc); + } + } +} diff --git a/src/vma/proto/dst_entry_tcp.h b/src/vma/proto/dst_entry_tcp.h new file mode 100644 index 0000000..0273e04 --- /dev/null +++ b/src/vma/proto/dst_entry_tcp.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef DST_ENTRY_TCP_H +#define DST_ENTRY_TCP_H + +#include "vma/proto/dst_entry.h" + +/* Structure for TCP scatter/gather I/O. */ +typedef struct tcp_iovec +{ + struct iovec iovec; + mem_buf_desc_t* p_desc; +} tcp_iovec; + +class dst_entry_tcp : public dst_entry +{ +public: + dst_entry_tcp(in_addr_t dst_ip, uint16_t dst_port, uint16_t src_port, + socket_data &data, resource_allocation_key &ring_alloc_logic); + virtual ~dst_entry_tcp(); + +#ifdef DEFINED_TSO + ssize_t fast_send(const iovec* p_iov, const ssize_t sz_iov, vma_send_attr attr); + ssize_t slow_send(const iovec* p_iov, const ssize_t sz_iov, vma_send_attr attr, + struct vma_rate_limit_t &rate_limit, int flags = 0, + socket_fd_api* sock = 0, tx_call_t call_type = TX_UNDEF); +#else + virtual ssize_t fast_send(const iovec* p_iov, const ssize_t sz_iov, bool is_dummy, bool b_blocked = true, bool is_rexmit = false); + ssize_t slow_send(const iovec* p_iov, size_t sz_iov, bool is_dummy, struct vma_rate_limit_t &rate_limit, bool b_blocked = true, bool is_rexmit = false, int flags = 0, socket_fd_api* sock = 0, tx_call_t call_type = TX_UNDEF); +#endif /* DEFINED_TSO */ + ssize_t slow_send_neigh(const iovec* p_iov, size_t sz_iov, struct vma_rate_limit_t &rate_limit); + + mem_buf_desc_t* get_buffer(bool b_blocked = false); + void put_buffer(mem_buf_desc_t * p_desc); + +protected: + transport_t get_transport(sockaddr_in to); + virtual uint8_t get_protocol_type() const { return IPPROTO_TCP; }; + virtual uint32_t get_inline_sge_num() { return 1; }; + virtual ibv_sge* get_sge_lst_4_inline_send() { return m_sge; }; + virtual ibv_sge* get_sge_lst_4_not_inline_send() { return m_sge; }; + + virtual void configure_headers(); + virtual ssize_t pass_buff_to_neigh(const iovec *p_iov, size_t sz_iov, uint16_t packet_id = 0); + +private: + const uint32_t m_n_sysvar_tx_bufs_batch_tcp; + + inline void send_lwip_buffer(ring_user_id_t id, vma_ibv_send_wr* p_send_wqe, vma_wr_tx_packet_attr attr) + { + if (unlikely(is_set(attr, VMA_TX_PACKET_DUMMY))) { + if (m_p_ring->get_hw_dummy_send_support(id, p_send_wqe)) { + vma_ibv_wr_opcode last_opcode = m_p_send_wqe_handler->set_opcode(*p_send_wqe, VMA_IBV_WR_NOP); + m_p_ring->send_lwip_buffer(id, p_send_wqe, attr); + m_p_send_wqe_handler->set_opcode(*p_send_wqe, last_opcode); + } + /* no need to free the buffer if dummy send is not supported, as for lwip buffers we have 2 ref counts, */ + /* one for caller, and one for completion. for completion, we ref count in */ + /* send_lwip_buffer(). Since we are not going in, the caller will free the */ + /* buffer. */ + } else { + m_p_ring->send_lwip_buffer(id, p_send_wqe, attr); + } + } + +}; + +#endif /* DST_ENTRY_TCP_H */ diff --git a/src/vma/proto/dst_entry_udp.cpp b/src/vma/proto/dst_entry_udp.cpp new file mode 100644 index 0000000..bff9bb2 --- /dev/null +++ b/src/vma/proto/dst_entry_udp.cpp @@ -0,0 +1,443 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "utils/bullseye.h" +#include "vma/util/utils.h" +#include "dst_entry_udp.h" + +#define MODULE_NAME "dst_udp" + +#define dst_udp_logpanic __log_panic +#define dst_udp_logerr __log_err +#define dst_udp_logwarn __log_warn +#define dst_udp_loginfo __log_info +#define dst_udp_logdbg __log_info_dbg +#define dst_udp_logfunc __log_info_func +#define dst_udp_logfuncall __log_info_funcall + + +dst_entry_udp::dst_entry_udp(in_addr_t dst_ip, uint16_t dst_port, uint16_t src_port, + socket_data &sock_data, resource_allocation_key &ring_alloc_logic): + dst_entry(dst_ip, dst_port, src_port, sock_data, ring_alloc_logic), + m_n_sysvar_tx_bufs_batch_udp(safe_mce_sys().tx_bufs_batch_udp), + m_b_sysvar_tx_nonblocked_eagains(safe_mce_sys().tx_nonblocked_eagains), + m_sysvar_thread_mode(safe_mce_sys().thread_mode), + m_n_sysvar_tx_prefetch_bytes(safe_mce_sys().tx_prefetch_bytes) +{ + dst_udp_logdbg("%s", to_str().c_str()); + atomic_set(&m_a_tx_ip_id, 0); + m_n_tx_ip_id = 0; +} + +dst_entry_udp::~dst_entry_udp() +{ + dst_udp_logdbg("%s", to_str().c_str()); +} + +transport_t dst_entry_udp::get_transport(sockaddr_in to) +{ + return __vma_match_udp_sender(TRANS_VMA, safe_mce_sys().app_id, (sockaddr *)(&to), sizeof to); +} + +//The following function supposed to be called under m_lock +void dst_entry_udp::configure_headers() +{ + m_header.init(); + m_header.configure_udp_header(m_dst_port, m_src_port); + dst_entry::configure_headers(); +} + +inline ssize_t dst_entry_udp::fast_send_not_fragmented(const iovec* p_iov, const ssize_t sz_iov, vma_wr_tx_packet_attr attr, size_t sz_udp_payload, ssize_t sz_data_payload) +{ + mem_buf_desc_t* p_mem_buf_desc; + bool b_blocked = is_set(attr, VMA_TX_PACKET_BLOCK); + + // Get a bunch of tx buf descriptor and data buffers + if (unlikely(m_p_tx_mem_buf_desc_list == NULL)) { + m_p_tx_mem_buf_desc_list = m_p_ring->mem_buf_tx_get(m_id, b_blocked, m_n_sysvar_tx_bufs_batch_udp); + + if (unlikely(m_p_tx_mem_buf_desc_list == NULL)) { + if (b_blocked) { + dst_udp_logdbg("Error when blocking for next tx buffer (errno=%d %m)", errno); + } + else { + dst_udp_logfunc("Packet dropped. NonBlocked call but not enough tx buffers. Returning OK"); + if (!m_b_sysvar_tx_nonblocked_eagains) return sz_data_payload; + } + errno = EAGAIN; + return -1; + } + } + // Disconnect the first buffer from the list + p_mem_buf_desc = m_p_tx_mem_buf_desc_list; + m_p_tx_mem_buf_desc_list = m_p_tx_mem_buf_desc_list->p_next_desc; + p_mem_buf_desc->p_next_desc = NULL; + + set_tx_buff_list_pending(false); + + // Check if inline is possible + if (sz_iov == 1 && (sz_data_payload + m_header.m_total_hdr_len) < m_max_inline) { + m_p_send_wqe = &m_inline_send_wqe; + + m_header.m_header.hdr.m_udp_hdr.len = htons((uint16_t)sz_udp_payload); + m_header.m_header.hdr.m_ip_hdr.tot_len = htons(m_header.m_ip_header_len + sz_udp_payload); + + p_mem_buf_desc->tx.p_ip_h = &m_header.m_header.hdr.m_ip_hdr; + p_mem_buf_desc->tx.p_udp_h = &m_header.m_header.hdr.m_udp_hdr; + + //m_sge[0].addr already points to the header + //so we just need to update the payload addr + len + m_sge[1].length = p_iov[0].iov_len; + m_sge[1].addr = (uintptr_t)p_iov[0].iov_base; +#ifdef DEFINED_TSO + m_sge[1].lkey = m_p_ring->get_tx_lkey(m_id); +#endif /* DEFINED_TSO */ + } else { + m_p_send_wqe = &m_not_inline_send_wqe; + + tx_packet_template_t *p_pkt = (tx_packet_template_t*)p_mem_buf_desc->p_buffer; + size_t hdr_len = m_header.m_transport_header_len + m_header.m_ip_header_len + sizeof(udphdr); // Add count of L2 (ipoib or mac) header length and udp header + + if (m_n_sysvar_tx_prefetch_bytes) { + prefetch_range(p_mem_buf_desc->p_buffer + m_header.m_transport_header_tx_offset, + min(sz_udp_payload, (size_t)m_n_sysvar_tx_prefetch_bytes)); + } + + m_header.copy_l2_ip_udp_hdr(p_pkt); + p_pkt->hdr.m_udp_hdr.len = htons((uint16_t)sz_udp_payload); + p_pkt->hdr.m_ip_hdr.frag_off = htons(0); + + // Update ip header specific values + p_pkt->hdr.m_ip_hdr.id = 0; + p_pkt->hdr.m_ip_hdr.tot_len = htons(m_header.m_ip_header_len + sz_udp_payload); + + p_mem_buf_desc->tx.p_ip_h = &p_pkt->hdr.m_ip_hdr; + p_mem_buf_desc->tx.p_udp_h = &p_pkt->hdr.m_udp_hdr; + + // Update the payload addr + len + m_sge[1].length = sz_data_payload + hdr_len; + m_sge[1].addr = (uintptr_t)(p_mem_buf_desc->p_buffer + (uint8_t)m_header.m_transport_header_tx_offset); +#ifdef DEFINED_TSO + m_sge[1].lkey = m_p_ring->get_tx_lkey(m_id); +#endif /* DEFINED_TSO */ + + // Calc payload start point (after the udp header if present else just after ip header) + uint8_t* p_payload = p_mem_buf_desc->p_buffer + m_header.m_transport_header_tx_offset + hdr_len; + + // Copy user data to our tx buffers + int ret = memcpy_fromiovec(p_payload, p_iov, sz_iov, 0, sz_data_payload); + BULLSEYE_EXCLUDE_BLOCK_START + if (ret != (int)sz_data_payload) { + dst_udp_logerr("memcpy_fromiovec error (sz_user_data_to_copy=%d, ret=%d)", sz_data_payload, ret); + m_p_ring->mem_buf_tx_release(p_mem_buf_desc, true); + errno = EINVAL; + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_END + } + + m_p_send_wqe->wr_id = (uintptr_t)p_mem_buf_desc; + send_ring_buffer(m_id, m_p_send_wqe, attr); + + // request tx buffers for the next packets + if (unlikely(m_p_tx_mem_buf_desc_list == NULL)) { + m_p_tx_mem_buf_desc_list = m_p_ring->mem_buf_tx_get(m_id, b_blocked, m_n_sysvar_tx_bufs_batch_udp); + } + + // If all went well :) then return the user data count transmitted + return sz_data_payload; +} + +ssize_t dst_entry_udp::fast_send_fragmented(const iovec* p_iov, const ssize_t sz_iov, vma_wr_tx_packet_attr attr, size_t sz_udp_payload, ssize_t sz_data_payload) +{ + tx_packet_template_t *p_pkt; + mem_buf_desc_t* p_mem_buf_desc = NULL, *tmp; + + m_p_send_wqe = &m_fragmented_send_wqe; + + // Find number of ip fragments (-> packets, buffers, buffer descs...) + int n_num_frags = (sz_udp_payload + m_max_ip_payload_size - 1) / m_max_ip_payload_size; + uint16_t packet_id = (m_sysvar_thread_mode > THREAD_MODE_SINGLE) ? + atomic_fetch_and_inc(&m_a_tx_ip_id) : + m_n_tx_ip_id++; + packet_id = htons(packet_id); + + bool b_blocked = is_set(attr, VMA_TX_PACKET_BLOCK); + + dst_udp_logfunc("udp info: payload_sz=%d, frags=%d, scr_port=%d, dst_port=%d, blocked=%s, ", sz_data_payload, n_num_frags, ntohs(m_header.m_header.hdr.m_udp_hdr.source), ntohs(m_dst_port), b_blocked?"true":"false"); + + // Get all needed tx buf descriptor and data buffers + p_mem_buf_desc = m_p_ring->mem_buf_tx_get(m_id, b_blocked, n_num_frags); + + if (unlikely(p_mem_buf_desc == NULL)) { + if (b_blocked) { + dst_udp_logdbg("Error when blocking for next tx buffer (errno=%d %m)", errno); + } + else { + dst_udp_logfunc("Packet dropped. NonBlocked call but not enough tx buffers. Returning OK"); + if (!m_b_sysvar_tx_nonblocked_eagains) return sz_data_payload; + } + errno = EAGAIN; + return -1; + } + + // Int for counting offset inside the ip datagram payload + uint32_t n_ip_frag_offset = 0; + size_t sz_user_data_offset = 0; + + while (n_num_frags--) { + // Calc this ip datagram fragment size (include any udp header) + size_t sz_ip_frag = min((size_t)m_max_ip_payload_size, (sz_udp_payload - n_ip_frag_offset)); + size_t sz_user_data_to_copy = sz_ip_frag; + size_t hdr_len = m_header.m_transport_header_len + m_header.m_ip_header_len; // Add count of L2 (ipoib or mac) header length + + if (m_n_sysvar_tx_prefetch_bytes) { + prefetch_range(p_mem_buf_desc->p_buffer + m_header.m_transport_header_tx_offset, + min(sz_ip_frag, (size_t)m_n_sysvar_tx_prefetch_bytes)); + } + + p_pkt = (tx_packet_template_t*)p_mem_buf_desc->p_buffer; + + uint16_t frag_off = 0; + if (n_num_frags) { + frag_off |= MORE_FRAGMENTS_FLAG; + } + + if (n_ip_frag_offset == 0) { + m_header.copy_l2_ip_udp_hdr(p_pkt); + // Add count of udp header length + hdr_len += sizeof(udphdr); + + // Copy less from user data + sz_user_data_to_copy -= sizeof(udphdr); + + // Only for first fragment add the udp header + p_pkt->hdr.m_udp_hdr.len = htons((uint16_t)sz_udp_payload); + } + else { + m_header.copy_l2_ip_hdr(p_pkt); + frag_off |= FRAGMENT_OFFSET & (n_ip_frag_offset / 8); + } + + p_pkt->hdr.m_ip_hdr.frag_off = htons(frag_off); + // Update ip header specific values + p_pkt->hdr.m_ip_hdr.id = packet_id; + p_pkt->hdr.m_ip_hdr.tot_len = htons(m_header.m_ip_header_len + sz_ip_frag); + + // Calc payload start point (after the udp header if present else just after ip header) + uint8_t* p_payload = p_mem_buf_desc->p_buffer + m_header.m_transport_header_tx_offset + hdr_len; + + // Copy user data to our tx buffers + int ret = memcpy_fromiovec(p_payload, p_iov, sz_iov, sz_user_data_offset, sz_user_data_to_copy); + BULLSEYE_EXCLUDE_BLOCK_START + if (ret != (int)sz_user_data_to_copy) { + dst_udp_logerr("memcpy_fromiovec error (sz_user_data_to_copy=%d, ret=%d)", sz_user_data_to_copy, ret); + m_p_ring->mem_buf_tx_release(p_mem_buf_desc, true); + errno = EINVAL; + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_END + + attr = (vma_wr_tx_packet_attr)(attr|VMA_TX_SW_CSUM); + p_mem_buf_desc->tx.p_ip_h = &p_pkt->hdr.m_ip_hdr; + p_mem_buf_desc->tx.p_udp_h = &p_pkt->hdr.m_udp_hdr; + + m_sge[1].addr = (uintptr_t)(p_mem_buf_desc->p_buffer + (uint8_t)m_header.m_transport_header_tx_offset); + m_sge[1].length = sz_user_data_to_copy + hdr_len; +#ifdef DEFINED_TSO + m_sge[1].lkey = m_p_ring->get_tx_lkey(m_id); +#endif /* DEFINED_TSO */ + m_p_send_wqe->wr_id = (uintptr_t)p_mem_buf_desc; + + dst_udp_logfunc("%s packet_sz=%d, payload_sz=%d, ip_offset=%d id=%d", m_header.to_str().c_str(), + m_sge[1].length - m_header.m_transport_header_len, sz_user_data_to_copy, + n_ip_frag_offset, ntohs(packet_id)); + + tmp = p_mem_buf_desc->p_next_desc; + p_mem_buf_desc->p_next_desc = NULL; + + // We don't check the return valuse of post send when we reach the HW we consider that we completed our job + send_ring_buffer(m_id, m_p_send_wqe, attr); + + p_mem_buf_desc = tmp; + + // Update ip frag offset position + n_ip_frag_offset += sz_ip_frag; + + // Update user data start offset copy location + sz_user_data_offset += sz_user_data_to_copy; + + } // while(n_num_frags) + + // If all went well :) then return the user data count transmitted + return sz_data_payload; +} + +#ifdef DEFINED_TSO +ssize_t dst_entry_udp::fast_send(const iovec* p_iov, const ssize_t sz_iov, vma_send_attr attr) +{ + // Calc user data payload size + ssize_t sz_data_payload = 0; + for (ssize_t i = 0; i < sz_iov; i++) + sz_data_payload += p_iov[i].iov_len; + + if (unlikely(sz_data_payload > 65536)) { + dst_udp_logfunc("sz_data_payload=%d, to_port=%d, local_port=%d, b_blocked=%s", sz_data_payload, ntohs(m_dst_port), ntohs(m_src_port), (is_set(attr.flags, VMA_TX_PACKET_BLOCK) ? "true" : "false")); + dst_udp_logfunc("sz_data_payload=%d exceeds max of 64KB", sz_data_payload); + errno = EMSGSIZE; + return -1; + } + + // Calc udp payload size + size_t sz_udp_payload = sz_data_payload + sizeof(struct udphdr); + if (sz_udp_payload <= (size_t)m_max_udp_payload_size) { + attr.flags = (vma_wr_tx_packet_attr)(attr.flags | VMA_TX_PACKET_L3_CSUM | VMA_TX_PACKET_L4_CSUM); + return fast_send_not_fragmented(p_iov, sz_iov, attr.flags, sz_udp_payload, sz_data_payload); + } else { + attr.flags = (vma_wr_tx_packet_attr)(attr.flags | VMA_TX_PACKET_L3_CSUM); + return fast_send_fragmented(p_iov, sz_iov, attr.flags, sz_udp_payload, sz_data_payload); + } +} + +ssize_t dst_entry_udp::slow_send(const iovec* p_iov, const ssize_t sz_iov, vma_send_attr attr, + struct vma_rate_limit_t &rate_limit, int flags /*= 0*/, + socket_fd_api* sock /*= 0*/, tx_call_t call_type /*= 0*/) +{ + ssize_t ret_val = 0; + + dst_udp_logdbg("In slow send"); + + prepare_to_send(rate_limit, false); + + if (m_b_force_os || !m_b_is_offloaded) { + struct sockaddr_in to_saddr; + to_saddr.sin_port = m_dst_port; + to_saddr.sin_addr.s_addr = m_dst_ip.get_in_addr(); + to_saddr.sin_family = AF_INET; + dst_udp_logdbg("Calling to tx_os"); + ret_val = sock->tx_os(call_type, p_iov, sz_iov, flags, (const struct sockaddr*)&to_saddr, sizeof(struct sockaddr_in)); + } + else { + if (!is_valid()) { // That means that the neigh is not resolved yet + ret_val = pass_buff_to_neigh(p_iov, sz_iov); + } + else { + ret_val = fast_send(p_iov, sz_iov, attr); + } + } + + return ret_val; +} +#else +ssize_t dst_entry_udp::fast_send(const iovec* p_iov, const ssize_t sz_iov, + bool is_dummy, bool b_blocked /*=true*/, bool is_rexmit /*=false*/) +{ + NOT_IN_USE(is_rexmit); + + // Calc user data payload size + ssize_t sz_data_payload = 0; + for (ssize_t i = 0; i < sz_iov; i++) + sz_data_payload += p_iov[i].iov_len; + + if (unlikely(sz_data_payload > 65536)) { + dst_udp_logfunc("sz_data_payload=%d, to_port=%d, local_port=%d, b_blocked=%s", sz_data_payload, ntohs(m_dst_port), ntohs(m_src_port), b_blocked?"true":"false"); + dst_udp_logfunc("sz_data_payload=%d exceeds max of 64KB", sz_data_payload); + errno = EMSGSIZE; + return -1; + } + + // Calc udp payload size + size_t sz_udp_payload = sz_data_payload + sizeof(struct udphdr); + vma_wr_tx_packet_attr attr = (vma_wr_tx_packet_attr)((VMA_TX_PACKET_BLOCK * b_blocked) | (VMA_TX_PACKET_DUMMY * is_dummy) | VMA_TX_PACKET_L3_CSUM); + if (sz_udp_payload <= (size_t)m_max_udp_payload_size) { + return fast_send_not_fragmented(p_iov, sz_iov, (vma_wr_tx_packet_attr) (attr | VMA_TX_PACKET_L4_CSUM), sz_udp_payload, sz_data_payload); + } else { + return fast_send_fragmented(p_iov, sz_iov, attr, sz_udp_payload, sz_data_payload); + } +} + +ssize_t dst_entry_udp::slow_send(const iovec* p_iov, size_t sz_iov, bool is_dummy, + struct vma_rate_limit_t &rate_limit, bool b_blocked /*= true*/, + bool is_rexmit /*= false*/, int flags /*= 0*/, + socket_fd_api* sock /*= 0*/, tx_call_t call_type /*= 0*/) +{ + NOT_IN_USE(is_rexmit); + + ssize_t ret_val = 0; + + dst_udp_logdbg("In slow send"); + + prepare_to_send(rate_limit, false); + + if (m_b_force_os || !m_b_is_offloaded) { + struct sockaddr_in to_saddr; + to_saddr.sin_port = m_dst_port; + to_saddr.sin_addr.s_addr = m_dst_ip.get_in_addr(); + to_saddr.sin_family = AF_INET; + dst_udp_logdbg("Calling to tx_os"); + ret_val = sock->tx_os(call_type, p_iov, sz_iov, flags, (const struct sockaddr*)&to_saddr, sizeof(struct sockaddr_in)); + } + else { + if (!is_valid()) { // That means that the neigh is not resolved yet + ret_val = pass_buff_to_neigh(p_iov, sz_iov); + } + else { + ret_val = fast_send(p_iov, sz_iov, is_dummy, b_blocked); + } + } + + return ret_val; +} +#endif /* DEFINED_TSO */ + +void dst_entry_udp::init_sge() +{ + m_sge[0].length = m_header.m_total_hdr_len; + m_sge[0].addr = m_header.m_actual_hdr_addr; +#ifdef DEFINED_TSO + m_sge[0].lkey = m_p_ring->get_tx_lkey(m_id); +#endif /* DEFINED_TSO */ +} + +ssize_t dst_entry_udp::pass_buff_to_neigh(const iovec *p_iov, size_t sz_iov, uint16_t packet_id) +{ + m_header_neigh.init(); + m_header_neigh.configure_udp_header(m_dst_port, m_src_port); + + packet_id = (m_sysvar_thread_mode > THREAD_MODE_SINGLE) ? + atomic_fetch_and_inc(&m_a_tx_ip_id) : + m_n_tx_ip_id++; + packet_id = htons(packet_id); + + return(dst_entry::pass_buff_to_neigh(p_iov, sz_iov, packet_id)); +} diff --git a/src/vma/proto/dst_entry_udp.h b/src/vma/proto/dst_entry_udp.h new file mode 100644 index 0000000..31cc4cd --- /dev/null +++ b/src/vma/proto/dst_entry_udp.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef DST_ENTRY_UDP_H +#define DST_ENTRY_UDP_H + +#include "vma/proto/dst_entry.h" + +class dst_entry_udp : public dst_entry +{ +public: + dst_entry_udp(in_addr_t dst_ip, uint16_t dst_port, uint16_t src_port, + socket_data &sock_data, resource_allocation_key &ring_alloc_logic); + virtual ~dst_entry_udp(); + +#ifdef DEFINED_TSO + ssize_t fast_send(const iovec* p_iov, const ssize_t sz_iov, vma_send_attr attr); + ssize_t slow_send(const iovec* p_iov, const ssize_t sz_iov, vma_send_attr attr, + struct vma_rate_limit_t &rate_limit, int flags = 0, + socket_fd_api* sock = 0, tx_call_t call_type = TX_UNDEF); +#else + virtual ssize_t slow_send(const iovec* p_iov, size_t sz_iov, bool is_dummy, struct vma_rate_limit_t &rate_limit, bool b_blocked = true, bool is_rexmit = false, int flags = 0, socket_fd_api* sock = 0, tx_call_t call_type = TX_UNDEF); + virtual ssize_t fast_send(const iovec* p_iov, const ssize_t sz_iov, bool is_dummy, bool b_blocked = true, bool is_rexmit = false); +#endif /* DEFINED_TSO */ + +protected: + virtual transport_t get_transport(sockaddr_in to); + virtual uint8_t get_protocol_type() const { return IPPROTO_UDP; }; + virtual uint32_t get_inline_sge_num() { return 2; }; + virtual ibv_sge* get_sge_lst_4_inline_send() { return m_sge; }; + virtual ibv_sge* get_sge_lst_4_not_inline_send() { return &m_sge[1]; }; + virtual void configure_headers(); + virtual void init_sge(); + virtual ssize_t pass_buff_to_neigh(const iovec *p_iov, size_t sz_iov, uint16_t packet_id = 0); + atomic_t m_a_tx_ip_id; + size_t m_n_tx_ip_id; + +private: + + inline ssize_t fast_send_not_fragmented(const iovec* p_iov, const ssize_t sz_iov, vma_wr_tx_packet_attr attr, size_t sz_udp_payload, ssize_t sz_data_payload); + ssize_t fast_send_fragmented(const iovec* p_iov, const ssize_t sz_iov, vma_wr_tx_packet_attr attr, size_t sz_udp_payload, ssize_t sz_data_payload); + + const uint32_t m_n_sysvar_tx_bufs_batch_udp; + const bool m_b_sysvar_tx_nonblocked_eagains; + const thread_mode_t m_sysvar_thread_mode; + const uint32_t m_n_sysvar_tx_prefetch_bytes; +}; + +#endif /* DST_ENTRY_UDP_H */ diff --git a/src/vma/proto/dst_entry_udp_mc.cpp b/src/vma/proto/dst_entry_udp_mc.cpp new file mode 100644 index 0000000..8f32ae7 --- /dev/null +++ b/src/vma/proto/dst_entry_udp_mc.cpp @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "dst_entry_udp_mc.h" + +#define MODULE_NAME "dst_mc" + +#define dst_udp_mc_logpanic __log_panic +#define dst_udp_mc_logerr __log_err +#define dst_udp_mc_logwarn __log_warn +#define dst_udp_mc_loginfo __log_info +#define dst_udp_mc_logdbg __log_info_dbg +#define dst_udp_mc_logfunc __log_info_func +#define dst_udp_mc_logfuncall __log_info_funcall + + +dst_entry_udp_mc::dst_entry_udp_mc(in_addr_t dst_ip, uint16_t dst_port, + uint16_t src_port, in_addr_t tx_if_ip, + bool mc_b_loopback ,socket_data &sock_data, + resource_allocation_key &ring_alloc_logic): + dst_entry_udp(dst_ip, dst_port, src_port, sock_data, ring_alloc_logic), + m_mc_tx_if_ip(tx_if_ip), m_b_mc_loopback_enabled(mc_b_loopback) +{ + dst_udp_mc_logdbg("%s", to_str().c_str()); +} + +dst_entry_udp_mc::~dst_entry_udp_mc() +{ + dst_udp_mc_logdbg("%s", to_str().c_str()); +} + +//The following function supposed to be called under m_lock +bool dst_entry_udp_mc::conf_l2_hdr_and_snd_wqe_ib() +{ + bool ret_val = false; + + dst_udp_mc_logfunc("%s", to_str().c_str()); + + ret_val = dst_entry_udp::conf_l2_hdr_and_snd_wqe_ib(); + + if (ret_val && !m_b_mc_loopback_enabled && m_p_send_wqe_handler) { + wqe_send_ib_handler *wqe_ib = dynamic_cast(m_p_send_wqe_handler); + if (wqe_ib) { + //Since checksum fails when packet contains an immediate header we don't enable an immediate header + //So MC loopback disable is NOT SUPPORTED! + //wqe_ib->enable_imm_data(m_inline_send_wqe); + //wqe_ib->enable_imm_data(m_not_inline_send_wqe); + } + else { + ret_val = false; + } + } + return ret_val; +} + +void dst_entry_udp_mc::set_src_addr() +{ + m_pkt_src_ip = INADDR_ANY; + + if (m_bound_ip) { + m_pkt_src_ip = m_bound_ip; + } + else if (m_mc_tx_if_ip.get_in_addr() && !m_mc_tx_if_ip.is_mc()) { + m_pkt_src_ip = m_mc_tx_if_ip.get_in_addr(); + } + else if (m_p_rt_val && m_p_rt_val->get_src_addr()) { + m_pkt_src_ip = m_p_rt_val->get_src_addr(); + } + else if (m_p_net_dev_val && m_p_net_dev_val->get_local_addr()) { + m_pkt_src_ip = m_p_net_dev_val->get_local_addr(); + } +} + +//The following function supposed to be called under m_lock +bool dst_entry_udp_mc::resolve_net_dev(bool is_connect) +{ + NOT_IN_USE(is_connect); + bool ret_val = false; + cache_entry_subject* p_ces = NULL; + + if (m_mc_tx_if_ip.get_in_addr() != INADDR_ANY && !m_mc_tx_if_ip.is_mc()) { + if(m_p_net_dev_entry == NULL && g_p_net_device_table_mgr->register_observer(m_mc_tx_if_ip.get_in_addr(), this, &p_ces)) { + m_p_net_dev_entry = dynamic_cast(p_ces); + } + if (m_p_net_dev_entry) { + m_p_net_dev_entry->get_val(m_p_net_dev_val); + if (m_p_net_dev_val) { + ret_val = alloc_transport_dep_res(); + } + else { + dst_udp_mc_logdbg("Valid netdev value not found"); + } + } + else { + m_b_is_offloaded = false; + dst_udp_mc_logdbg("Netdev is not offloaded fallback to OS"); + } + } + else { + ret_val = dst_entry::resolve_net_dev(); + } + return ret_val; +} diff --git a/src/vma/proto/dst_entry_udp_mc.h b/src/vma/proto/dst_entry_udp_mc.h new file mode 100644 index 0000000..2aad57f --- /dev/null +++ b/src/vma/proto/dst_entry_udp_mc.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef DST_ENTRY_UDP_MC_H +#define DST_ENTRY_UDP_MC_H + +#include "vma/proto/dst_entry_udp.h" + +class dst_entry_udp_mc : public dst_entry_udp +{ +public: + dst_entry_udp_mc(in_addr_t dst_ip, uint16_t dst_port, uint16_t src_port, + in_addr_t mc_tx_if_ip, bool mc_b_loopback, socket_data &sock_data, + resource_allocation_key &ring_alloc_logic); + virtual ~dst_entry_udp_mc(); + + virtual bool conf_l2_hdr_and_snd_wqe_ib(); + +protected: + ip_address m_mc_tx_if_ip; + bool m_b_mc_loopback_enabled; + + virtual void set_src_addr(); + virtual bool resolve_net_dev(bool is_connect=false); +}; + +#endif /* DST_ENTRY_UDP_MC_H */ diff --git a/src/vma/proto/flow_tuple.cpp b/src/vma/proto/flow_tuple.cpp new file mode 100644 index 0000000..9117514 --- /dev/null +++ b/src/vma/proto/flow_tuple.cpp @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + + +#include "flow_tuple.h" +#include +#include + + +#define MODULE_NAME "flow_tuple" + + +flow_tuple::flow_tuple() : + m_dst_ip(INADDR_ANY), m_src_ip(INADDR_ANY), m_dst_port(INPORT_ANY), m_src_port(INPORT_ANY), m_protocol(PROTO_UNDEFINED) +{ + m_str[0] = '\0'; +} + +flow_tuple::flow_tuple(sock_addr& dst, sock_addr& src, in_protocol_t protocol) +{ + m_protocol = protocol; + m_dst_ip = dst.get_in_addr(); + m_dst_port = dst.get_in_port(); + m_src_ip = src.get_in_addr(); + m_src_port = src.get_in_port(); + set_str(); +} + +flow_tuple::flow_tuple(in_addr_t dst_ip, in_port_t dst_port, in_addr_t src_ip, in_port_t src_port, in_protocol_t protocol) +{ + m_protocol = protocol; + m_dst_ip = dst_ip; + m_dst_port = dst_port; + m_src_ip = src_ip; + m_src_port = src_port; + set_str(); +} + +flow_tuple::flow_tuple(const flow_tuple &ft) +{ + m_protocol = ft.m_protocol; + m_dst_ip = ft.m_dst_ip; + m_dst_port = ft.m_dst_port; + m_src_ip = ft.m_src_ip; + m_src_port = ft.m_src_port; + set_str(); +} + +flow_tuple& flow_tuple::operator=(const flow_tuple &ft) +{ + m_protocol = ft.m_protocol; + m_dst_ip = ft.m_dst_ip; + m_dst_port = ft.m_dst_port; + m_src_ip = ft.m_src_ip; + m_src_port = ft.m_src_port; + strncpy(m_str, ft.m_str, sizeof(m_str)); + + return *this; +} + +bool flow_tuple::is_tcp() +{ + return (m_protocol == PROTO_TCP); +} + +bool flow_tuple::is_udp_uc() +{ + return ((m_protocol == PROTO_UDP) && !(IN_MULTICAST_N(m_dst_ip))); +} + +bool flow_tuple::is_udp_mc() +{ + return ((m_protocol == PROTO_UDP) && (IN_MULTICAST_N(m_dst_ip))); +} + +bool flow_tuple::is_local_loopback() +{ + return (LOOPBACK_N(m_dst_ip)); +} + +bool flow_tuple::is_5_tuple() +{ + return (m_src_ip != INADDR_ANY && m_src_port != INPORT_ANY); +} + +bool flow_tuple::is_3_tuple() +{ + return (m_src_ip == INADDR_ANY && m_src_port == INPORT_ANY); +} + +void flow_tuple::set_str() +{ + /* cppcheck-suppress wrongPrintfScanfArgNum */ + snprintf(m_str, sizeof(m_str), "dst:%hhu.%hhu.%hhu.%hhu:%hu, src:%hhu.%hhu.%hhu.%hhu:%hu, proto:%s", + NIPQUAD(m_dst_ip), ntohs(m_dst_port), + NIPQUAD(m_src_ip), ntohs(m_src_port), + __vma_get_protocol_str(m_protocol)); +} + +void flow_tuple_with_local_if::set_str() +{ + char addr_str[32] = { 0 }; + + /* cppcheck-suppress wrongPrintfScanfArgNum */ + snprintf(addr_str, sizeof(addr_str), ", if:%hhu.%hhu.%hhu.%hhu", + NIPQUAD(m_local_if)); + strcat(m_str, addr_str); +}; diff --git a/src/vma/proto/flow_tuple.h b/src/vma/proto/flow_tuple.h new file mode 100644 index 0000000..c95a3d2 --- /dev/null +++ b/src/vma/proto/flow_tuple.h @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef FLOW_TUPLE_H +#define FLOW_TUPLE_H + + +#include +#include +#include "vma/util/libvma.h" +#include "vma/util/sock_addr.h" + +#define STR_MAX_LENGTH 100 + +// Looks at the packet in the ingress flow (in regards to dst and src) +// Practically a 'five tuple' key +class flow_tuple +{ +public: + flow_tuple(); + flow_tuple(sock_addr& dst, sock_addr& src, in_protocol_t protocol); + flow_tuple(in_addr_t dst_ip, in_port_t dst_port, in_addr_t src_ip, in_port_t src_port, in_protocol_t protocol); + flow_tuple(const flow_tuple &ft); // Copy Constructor + virtual ~flow_tuple() { }; + + in_addr_t get_dst_ip() { return m_dst_ip; } + in_addr_t get_src_ip() { return m_src_ip; } + in_port_t get_dst_port() { return m_dst_port; } + in_port_t get_src_port() { return m_src_port; } + in_protocol_t get_protocol() { return m_protocol; } + + bool is_tcp(); + bool is_udp_uc(); + bool is_udp_mc(); + bool is_local_loopback(); + bool is_5_tuple(); + bool is_3_tuple(); + + flow_tuple& operator=(const flow_tuple &ft); + + virtual bool operator==(flow_tuple const& other) const + { + return (m_dst_port == other.m_dst_port) && + (m_dst_ip == other.m_dst_ip) && + (m_src_port == other.m_src_port) && + (m_src_ip == other.m_src_ip) && + (m_protocol == other.m_protocol); + } + + virtual bool operator <(flow_tuple const& other) const + { + if (m_dst_port != other.m_dst_port) + return m_dst_port < other.m_dst_port; + if (m_dst_ip != other.m_dst_ip) + return m_dst_ip < other.m_dst_ip; + if (m_src_port != other.m_src_port) + return m_src_port < other.m_src_port; + if (m_src_ip != other.m_src_ip) + return m_src_ip < other.m_src_ip; + return m_protocol < other.m_protocol; + } + + virtual size_t hash(void) + { + uint8_t csum = 0; + uint8_t* pval = (uint8_t*)this; + for (size_t i = 0; i < (sizeof(flow_tuple) - sizeof(m_str)); ++i, ++pval) { csum ^= *pval; } + return csum; + } + + const char* to_str() { return m_str; }; + +protected: + in_addr_t m_dst_ip; + in_addr_t m_src_ip; + in_port_t m_dst_port; + in_port_t m_src_port; + in_protocol_t m_protocol; + + char m_str[STR_MAX_LENGTH]; + virtual void set_str(); +}; + +typedef std::list flow_tuple_list_t; + + +// Adding the 'six tuple' element of local_if +// Required by sockinfo when handling MC groups attach/detach +class flow_tuple_with_local_if : public flow_tuple +{ +public: + flow_tuple_with_local_if(sock_addr& dst, sock_addr& src, in_protocol_t protocol, in_addr_t local_if) : + flow_tuple(dst, src, protocol), m_local_if(local_if) { set_str(); }; + flow_tuple_with_local_if(in_addr_t dst_ip, in_port_t dst_port, in_addr_t src_ip, in_port_t src_port, in_protocol_t protocol, in_addr_t local_if) : + flow_tuple(dst_ip, dst_port, src_ip, src_port, protocol), m_local_if(local_if) { set_str(); }; + + in_addr_t get_local_if() { return m_local_if; } + + virtual bool operator==(flow_tuple_with_local_if const& other) const + { + return ((m_local_if == other.m_local_if) && + (*((flow_tuple*)this) == ((flow_tuple)other))); + } + + virtual bool operator <(flow_tuple_with_local_if const& other) const + { + if (m_local_if != other.m_local_if) + return m_local_if < other.m_local_if; + return (*((flow_tuple*)this) < ((flow_tuple)other)); + } + + virtual size_t hash(void) + { + uint8_t csum = 0; + uint8_t* pval = (uint8_t*)this; + for (size_t i = 0; i < (sizeof(flow_tuple_with_local_if) - sizeof(m_str)); ++i, ++pval) { csum ^= *pval; } + return csum; + } + +protected: + in_addr_t m_local_if; + virtual void set_str(); +}; + + +#endif /* FLOW_TUPLE_H */ diff --git a/src/vma/proto/header.cpp b/src/vma/proto/header.cpp new file mode 100644 index 0000000..483f5a1 --- /dev/null +++ b/src/vma/proto/header.cpp @@ -0,0 +1,193 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "header.h" + + +void header::init() +{ + memset(&m_header, 0, sizeof(m_header)); + m_ip_header_len = 0; + m_transport_header_len = 0; + m_total_hdr_len = 0; + m_aligned_l2_l3_len = 40; + m_is_vlan_enabled = false; +} + +header::header() : + m_actual_hdr_addr(0), + m_transport_header_tx_offset(0), + m_is_vlan_enabled(false), + m_transport_type(VMA_TRANSPORT_UNKNOWN) +{ + init(); +} + +header::header(const header &h): tostr() +{ + m_header = h.m_header; + m_ip_header_len = h.m_ip_header_len; + m_transport_header_len = h.m_transport_header_len; + m_total_hdr_len = h.m_total_hdr_len; + m_aligned_l2_l3_len = h.m_aligned_l2_l3_len; + m_transport_header_tx_offset = h.m_transport_header_tx_offset; + m_is_vlan_enabled = h.m_is_vlan_enabled; + m_transport_type = h.m_transport_type; + update_actual_hdr_addr(); +} + +void header::configure_udp_header(uint16_t dest_port, uint16_t src_port) +{ + udphdr *p_udp_hdr = &m_header.hdr.m_udp_hdr; + + memset(p_udp_hdr, 0 , (sizeof(*p_udp_hdr))); + + p_udp_hdr->dest = dest_port; + p_udp_hdr->source = src_port; + p_udp_hdr->check = 0; + + m_total_hdr_len += sizeof(udphdr); +} + +void header::configure_tcp_ports(uint16_t dest_port, uint16_t src_port) +{ + tcphdr *p_tcp_hdr = &m_header.hdr.m_tcp_hdr; + + /* memset(p_tcp_hdr, 0 , (sizeof(*p_tcp_hdr))); */ + + p_tcp_hdr->dest = dest_port; + p_tcp_hdr->source = src_port; + + /* don't increase header len, as the tcp stack is not using these ports */ +} + +void header::configure_ip_header(uint8_t protocol, in_addr_t src_addr, in_addr_t dest_addr, uint8_t ttl, uint8_t tos, uint16_t packet_id) +{ + iphdr* p_hdr = &m_header.hdr.m_ip_hdr; + + memset(p_hdr, 0 , (sizeof(*p_hdr))); + + // build ip header + p_hdr->ihl = IPV4_HDR_LEN_WITHOUT_OPTIONS / sizeof(uint32_t); // 5 * 4 bytes (32 bit words) = 20 bytes = regular iph length with out any optionals + p_hdr->version = IPV4_VERSION; + p_hdr->protocol = protocol; + p_hdr->saddr = src_addr; + p_hdr->daddr = dest_addr; + p_hdr->tos = tos; + p_hdr->ttl = ttl; + p_hdr->id = packet_id; + + m_ip_header_len = IPV4_HDR_LEN_WITHOUT_OPTIONS; + m_total_hdr_len += m_ip_header_len; +} + +void header::configure_ipoib_headers(uint32_t ipoib_header /*=IPOIB_HEADER*/) +{ + ib_hdr_template_t *p_hdr = &m_header.hdr.m_l2_hdr.ib_hdr; + m_transport_header_tx_offset = sizeof(p_hdr->m_alignment); + m_transport_header_len = sizeof(p_hdr->m_ipoib_hdr); + m_total_hdr_len += m_transport_header_len; + p_hdr->m_ipoib_hdr.ipoib_header = htonl(ipoib_header); + update_actual_hdr_addr(); +} + +void header::set_mac_to_eth_header(const L2_address &src, const L2_address &dst, ethhdr ð_header) +{ + // copy source and destination mac address to eth header + memcpy(eth_header.h_source, src.get_address(), src.get_addrlen()); + memcpy(eth_header.h_dest, dst.get_address(), dst.get_addrlen()); + // sets the size of 'm_eth_hdr' in the 'eth_hdr_template' struct + m_transport_header_len = sizeof(eth_header); +} + +void header::set_ip_ttl(uint8_t ttl) +{ + iphdr* p_hdr = &m_header.hdr.m_ip_hdr; + + p_hdr->ttl = ttl; +} + +void header::set_ip_tos(uint8_t tos) +{ + iphdr* p_hdr = &m_header.hdr.m_ip_hdr; + + p_hdr->tos = tos; +} + +void header::configure_eth_headers(const L2_address &src, const L2_address &dst, uint16_t encapsulated_proto/*=ETH_P_IP*/) +{ + eth_hdr_template_t *p_eth_hdr = &m_header.hdr.m_l2_hdr.eth_hdr; + p_eth_hdr->m_eth_hdr.h_proto = htons(encapsulated_proto); + m_is_vlan_enabled = false; + set_mac_to_eth_header(src, dst, p_eth_hdr->m_eth_hdr); + m_transport_header_tx_offset = sizeof(p_eth_hdr->m_alignment); + m_total_hdr_len += m_transport_header_len; + + update_actual_hdr_addr(); +} + +void header::update_actual_hdr_addr() +{ + m_actual_hdr_addr = (uintptr_t)((((uint8_t*)(&m_header)) + (uint8_t)(m_transport_header_tx_offset))); +} + +void header::configure_vlan_eth_headers(const L2_address &src, const L2_address &dst, uint16_t tos, uint16_t encapsulated_proto/*=ETH_P_IP*/) +{ + vlan_eth_hdr_template_t* p_vlan_eth_hdr = &m_header.hdr.m_l2_hdr.vlan_eth_hdr; + set_mac_to_eth_header(src, dst, p_vlan_eth_hdr->m_eth_hdr); + + p_vlan_eth_hdr->m_vlan_hdr.h_vlan_TCI = htons(tos); + p_vlan_eth_hdr->m_eth_hdr.h_proto = htons(ETH_P_8021Q); + p_vlan_eth_hdr->m_vlan_hdr.h_vlan_encapsulated_proto = htons(encapsulated_proto); + m_is_vlan_enabled = true; + m_transport_header_tx_offset = sizeof(p_vlan_eth_hdr->m_alignment); + m_transport_header_len += sizeof(p_vlan_eth_hdr->m_vlan_hdr); + m_total_hdr_len += m_transport_header_len; + update_actual_hdr_addr(); +} + + +bool header::set_vlan_pcp(uint8_t pcp) +{ + if (!m_is_vlan_enabled) { + return false; + } + vlan_eth_hdr_template_t* p_vlan_eth_hdr = + &m_header.hdr.m_l2_hdr.vlan_eth_hdr; + // zero old pcp and set new one + uint16_t vlan_pcp = ((uint16_t)pcp << NET_ETH_VLAN_PCP_OFFSET) | + (htons(p_vlan_eth_hdr->m_vlan_hdr.h_vlan_TCI) & 0x1fff); + p_vlan_eth_hdr->m_vlan_hdr.h_vlan_TCI = htons(vlan_pcp); + + return true; +} diff --git a/src/vma/proto/header.h b/src/vma/proto/header.h new file mode 100644 index 0000000..111b5c8 --- /dev/null +++ b/src/vma/proto/header.h @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + + +#ifndef HEADER_H +#define HEADER_H + +#include +#include +#include +#include +#include +#include +#include + +#include "vma/util/vtypes.h" +#include "vma/util/to_str.h" +#include "L2_address.h" +#include "vma/util/sys_vars.h" + +// We align the frame so IP header will be 4 bytes align +// And we align the L2 headers so IP header on both transport +// types will be at the same offset from buffer start +#define NET_IB_IP_ALIGN_SZ 16 +#define NET_ETH_IP_ALIGN_SZ 6 +#define NET_ETH_VLAN_IP_ALIGN_SZ 2 +#define NET_ETH_VLAN_PCP_OFFSET 13 + +struct __attribute__ ((packed)) ib_hdr_template_t { // Offeset Size + char m_alignment[NET_IB_IP_ALIGN_SZ]; // 0 16 = 16 + ipoibhdr m_ipoib_hdr; // 16 4 = 20 +// iphdr m_ip_hdr; // 20 20 = 40 +}; + +struct __attribute__ ((packed)) eth_hdr_template_t { // Offeset Size + char m_alignment[NET_ETH_IP_ALIGN_SZ]; // 0 6 = 6 + ethhdr m_eth_hdr; // 6 14 = 20 +// iphdr m_ip_hdr; // 20 20 = 40 +}; + +struct __attribute__ ((packed)) vlan_eth_hdr_template_t { // Offeset Size + char m_alignment[NET_ETH_VLAN_IP_ALIGN_SZ]; // 0 2 = 2 + ethhdr m_eth_hdr; // 2 14 = 16 + vlanhdr m_vlan_hdr; // 16 4 = 20 +// iphdr m_ip_hdr; // 20 20 = 40 +}; + +union l2_hdr_template_t { + ib_hdr_template_t ib_hdr; + eth_hdr_template_t eth_hdr; + vlan_eth_hdr_template_t vlan_eth_hdr; +}; + +struct __attribute__ ((packed, aligned)) tx_hdr_template_t { // Offeset Size + l2_hdr_template_t m_l2_hdr; // 0 20 + iphdr m_ip_hdr; // 20 20 + union { + udphdr m_udp_hdr; // 40 8 + tcphdr m_tcp_hdr; // 40 20 + }; +}; + +union tx_packet_template_t { + tx_hdr_template_t hdr; + uint32_t words[15]; //change in tx_hdr_template_t size may require to modify this array size +}; + + +class header: public tostr +{ +public: + header(); + header(const header &h); + virtual ~header() {}; + + + void init(); + void configure_udp_header(uint16_t dest_port, uint16_t src_port); + void configure_tcp_ports(uint16_t dest_port, uint16_t src_port); + void configure_ip_header(uint8_t protocol, in_addr_t src_addr, in_addr_t dest_addr, uint8_t ttl = 64, uint8_t tos = 0, uint16_t packet_id = 0); + void configure_ipoib_headers(uint32_t ipoib_header = IPOIB_HEADER); + void set_mac_to_eth_header(const L2_address &src, const L2_address &dst, ethhdr ð_header); + void set_ip_ttl(uint8_t ttl); + void set_ip_tos(uint8_t tos); + void configure_eth_headers(const L2_address &src, const L2_address &dst, uint16_t encapsulated_proto = ETH_P_IP); + void configure_vlan_eth_headers(const L2_address &src, const L2_address &dst, uint16_t tci, uint16_t encapsulated_proto = ETH_P_IP); + bool set_vlan_pcp(uint8_t pcp); + void update_actual_hdr_addr(); + + inline void copy_l2_ip_hdr(tx_packet_template_t *p_hdr) + { + // copy words every time, to optimize for speed + p_hdr->words[0] = m_header.words[0]; // dummy(16) + l2(16) (mac / dummy) + p_hdr->words[1] = m_header.words[1]; // l2 (32) (mac / dummy) + p_hdr->words[2] = m_header.words[2]; // l2 (32) (mac / dummy) + p_hdr->words[3] = m_header.words[3]; // l2 (32) (mac / dummy) + p_hdr->words[4] = m_header.words[4]; // l2 (32) (mac / vlan / ipoib) + p_hdr->words[5] = m_header.words[5]; // IP-> ver(4) + hdrlen(4) + tos(8) + totlen(16) + p_hdr->words[6] = m_header.words[6]; // IP-> id(16) + frag(16) + p_hdr->words[7] = m_header.words[7]; // IP-> ttl(8) + protocol(8) + checksum(16) + p_hdr->words[8] = m_header.words[8]; // IP-> saddr(32) + p_hdr->words[9] = m_header.words[9]; // IP-> daddr(32) + } + + inline void copy_l2_ip_udp_hdr(tx_packet_template_t *p_hdr) + { + copy_l2_ip_hdr(p_hdr); + p_hdr->words[10] = m_header.words[10]; // UDP-> sport(16) + dst_port(16) + p_hdr->words[11] = m_header.words[11]; // UDP-> len(16) + check(16) + } + + inline void copy_l2_hdr(tx_packet_template_t *p_hdr) + { + uint32_t *to_words = p_hdr->words; + uint32_t *from_words = m_header.words; + to_words[0] = from_words[0]; // dummy(16) + l2(16) (mac / dummy) + to_words[1] = from_words[1]; // l2 (32) (mac / dummy) + to_words[2] = from_words[2]; // l2 (32) (mac / dummy) + to_words[3] = from_words[3]; // l2 (32) (mac / dummy) + to_words[4] = from_words[4]; // l2 (32) (mac / vlan / ipoib) + } + + uintptr_t m_actual_hdr_addr; + tx_packet_template_t m_header; + uint16_t m_ip_header_len; + uint16_t m_transport_header_len; + uint16_t m_total_hdr_len; + uint16_t m_aligned_l2_l3_len; + uint16_t m_transport_header_tx_offset; + bool m_is_vlan_enabled; + transport_type_t m_transport_type; +}; + +#endif /* HEADER_H */ diff --git a/src/vma/proto/igmp_handler.cpp b/src/vma/proto/igmp_handler.cpp new file mode 100644 index 0000000..d6b534d --- /dev/null +++ b/src/vma/proto/igmp_handler.cpp @@ -0,0 +1,244 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include +#include + +#include "utils/bullseye.h" +#include "vlogger/vlogger.h" +#include "vma/proto/neighbour_table_mgr.h" +#include "vma/dev/wqe_send_handler.h" +#include "vma/dev/wqe_send_ib_handler.h" +#include "vma/util/utils.h" +#include "igmp_handler.h" + + + +#define MODULE_NAME "igmp_hdlr" +#undef MODULE_HDR_INFO +#define MODULE_HDR_INFO MODULE_NAME "[%s]:%d:%s() " + +#undef __INFO__ +#define __INFO__ this->to_str().c_str() + +#define igmp_hdlr_logpanic __log_info_panic +#define igmp_hdlr_logerr __log_info_err +#define igmp_hdlr_logwarn __log_info_warn +#define igmp_hdlr_loginfo __log_info_info +#define igmp_hdlr_logdbg __log_info_dbg +#define igmp_hdlr_logfunc __log_info_func +#define igmp_hdlr_logfuncall __log_info_funcall + +#define IGMPV1_MAX_RESPONSE_TIME 100 + +igmp_handler::igmp_handler(const igmp_key &key, uint8_t igmp_code) : m_mc_addr (key.get_in_addr()), m_p_ndvl(key.get_net_device_val()), + m_ignore_timer(false), m_timer_handle(NULL), m_p_neigh_entry(NULL), m_p_neigh_val(NULL), + m_p_ring(NULL), m_igmp_code(igmp_code ? igmp_code : IGMPV1_MAX_RESPONSE_TIME), m_id(0) +{ + ring_alloc_logic_attr ring_attr(safe_mce_sys().ring_allocation_logic_tx); + m_ring_allocation_logic = ring_allocation_logic_tx(m_p_ndvl->get_local_addr(), ring_attr, this); + + memset(&m_sge, 0, sizeof(m_sge)); + memset(&m_p_send_igmp_wqe, 0, sizeof(m_p_send_igmp_wqe)); +} + +igmp_handler::~igmp_handler() +{ + if (m_p_neigh_entry) { + g_p_neigh_table_mgr->unregister_observer(igmp_key(m_mc_addr, m_p_ndvl),this); + m_p_neigh_entry = NULL; + } + + if (m_p_ring) { + m_p_ndvl->release_ring(m_ring_allocation_logic.get_key()); + m_p_ring = NULL; + } + + if (m_p_neigh_val) { + delete m_p_neigh_val; + m_p_neigh_val = NULL; + } +} + +bool igmp_handler::init(const igmp_key &key) +{ + igmp_hdlr_logfunc(""); + cache_entry_subject* p_ces = NULL; + g_p_neigh_table_mgr->register_observer(key, this, &p_ces); + m_p_neigh_entry = dynamic_cast(p_ces); + + BULLSEYE_EXCLUDE_BLOCK_START + if (!m_p_neigh_entry) { + igmp_hdlr_logerr("Dynamic casting to neigh_entry has failed"); + return false; + } + + m_p_neigh_val = new neigh_ib_val; + if (!m_p_neigh_val) { + igmp_hdlr_logerr("Failed allocating neigh_val"); + return false; + } + + m_p_ring = m_p_ndvl->reserve_ring(m_ring_allocation_logic.get_key()); + if (!m_p_ring) { + igmp_hdlr_logerr("Ring was not reserved"); + return false; + } + m_id = m_p_ring->generate_id(); + BULLSEYE_EXCLUDE_BLOCK_END + + return true; +} + + // will register timer and later do 'tx_igmp_report(mc_group, ndvl)' +void igmp_handler::handle_query(uint8_t igmp_code) +{ + igmp_hdlr_logdbg("Received igmp query, preparing to send report"); + + m_igmp_code = igmp_code ? igmp_code : IGMPV1_MAX_RESPONSE_TIME; + + m_ignore_timer = false; + + priv_register_timer_event(this, ONE_SHOT_TIMER, (void*)IGMP_TIMER_ID); +} + +void igmp_handler::priv_register_timer_event(timer_handler* handler, timer_req_type_t req_type, void* user_data) +{ + int duration = 0 ; + srand(time(NULL)); + /* coverity[dont_call] */ + duration = (rand() % (m_igmp_code * 100)); // igmp_code (1-255) is in 1/10 sec units + + lock(); + if (!m_timer_handle && g_p_event_handler_manager) { + igmp_hdlr_logdbg("Register timer (%d msec) for sending igmp report after seen an igmp query for this group", duration); + m_timer_handle = g_p_event_handler_manager->register_timer_event(duration, handler, req_type, user_data); + } + unlock(); +} + +void igmp_handler::handle_report() +{ + igmp_hdlr_logdbg("Ignoring self timer (%p) after seen an igmp report for this group", m_timer_handle); + m_ignore_timer = true; // check if was not ignored before ? +} + +void igmp_handler::clean_obj() +{ + if (is_cleaned()) { + return ; + } + + set_cleaned(); + m_timer_handle = NULL; + if (g_p_event_handler_manager->is_running()) { + g_p_event_handler_manager->unregister_timers_event_and_delete(this); + } else { + cleanable_obj::clean_obj(); + } +} + +void igmp_handler::handle_timer_expired(void* user_data) +{ + NOT_IN_USE(user_data); + igmp_hdlr_logdbg("Timeout expired"); + m_timer_handle = NULL; + + if (m_ignore_timer) { + igmp_hdlr_logdbg("Ignoring timeout handling due to captured IGMP report"); + return; + } + igmp_hdlr_logdbg("Sending igmp report"); + + if (!tx_igmp_report()) { + igmp_hdlr_logdbg("Send igmp report failed, registering new timer"); + priv_register_timer_event(this, ONE_SHOT_TIMER, (void*)IGMP_TIMER_ID); + } +} + +bool igmp_handler::tx_igmp_report() +{ + + if (m_p_neigh_entry->get_peer_info(m_p_neigh_val)) { + igmp_hdlr_logdbg("neigh is valid"); + } + else { + igmp_hdlr_logdbg("neigh is not valid"); + return false; + } + + mem_buf_desc_t* p_mem_buf_desc = m_p_ring->mem_buf_tx_get(m_id, false, 1); + if (unlikely(p_mem_buf_desc == NULL)) { + igmp_hdlr_logdbg("No free TX buffer, not sending igmp report"); + return false; + } + + wqe_send_ib_handler wqe_sh; + wqe_sh.init_ib_wqe(m_p_send_igmp_wqe, &m_sge, 1, ((neigh_ib_val *)m_p_neigh_val)->get_ah(), + ((neigh_ib_val *)m_p_neigh_val)->get_qpn(), ((neigh_ib_val *)m_p_neigh_val)->get_qkey()); + m_header.init(); + m_header.configure_ipoib_headers(); + size_t m_total_l2_hdr_len = m_header.m_total_hdr_len; + m_header.configure_ip_header(IPPROTO_IGMP, m_p_ndvl->get_local_addr(), m_mc_addr.get_in_addr(),/*ttl for IGMP*/1); + m_header.copy_l2_ip_hdr((tx_packet_template_t*)p_mem_buf_desc->p_buffer); + + // Override IP header with IGMPV2 specific info + ip_igmp_tx_hdr_template_t* p_ip_pkt = (ip_igmp_tx_hdr_template_t*)(p_mem_buf_desc->p_buffer + m_header.m_transport_header_tx_offset + m_total_l2_hdr_len); + set_ip_igmp_hdr(p_ip_pkt); + + m_sge.addr = (uintptr_t)(p_mem_buf_desc->p_buffer + (uint8_t)m_header.m_transport_header_tx_offset); + m_sge.length = m_header.m_total_hdr_len + sizeof(uint32_t /*m_ip_hdr_ext*/) + sizeof (igmphdr /*m_igmp_hdr*/); + m_sge.lkey = p_mem_buf_desc->lkey; + p_mem_buf_desc->p_next_desc = NULL; + m_p_send_igmp_wqe.wr_id = (uintptr_t)p_mem_buf_desc; + + igmp_hdlr_logdbg("Sending igmp report"); + m_p_ring->send_ring_buffer(m_id, &m_p_send_igmp_wqe, (vma_wr_tx_packet_attr)0); + return true; +} + +void igmp_handler::set_ip_igmp_hdr(ip_igmp_tx_hdr_template_t* ip_igmp_hdr) +{ + ip_igmp_hdr->m_ip_hdr.ihl = IPV4_IGMP_HDR_LEN_WORDS; + ip_igmp_hdr->m_ip_hdr.tot_len = htons(IPV4_IGMP_HDR_LEN + sizeof(igmphdr)); + ip_igmp_hdr->m_ip_hdr_ext = htonl(IGMP_IP_HEADER_EXT); + ip_igmp_hdr->m_ip_hdr.check = 0; + ip_igmp_hdr->m_ip_hdr.check = compute_ip_checksum((unsigned short*)&ip_igmp_hdr->m_ip_hdr, (IPV4_IGMP_HDR_LEN_WORDS) * 2); + + // Create the IGMP header + ip_igmp_hdr->m_igmp_hdr.type = IGMPV2_HOST_MEMBERSHIP_REPORT; + ip_igmp_hdr->m_igmp_hdr.code = 0; + ip_igmp_hdr->m_igmp_hdr.group = m_mc_addr.get_in_addr(); + ip_igmp_hdr->m_igmp_hdr.csum = 0; + ip_igmp_hdr->m_igmp_hdr.csum = compute_ip_checksum((unsigned short*)&ip_igmp_hdr->m_igmp_hdr, IGMP_HDR_LEN_WORDS * 2); +} diff --git a/src/vma/proto/igmp_handler.h b/src/vma/proto/igmp_handler.h new file mode 100644 index 0000000..b465e6f --- /dev/null +++ b/src/vma/proto/igmp_handler.h @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "vma/proto/neighbour.h" +#include "vma/event/event_handler_manager.h" +#include "vma/event/timer_handler.h" +#include + + +#ifndef IGMP_HANDLER_H_ +#define IGMP_HANDLER_H_ + +#define igmp_key neigh_key + +#define IGMP_TIMER_ID 0 + +struct __attribute__ ((packed, aligned)) ip_igmp_tx_hdr_template_t { + iphdr m_ip_hdr; + uint32_t m_ip_hdr_ext; + igmphdr m_igmp_hdr; +}; + +#define IGMP_IP_HEADER_EXT 0x94040000 // IP header options field: Router alert + +class igmp_handler : public timer_handler, public lock_mutex, public cleanable_obj, public cache_observer, public neigh_observer +{ +public: + igmp_handler(const igmp_key &key, uint8_t igmp_code); + bool init(const igmp_key &key); + ~igmp_handler(); + + const std::string to_str() const + { + return(m_mc_addr.to_str() + " " + m_p_ndvl->to_str()); + } + + virtual transport_type_t get_obs_transport_type() const + { + return m_p_ndvl->get_transport_type(); + } + + void handle_query(uint8_t igmp_code); // handle queries coming from router + void handle_report(); // handle reports coming from other hosts + + virtual void clean_obj(); +private: + + ip_address m_mc_addr; + net_device_val* m_p_ndvl; + ring_allocation_logic_tx m_ring_allocation_logic; + bool m_ignore_timer; + void* m_timer_handle; + neigh_entry* m_p_neigh_entry; + neigh_val* m_p_neigh_val; + ring* m_p_ring; + header m_header; + ibv_sge m_sge; + vma_ibv_send_wr m_p_send_igmp_wqe; + uint8_t m_igmp_code; + ring_user_id_t m_id; + + void set_timer(); //called by tx_igmp_report + void unset_timer(); // called if igmp packet is report and not query + virtual void handle_timer_expired(void* user_data); + void priv_register_timer_event(timer_handler* handler, timer_req_type_t req_type, void* user_data); + bool tx_igmp_report(); + void set_ip_igmp_hdr(ip_igmp_tx_hdr_template_t* igmp_hdr); + +}; + +#endif diff --git a/src/vma/proto/igmp_mgr.cpp b/src/vma/proto/igmp_mgr.cpp new file mode 100644 index 0000000..caea574 --- /dev/null +++ b/src/vma/proto/igmp_mgr.cpp @@ -0,0 +1,149 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include +#include + +#include "utils/bullseye.h" +#include "vlogger/vlogger.h" +#include "igmp_mgr.h" +#include "vma/dev/net_device_table_mgr.h" +#include "vma/dev/net_device_val.h" + + + +#define MODULE_NAME "igmp_mgr" +#undef MODULE_HDR_INFO +#define MODULE_HDR_INFO MODULE_NAME "[%s]:%d:%s() " + +#undef __INFO__ +#define __INFO__ "" + +#define igmp_mgr_logpanic __log_info_panic +#define igmp_mgr_logerr __log_info_err +#define igmp_mgr_logwarn __log_info_warn +#define igmp_mgr_loginfo __log_info_info +#define igmp_mgr_logdbg __log_info_dbg +#define igmp_mgr_logfunc __log_info_func +#define igmp_mgr_logfuncall __log_info_funcall + + +igmp_mgr *g_p_igmp_mgr = NULL; + +igmp_mgr::~igmp_mgr() +{ + igmp_handler* p_igmp_hdlr = NULL; + igmp_hdlr_map_t::iterator iter = m_igmp_hash.begin(); + while (iter != m_igmp_hash.end()) { + p_igmp_hdlr = iter->second; + igmp_mgr_logdbg("Delete existing igmp handler '%s'", p_igmp_hdlr->to_str().c_str()); + m_igmp_hash.erase(iter); + p_igmp_hdlr->clean_obj(); + p_igmp_hdlr = NULL; + iter = m_igmp_hash.begin(); + } +} + +void igmp_mgr::process_igmp_packet(struct iphdr* p_ip_h, in_addr_t local_if) +{ + igmp_mgr_logfunc(""); + igmp_handler* p_igmp_hdlr = NULL; + uint16_t ip_h_hdr_len = (int)(p_ip_h->ihl)*4; + struct igmphdr* p_igmp_h = (struct igmphdr*)(((uint8_t*)p_ip_h) + ip_h_hdr_len); + + net_device_val* p_ndvl = g_p_net_device_table_mgr->get_net_device_val(local_if); + BULLSEYE_EXCLUDE_BLOCK_START + if (!p_ndvl){ + igmp_mgr_logerr("Failed getting relevant net device"); + return; + } + BULLSEYE_EXCLUDE_BLOCK_END + + igmp_key key(ip_address(p_igmp_h->group), p_ndvl); + p_igmp_hdlr = get_igmp_handler(key, p_igmp_h->code); + BULLSEYE_EXCLUDE_BLOCK_START + if (!p_igmp_hdlr){ + igmp_mgr_logerr("Failed getting relevant igmp_handler"); + return; + } + BULLSEYE_EXCLUDE_BLOCK_END + + switch (p_igmp_h->type) { + case IGMP_HOST_MEMBERSHIP_QUERY: + p_igmp_hdlr->handle_query(p_igmp_h->code); + break; + + case IGMP_HOST_MEMBERSHIP_REPORT: + case IGMPV2_HOST_MEMBERSHIP_REPORT: + p_igmp_hdlr->handle_report(); + break; + + default: + break; + } +} + +igmp_handler* igmp_mgr::get_igmp_handler(const igmp_key &key, uint8_t igmp_code) +{ + igmp_handler *p_igmp_hdlr = NULL; + + lock(); + igmp_hdlr_map_t::iterator iter = m_igmp_hash.find(key); + if (iter != m_igmp_hash.end()) { + p_igmp_hdlr = iter->second; + igmp_mgr_logdbg("Found existing igmp handler '%s'", p_igmp_hdlr->to_str().c_str()); + } + else { + p_igmp_hdlr = new igmp_handler(key, igmp_code); + BULLSEYE_EXCLUDE_BLOCK_START + if (!p_igmp_hdlr) { + igmp_mgr_logerr("Failed allocating new igmp handler for mc_address = %d.%d.%d.%d, local_if= %d.%d.%d.%d", + NIPQUAD(key.get_in_addr()), NIPQUAD(key.get_net_device_val()->get_local_addr())); + unlock(); + return p_igmp_hdlr; + } + if (!p_igmp_hdlr->init(key)) { + igmp_mgr_logerr("Failed to initialize new igmp handler '%s'", p_igmp_hdlr->to_str().c_str()); + delete(p_igmp_hdlr); + p_igmp_hdlr = NULL; + unlock(); + return p_igmp_hdlr; + } + BULLSEYE_EXCLUDE_BLOCK_END + m_igmp_hash.insert(igmp_hdlr_map_t::value_type(key, p_igmp_hdlr)); + igmp_mgr_logdbg("Created new igmp handler '%s'", p_igmp_hdlr->to_str().c_str()); + } + unlock(); + return p_igmp_hdlr; +} + diff --git a/src/vma/proto/igmp_mgr.h b/src/vma/proto/igmp_mgr.h new file mode 100644 index 0000000..48f34ed --- /dev/null +++ b/src/vma/proto/igmp_mgr.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "vma/proto/igmp_handler.h" +#include + +#ifndef IGMP_MANAGER_H +#define IGMP_MANAGER_H + + +typedef std::tr1::unordered_map igmp_hdlr_map_t; + +class igmp_mgr : public lock_mutex +{ +public: + igmp_mgr() {}; + ~igmp_mgr(); + void process_igmp_packet(struct iphdr* p_ip_h, in_addr_t local_if); + +private: + igmp_hdlr_map_t m_igmp_hash; + igmp_handler* get_igmp_handler(const igmp_key &key, uint8_t igmp_code); +}; + +extern igmp_mgr *g_p_igmp_mgr; + +#endif + diff --git a/src/vma/proto/ip_address.h b/src/vma/proto/ip_address.h new file mode 100644 index 0000000..ebce3ce --- /dev/null +++ b/src/vma/proto/ip_address.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef IP_ADDRESS_H +#define IP_ADDRESS_H + +#include +#include "vma/util/to_str.h" +#include "vma/util/vtypes.h" +#include + +class ip_address : public tostr +{ +public: + ip_address(in_addr_t ip): m_ip(ip){}; + ~ip_address(){}; + + const std::string to_str() const + { + char s[20]; + /* cppcheck-suppress wrongPrintfScanfArgNum */ + sprintf(s, "%d.%d.%d.%d", NIPQUAD(m_ip)); + return(std::string(s)); + } + + in_addr_t get_in_addr() const { return m_ip; }; +#if _BullseyeCoverage + #pragma BullseyeCoverage off +#endif + in_addr_t get_actual_key() { return get_in_addr(); }; + bool is_anyaddr() { return (INADDR_ANY == m_ip); }; + bool is_mc() { return (IN_MULTICAST_N(m_ip)); }; + bool is_local_loopback() { return (LOOPBACK_N(m_ip)); }; +#if _BullseyeCoverage + #pragma BullseyeCoverage on +#endif + + bool operator==(const ip_address &ip) const { return (m_ip == ip.get_in_addr()); }; + +#if _BullseyeCoverage + #pragma BullseyeCoverage off +#endif + bool operator<(const ip_address &ip) const { return (m_ip < ip.get_in_addr()); }; +#if _BullseyeCoverage + #pragma BullseyeCoverage on +#endif + +private: + in_addr_t m_ip; +}; + +namespace std { namespace tr1 { +template<> +class hash +{ +public: + size_t operator()(const ip_address &key) const + { + hash_hash; + return _hash(key.get_in_addr()); + } +}; +}} + + +#endif /* IP_ADDRESS_H */ diff --git a/src/vma/proto/ip_frag.cpp b/src/vma/proto/ip_frag.cpp new file mode 100644 index 0000000..c59f72a --- /dev/null +++ b/src/vma/proto/ip_frag.cpp @@ -0,0 +1,539 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "ip_frag.h" + +#include +#include +#include "utils/bullseye.h" +#include "vma/event/event_handler_manager.h" +#include "mem_buf_desc.h" + +//#define IP_FRAG_DEBUG 1 + +#ifdef IP_FRAG_DEBUG +#define frag_dbg(fmt, args...) \ + vlog_printf(VLOG_WARNING, "%s:%d : " fmt "\n", __FUNCTION__, __LINE__, ##args) +#else +#define frag_dbg(fmt, args...) +#endif + +#define frag_err(fmt, args...) \ + vlog_printf(VLOG_ERROR, "%s:%d : " fmt "\n", __FUNCTION__, __LINE__, ##args) + +#define frag_panic(fmt, args...) \ + {vlog_printf(VLOG_PANIC, "%s:%d : " fmt "\n", __FUNCTION__, __LINE__, ##args); throw;} + + +#ifdef IP_FRAG_DEBUG +static int debug_drop_every_n_pkt=0; // 0 - Disabled, 1/N is the number of packet dropped +static int debug_drop_index=0; // counter + +static int g_ip_frag_count_check = 0; + #define MEMBUF_DEBUG_REF_INC(__p_desc__) {g_ip_frag_count_check++; if (__p_desc__->n_ref_count!=0) frag_panic("REF_INC: p=%p\n", __p_desc__); __p_desc__->n_ref_count++;} + #define MEMBUF_DEBUG_REF_DEC(__p_desc__) {mem_buf_desc_t* frag_list = __p_desc__; while (frag_list) { MEMBUF_DEBUG_REF_DEC_1(frag_list); frag_list = frag_list->p_next_desc; }} + #define MEMBUF_DEBUG_REF_DEC_1(__p_desc__) {g_ip_frag_count_check--; __p_desc__->n_ref_count--; if (__p_desc__->n_ref_count!=0) frag_panic("REF_DEC: p=%p\n", __p_desc__);} + #define PRINT_STATISTICS() {print_statistics();} +#else + #define MEMBUF_DEBUG_REF_INC(__p_desc__) + #define MEMBUF_DEBUG_REF_DEC(__p_desc__) + #define PRINT_STATISTICS() +#endif + + +ip_frag_manager * g_p_ip_frag_manager = NULL; + +ip_frag_hole_desc *hole_base = NULL; +ip_frag_hole_desc *hole_free_list_head = NULL; +int hole_free_list_count = 0; + +ip_frag_desc *desc_base = NULL; +ip_frag_desc *desc_free_list_head = NULL; +int desc_free_list_count = 0; + + +ip_frag_manager::ip_frag_manager() : lock_spin("ip_frag_manager") +{ + frag_dbg(""); + m_frag_counter = 0; + int i; + + + frag_dbg("NOTE: ip frag periodic timer is disabled until HW supports ip frag offload"); + // g_p_event_handler_manager->register_timer_event(IP_FRAG_CLEANUP_INT, this, PERIODIC_TIMER, 0); + + frag_dbg("Created new IPFRAG MANAGER instance"); + /* allocate hole list */ + desc_base = new ip_frag_desc_t [IP_FRAG_MAX_DESC]; + BULLSEYE_EXCLUDE_BLOCK_START + if (!desc_base) { + frag_dbg("Failed to allocate descriptor"); + free_frag_resources(); + throw_vma_exception("Failed to allocate descriptor"); + } + hole_base = new ip_frag_hole_desc [IP_FRAG_MAX_HOLES]; + if (!hole_base) { + frag_dbg("Failed to allocate hole descriptor"); + free_frag_resources(); + throw_vma_exception("Failed to allocate hole descriptor"); + } + BULLSEYE_EXCLUDE_BLOCK_END + for (i = 0; i < IP_FRAG_MAX_DESC; i++) { + free_frag_desc(&desc_base[i]); + } + for (i = 0; i < IP_FRAG_MAX_HOLES; i++) { + free_hole_desc(&hole_base[i]); + } +} + +void ip_frag_manager::free_frag_resources(void) +{ + + ip_frags_list_t::iterator i; + ip_frag_desc_t *desc; + + frag_dbg("NOTE: ip frag periodic timer is disabled until HW supports ip frag offload"); + // g_p_event_handler_manager->unregister_timer_event(this, NULL); + + lock(); + + while (m_frags.size() > 0) { + i = m_frags.begin(); + desc = i->second; + destroy_frag_desc(desc); + free_frag_desc(desc); + m_frags.erase(i); + } + + owner_desc_map_t temp_buff_map = m_return_descs; + m_return_descs.clear(); + + unlock(); + + // Must call cq_mgr outside the lock to avoid ABBA deadlock + return_buffers_to_owners(temp_buff_map); + + delete [] desc_base; + delete [] hole_base; + frag_dbg("Deleted IPFRAG MANAGER instance"); +} + +ip_frag_manager::~ip_frag_manager() +{ + free_frag_resources(); +} + +#if _BullseyeCoverage + #pragma BullseyeCoverage off +#endif + +void ip_frag_manager::print_statistics() +{ + frag_dbg("free desc=%d, free holes=%d, map size=%d, frags=%d", desc_free_list_count, hole_free_list_count, m_frags.size(), g_ip_frag_count_check); +} + +void ip_frag_manager::free_frag(mem_buf_desc_t *frag) +{ + mem_buf_desc_t *tail; + + // There are cases that we might not have a frag list at all to release + // This is instead of checking the pointer before all calls to free_frag() + if (!frag) + return; + + // Change packet size - it will force packet to be discarded + frag->sz_data = IP_FRAG_FREED; + + // Return to owner does post_recv() which deals with linked buffers automatically + MEMBUF_DEBUG_REF_DEC(frag); + + tail = frag; + while (tail->p_next_desc) { + tail = tail->p_next_desc; + } + tail->p_next_desc = m_return_descs[frag->p_desc_owner]; + m_return_descs[frag->p_desc_owner] = frag; + +} + + +//FIXME: use preallocated descriptors!!! instead of malloc +ip_frag_hole_desc* ip_frag_manager::alloc_hole_desc() +{ + struct ip_frag_hole_desc *ret; + ret = hole_free_list_head; + if (!ret) + return NULL; + + // unlink from hole's free list + hole_free_list_head = ret->next; + hole_free_list_count--; + + // clear hole struct + ret->data_first = 0; + ret->data_last = 0; + ret->next = 0; + return ret; +} + +void ip_frag_manager::free_hole_desc(struct ip_frag_hole_desc *p) +{ + // link in head of free list + p->next = hole_free_list_head; + hole_free_list_head = p; + ++hole_free_list_count; +} + +ip_frag_desc_t *ip_frag_manager::alloc_frag_desc() +{ + ip_frag_desc_t *ret; + ret = desc_free_list_head; + if (!ret) + return NULL; + + // unlink from hole's free list + desc_free_list_head = ret->next; + --desc_free_list_count; + + ret->next = 0; + return ret; +} + +void ip_frag_manager::free_frag_desc(ip_frag_desc_t *p) +{ + // link in head of free list + p->next = desc_free_list_head; + desc_free_list_head = p; + desc_free_list_count++; +} + +void ip_frag_manager::destroy_frag_desc(ip_frag_desc_t *desc) +{ + struct ip_frag_hole_desc *phole, *pphole; + + // free holes + phole = desc->hole_list; + while (phole) { + pphole = phole; + phole = phole->next; + free_hole_desc(pphole); + } + + // free frags + free_frag(desc->frag_list); +} + + +/** + * first fragment for given address is detected - setup + */ +ip_frag_desc_t *ip_frag_manager::new_frag_desc(ip_frag_key_t &key) +{ + ip_frag_desc_t *desc = NULL; + struct ip_frag_hole_desc *hole = NULL; + + hole = alloc_hole_desc(); + if (!hole){ + frag_dbg("NULL hole"); + return NULL; + } + hole->first = IP_FRAG_NINF; + hole->last = IP_FRAG_INF; + + desc = alloc_frag_desc(); + if (!desc) { + frag_dbg("NULL desc"); + free_hole_desc(hole); + return NULL; + } + desc->ttl = IP_FRAG_TTL; + desc->frag_list = 0; + desc->hole_list = hole; + desc->frag_counter = m_frag_counter; + + m_frags[key] = desc; + return desc; +} + +/** + * Complexity of the algorithm: + * O(1) if packets are coming in order or reverse order + * O(n^2) for random fragments, where n is number of fragments + * returns: 0 if finished OK (if the packet is complete - put it in ret) + * -1 if finished not OK and this packet needs to be droped + */ +int ip_frag_manager::add_frag(iphdr *hdr, mem_buf_desc_t *frag, mem_buf_desc_t **ret) +{ + ip_frag_key_t key; + ip_frags_list_t::iterator i; + ip_frag_desc_t *desc; + struct ip_frag_hole_desc *phole, *phole_prev; + struct ip_frag_hole_desc *new_hole; + uint16_t frag_off, frag_first, frag_last; + bool more_frags; + + assert(hdr); + assert(frag); + + key.ip_id = hdr->id; //id is in network order! + key.src_ip = hdr->saddr; + key.dst_ip = hdr->daddr; + key.ipproto = hdr->protocol; + + frag_dbg("Fragment: %d.%d.%d.%d->%d.%d.%d.%d id=%x size=%d", + NIPQUAD(key.src_ip), + NIPQUAD(key.dst_ip), + (int)key.ip_id, (int)ntohs(hdr->tot_len)); + +#ifdef IP_FRAG_DEBUG + if (debug_drop_every_n_pkt && ((++debug_drop_index) % debug_drop_every_n_pkt == 0)) { + frag_dbg("XXX debug force dropped XXX"); + return -1; + } +#endif + + lock(); + + MEMBUF_DEBUG_REF_INC(frag); + PRINT_STATISTICS(); + + frag_off = ntohs(hdr->frag_off); + more_frags = frag_off & MORE_FRAGMENTS_FLAG; + frag_first = (frag_off & FRAGMENT_OFFSET) * 8; + frag_last = frag_first + ntohs(hdr->tot_len) - (hdr->ihl<<2) - 1; // frag starts from 0!!! + frag_dbg("> fragment: %d-%d, %s more frags", frag_first, frag_last, more_frags?"pending":"no"); + + m_frag_counter++; + + i = m_frags.find(key); + + if (i == m_frags.end()) { + /* new fragment */ + frag_dbg("> new fragmented packet"); + desc = new_frag_desc(key); + } + else { + desc = i->second; + if ((m_frag_counter - desc->frag_counter) > IP_FRAG_SPACE) { + // discard this packet + frag_dbg("expiring packet fragments id=%x", i->first); + destroy_frag_desc(desc); + free_frag_desc(desc); + m_frags.erase(i); + i = m_frags.end(); + // Add new fregment + frag_dbg("> new fragmented packet"); + desc = new_frag_desc(key); + } + else { + frag_dbg("> old fragmented packet"); + } + } + if (desc==NULL) { + MEMBUF_DEBUG_REF_DEC(frag); + PRINT_STATISTICS(); + unlock(); + return -1; + } + + //desc->last_frag_counter = m_frag_counter; + + /* 8 step reassembly algorithm as described in RFC 815 */ + //step 1 + phole_prev = 0; phole = desc->hole_list; + while (phole) { + //step 2 and step 3 + if (frag_first >= phole->first && frag_last <= phole->last) { + break; + } + phole_prev = phole; + phole = phole->next; + } + if (!phole) { // the right hole wasn't found + MEMBUF_DEBUG_REF_DEC(frag); + PRINT_STATISTICS(); + unlock(); + return -1; + } + + frag_dbg("> found hole: %d-%d", phole->first, phole->last); + + // step 4 - remove hole from list + if (phole_prev) + phole_prev->next = phole->next; + else + desc->hole_list = phole->next; + + // step 5 + if (frag_first > phole->first) { + new_hole = alloc_hole_desc(); + if (!new_hole) { + free_hole_desc(phole); // phole was removed from the list in step 4! + MEMBUF_DEBUG_REF_DEC(frag); + PRINT_STATISTICS(); + unlock(); + return -1; + } + new_hole->first = phole->first; + new_hole->last = frag_first-1; + new_hole->data_first = phole->data_first; + new_hole->data_last = frag; + + new_hole->next = phole->next; + if (phole_prev) + phole_prev->next = new_hole; + else + desc->hole_list = new_hole; + + phole_prev = new_hole; + } + + //step 6 + if (frag_last < phole->last && more_frags) { + new_hole = alloc_hole_desc(); + if (!new_hole) { + free_hole_desc(phole); // phole was removed from the list in step 4! + MEMBUF_DEBUG_REF_DEC(frag); + PRINT_STATISTICS(); + unlock(); + return -1; + } + + new_hole->first = frag_last + 1; + new_hole->last = phole->last; + new_hole->data_first = frag; + new_hole->data_last = phole->data_last; + + new_hole->next = phole->next; + if (phole_prev) + phole_prev->next = new_hole; + else + desc->hole_list = new_hole; + } + + // link frag + if (phole->data_first) + phole->data_first->p_next_desc = frag; + else + desc->frag_list = frag; + frag->p_next_desc = phole->data_last; + + free_hole_desc(phole); + + if (!desc->hole_list) { + //step 8 - datagram assembly completed + if (i == m_frags.end()) + i = m_frags.find(key); + if (i == m_frags.end()){ + MEMBUF_DEBUG_REF_DEC(frag); + frag_panic("frag desc lost from map???"); + //coverity unreachable + /*unlock(); + return -1;*/ + } + MEMBUF_DEBUG_REF_DEC(desc->frag_list); + m_frags.erase(i); + *ret = desc->frag_list; + free_frag_desc(desc); + frag_dbg("> PACKET ASSEMBLED"); + PRINT_STATISTICS(); + unlock(); + return 0; + } + frag_dbg("> need more packets"); + + *ret = NULL; + PRINT_STATISTICS(); + unlock(); + return 0; +} + +void ip_frag_manager::return_buffers_to_owners(const owner_desc_map_t &buff_map) +{ + // Assume locked !!! + owner_desc_map_t::const_iterator iter; + + for (iter = buff_map.begin(); iter != buff_map.end(); ++iter) { + if(g_buffer_pool_rx) + g_buffer_pool_rx->put_buffers_thread_safe(iter->second); + } +} + + +void ip_frag_manager::handle_timer_expired(void* user_data) +{ + NOT_IN_USE(user_data); + ip_frags_list_t::iterator iter, iter_temp; + ip_frag_desc_t *desc; + uint64_t delta =0; + + lock(); + if (m_frag_counter > IP_FRAG_SPACE) { + delta = m_frag_counter - IP_FRAG_SPACE; + m_frag_counter -= delta; + } + + frag_dbg("calling handle_timer_expired, m_frag_counter=%ld, delta=%ld", m_frag_counter, delta); + PRINT_STATISTICS(); + + iter = m_frags.begin(); + while (iter != m_frags.end()) { + desc = iter->second; + desc->frag_counter -= delta; + if (desc->frag_counter<0 || (desc->ttl <= 0)) { //discard this packet + frag_dbg("expiring packet fragments desc=%p (frag_counter=%d, ttl=%d)", desc, desc->frag_counter, desc->ttl); + destroy_frag_desc(desc); + free_frag_desc(desc); + iter_temp = iter++; + m_frags.erase(iter_temp); + } + else { + iter++; + } + + --desc->ttl; + } + + owner_desc_map_t temp_buff_map = m_return_descs; + m_return_descs.clear(); + + PRINT_STATISTICS(); + unlock(); + + // Must call cq_mgr outside the lock to avoid ABBA deadlock + return_buffers_to_owners(temp_buff_map); +} + +#if _BullseyeCoverage + #pragma BullseyeCoverage on +#endif diff --git a/src/vma/proto/ip_frag.h b/src/vma/proto/ip_frag.h new file mode 100644 index 0000000..ade99fd --- /dev/null +++ b/src/vma/proto/ip_frag.h @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef _IP_FRAG_H +#define _IP_FRAG_H + +/** + * IP reassembly is based on algorithm described in RFC815 + */ + +#include +#include +#include +#include + +#include "vlogger/vlogger.h" +#include "utils/lock_wrapper.h" +#include +#include +#include +#include + +class mem_buf_desc_t; +class event_handler_manager; +class mem_buf_desc_owner; + +#define IP_FRAG_FREED ((size_t)-1) + +#define IP_FRAG_MAX_DESC 1024 /* maximum number of preallocated descriptors */ +#define IP_FRAG_MAX_HOLES 16000 /* maximum number of preallocated holes */ +#define IP_FRAG_TTL 2 /* default unassembled fragment time to live in ticks */ +#define IP_FRAG_INF 0xFFFF +#define IP_FRAG_NINF 0x0 +#define IP_FRAG_SPACE 60000 +#define IP_FRAG_CLEANUP_INT 10 + +struct ip_frag_key_t { + uint16_t ip_id; + in_addr_t src_ip; + in_addr_t dst_ip; + uint8_t ipproto; +}; + + +inline bool +operator<( ip_frag_key_t const& a, ip_frag_key_t const& b) +{ + if (a.ip_id < b.ip_id) + return true; + + if (a.ip_id > b.ip_id) + return false; + + if (a.src_ip < b.src_ip) + return true; + + if (a.src_ip > b.src_ip) + return false; + + if (a.dst_ip < b.dst_ip) + return true; + + if (a.dst_ip > b.dst_ip) + return false; + + if (a.ipproto < b.ipproto) + return true; + + if (a.ipproto > b.ipproto) + return false; + + return false; +} + +struct ip_frag_hole_desc { + uint16_t first; + uint16_t last; + mem_buf_desc_t *data_first; + mem_buf_desc_t *data_last; + struct ip_frag_hole_desc *next; +}; + +typedef struct ip_frag_desc { + uint16_t ttl; + uint16_t pkt_size; + struct ip_frag_hole_desc *hole_list; + mem_buf_desc_t *frag_list; + int64_t frag_counter; + struct ip_frag_desc *next; +} ip_frag_desc_t; + +typedef std::map > ip_frags_list_t; +typedef std::map owner_desc_map_t; + +class ip_frag_manager : private lock_spin, public timer_handler +{ +public: + ip_frag_manager(); + ~ip_frag_manager(); + /** + * add fragment to the list. + * Return: + * 0 if finished OK + * - if the packet is complete - put the pointer to the first fragment of the packet in ret. + * Rest of the packet fragments are linked in order. + * - if we need more fragments - put NULL in ret. + * -1 if finished with error and this packet needs to be droped + */ + int add_frag(iphdr *hdr, mem_buf_desc_t *frag, mem_buf_desc_t **ret); + + uint64_t m_frag_counter; + +private: + ip_frags_list_t m_frags; + + // Map of buffers to return, by owner + owner_desc_map_t m_return_descs; + + + /** + * first fragment for given address is detected - setup + */ + ip_frag_desc_t* new_frag_desc(ip_frag_key_t &key); + void print_statistics(); + void return_buffers_to_owners(const owner_desc_map_t &buff_map); + void free_frag(mem_buf_desc_t *frag); + ip_frag_hole_desc* alloc_hole_desc(); + void free_hole_desc(struct ip_frag_hole_desc *p); + ip_frag_desc_t* alloc_frag_desc(); + void free_frag_desc(ip_frag_desc_t *p); + void destroy_frag_desc(ip_frag_desc_t *desc); + + virtual void handle_timer_expired(void* user_data); + + void free_frag_resources(void); +}; + +extern ip_frag_manager * g_p_ip_frag_manager; + +#endif diff --git a/src/vma/proto/mem_buf_desc.h b/src/vma/proto/mem_buf_desc.h new file mode 100644 index 0000000..8bf4cef --- /dev/null +++ b/src/vma/proto/mem_buf_desc.h @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef MEM_BUF_DESC_H +#define MEM_BUF_DESC_H + +#include +#include "utils/atomic.h" +#include "vma/util/vma_list.h" +#include "vma/lwip/pbuf.h" + +class ring_slave; + +struct timestamps_t +{ + struct timespec sw; + struct timespec hw; +}; + +/** + * mem_buf_desc_t struct is used as the mapping of the wr_id in the wce to: + * (1) p_desc_owner - to notify the owner of this mem_buf_desc of a completion of this WR + * Transmitting object (sockinfo) - reference counting for TX limit logic on TX completion + * Receiving object (ib_conn_mgr) - processing of the incoming ip packet on RX completion + * (2) p_next_desc is used to link a few mem_buf_desc_t object on a list (free list, + * TX fragment list, TX waiting completion signal list) + * (3) p_buffer is the data buffer pointer (to be reused for TX or the ready + * received data in TX) + */ +class mem_buf_desc_t { +public: + mem_buf_desc_t(uint8_t *buffer, size_t size, pbuf_free_custom_fn custom_free_function) : + p_buffer(buffer), lkey(0), p_next_desc(0), + p_prev_desc(0), sz_buffer(size), sz_data(0), + p_desc_owner(0) { + memset(&lwip_pbuf, 0, sizeof(lwip_pbuf)); + memset(&rx, 0, sizeof(rx)); + memset(&tx, 0, sizeof(tx)); + reset_ref_count(); + + lwip_pbuf.custom_free_function = custom_free_function; + } + + struct pbuf_custom lwip_pbuf; //Do not change the location of this field. + uint8_t* const p_buffer; + + static inline size_t buffer_node_offset(void) {return NODE_OFFSET(mem_buf_desc_t, buffer_node);} + list_node buffer_node; + + union { + struct { + sockaddr_in src; // L3 info + sockaddr_in dst; // L3 info + + iovec frag; // Datagram part base address and length + size_t sz_payload; // This is the total amount of data of the packet, if (sz_payload>sz_data) means fragmented packet. + uint64_t hw_raw_timestamp; + timestamps_t timestamps; + void* context; + uint32_t flow_tag_id; // Flow Tag ID of this received packet + + union { + struct { + struct iphdr* p_ip_h; + struct tcphdr* p_tcp_h; + size_t n_transport_header_len; + bool gro; + } tcp; + struct { + in_addr_t local_if; // L3 info + } udp; + }; + + int8_t n_frags; //number of fragments + bool is_vma_thr; // specify whether packet drained from VMA internal thread or from user app thread + bool is_sw_csum_need; // specify if software checksum is need for this packet + bool socketxtreme_polled; + } rx; + struct { + size_t dev_mem_length; // Total data aligned to 4 bytes. + struct iphdr* p_ip_h; + union { + struct udphdr* p_udp_h; + struct tcphdr* p_tcp_h; + }; + } tx; + }; + +private: + atomic_t n_ref_count; // number of interested receivers (sockinfo) [can be modified only in cq_mgr context] +public: + + uint32_t lkey; // Buffers lkey for QP access + mem_buf_desc_t* p_next_desc; // A general purpose linked list of mem_buf_desc + mem_buf_desc_t* p_prev_desc; + size_t const sz_buffer; // this is the size of the buffer + size_t sz_data; // this is the amount of data inside the buffer (sz_data <= sz_buffer) + + // Tx: qp_mgr owns the mem_buf_desc and the associated data buffer + // Rx: cq_mgr owns the mem_buf_desc and the associated data buffer + ring_slave* p_desc_owner; + + inline int get_ref_count() const {return atomic_read(&n_ref_count);} + inline void reset_ref_count() {atomic_set(&n_ref_count, 0);} + inline int inc_ref_count() {return atomic_fetch_and_inc(&n_ref_count);} + inline int dec_ref_count() {return atomic_fetch_and_dec(&n_ref_count);} + + inline unsigned int lwip_pbuf_inc_ref_count() {return ++lwip_pbuf.pbuf.ref;} + inline unsigned int lwip_pbuf_dec_ref_count() {if (likely(lwip_pbuf.pbuf.ref)) --lwip_pbuf.pbuf.ref; return lwip_pbuf.pbuf.ref;} + inline unsigned int lwip_pbuf_get_ref_count() const {return lwip_pbuf.pbuf.ref;} +}; + +typedef vma_list_t descq_t; + +#endif + diff --git a/src/vma/proto/neighbour.cpp b/src/vma/proto/neighbour.cpp new file mode 100644 index 0000000..a8d5c1b --- /dev/null +++ b/src/vma/proto/neighbour.cpp @@ -0,0 +1,2184 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include +#include + +#include "utils/bullseye.h" +#include "vlogger/vlogger.h" +#include "vma/util/vtypes.h" +#include "vma/util/utils.h" +#include "vma/dev/ib_ctx_handler_collection.h" +#include "vma/proto/neighbour.h" +#include "vma/proto/neighbour_table_mgr.h" +#include "vma/proto/route_rule_table_key.h" +#include "vma/proto/route_table_mgr.h" +#include "vma/dev/wqe_send_handler.h" +#include "vma/dev/wqe_send_ib_handler.h" + +//This include should be after vma includes +#include + +#define MODULE_NAME "ne" +#undef MODULE_HDR_INFO +#define MODULE_HDR_INFO MODULE_NAME "[%s]:%d:%s() " +#undef __INFO__ +#define __INFO__ m_to_str.c_str() + +#define neigh_logpanic __log_info_panic +#define neigh_logerr __log_info_err +#define neigh_logwarn __log_info_warn +#define neigh_loginfo __log_info_info +#define neigh_logdbg __log_info_dbg +#define neigh_logfunc __log_info_func +#define neigh_logfuncall __log_info_funcall + +#define run_helper_func(func, event) \ + {if (my_neigh->func) { \ + my_neigh->priv_event_handler_no_locks((event)); \ + return; \ + }} + +#define RDMA_CM_TIMEOUT 3500 + + +/**/ +/** inlining functions can only help if they are implemented before their usage **/ +/**/ + +// This function create new val and initiate it with Multicast MAC +inline int neigh_eth::build_mc_neigh_val() +{ + neigh_logdbg(""); + + //We need lock in any case that we change entry + auto_unlocker lock(m_lock); + + m_state = false; + + if (m_val == NULL) + //This is the first time we are trying to allocate new val or it failed last time + m_val = new neigh_eth_val; + + BULLSEYE_EXCLUDE_BLOCK_START + if (m_val == NULL) { + neigh_logdbg("m_val allocation has failed"); + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_END + + address_t address = new unsigned char[ETH_ALEN]; + create_multicast_mac_from_ip(address, get_key().get_in_addr()); + m_val->m_l2_address = new ETH_addr(address); + BULLSEYE_EXCLUDE_BLOCK_START + if (m_val->m_l2_address == NULL) { + neigh_logdbg("m_val->m_l2_address allocation has failed"); + delete [] address; + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_END + + m_state = true; + neigh_logdbg("Peer MAC = %s", m_val->m_l2_address->to_str().c_str()); + delete [] address; + return 0; + +} + +inline int neigh_eth::build_uc_neigh_val() +{ + neigh_logdbg(""); + + // We need lock in any case that we change entry + auto_unlocker lock(m_lock); + + if (m_val == NULL) { + // This is the first time we are trying to allocate new val or it failed last time + m_val = new neigh_eth_val; + } + + BULLSEYE_EXCLUDE_BLOCK_START + if (m_val == NULL) + return -1; + BULLSEYE_EXCLUDE_BLOCK_END + + unsigned char tmp[ETH_ALEN]; + address_t address = (address_t)tmp; + + BULLSEYE_EXCLUDE_BLOCK_START + if (!priv_get_neigh_l2(address)) { + neigh_logdbg("Failed in priv_get_neigh_l2()"); + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_END + + m_val->m_l2_address = new ETH_addr(address); + BULLSEYE_EXCLUDE_BLOCK_START + if (m_val->m_l2_address == NULL) { + neigh_logdbg("m_val->m_l2_address allocation has failed"); + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_END + + neigh_logdbg("Peer MAC = %s", m_val->m_l2_address->to_str().c_str()); + return 0; +} + +neigh_val & neigh_ib_val::operator=(const neigh_val & val) +{ + IPoIB_addr* l2_addr = NULL; + neigh_val* tmp_val = const_cast(&val); + const neigh_ib_val* ib_val = dynamic_cast(tmp_val); + BULLSEYE_EXCLUDE_BLOCK_START + if (ib_val == NULL) { + __log_panic("neigh_ib_val is NULL"); + } + BULLSEYE_EXCLUDE_BLOCK_END + m_l2_address = new IPoIB_addr((ib_val->get_l2_address())->get_address()); + l2_addr = (IPoIB_addr *)m_l2_address; //no need to do dynamic casting here + m_ah = ib_val->get_ah(); //TODO: we need to handle this - in case ah is used in post_send we cannot destroy it + m_qkey = ib_val->get_qkey(); + l2_addr->set_qpn(ib_val->get_qpn()); + m_ah_attr = ib_val->get_ah_attr(); + return *this; +} + +neigh_entry::neigh_entry(neigh_key key, transport_type_t _type, bool is_init_resources): + cache_entry_subject(key), + m_cma_id(NULL), + m_rdma_port_space((enum rdma_port_space)0), + m_state_machine(NULL), + m_type(UNKNOWN), + m_trans_type(_type), + m_state(false), + m_err_counter(0), + m_timer_handle(NULL), + m_arp_counter(0), + m_p_dev(key.get_net_device_val()), + m_p_ring(NULL), + m_is_loopback(false), + m_to_str(std::string(priv_vma_transport_type_str(m_trans_type)) + ":" + get_key().to_str()), m_id(0), + m_is_first_send_arp(true), m_n_sysvar_neigh_wait_till_send_arp_msec(safe_mce_sys().neigh_wait_till_send_arp_msec), + m_n_sysvar_neigh_uc_arp_quata(safe_mce_sys().neigh_uc_arp_quata), + m_n_sysvar_neigh_num_err_retries(safe_mce_sys().neigh_num_err_retries) +{ + m_val = NULL; + + BULLSEYE_EXCLUDE_BLOCK_START + if (m_p_dev == NULL) { + neigh_logpanic("get_net_dev return NULL"); + } + + ring_alloc_logic_attr ring_attr(safe_mce_sys().ring_allocation_logic_tx); + m_ring_allocation_logic = ring_allocation_logic_tx(m_p_dev->get_local_addr(), ring_attr, this); + + if(is_init_resources) { + m_p_ring = m_p_dev->reserve_ring(m_ring_allocation_logic.get_key()); + if (m_p_ring == NULL) { + neigh_logpanic("reserve_ring return NULL"); + } + m_id = m_p_ring->generate_id(); + } + BULLSEYE_EXCLUDE_BLOCK_END + + memset(&m_dst_addr, 0, sizeof(m_dst_addr)); + memset(&m_src_addr, 0, sizeof(m_src_addr)); + m_dst_addr.sin_addr.s_addr = get_key().get_in_addr(); /*(peer_ip)*/ + m_dst_addr.sin_family = AF_INET; + + m_src_addr.sin_addr.s_addr = m_p_dev->get_local_addr(); + m_src_addr.sin_family = AF_INET; + + memset(&m_send_wqe, 0, sizeof(m_send_wqe)); + memset(&m_sge, 0, sizeof(m_sge)); + + /* Verify if neigh is local (loopback) checking into account + * primary and secondary ip-addresses + */ + { + const ip_data_vector_t& ip = m_p_dev->get_ip_array(); + for (size_t i = 0; i < ip.size(); i++) { + if (ip[i]->local_addr == m_dst_addr.sin_addr.s_addr) { + neigh_logdbg("This is loopback neigh"); + m_is_loopback = true; + break; + } + } + } + + neigh_logdbg("Created new neigh_entry"); +} + +neigh_entry::~neigh_entry() +{ + neigh_logdbg(""); + + if (m_state_machine) { + delete m_state_machine; + m_state_machine = NULL; + } + if (m_p_dev && m_p_ring) { + m_p_dev->release_ring(m_ring_allocation_logic.get_key()); + m_p_ring = NULL; + } + if (m_val) { + delete m_val; + m_val = NULL; + } + //TODO:Do we want to check here that unsent queue is empty and if not to send everything? + + neigh_logdbg("Done"); +} + +bool neigh_entry::is_deletable() +{ + if(m_state_machine == NULL) { + return true; + } + + int state = m_state_machine->get_curr_state(); + + //Wait for steady state in which unsent_queue is empty + if(state == ST_NOT_ACTIVE || state == ST_READY) { + return true; + } + return false; +} + +void neigh_entry::clean_obj() +{ + if (is_cleaned()) { + return ; + } + + m_lock.lock(); + set_cleaned(); + m_timer_handle = NULL; + if (g_p_event_handler_manager->is_running()) { + g_p_event_handler_manager->unregister_timers_event_and_delete(this); + m_lock.unlock(); + } else { + m_lock.unlock(); + cleanable_obj::clean_obj(); + } +} + +int neigh_entry::send(neigh_send_info &s_info) +{ + neigh_logdbg(""); + auto_unlocker lock(m_lock); + //Need to copy send info + neigh_send_data *ns_data = new neigh_send_data(&s_info); + + m_unsent_queue.push_back(ns_data); + int ret = ns_data->m_iov.iov_len; + if (m_state) + empty_unsent_queue(); + // coverity[leaked_storage] + return ret; +} + +void neigh_entry::empty_unsent_queue() +{ + neigh_logdbg(""); + auto_unlocker lock(m_lock); + + while (!m_unsent_queue.empty()) + { + neigh_send_data * n_send_data = m_unsent_queue.front(); + if (prepare_to_send_packet(n_send_data->m_header)) { + if (post_send_packet(n_send_data)) { + neigh_logdbg("sent one packet"); + } + else { + neigh_logdbg("Failed in post_send_packet(). Dropping the packet"); + } + } + else { + neigh_logdbg("Failed in prepare_to_send_packet(). Dropping the packet"); + } + m_unsent_queue.pop_front(); + delete n_send_data; + } +} + +void neigh_entry::handle_timer_expired(void* ctx) +{ + NOT_IN_USE(ctx); + neigh_logdbg("Timeout expired!"); + + // Clear Timer Handler + m_timer_handle = NULL; + + m_sm_lock.lock(); + int sm_state = m_state_machine->get_curr_state(); + m_sm_lock.unlock(); + + if(sm_state == ST_INIT) { + event_handler(EV_START_RESOLUTION); + return; + } + + // Check if neigh_entry state is reachable + int state; + if(!priv_get_neigh_state(state)) { + neigh_logdbg("neigh state not valid!\n"); + return; + } + + if(!priv_is_failed(state)) { + //We want to verify that L2 address wasn't changed + unsigned char tmp[IPOIB_HW_ADDR_LEN]; + address_t l2_addr = (address_t)tmp; + if(!priv_get_neigh_l2(l2_addr)) { + return; + } + if(priv_handle_neigh_is_l2_changed(l2_addr)) { + return; + } + } + + if (!priv_is_reachable(state)) { + neigh_logdbg("State (%d) is not reachable and L2 address wasn't changed. Sending ARP", state); + send_arp(); + m_timer_handle = priv_register_timer_event(m_n_sysvar_neigh_wait_till_send_arp_msec, this, ONE_SHOT_TIMER, NULL); + } + else { + neigh_logdbg("State is reachable (%s %d) and L2 address wasn't changed. Stop sending ARP", (state == NUD_REACHABLE) ? "NUD_REACHABLE" : "NUD_PERMANENT", state); + } +} + +void neigh_entry::send_arp() +{ + // In case we already sent the quota number of unicast ARPs, start sending broadcast ARPs + // or we want to send broadcast ARP for the first time + // or m_val is not valid + bool is_broadcast = (m_arp_counter >= m_n_sysvar_neigh_uc_arp_quata) || m_is_first_send_arp || !m_val; + if (post_send_arp(is_broadcast)) { + m_is_first_send_arp = false; + m_arp_counter++; + } +} + +bool neigh_entry::post_send_packet(neigh_send_data *p_n_send_data) +{ + neigh_logdbg("ENTER post_send_packet protocol = %d", p_n_send_data->m_protocol); + m_id = generate_ring_user_id(p_n_send_data->m_header); + switch(p_n_send_data->m_protocol) + { + case IPPROTO_UDP: + return (post_send_udp(p_n_send_data)); + case IPPROTO_TCP: + return (post_send_tcp(p_n_send_data)); + default: + neigh_logdbg("Unsupported protocol"); + return false; + + } +} + +bool neigh_entry::post_send_udp(neigh_send_data *n_send_data) +{ + // Find number of ip fragments (-> packets, buffers, buffer descs...) + neigh_logdbg("ENTER post_send_udp"); + int n_num_frags = 1; + bool b_need_sw_csum = false; +#ifdef DEFINED_SW_CSUM + b_need_sw_csum = true; +#endif + mem_buf_desc_t* p_mem_buf_desc, *tmp = NULL; + tx_packet_template_t *p_pkt; + size_t sz_data_payload = n_send_data->m_iov.iov_len; + header *h = n_send_data->m_header; + + size_t max_ip_payload_size = ((n_send_data->m_mtu - sizeof(struct iphdr)) & ~0x7); + + if (sz_data_payload > 65536) { + neigh_logdbg("sz_data_payload=%d exceeds max of 64KB", sz_data_payload); + errno = EMSGSIZE; + return false; + } + + size_t sz_udp_payload = sz_data_payload + sizeof(struct udphdr); + + // Usually max inline < MTU! + if (sz_udp_payload > max_ip_payload_size) { + b_need_sw_csum = true; + n_num_frags = (sz_udp_payload + max_ip_payload_size - 1) / max_ip_payload_size; + } + + neigh_logdbg("udp info: payload_sz=%d, frags=%d, scr_port=%d, dst_port=%d", sz_data_payload, n_num_frags, ntohs(h->m_header.hdr.m_udp_hdr.source), ntohs(h->m_header.hdr.m_udp_hdr.dest)); + + // Get all needed tx buf descriptor and data buffers + p_mem_buf_desc = m_p_ring->mem_buf_tx_get(m_id, false, n_num_frags); + + if (unlikely(p_mem_buf_desc == NULL)) { + neigh_logdbg("Packet dropped. not enough tx buffers"); + return false; + } + + // Int for counting offset inside the ip datagram payload + uint32_t n_ip_frag_offset = 0; + size_t sz_user_data_offset = 0; + + while (n_num_frags--) { + // Calc this ip datagram fragment size (include any udp header) + size_t sz_ip_frag = min(max_ip_payload_size, (sz_udp_payload - n_ip_frag_offset)); + size_t sz_user_data_to_copy = sz_ip_frag; + size_t hdr_len = h->m_transport_header_len + h->m_ip_header_len; // Add count of L2 (ipoib or mac) header length + + p_pkt = (tx_packet_template_t*)p_mem_buf_desc->p_buffer; + + uint16_t frag_off = 0; + if (n_num_frags) { + frag_off |= MORE_FRAGMENTS_FLAG; + } + + if (n_ip_frag_offset == 0) { + h->copy_l2_ip_udp_hdr(p_pkt); + // Add count of udp header length + hdr_len += sizeof(udphdr); + + // Copy less from user data + sz_user_data_to_copy -= sizeof(udphdr); + + // Only for first fragment add the udp header + p_pkt->hdr.m_udp_hdr.len = htons((uint16_t)sz_udp_payload); + } + else { + h->copy_l2_ip_hdr(p_pkt); + frag_off |= FRAGMENT_OFFSET & (n_ip_frag_offset / 8); + } + + p_pkt->hdr.m_ip_hdr.frag_off = htons(frag_off); + // Update ip header specific values + p_pkt->hdr.m_ip_hdr.tot_len = htons(h->m_ip_header_len + sz_ip_frag); + + // Calc payload start point (after the udp header if present else just after ip header) + uint8_t* p_payload = p_mem_buf_desc->p_buffer + h->m_transport_header_tx_offset + hdr_len; + + // Copy user data to our tx buffers + int ret = memcpy_fromiovec(p_payload, &n_send_data->m_iov, 1, sz_user_data_offset, sz_user_data_to_copy); + BULLSEYE_EXCLUDE_BLOCK_START + if (ret != (int)sz_user_data_to_copy) { + neigh_logerr("memcpy_fromiovec error (sz_user_data_to_copy=%d, ret=%d)", sz_user_data_to_copy, ret); + m_p_ring->mem_buf_tx_release(p_mem_buf_desc, true); + errno = EINVAL; + return false; + } + BULLSEYE_EXCLUDE_BLOCK_END + + wqe_send_handler wqe_sh; + vma_wr_tx_packet_attr attr = (vma_wr_tx_packet_attr)(VMA_TX_PACKET_L3_CSUM); + if (b_need_sw_csum) { + attr = (vma_wr_tx_packet_attr)(attr|VMA_TX_SW_CSUM); + wqe_sh.disable_hw_csum(m_send_wqe); + } else { + neigh_logdbg("using HW checksum calculation"); + wqe_sh.enable_hw_csum(m_send_wqe); + } + + p_mem_buf_desc->tx.p_ip_h = &p_pkt->hdr.m_ip_hdr; + p_mem_buf_desc->tx.p_udp_h = &p_pkt->hdr.m_udp_hdr; + + m_sge.addr = (uintptr_t)(p_mem_buf_desc->p_buffer + (uint8_t)h->m_transport_header_tx_offset); + m_sge.length = sz_user_data_to_copy + hdr_len; +#ifdef DEFINED_TSO + m_sge.lkey = m_p_ring->get_tx_lkey(m_id); +#endif /* DEFINED_TSO */ + m_send_wqe.wr_id = (uintptr_t)p_mem_buf_desc; + + neigh_logdbg("%s packet_sz=%d, payload_sz=%d, ip_offset=%d id=%d", h->to_str().c_str(), + m_sge.length - h->m_transport_header_len, sz_user_data_to_copy, + n_ip_frag_offset, ntohs(p_pkt->hdr.m_ip_hdr.id)); + + tmp = p_mem_buf_desc->p_next_desc; + p_mem_buf_desc->p_next_desc = NULL; + + // We don't check the return value of post send when we reach the HW we consider that we completed our job + m_p_ring->send_ring_buffer(m_id, &m_send_wqe, attr); + + p_mem_buf_desc = tmp; + + // Update ip frag offset position + n_ip_frag_offset += sz_ip_frag; + + // Update user data start offset copy location + sz_user_data_offset += sz_user_data_to_copy; + + } // while(n_num_frags) + + return true; +} + + +bool neigh_entry::post_send_tcp(neigh_send_data *p_data) +{ + tx_packet_template_t* p_pkt; + mem_buf_desc_t *p_mem_buf_desc; + size_t total_packet_len = 0; + header *h = p_data->m_header; + + wqe_send_handler wqe_sh; + wqe_sh.enable_hw_csum(m_send_wqe); + + p_mem_buf_desc = m_p_ring->mem_buf_tx_get(m_id, false, 1); + + BULLSEYE_EXCLUDE_BLOCK_START + if (unlikely(p_mem_buf_desc == NULL)) { + neigh_logdbg("Packet dropped. not enough tx buffers"); + return false; + } + BULLSEYE_EXCLUDE_BLOCK_END + + p_mem_buf_desc->lwip_pbuf.pbuf.payload = (u8_t *)p_mem_buf_desc->p_buffer + h->m_total_hdr_len; + + p_mem_buf_desc->p_next_desc = NULL; + + //copy L4 neigh buffer to tx buffer + memcpy((void*)(p_mem_buf_desc->p_buffer + h->m_aligned_l2_l3_len), + p_data->m_iov.iov_base, p_data->m_iov.iov_len); + + p_pkt = (tx_packet_template_t*)(p_mem_buf_desc->p_buffer); + total_packet_len = p_data->m_iov.iov_len + h->m_total_hdr_len; + h->copy_l2_ip_hdr(p_pkt); + // We've copied to aligned address, and now we must update p_pkt to point to real + // L2 header + + p_pkt->hdr.m_ip_hdr.tot_len = (htons)(p_data->m_iov.iov_len + h->m_ip_header_len); + + // The header is aligned for fast copy but we need to maintain this diff in order to get the real header pointer easily + size_t hdr_alignment_diff = h->m_aligned_l2_l3_len - h->m_total_hdr_len; + m_sge.addr = (uintptr_t)((uint8_t*)p_pkt + hdr_alignment_diff); + m_sge.length = total_packet_len; +#ifdef DEFINED_TSO + m_sge.lkey = m_p_ring->get_tx_lkey(m_id); +#endif /* DEFINED_TSO */ + + /* for DEBUG */ + if ((uint8_t*)m_sge.addr < p_mem_buf_desc->p_buffer) { + neigh_logerr("p_buffer - addr=%d, m_total_hdr_len=%zd, p_buffer=%p, type=%d, len=%d, tot_len=%d, payload=%p, hdr_alignment_diff=%zd\n", + (int)(p_mem_buf_desc->p_buffer - (uint8_t*)m_sge.addr), h->m_total_hdr_len, + p_mem_buf_desc->p_buffer, p_mem_buf_desc->lwip_pbuf.pbuf.type, + p_mem_buf_desc->lwip_pbuf.pbuf.len, p_mem_buf_desc->lwip_pbuf.pbuf.tot_len, + p_mem_buf_desc->lwip_pbuf.pbuf.payload, hdr_alignment_diff); + } + + m_send_wqe.wr_id = (uintptr_t)p_mem_buf_desc; + vma_wr_tx_packet_attr attr = (vma_wr_tx_packet_attr)(VMA_TX_PACKET_L3_CSUM|VMA_TX_PACKET_L4_CSUM); + p_mem_buf_desc->tx.p_ip_h = &p_pkt->hdr.m_ip_hdr; + p_mem_buf_desc->tx.p_tcp_h = (struct tcphdr*)(((uint8_t*)(&(p_pkt->hdr.m_ip_hdr))+sizeof(p_pkt->hdr.m_ip_hdr))); + + m_p_ring->send_ring_buffer(m_id, &m_send_wqe, attr); +#ifndef __COVERITY__ + struct tcphdr* p_tcp_h = (struct tcphdr*)(((uint8_t*)(&(p_pkt->hdr.m_ip_hdr))+sizeof(p_pkt->hdr.m_ip_hdr))); + NOT_IN_USE(p_tcp_h); /* to supress warning in case VMA_MAX_DEFINED_LOG_LEVEL */ + neigh_logdbg("Tx TCP segment info: src_port=%d, dst_port=%d, flags='%s%s%s%s%s%s' seq=%u, ack=%u, win=%u, payload_sz=%u", + ntohs(p_tcp_h->source), ntohs(p_tcp_h->dest), + p_tcp_h->urg?"U":"", p_tcp_h->ack?"A":"", p_tcp_h->psh?"P":"", + p_tcp_h->rst?"R":"", p_tcp_h->syn?"S":"", p_tcp_h->fin?"F":"", + ntohl(p_tcp_h->seq), ntohl(p_tcp_h->ack_seq), ntohs(p_tcp_h->window), + total_packet_len- p_tcp_h->doff*4 -34); +#endif + return true; +} + +void neigh_entry::priv_handle_neigh_reachable_event() +{ + //In case this is reachable event we should set ARP counter to 0 and stop the timer + //(we don't want to continue sending ARPs) + m_arp_counter = 0; + priv_unregister_timer(); +} + +//========================================== cache_observer functions implementation ============================ + + +bool neigh_entry::get_peer_info(neigh_val * p_val) +{ + neigh_logfunc("calling neigh_entry get_peer_info. state = %d", m_state); + BULLSEYE_EXCLUDE_BLOCK_START + if (p_val == NULL) { + neigh_logdbg("p_val is NULL, return false"); + return false; + } + BULLSEYE_EXCLUDE_BLOCK_END + + auto_unlocker lock(m_lock); + if (m_state) { + neigh_logdbg("There is a valid val"); + *p_val = *m_val; + return m_state; + } + + /* If state is NOT_ACTIVE need to kick start state machine, + otherwise it means that it was already started*/ + if ((state_t)m_state_machine->get_curr_state() == ST_NOT_ACTIVE) + priv_kick_start_sm(); + + if (m_state) { + neigh_logdbg("There is a valid val"); + *p_val = *m_val; + return m_state; + } + + return false; +} + +// Overriding subject's register_observer +bool neigh_entry::register_observer(const observer* const new_observer) +{ + /* register_observer should kick start neigh state machine in case m_state is not valid + * and state of State Machine is NOT_ACTIVE + */ + neigh_logdbg("Observer = %p ", new_observer); + + if (subject::register_observer(new_observer)) + { + if (!m_state && ((state_t) m_state_machine->get_curr_state()== ST_NOT_ACTIVE)) + { + neigh_logdbg("SM state is ST_NOT_ACTIVE Kicking SM start"); + priv_kick_start_sm(); + } + return true; + } + return false; +} + +const std::string neigh_entry::to_str() const +{ + return m_to_str; +} + +void neigh_entry::handle_neigh_event(neigh_nl_event* nl_ev) +{ + const netlink_neigh_info* nl_info = nl_ev->get_neigh_info(); + + int neigh_state = nl_info->state; + switch (neigh_state) + { + + case NUD_REACHABLE: + case NUD_PERMANENT: + { + BULLSEYE_EXCLUDE_BLOCK_START + if(m_state_machine == NULL) { + neigh_logerr("m_state_machine: not a valid case"); + break; + } + BULLSEYE_EXCLUDE_BLOCK_END + + neigh_logdbg("state = '%s' (%d) L2 address = %s", nl_info->get_state2str().c_str(), neigh_state, nl_info->lladdr_str.c_str()); + priv_handle_neigh_reachable_event(); + /* In case we got REACHABLE event need to do the following + * Check that neigh has L2 address + * if not send event to neigh + * else need to check that the new l2 address is equal to the old one + * if not equal this is a remote bonding event - issue an EV_ERROR + */ + auto_unlocker lock(m_lock); + // This if and priv_handle_neigh_ha_event should be done under lock + if (m_state_machine->get_curr_state() != ST_READY) { + // This is new entry + event_handler(EV_ARP_RESOLVED); + break; + } + + // Check if neigh L2 address changed (HA event) and restart the state machine + priv_handle_neigh_is_l2_changed(nl_info->lladdr); + break; + } + + case NUD_STALE: + { + BULLSEYE_EXCLUDE_BLOCK_START + if(m_state_machine == NULL) { + neigh_logerr("m_state_machine: not a valid case"); + break; + } + BULLSEYE_EXCLUDE_BLOCK_END + + m_lock.lock(); + if (m_state_machine->get_curr_state() != ST_READY) { + // This is new entry, neigh entry state != READY + neigh_logdbg("state = '%s' m_state_machine != ST_READY - Doing nothing", nl_info->get_state2str().c_str()); + m_lock.unlock(); + break; + } + // Check if neigh L2 address changed (HA event) and restart the state machine + neigh_logdbg("state = '%s' (%d) L2 address = %s", nl_info->get_state2str().c_str(), neigh_state, nl_info->lladdr_str.c_str()); + bool ret = priv_handle_neigh_is_l2_changed(nl_info->lladdr); + m_lock.unlock(); + + if(! ret ) { + //If L2 address wasn't changed we need to send ARP + send_arp(); + m_timer_handle = priv_register_timer_event(m_n_sysvar_neigh_wait_till_send_arp_msec, this, ONE_SHOT_TIMER, NULL); + } + break; + } + + case NUD_INCOMPLETE: + { + neigh_logdbg("state = INCOMPLETE"); + break; + } + + case NUD_FAILED: + { + neigh_logdbg("state = FAILED"); + event_handler(EV_ERROR); + break; + } + + default: + { + neigh_logdbg("Unhandled state = '%s' (%d)", nl_info->get_state2str().c_str(), neigh_state); + break; + } + } +} + +//============================ Functions that handling events for state machine =================================== + +const char* neigh_entry::event_to_str(event_t event) const +{ + switch (event) + { + case EV_KICK_START: + return "EV_KICK_START"; + case EV_START_RESOLUTION: + return "EV_START_RESOLUTION"; + case EV_ARP_RESOLVED: + return "EV_ARP_RESOLVED"; + case EV_ADDR_RESOLVED: + return "EV_ADDR_RESOLVED"; + case EV_PATH_RESOLVED: + return "EV_PATH_RESOLVED"; + case EV_ERROR: + return "EV_ERROR"; + case EV_TIMEOUT_EXPIRED: + return "EV_TIMEOUT_EXPIRED"; + case EV_UNHANDLED: + return "EV_UNHANDELED"; + BULLSEYE_EXCLUDE_BLOCK_START + default: + return "Undefined"; + BULLSEYE_EXCLUDE_BLOCK_END + } + +} + +const char* neigh_entry::state_to_str(state_t state) const +{ + switch (state) + { + case ST_NOT_ACTIVE: + return "NEIGH_NOT_ACTIVE"; + case ST_ERROR: + return "NEIGH_ERROR"; + case ST_INIT: + return "NEIGH_INIT"; + case ST_INIT_RESOLUTION: + return "NEIGH_INIT_RESOLUTION"; + case ST_ADDR_RESOLVED: + return "NEIGH_ADDR_RESOLVED"; + case ST_ARP_RESOLVED: + return "NEIGH_ARP_RESOLVED"; + case ST_PATH_RESOLVED: + return "NEIGH_PATH_RESOLVED"; + case ST_READY: + return "NEIGH_READY"; + BULLSEYE_EXCLUDE_BLOCK_START + default: + return "Undefined"; + BULLSEYE_EXCLUDE_BLOCK_END + } +} + +/* + * RDMA_CM_EVENT_ADDR_RESOLVED will be mapped to neigh_entry:event_t::ADDRESS_RESOLVED + * RDMA_CM_EVENT_ADDR_ERROR, RDMA_CM_EVENT_ROUTE_ERROR, RDMA_CM_EVENT_MULTICAST_ERROR will be mapped to neigh_entry:event_t::RESTART + * RDMA_CM_EVENT_MULTICAST_JOIN and RDMA_CM_EVENT_ROUTE_RESOLVED will be mapped to neigh_entry:event_t::PATH_RESOLVED + * We are not going to handle local errors events, what is interesting is remote error events or fabric events in case of IB. + * For local errors we will have netlink event that entry is deleted - need to think where it will be handled in neigh_tbl_mgr or neigh_entry + */ +neigh_entry::event_t neigh_entry::rdma_event_mapping(struct rdma_cm_event* p_rdma_cm_event) +{ + // General check of cma_id + BULLSEYE_EXCLUDE_BLOCK_START + if (m_cma_id != NULL && m_cma_id != p_rdma_cm_event->id) { + neigh_logerr("cma_id %p != event->cma_id %p", m_cma_id, p_rdma_cm_event->id); + return EV_UNHANDLED; + } + BULLSEYE_EXCLUDE_BLOCK_END + neigh_logdbg("Got event %s (%d)", priv_rdma_cm_event_type_str(p_rdma_cm_event->event), p_rdma_cm_event->event); + + switch (p_rdma_cm_event->event) + { + case RDMA_CM_EVENT_ADDR_RESOLVED: + return EV_ADDR_RESOLVED; + + case RDMA_CM_EVENT_MULTICAST_JOIN: + case RDMA_CM_EVENT_ROUTE_RESOLVED: + return EV_PATH_RESOLVED; + + case RDMA_CM_EVENT_ADDR_ERROR: + case RDMA_CM_EVENT_MULTICAST_ERROR: + case RDMA_CM_EVENT_ROUTE_ERROR: + case RDMA_CM_EVENT_TIMEWAIT_EXIT: + return EV_ERROR; + BULLSEYE_EXCLUDE_BLOCK_START + default: + neigh_logdbg("Un-handled rdma_cm event %d", p_rdma_cm_event->event); + return EV_UNHANDLED; + BULLSEYE_EXCLUDE_BLOCK_END + } +} + +// call this function from the transition functions only (instead of using recursive lock) +void neigh_entry::priv_event_handler_no_locks(event_t event, void* p_event_info) +{ + neigh_logfunc("Enter: event %s", event_to_str(event)); + m_state_machine->process_event(event, p_event_info); +} + +void neigh_entry::event_handler(event_t event, void* p_event_info) +{ + neigh_logfunc("Enter: event %s", event_to_str(event)); + BULLSEYE_EXCLUDE_BLOCK_START + if (event == EV_UNHANDLED) { + neigh_logdbg("Enter: event %s. UNHANDLED event - Ignored!", event_to_str(event)); + return; + } + BULLSEYE_EXCLUDE_BLOCK_END + m_sm_lock.lock(); + priv_event_handler_no_locks(event, p_event_info); + m_sm_lock.unlock(); +} + +void neigh_entry::handle_event_rdma_cm_cb(struct rdma_cm_event* p_event) +{ + event_t event = rdma_event_mapping(p_event); + event_handler(event, p_event); +} + +//================================== Static functions for state machine dofunc =========================== +//General entry dofunc +void neigh_entry::general_st_entry(const sm_info_t& func_info) +{ + neigh_entry* my_neigh = (neigh_entry *) func_info.app_hndl; + my_neigh->priv_general_st_entry(func_info); +} + +//General leave dofunc +void neigh_entry::general_st_leave(const sm_info_t& func_info) +{ + neigh_entry* my_neigh = (neigh_entry *) func_info.app_hndl; + my_neigh->priv_general_st_leave(func_info); + + /* + if (my_conn_mgr->m_timer_handle) { + g_p_event_handler_manager->unregister_timer_event(my_conn_mgr, my_conn_mgr->m_timer_handle); + my_conn_mgr->m_timer_handle = NULL; + } + */ +} + +void neigh_entry::print_event_info(int state, int event, void* app_data) +{ + neigh_entry * my_neigh = (neigh_entry *) app_data; + my_neigh->priv_print_event_info((state_t) state, (event_t) event); +} + +//Static enter function for NOT_ACTIVE state +void neigh_entry::dofunc_enter_not_active(const sm_info_t& func_info) +{ + //Need to change entry state to false + neigh_entry * my_neigh = (neigh_entry *) func_info.app_hndl; + general_st_entry(func_info); + my_neigh->priv_enter_not_active(); +} + +//Static enter function for ERROR state +void neigh_entry::dofunc_enter_error(const sm_info_t& func_info) +{ + //Need to change entry state to false + neigh_entry * my_neigh = (neigh_entry *) func_info.app_hndl; + general_st_entry(func_info); + my_neigh->priv_enter_error(); +} + +//Static enter function for INIT state +void neigh_entry::dofunc_enter_init(const sm_info_t& func_info) +{ + neigh_entry * my_neigh = (neigh_entry *) func_info.app_hndl; + general_st_entry(func_info); + run_helper_func(priv_enter_init(), EV_ERROR); +} + +//Static enter function for INIT_RESOLUTION state +void neigh_entry::dofunc_enter_init_resolution(const sm_info_t& func_info) +{ + neigh_entry * my_neigh = (neigh_entry *) func_info.app_hndl; + general_st_entry(func_info); + run_helper_func(priv_enter_init_resolution(), EV_ERROR); +} + +//Static enter function for ADDR_RESOLVED state +void neigh_entry::dofunc_enter_addr_resolved(const sm_info_t& func_info) +{ + neigh_entry * my_neigh = (neigh_entry *) func_info.app_hndl; + general_st_entry(func_info); + run_helper_func(priv_enter_addr_resolved(), EV_ERROR); +} + +//Static enter function for READY state +void neigh_entry::dofunc_enter_ready(const sm_info_t& func_info) +{ + neigh_entry * my_neigh = (neigh_entry *) func_info.app_hndl; + general_st_entry(func_info); + run_helper_func(priv_enter_ready(), EV_ERROR); +} + +// ================================== private functions for sate machine ============================================ + +void neigh_entry::priv_general_st_entry(const sm_info_t& func_info) +{ + NOT_IN_USE(func_info); /* to supress warning in case VMA_MAX_DEFINED_LOG_LEVEL */ + neigh_logdbg("State change: %s (%d) => %s (%d) with event %s (%d)", + state_to_str((state_t) func_info.old_state), func_info.old_state, + state_to_str((state_t) func_info.new_state), func_info.new_state, + event_to_str((event_t) func_info.event), func_info.event); +} + +void neigh_entry::priv_general_st_leave(const sm_info_t& func_info) +{ + NOT_IN_USE(func_info); +} + +void neigh_entry::priv_print_event_info(state_t state, event_t event) +{ + NOT_IN_USE(state); /* to supress warning in case VMA_MAX_DEFINED_LOG_LEVEL */ + NOT_IN_USE(event); /* to supress warning in case VMA_MAX_DEFINED_LOG_LEVEL */ + neigh_logdbg("Got event '%s' (%d) in state '%s' (%d)", + event_to_str(event), event, state_to_str(state), state); +} + +//Function that start neigh State Machine (SM) +void neigh_entry::priv_kick_start_sm() +{ + neigh_logdbg("Kicking connection start"); + event_handler(EV_KICK_START); +} + +//Private enter function for INIT state +int neigh_entry::priv_enter_init() +{ + m_timer_handle = priv_register_timer_event(0, this, ONE_SHOT_TIMER, NULL); + return 0; +} + +//Private enter function for INIT_RESOLUTION state +int neigh_entry::priv_enter_init_resolution() +{ + if (NULL == g_p_neigh_table_mgr->m_neigh_cma_event_channel) { + return 0; + } + + // 1. Delete old cma_id + priv_destroy_cma_id(); + + // 2. Create cma_id + neigh_logdbg("Calling rdma_create_id"); + IF_RDMACM_FAILURE(rdma_create_id(g_p_neigh_table_mgr->m_neigh_cma_event_channel, &m_cma_id, (void *)this, m_rdma_port_space)) + { + neigh_logerr("Failed in rdma_create_id (errno=%d %m)", errno); + return -1; + } ENDIF_RDMACM_FAILURE; + + + // 3. Register our handler on internal channel event listener thread + g_p_event_handler_manager->register_rdma_cm_event + (g_p_neigh_table_mgr->m_neigh_cma_event_channel->fd, + (void*) m_cma_id, + (void*) g_p_neigh_table_mgr->m_neigh_cma_event_channel, + this); + + // 4. Start RDMA address resolution + neigh_logdbg("Calling rdma_resolve_addr, src=%d.%d.%d.%d, dst=%d.%d.%d.%d", NIPQUAD(m_src_addr.sin_addr.s_addr), NIPQUAD(m_dst_addr.sin_addr.s_addr)); + + /* we had issues passing unicast src addr, let it find the correct one itself */ + struct sockaddr* src = IN_MULTICAST_N(m_dst_addr.sin_addr.s_addr) ? (struct sockaddr*)&m_src_addr : NULL; + + IF_RDMACM_FAILURE(rdma_resolve_addr(m_cma_id, src, (struct sockaddr*)&m_dst_addr, 2000)) + { + neigh_logdbg("Failed in rdma_resolve_addr m_cma_id = %p (errno=%d %m)", m_cma_id, errno); + return -1; + } ENDIF_RDMACM_FAILURE; + + return 0; +} + +//Private enter function for ADDR_RESOLVED state +int neigh_entry::priv_enter_addr_resolved() +{ + neigh_logfunc(""); + + m_lock.lock(); + + int state; + + if (!priv_get_neigh_state(state) || !priv_is_reachable(state)) { + neigh_logdbg("got addr_resolved but state=%d", state); + send_arp(); + m_timer_handle = priv_register_timer_event(m_n_sysvar_neigh_wait_till_send_arp_msec, this, ONE_SHOT_TIMER, NULL); + m_lock.unlock(); + return 0; + } else { + event_handler(EV_ARP_RESOLVED); + } + + m_lock.unlock(); + return 0; +} + +//Private enter function for NOT_ACTIVE state +void neigh_entry::priv_enter_not_active() +{ + neigh_logfunc(""); + + auto_unlocker lock(m_lock); + + m_state = false; + + priv_destroy_cma_id(); + priv_unregister_timer(); + m_is_first_send_arp = true; // force send boardcast next cycle + m_arp_counter = 0; + + // Flush unsent_queue in case that neigh entry is in error state + + if (!m_unsent_queue.empty()) { + neigh_logdbg("Flushing unsent queue"); + + while (!m_unsent_queue.empty()) + { + neigh_send_data * packet = m_unsent_queue.front(); + m_unsent_queue.pop_front(); + delete packet; + } + } + + if (m_val) { + neigh_logdbg("calling to zero_all_members()"); + m_val->zero_all_members(); + } + + return; +} + +//Private enter function for NOT_ERROR state +void neigh_entry::priv_enter_error() +{ + neigh_logfunc(""); + + m_lock.lock(); + + m_state = false; + + priv_destroy_cma_id(); + priv_unregister_timer(); + m_is_first_send_arp = true; // force send boardcast next cycle + m_arp_counter = 0; + + if (m_val) { + neigh_logdbg("calling to zero_all_members()"); + m_val->zero_all_members(); + } + + m_lock.unlock(); + + //Need to notify observers that now this entry is not valid + //We don't want to do it under neigh lock - can cause dead lock with prepare_to_send() of dst + notify_observers(NULL); + + m_lock.lock(); + //If unsent queue is not empty we will try to KICK START the connection, but only once + if (!m_unsent_queue.empty() && (m_err_counter < m_n_sysvar_neigh_num_err_retries)) { + neigh_logdbg("unsent_queue is not empty calling KICK_START"); + m_err_counter++; + event_handler(EV_KICK_START); + } + else { + neigh_logdbg("unsent_queue is empty or this is the #%d retry", m_err_counter + 1); + m_err_counter = 0; + event_handler(EV_ERROR); + } + m_lock.unlock(); + +} + +//Private enter function for READY state +int neigh_entry::priv_enter_ready() +{ + neigh_logfunc(""); + auto_unlocker lock(m_lock); + + m_state = true; + empty_unsent_queue(); + + int state; + // Need to send ARP in case neigh state is not REACHABLE and this is not MC neigh + // This is the case when VMA was started with neigh in STALE state and + // rdma_adress_resolve() in this case will not initiate ARP + if (m_type == UC && ! m_is_loopback) { + if (priv_get_neigh_state(state) && !priv_is_reachable(state)) { + send_arp(); + m_timer_handle = priv_register_timer_event(m_n_sysvar_neigh_wait_till_send_arp_msec, this, ONE_SHOT_TIMER, NULL); + } + } + return 0; +} + +bool neigh_entry::priv_get_neigh_state(int & state) +{ + netlink_neigh_info info; + char str_addr[INET_ADDRSTRLEN]; + + if (m_is_loopback) { + state = NUD_REACHABLE; + return true; + } + + if (inet_ntop(AF_INET, &(m_dst_addr.sin_addr), str_addr, sizeof(str_addr)) && + g_p_netlink_handler->get_neigh(str_addr, m_p_dev->get_if_idx(), &info)) { + state = info.state; + neigh_logdbg("state = %s", info.get_state2str().c_str()); + return true; + } + + neigh_logdbg("Entry doesn't exist in netlink cache"); + return false; +} + +bool neigh_entry::priv_get_neigh_l2(address_t & l2_addr) +{ + netlink_neigh_info info; + char str_addr[INET_ADDRSTRLEN]; + + if (m_is_loopback) { + memcpy(l2_addr, m_p_dev->get_l2_address()->get_address(), m_p_dev->get_l2_address()->get_addrlen()); + return true; + } + + if (inet_ntop(AF_INET, &(m_dst_addr.sin_addr), str_addr, sizeof(str_addr)) && + g_p_netlink_handler->get_neigh(str_addr, m_p_dev->get_if_idx(), &info)){ + if (!priv_is_failed(info.state)) { + memcpy(l2_addr, info.lladdr, info.lladdr_len); + return true; + } + neigh_logdbg("Entry exists in netlink cache but state = %s", info.get_state2str().c_str()); + } + + neigh_logdbg("Entry doesn't exist in netlink cache"); + return false; + +} + +void neigh_entry::priv_destroy_cma_id() +{ + if (m_cma_id) { + g_p_event_handler_manager->unregister_rdma_cm_event( + g_p_neigh_table_mgr->m_neigh_cma_event_channel->fd, + (void*) m_cma_id); + neigh_logdbg("Calling rdma_destroy_id"); + IF_RDMACM_FAILURE(rdma_destroy_id(m_cma_id)) + { + neigh_logdbg("Failed in rdma_destroy_id (errno=%d %m)", errno); + } ENDIF_RDMACM_FAILURE; + m_cma_id = NULL; + } +} + +void* neigh_entry::priv_register_timer_event(int timeout_msec, timer_handler* handler, timer_req_type_t req_type, void* user_data){ + void* _timer_handler = NULL; + m_lock.lock(); + if(!is_cleaned()){ + _timer_handler = g_p_event_handler_manager->register_timer_event(timeout_msec, handler, req_type, user_data); + } + m_lock.unlock(); + return _timer_handler; +} + +void neigh_entry::priv_unregister_timer() +{ + if (m_timer_handle) { + // All timers in neigh are currently ONESHOT timers. + // Unregister of ONESHOT timer can lead to double free of timer, + // as ONESHOT timer free itself after it run. + // TODO: unregister all timers? is there just one or more? + //g_p_event_handler_manager->unregister_timer_event(this, m_timer_handle); + m_timer_handle = NULL; + } +} +//============================================================== neigh_eth ================================================== + +neigh_eth::neigh_eth(neigh_key key) : + neigh_entry(key, VMA_TRANSPORT_ETH) +{ + neigh_logdbg(""); + m_rdma_port_space = RDMA_PS_UDP; + + if (IN_MULTICAST_N(key.get_in_addr())) { + //This is Multicast neigh + m_type = MC; + build_mc_neigh_val(); + return; + } + // This is Unicast neigh + m_type = UC; + + sm_short_table_line_t short_sm_table[] = + { + // {curr state, event, next state, action func } + + { ST_NOT_ACTIVE, EV_KICK_START, ST_INIT, NULL }, + { ST_ERROR, EV_KICK_START, ST_INIT, NULL }, + { ST_INIT, EV_ARP_RESOLVED, ST_READY, NULL }, + { ST_INIT, EV_START_RESOLUTION, ST_INIT_RESOLUTION, NULL }, + { ST_INIT_RESOLUTION, EV_ADDR_RESOLVED, ST_ADDR_RESOLVED, NULL }, + { ST_INIT_RESOLUTION, EV_ARP_RESOLVED, ST_READY, NULL }, + { ST_ADDR_RESOLVED, EV_ARP_RESOLVED, ST_READY, NULL }, + { ST_READY, EV_ERROR, ST_ERROR, NULL }, + { ST_INIT, EV_ERROR, ST_ERROR, NULL }, + { ST_INIT_RESOLUTION, EV_ERROR, ST_ERROR, NULL }, + { ST_ERROR, EV_ERROR, ST_NOT_ACTIVE, NULL }, + //Entry functions + { ST_INIT, SM_STATE_ENTRY, SM_NO_ST, neigh_entry::dofunc_enter_init }, + { ST_INIT_RESOLUTION, SM_STATE_ENTRY, SM_NO_ST, neigh_entry::dofunc_enter_init_resolution }, + { ST_ERROR, SM_STATE_ENTRY, SM_NO_ST, neigh_entry::dofunc_enter_error }, + { ST_NOT_ACTIVE, SM_STATE_ENTRY, SM_NO_ST, neigh_entry::dofunc_enter_not_active }, + { ST_ADDR_RESOLVED, SM_STATE_ENTRY, SM_NO_ST, neigh_entry::dofunc_enter_addr_resolved}, + { ST_READY, SM_STATE_ENTRY, SM_NO_ST, neigh_entry::dofunc_enter_ready }, + SM_TABLE_END }; + + // Create state_nachine + m_state_machine = new state_machine(this, // app hndl + ST_NOT_ACTIVE, // start state_t + ST_LAST, // max states + EV_LAST, // max events + short_sm_table, // short table + general_st_entry, // default entry function + NULL, // default leave function + NULL, // default func + print_event_info // debug function + ); + + BULLSEYE_EXCLUDE_BLOCK_START + if (m_state_machine == NULL) + neigh_logpanic("Failed allocating state_machine"); + BULLSEYE_EXCLUDE_BLOCK_END + + priv_kick_start_sm(); +} + +neigh_eth::~neigh_eth() +{ + neigh_logdbg(""); + priv_enter_not_active(); +} + +bool neigh_eth::is_deletable() +{ + if(m_type == MC) + return true; + return(neigh_entry::is_deletable()); +} + +bool neigh_eth::get_peer_info(neigh_val * p_val) +{ + neigh_logfunc("calling neigh_eth get_peer_info"); + if (m_type == MC) { + auto_unlocker lock(m_lock); + if (m_state) { + *p_val = *m_val; + return true; + } + else { + if (build_mc_neigh_val()) + return false; + else { + *p_val = *m_val; + return true; + } + } + } + + return (neigh_entry::get_peer_info(p_val)); +} + +bool neigh_eth::register_observer(const observer* const new_observer) +{ + neigh_logdbg("neigh_eth register_observer"); + // In case of ETH Multicast we should change neigh_entry register_observer behavior + if (m_type == MC) { + if (subject::register_observer(new_observer)) { + auto_unlocker lock(m_lock); + if (!m_state) + // Try to build it again + build_mc_neigh_val(); + return true; + } + return false; + } + + return (neigh_entry::register_observer(new_observer)); +} + +int neigh_eth::priv_enter_init() +{ + int state; + + if (priv_get_neigh_state(state) && !priv_is_failed(state)) { + event_handler(EV_ARP_RESOLVED); + return 0; + } + + return neigh_entry::priv_enter_init(); +} + +int neigh_eth::priv_enter_init_resolution() +{ + int state; + + if (!(neigh_entry::priv_enter_init_resolution())) { + // query netlink - if this entry already exist and REACHABLE we can use it + if (priv_get_neigh_state(state) && !priv_is_failed(state)) { + event_handler(EV_ARP_RESOLVED); + } + return 0; + } + + return -1; +} + +bool neigh_eth::priv_handle_neigh_is_l2_changed(address_t new_l2_address_str) +{ + auto_unlocker lock(m_lock); + ETH_addr new_l2_address(new_l2_address_str); + if(m_val) { + if(m_val->get_l2_address()) { + if (!((m_val->get_l2_address())->compare(new_l2_address))) { + neigh_logdbg("l2 address was changed (%s => %s)", (m_val->get_l2_address())->to_str().c_str(), new_l2_address.to_str().c_str()); + event_handler(EV_ERROR); + return true; + } + else + { + neigh_logdbg("No change in l2 address"); + return false; + } + } + else { + neigh_logdbg("l2 address is NULL"); + } + } + else { + neigh_logerr("m_val is NULL"); + } + + event_handler(EV_ERROR); + return true; +} + +int neigh_eth::priv_enter_ready() +{ + neigh_logfunc(""); + + // In case of ETH, we want to unregister from events and destroy rdma cm handle + priv_destroy_cma_id(); + if (!build_uc_neigh_val()) + return (neigh_entry::priv_enter_ready()); + + return -1; +} + +bool neigh_eth::post_send_arp(bool is_broadcast) +{ + header h; + neigh_logdbg("Sending %s ARP", is_broadcast?"BC":"UC"); + + net_device_val_eth *netdevice_eth = dynamic_cast(m_p_dev); + if (netdevice_eth == NULL) { + neigh_logdbg("Net dev is NULL not sending ARP"); + return false; + } + BULLSEYE_EXCLUDE_BLOCK_END + const L2_address *src = m_p_dev->get_l2_address(); + const L2_address *dst; + if (!is_broadcast) { + dst = m_val->get_l2_address(); + } + else { + dst = m_p_dev->get_br_address(); + } + + + const unsigned char* peer_mac = dst->get_address(); + BULLSEYE_EXCLUDE_BLOCK_START + if (src == NULL || dst == NULL) { + neigh_logdbg("src or dst is NULL not sending ARP"); + return false; + } + BULLSEYE_EXCLUDE_BLOCK_END + + m_id = m_p_ring->generate_id(src->get_address(), dst->get_address(), netdevice_eth->get_vlan() ? htons(ETH_P_8021Q) : htons(ETH_P_ARP), htons(ETH_P_ARP), 0, 0, 0, 0); + mem_buf_desc_t* p_mem_buf_desc = m_p_ring->mem_buf_tx_get(m_id, false, 1); + BULLSEYE_EXCLUDE_BLOCK_START + if (unlikely(p_mem_buf_desc == NULL)) { + neigh_logdbg("No free TX buffer, not sending ARP"); + return false; + } + BULLSEYE_EXCLUDE_BLOCK_END + + wqe_send_handler wqe_sh; + wqe_sh.init_wqe(m_send_wqe, &m_sge, 1); + + h.init(); + if (netdevice_eth->get_vlan()) { //vlan interface + h.configure_vlan_eth_headers(*src, *dst, netdevice_eth->get_vlan(), ETH_P_ARP); + } + else { + h.configure_eth_headers(*src, *dst, ETH_P_ARP); + } + + tx_packet_template_t *p_pkt = (tx_packet_template_t*)p_mem_buf_desc->p_buffer; + h.copy_l2_hdr(p_pkt); + + eth_arp_hdr* p_arphdr = (eth_arp_hdr*) (p_mem_buf_desc->p_buffer + h.m_transport_header_tx_offset + h.m_total_hdr_len); + set_eth_arp_hdr(p_arphdr, m_p_dev->get_local_addr(), get_key().get_in_addr(), m_p_dev->get_l2_address()->get_address(), peer_mac); + + m_sge.addr = (uintptr_t)(p_mem_buf_desc->p_buffer + (uint8_t)h.m_transport_header_tx_offset); + m_sge.length = sizeof(eth_arp_hdr) + h.m_total_hdr_len; + m_sge.lkey = p_mem_buf_desc->lkey; + p_mem_buf_desc->p_next_desc = NULL; + m_send_wqe.wr_id = (uintptr_t)p_mem_buf_desc; + + m_p_ring->send_ring_buffer(m_id, &m_send_wqe, (vma_wr_tx_packet_attr)0); + + neigh_logdbg("ARP Sent"); + return true; +} + +bool neigh_eth::prepare_to_send_packet(header * h) +{ + neigh_logdbg(""); + + net_device_val_eth *netdevice_eth = dynamic_cast(m_p_dev); + BULLSEYE_EXCLUDE_BLOCK_START + if (netdevice_eth == NULL) { + neigh_logerr("Net dev is NULL dropping the packet"); + return false; + } + BULLSEYE_EXCLUDE_BLOCK_END + + const L2_address *src = m_p_dev->get_l2_address(); + const L2_address *dst = m_val->get_l2_address(); + + BULLSEYE_EXCLUDE_BLOCK_START + if (src == NULL || dst == NULL) { + neigh_logdbg("src or dst is NULL not sending ARP"); + return false; + } + BULLSEYE_EXCLUDE_BLOCK_END + + wqe_send_handler wqe_sh; + wqe_sh.init_wqe(m_send_wqe, &m_sge, 1); + + if (netdevice_eth->get_vlan()) { //vlan interface + h->configure_vlan_eth_headers(*src, *dst, netdevice_eth->get_vlan()); + } + else { + h->configure_eth_headers(*src, *dst); + } + + return(true); +} + +ring_user_id_t neigh_eth::generate_ring_user_id(header * h /* = NULL */) +{ + if (!h) + return m_p_ring->generate_id(); + + ethhdr *actual_header = (ethhdr*)h->m_actual_hdr_addr; + return m_p_ring->generate_id(actual_header->h_source, + actual_header->h_dest, + actual_header->h_proto, + htons(ETH_P_IP), + h->m_header.hdr.m_ip_hdr.saddr, + h->m_header.hdr.m_ip_hdr.daddr, + h->m_header.hdr.m_udp_hdr.source, + h->m_header.hdr.m_udp_hdr.dest); +} + +//============================================================== neigh_ib ================================================== + +neigh_ib::neigh_ib(neigh_key key, bool is_init_resources) : + neigh_entry(key, VMA_TRANSPORT_IB, is_init_resources), m_pd(NULL), m_n_sysvar_wait_after_join_msec(safe_mce_sys().wait_after_join_msec) +{ + neigh_logdbg(""); + + m_rdma_port_space = RDMA_PS_IPOIB; + + if(IS_BROADCAST_N(key.get_in_addr())) { + //In case of broadcast neigh we don't want to have state machine + m_type = MC; + return; + } + + if (IN_MULTICAST_N(key.get_in_addr())) { + //This is Multicast neigh + m_type = MC; + } + else { + // This is Unicast neigh + m_type = UC; + } + //Do we need to handle case when we get EV_ERROR but in case this error is not related to the state + //Like Address Resolve Error when we at ST_ARP_RESOLVED or ST_PATH_RESOLVED .... + + sm_short_table_line_t short_sm_table[] = + { + // {curr state, event, next state, action func } + { ST_NOT_ACTIVE, EV_KICK_START, ST_INIT, NULL }, + { ST_ERROR, EV_KICK_START, ST_INIT, NULL }, + { ST_INIT, EV_START_RESOLUTION, ST_INIT_RESOLUTION, NULL }, + { ST_INIT_RESOLUTION, EV_ADDR_RESOLVED, ST_ARP_RESOLVED, NULL }, + { ST_ARP_RESOLVED, EV_PATH_RESOLVED, ST_PATH_RESOLVED, NULL }, + { ST_PATH_RESOLVED, EV_TIMEOUT_EXPIRED, ST_READY, NULL }, + { ST_PATH_RESOLVED, EV_ERROR, ST_ERROR, NULL }, + { ST_ARP_RESOLVED, EV_ERROR, ST_ERROR, NULL }, + { ST_READY, EV_ERROR, ST_ERROR, NULL }, + { ST_INIT, EV_ERROR, ST_ERROR, NULL }, + { ST_INIT_RESOLUTION, EV_ERROR, ST_ERROR, NULL }, + { ST_ERROR, EV_ERROR, ST_NOT_ACTIVE, NULL }, + //Entry functions + { ST_INIT, SM_STATE_ENTRY, SM_NO_ST, neigh_entry::dofunc_enter_init }, + { ST_INIT_RESOLUTION, SM_STATE_ENTRY, SM_NO_ST, neigh_entry::dofunc_enter_init_resolution }, + { ST_ARP_RESOLVED, SM_STATE_ENTRY, SM_NO_ST, neigh_ib::dofunc_enter_arp_resolved }, + { ST_PATH_RESOLVED, SM_STATE_ENTRY, SM_NO_ST, neigh_ib::dofunc_enter_path_resolved }, + { ST_READY, SM_STATE_ENTRY, SM_NO_ST, neigh_entry::dofunc_enter_ready }, + { ST_NOT_ACTIVE, SM_STATE_ENTRY, SM_NO_ST, neigh_entry::dofunc_enter_not_active }, + { ST_ERROR, SM_STATE_ENTRY, SM_NO_ST, neigh_entry::dofunc_enter_error }, + SM_TABLE_END }; + + // Create state_nachine + m_state_machine = new state_machine(this, // app hndl + ST_NOT_ACTIVE, // start state_t + ST_LAST, // max states + EV_LAST, // max events + short_sm_table, // short table + general_st_entry, // default entry function + general_st_leave, // default leave function + NULL, // default func + print_event_info // debug function + ); + + BULLSEYE_EXCLUDE_BLOCK_START + if (m_state_machine == NULL) + neigh_logpanic("Failed allocating state_machine"); + BULLSEYE_EXCLUDE_BLOCK_END + + priv_kick_start_sm(); +} + +neigh_ib::~neigh_ib() +{ + priv_enter_not_active(); +} + +void neigh_ib::handle_event_ibverbs_cb(void* ev_data, void* ctx) +{ + NOT_IN_USE(ctx); + event_t event = ibverbs_event_mapping(ev_data); + event_handler(event, ev_data); +} + +// called when timer expired +void neigh_ib::handle_timer_expired(void* ctx) +{ + neigh_logdbg("general timeout expired!"); + + m_sm_lock.lock(); + int state = m_state_machine->get_curr_state(); + m_sm_lock.unlock(); + + if(state == ST_PATH_RESOLVED) { + // Clear Timer Handler + m_timer_handle = NULL; + event_handler(EV_TIMEOUT_EXPIRED); + } + else if(state == ST_READY) { + neigh_entry::handle_timer_expired(ctx); + } + else if(state == ST_INIT) { + // Clear Timer Handler + m_timer_handle = NULL; + event_handler(EV_START_RESOLUTION); + } +} + +bool neigh_ib::priv_handle_neigh_is_l2_changed(address_t new_l2_addr) +{ + auto_unlocker lock(m_lock); + IPoIB_addr new_l2_address(new_l2_addr); + if (m_val) { + if(m_val->get_l2_address()) { + if (!(m_val->get_l2_address()->compare(new_l2_address))) { + neigh_logdbg("l2 address was changed (%s => %s)", (m_val->get_l2_address())->to_str().c_str(), new_l2_address.to_str().c_str()); + event_handler(EV_ERROR); + return true; + } + else { + neigh_logdbg("No change in l2 address"); + return false; + } + } + else { + neigh_logdbg("l2 address is NULL\n"); + } + } + else { + neigh_logerr("m_val is NULL"); + } + + event_handler(EV_ERROR); + return true; +} + +bool neigh_ib::post_send_arp(bool is_broadcast) +{ + neigh_logdbg("Sending %s ARP", is_broadcast?"BC":"UC"); + + mem_buf_desc_t* p_mem_buf_desc = m_p_ring->mem_buf_tx_get(m_id, false, 1); + if (unlikely(p_mem_buf_desc == NULL)) { + neigh_logdbg("No free TX buffer, not sending ARP"); + return false; + } + + net_device_val_ib *netdevice_ib = dynamic_cast(m_p_dev); + if (netdevice_ib == NULL) { + m_p_ring->mem_buf_tx_release(p_mem_buf_desc, true); + neigh_logdbg("Net dev is NULL not sending ARP"); + return false; + } + + const L2_address *src = netdevice_ib->get_l2_address(); + const L2_address *dst; + neigh_ib_val br_neigh_val; + ibv_ah* ah = NULL; + uint32_t qpn; + uint32_t qkey; + const unsigned char* peer_mac = NULL; + if (!is_broadcast) { + dst = m_val->get_l2_address(); + peer_mac = dst->get_address(); + ah = ((neigh_ib_val *)m_val)->get_ah(); + qpn = ((neigh_ib_val *)m_val)->get_qpn(); + qkey = ((neigh_ib_val *)m_val)->get_qkey(); + } + else { + dst = m_p_dev->get_br_address(); + neigh_ib_broadcast * br_neigh = const_cast(((net_device_val_ib*)m_p_dev)->get_br_neigh()); + bool ret = br_neigh->get_peer_info(&br_neigh_val); + if (ret) { + ah = br_neigh_val.get_ah(); + qpn = br_neigh_val.get_qpn(); + qkey = br_neigh_val.get_qkey(); + } + else { + m_p_ring->mem_buf_tx_release(p_mem_buf_desc, true); + neigh_logdbg("BR Neigh is not valid, not sending BR ARP"); + return false; + } + } + + if (src == NULL || dst == NULL) { + m_p_ring->mem_buf_tx_release(p_mem_buf_desc, true); + neigh_logdbg("src or dst is NULL not sending ARP"); + return false; + } + + wqe_send_ib_handler wqe_sh; + wqe_sh.init_ib_wqe(m_send_wqe, &m_sge, 1, ah, qpn, qkey); + neigh_logdbg("ARP: ah=%#x, qkey=%#x, qpn=%#x", ah ,qkey, qpn); + header h; + h.init(); + h.configure_ipoib_headers(IPOIB_ARP_HEADER); + + + tx_packet_template_t *p_pkt = (tx_packet_template_t*)p_mem_buf_desc->p_buffer; + h.copy_l2_hdr(p_pkt); + + ib_arp_hdr* p_arphdr = (ib_arp_hdr*) (p_mem_buf_desc->p_buffer + h.m_transport_header_tx_offset + h.m_total_hdr_len); + set_ib_arp_hdr(p_arphdr, m_p_dev->get_local_addr(), get_key().get_in_addr(), m_p_dev->get_l2_address()->get_address(), peer_mac); + + m_sge.addr = (uintptr_t)(p_mem_buf_desc->p_buffer + (uint8_t)h.m_transport_header_tx_offset); + m_sge.length = sizeof(ib_arp_hdr) + h.m_total_hdr_len; + m_sge.lkey = p_mem_buf_desc->lkey; + p_mem_buf_desc->p_next_desc = NULL; + m_send_wqe.wr_id = (uintptr_t)p_mem_buf_desc; + + m_p_ring->send_ring_buffer(m_id, &m_send_wqe, (vma_wr_tx_packet_attr)0); + + neigh_logdbg("ARP Sent"); + return true; +} + +bool neigh_ib::prepare_to_send_packet(header * h) +{ + neigh_logdbg(""); + wqe_send_ib_handler wqe_sh; + wqe_sh.init_ib_wqe(m_send_wqe, &m_sge , 1, ((neigh_ib_val *)m_val)->get_ah(), ((neigh_ib_val *)m_val)->get_qpn(), ((neigh_ib_val *)m_val)->get_qkey()); + h->configure_ipoib_headers(); + + return true; +} + +neigh_entry::event_t neigh_ib::ibverbs_event_mapping(void* p_event_info) +{ + struct ibv_async_event *ev = (struct ibv_async_event *) p_event_info; + neigh_logdbg("Got event %s (%d) ", priv_ibv_event_desc_str(ev->event_type), ev->event_type); + + switch (ev->event_type) + { + case IBV_EVENT_SM_CHANGE: + case IBV_EVENT_CLIENT_REREGISTER: + return EV_ERROR; + default: + return EV_UNHANDLED; + } +} + +void neigh_ib::dofunc_enter_arp_resolved(const sm_info_t& func_info) +{ + neigh_ib * my_neigh = (neigh_ib *) func_info.app_hndl; + neigh_entry::general_st_entry(func_info); + + run_helper_func(priv_enter_arp_resolved(), EV_ERROR); +} + +void neigh_ib::dofunc_enter_path_resolved(const sm_info_t& func_info) +{ + neigh_ib * my_neigh = (neigh_ib *) func_info.app_hndl; + neigh_entry::general_st_entry(func_info); + + uint32_t wait_after_join_msec; + + run_helper_func(priv_enter_path_resolved((struct rdma_cm_event*)func_info.ev_data, wait_after_join_msec), + EV_ERROR); + my_neigh->m_timer_handle = my_neigh->priv_register_timer_event(wait_after_join_msec, my_neigh, ONE_SHOT_TIMER, NULL); +} + +int neigh_ib::priv_enter_arp_resolved() +{ + neigh_logfunc(""); + + if (m_cma_id->verbs == NULL) { + neigh_logdbg("m_cma_id->verbs is NULL"); + return -1; + } + + if (find_pd()) + return -1; + + //Register Verbs event in case there was Fabric change + if (m_cma_id->verbs) { + g_p_event_handler_manager->register_ibverbs_event( + m_cma_id->verbs->async_fd, this, + m_cma_id->verbs, 0); + } + + if (m_type == UC) + return (handle_enter_arp_resolved_uc()); + else + // MC + return (handle_enter_arp_resolved_mc()); +} + +int neigh_ib::priv_enter_path_resolved(struct rdma_cm_event* event_data, + uint32_t & wait_after_join_msec) +{ + neigh_logfunc(""); + + if (m_val == NULL) + //This is the first time we are trying to allocate new val or it failed last time + m_val = new neigh_ib_val; + + BULLSEYE_EXCLUDE_BLOCK_START + if (m_val == NULL) + return -1; + BULLSEYE_EXCLUDE_BLOCK_END + + if (m_type == UC) + return (build_uc_neigh_val(event_data, wait_after_join_msec)); + else + //MC + return (build_mc_neigh_val(event_data, wait_after_join_msec)); +} + +void neigh_ib::priv_enter_error() +{ + auto_unlocker lock(m_lock); + + m_state = false; + m_pd = NULL; + + destroy_ah(); + priv_unregister_timer(); + + if (m_cma_id && m_cma_id->verbs) { + neigh_logdbg("Unregister Verbs event"); + g_p_event_handler_manager->unregister_ibverbs_event(m_cma_id->verbs->async_fd, this); + } + + neigh_entry::priv_enter_error(); +} + +void neigh_ib::priv_enter_not_active() +{ + neigh_logfunc(""); + + auto_unlocker lock(m_lock); + + m_state = false; + m_pd = NULL; + + destroy_ah(); + + if (m_cma_id && m_cma_id->verbs) { + neigh_logdbg("Unregister Verbs event"); + g_p_event_handler_manager->unregister_ibverbs_event(m_cma_id->verbs->async_fd, this); + } + + neigh_entry::priv_enter_not_active(); +} + +int neigh_ib::priv_enter_ready() +{ + neigh_logfunc(""); + priv_unregister_timer(); + return (neigh_entry::priv_enter_ready()); +} + +int neigh_ib::handle_enter_arp_resolved_mc() +{ + neigh_logdbg(""); + + IF_RDMACM_FAILURE(rdma_join_multicast( m_cma_id, (struct sockaddr*)&m_dst_addr, (void *)this)) + { + neigh_logdbg("Failed in rdma_join_multicast (errno=%d %m)", errno); + return -1; + } ENDIF_RDMACM_FAILURE; + + return 0; +} + +int neigh_ib::handle_enter_arp_resolved_uc() +{ + neigh_logdbg(""); + + IF_RDMACM_FAILURE(rdma_resolve_route(m_cma_id, RDMA_CM_TIMEOUT)) + { + neigh_logdbg("Resolve address error (errno=%d %m)", errno); + return -1; + } ENDIF_RDMACM_FAILURE; + + return 0; +} + +int neigh_ib::build_mc_neigh_val(struct rdma_cm_event* event_data, + uint32_t & wait_after_join_msec) +{ + neigh_logdbg(""); + + m_val->m_l2_address = new IPoIB_addr(event_data->param.ud.qp_num, (address_t)event_data->param.ud.ah_attr.grh.dgid.raw); + BULLSEYE_EXCLUDE_BLOCK_START + if (m_val->m_l2_address == NULL) { + neigh_logdbg("Failed allocating m_val->m_l2_address"); + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_END + + ((neigh_ib_val *) m_val)->m_qkey = event_data->param.ud.qkey; + + memcpy(&((neigh_ib_val *) m_val)->m_ah_attr, + &event_data->param.ud.ah_attr, + sizeof(((neigh_ib_val *) m_val)->m_ah_attr)); + + BULLSEYE_EXCLUDE_BLOCK_START + if (create_ah()) + return -1; + BULLSEYE_EXCLUDE_BLOCK_END + + neigh_logdbg("IB multicast neigh params are : ah=%#x, qkey=%#x, sl=%#x, rate=%#x, port_num = %#x, qpn=%#x dlid=%#x dgid = " IPOIB_HW_ADDR_PRINT_FMT_16, + ((neigh_ib_val *) m_val)->m_ah, ((neigh_ib_val *) m_val)->m_qkey, ((neigh_ib_val *) m_val)->m_ah_attr.sl, ((neigh_ib_val *) m_val)->m_ah_attr.static_rate, + ((neigh_ib_val *) m_val)->m_ah_attr.port_num, ((neigh_ib_val *) m_val)->get_qpn(), ((neigh_ib_val *) m_val)->m_ah_attr.dlid, + IPOIB_HW_ADDR_PRINT_ADDR_16(((neigh_ib_val *) m_val)->m_ah_attr.grh.dgid.raw)); + /*neigh_logerr("flow_label = %#x, sgid_index=%#x, hop_limit=%#x, traffic_class=%#x", ((neigh_ib_val *) m_val)->m_ah_attr.grh.flow_label, ((neigh_ib_val *) m_val)->m_ah_attr.grh.sgid_index, + ((neigh_ib_val *) m_val)->m_ah_attr.grh.hop_limit, ((neigh_ib_val *) m_val)->m_ah_attr.grh.traffic_class); + */ + wait_after_join_msec = m_n_sysvar_wait_after_join_msec; + + return 0; +} + +int neigh_ib::build_uc_neigh_val(struct rdma_cm_event* event_data, + uint32_t & wait_after_join_msec) +{ + NOT_IN_USE(event_data); + neigh_logdbg(""); + + // Find peer's IPoIB row address + unsigned char tmp[IPOIB_HW_ADDR_LEN]; + address_t address = (address_t) tmp; + BULLSEYE_EXCLUDE_BLOCK_START + if (!priv_get_neigh_l2(address)) { + neigh_logdbg("Failed in priv_get_neigh_l2()"); + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_END + + m_val->m_l2_address = new IPoIB_addr(address); + BULLSEYE_EXCLUDE_BLOCK_START + if (m_val->m_l2_address == NULL) { + neigh_logdbg("Failed creating m_val->m_l2_address"); + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_END + neigh_logdbg("IPoIB MAC = %s", m_val->m_l2_address->to_str().c_str()); + // IPoIB qkey is hard coded in SM . Do we want to take it from event or leave it hard coded + //((neigh_ib_val *) m_val)->m_qkey = event_data->param.ud.qkey; //0x0b1b; + ((neigh_ib_val *) m_val)->m_qkey = IPOIB_QKEY; + + //memcpy(&m_val.ib_addr.m_ah_attr, &event_data->param.ud.ah_attr, sizeof(struct ibv_ah_attr)); + + if (!m_cma_id || m_cma_id->route.num_paths <= 0) { + neigh_logdbg("Can't prepare AH attr (cma_id=%p, num_paths=%d)", m_cma_id, m_cma_id ? m_cma_id->route.num_paths : 0); + return -1; + } + + memset(&((neigh_ib_val *) m_val)->m_ah_attr, 0, sizeof(((neigh_ib_val *) m_val)->m_ah_attr)); + ((neigh_ib_val *) m_val)->m_ah_attr.dlid = ntohs(m_cma_id->route.path_rec->dlid); + ((neigh_ib_val *) m_val)->m_ah_attr.sl = m_cma_id->route.path_rec->sl; + ((neigh_ib_val *) m_val)->m_ah_attr.src_path_bits = 0; + ((neigh_ib_val *) m_val)->m_ah_attr.static_rate = m_cma_id->route.path_rec->rate; + ((neigh_ib_val *) m_val)->m_ah_attr.is_global = 0; + ((neigh_ib_val *) m_val)->m_ah_attr.port_num = m_cma_id->port_num; + + BULLSEYE_EXCLUDE_BLOCK_START + if (create_ah()) + return -1; + BULLSEYE_EXCLUDE_BLOCK_END + + neigh_logdbg("IB unicast neigh params ah=%#x, qkey=%#x, qpn=%#x, dlid=%#x", ((neigh_ib_val *) m_val)->m_ah, + ((neigh_ib_val *) m_val)->m_qkey, ((neigh_ib_val *) m_val)->get_qpn(), ((neigh_ib_val *) m_val)->m_ah_attr.dlid); + + wait_after_join_msec = 0; + + return 0; +} + +int neigh_ib::find_pd() +{ + neigh_logdbg(""); + + ib_ctx_handler* ib_ctx_h = g_p_ib_ctx_handler_collection->get_ib_ctx(m_p_dev->get_ifname_link()); + + if (ib_ctx_h) { + m_pd = ib_ctx_h->get_ibv_pd(); + return 0; + } + + return -1; +} + +int neigh_ib::create_ah() +{ + neigh_logdbg(""); + + /* if (((neigh_ib_val *) m_val)->m_ah) { + // if there's ah we want to destroy it - shouldn't happen + neigh_logerr("destroy ah %p (shouldn't happen)", ((neigh_ib_val *) m_val)->m_ah); + if (destroy_ah()) + return -1; + } + */ + ((neigh_ib_val *) m_val)->m_ah = ibv_create_ah(m_pd, &((neigh_ib_val *) m_val)->m_ah_attr); + BULLSEYE_EXCLUDE_BLOCK_START + if (!((neigh_ib_val *) m_val)->m_ah) { + neigh_logdbg("failed creating address handler (errno=%d %m)", errno); + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_END + return 0; +} + +int neigh_ib::destroy_ah() +{ + neigh_logdbg(""); + //For now we whouldn't destroy it + //We cannot destroy ah till each post_send with this ah has ended + //TODO: Need to think how to handle this - for now there will be ah leak + return 0; +#if 0 //unreachable code +#ifndef __COVERITY__ + if (m_val && ((neigh_ib_val *) m_val)->m_ah) { + IF_VERBS_FAILURE(ibv_destroy_ah(((neigh_ib_val *) m_val)->m_ah)) + { + neigh_logdbg("failed destroying address handle (errno=%d %m)", errno); + return -1; + }ENDIF_VERBS_FAILURE; + } + return 0; +#endif +#endif +} + +//================================================================================================================== + +neigh_ib_broadcast::neigh_ib_broadcast(neigh_key key) : neigh_ib(key, false) +{ + neigh_logdbg("Calling rdma_create_id"); + IF_RDMACM_FAILURE(rdma_create_id(g_p_neigh_table_mgr->m_neigh_cma_event_channel, &m_cma_id, (void *)this, m_rdma_port_space)) + { + neigh_logerr("Failed in rdma_create_id (errno=%d %m)", errno); + return; + } ENDIF_RDMACM_FAILURE; + + + neigh_logdbg("Calling rdma_bind_addr"); + struct sockaddr_in local_sockaddr; + local_sockaddr.sin_family = AF_INET; + local_sockaddr.sin_port = INPORT_ANY; + local_sockaddr.sin_addr.s_addr = m_p_dev->get_local_addr(); + + IF_RDMACM_FAILURE(rdma_bind_addr(m_cma_id, (struct sockaddr*)&local_sockaddr)) { + neigh_logerr("Failed in rdma_bind_addr (src=%d.%d.%d.%d) (errno=%d %m)", NIPQUAD(m_p_dev->get_local_addr()), errno); + return; + } ENDIF_RDMACM_FAILURE; + + build_mc_neigh_val(); + + m_state = true; +} + +void neigh_ib_broadcast::build_mc_neigh_val() +{ + m_val = new neigh_ib_val; + if(m_val == NULL) { + neigh_logerr("Failed allocating m_val"); + return; + } + + if (m_cma_id->verbs == NULL) { + neigh_logdbg("m_cma_id->verbs is NULL"); + return; + } + + m_val->m_l2_address = new IPoIB_addr(((m_p_dev->get_br_address())->get_address())); + if (m_val->m_l2_address == NULL) { + neigh_logerr("Failed allocating m_val->m_l2_address"); + return; + } + + ((neigh_ib_val *) m_val)->m_qkey = IPOIB_QKEY; + + memset(&((neigh_ib_val *) m_val)->m_ah_attr, 0, sizeof(((neigh_ib_val *) m_val)->m_ah_attr)); + memcpy( ((neigh_ib_val *) m_val)->m_ah_attr.grh.dgid.raw , &((m_val->m_l2_address->get_address())[4]), 16*sizeof(char)); + + ((neigh_ib_val *) m_val)->m_ah_attr.dlid = 0xc000; + ((neigh_ib_val *) m_val)->m_ah_attr.static_rate = 0x3; + ((neigh_ib_val *) m_val)->m_ah_attr.port_num = m_cma_id->port_num; + ((neigh_ib_val *) m_val)->m_ah_attr.is_global = 0x1; + + if(find_pd()) { + neigh_logerr("Failed find_pd()"); + return; + } + + /*neigh_logerr("m_pd = %p, flow_label = %#x, sgid_index=%#x, hop_limit=%#x, traffic_class=%#x", + m_pd, ((neigh_ib_val *) m_val)->m_ah_attr.grh.flow_label, ((neigh_ib_val *) m_val)->m_ah_attr.grh.sgid_index, + ((neigh_ib_val *) m_val)->m_ah_attr.grh.hop_limit, ((neigh_ib_val *) m_val)->m_ah_attr.grh.traffic_class); + */ + if (create_ah()) + return; + + neigh_logdbg("IB broadcast neigh params are : ah=%#x, qkey=%#x, sl=%#x, rate=%#x, port_num = %#x, qpn=%#x, dlid=%#x dgid = " IPOIB_HW_ADDR_PRINT_FMT_16, + ((neigh_ib_val *) m_val)->m_ah, ((neigh_ib_val *) m_val)->m_qkey, ((neigh_ib_val *) m_val)->m_ah_attr.sl, + ((neigh_ib_val *) m_val)->m_ah_attr.static_rate,((neigh_ib_val *) m_val)->m_ah_attr.port_num, + ((neigh_ib_val *) m_val)->get_qpn(), ((neigh_ib_val *) m_val)->m_ah_attr.dlid, IPOIB_HW_ADDR_PRINT_ADDR_16(((neigh_ib_val *) m_val)->m_ah_attr.grh.dgid.raw) ); + + +} + +bool neigh_ib_broadcast::get_peer_info(neigh_val * p_val) +{ + neigh_logfunc("calling neigh_entry get_peer_info. state = %d", m_state); + if (p_val == NULL) { + neigh_logdbg("p_val is NULL, return false"); + return false; + } + + auto_unlocker lock(m_lock); + if (m_state) { + neigh_logdbg("There is a valid val"); + *p_val = *m_val; + return m_state; + } + + return false; +} + +int neigh_ib_broadcast::send(neigh_send_info &s_info) +{ + NOT_IN_USE(s_info); + neigh_logerr("We should not call for this function, something is wrong"); + return false; +} + +void neigh_ib_broadcast::send_arp() +{ + neigh_logerr("We should not call for this function, something is wrong"); +} diff --git a/src/vma/proto/neighbour.h b/src/vma/proto/neighbour.h new file mode 100644 index 0000000..5642978 --- /dev/null +++ b/src/vma/proto/neighbour.h @@ -0,0 +1,423 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef NEIGHBOUR_H +#define NEIGHBOUR_H + +#include +#include + +#include "state_machine/sm.h" +#include "vma/util/sys_vars.h" +#include "vma/util/to_str.h" +#include "vma/infra/cache_subject_observer.h" +#include "vma/infra/sender.h" +#include "vma/event/event_handler_ibverbs.h" +#include "vma/event/event_handler_rdma_cm.h" +#include "vma/event/event_handler_manager.h" +#include "vma/event/timer_handler.h" +#include "vma/event/netlink_event.h" +#include "vma/proto/ip_address.h" +#include "vma/proto/L2_address.h" + +#include "vma/proto/header.h" +#include "vma/dev/ring_allocation_logic.h" +#include "vma/dev/net_device_val.h" +#include "vma/dev/ring.h" +#include "vma/proto/arp.h" + +class neigh_key : public tostr +{ +public: + neigh_key(ip_address addr, net_device_val* p_ndvl): m_ip_addrs(addr), m_p_net_dev_val(p_ndvl) {}; + virtual ~neigh_key() {}; + + const std::string to_str() const + { + return(m_ip_addrs.to_str() + " " + m_p_net_dev_val->to_str()); + } + in_addr_t get_in_addr() const { return m_ip_addrs.get_in_addr(); }; + net_device_val* get_net_device_val() const { return m_p_net_dev_val; }; + + virtual size_t hash(void) + { + uint8_t csum = 0; + uint8_t* pval = (uint8_t*)this; + for (size_t i = 0; i < sizeof(ip_address); ++i, ++pval) { csum ^= *pval; } + return csum; + } + + virtual bool operator==(neigh_key const& other) const + { + return ((m_ip_addrs == other.m_ip_addrs) && (m_p_net_dev_val == other.m_p_net_dev_val)); + } + +private: + ip_address m_ip_addrs; + net_device_val* m_p_net_dev_val; +}; + +namespace std { namespace tr1 { +template<> +class hash +{ +public: + size_t operator()(const neigh_key &key) const + { + neigh_key* tmp_key = (neigh_key*)&key; + return tmp_key->hash(); + } +}; +}} + +class neigh_val : public tostr +{ +public: + neigh_val(): m_trans_type(VMA_TRANSPORT_UNKNOWN), m_l2_address(NULL){}; + virtual ~neigh_val(){}; + + virtual void zero_all_members() + { if(m_l2_address) + delete m_l2_address; + m_l2_address = NULL; + }; + const L2_address* get_l2_address() const { return m_l2_address; }; + + virtual neigh_val & operator=(const neigh_val & val) + { + if (this != &val) { + m_l2_address = val.m_l2_address; + m_trans_type = val.m_trans_type; + } + return *this; + } + +protected: + friend class neigh_entry; + friend class neigh_ib; + friend class neigh_eth; + friend class neigh_ib_broadcast; + transport_type_t m_trans_type; + L2_address* m_l2_address; +}; + +class neigh_eth_val : public neigh_val +{ +public: + neigh_eth_val() + { + m_trans_type = VMA_TRANSPORT_ETH; + zero_all_members(); + } + + neigh_val & operator=(const neigh_val & val) + { + return neigh_val::operator=(val); + } + +private: + friend class neigh_eth; +}; + +class neigh_ib_val : public neigh_val +{ +public: + neigh_ib_val() : m_ah(NULL) { zero_all_members(); }; + + ibv_ah* get_ah()const { return m_ah; }; + ibv_ah_attr get_ah_attr() const { return m_ah_attr; }; + uint32_t get_qkey() const { return m_qkey; }; + uint32_t get_qpn() const + { + if (m_l2_address) + return(((IPoIB_addr *) m_l2_address)->get_qpn()); + else + return 0; + } + + neigh_val & operator=(const neigh_val & val); + +private: + friend class neigh_ib; + friend class neigh_ib_broadcast; + + ibv_ah_attr m_ah_attr; + ibv_ah* m_ah; + uint32_t m_qkey; + + void zero_all_members() + { + memset(&m_ah_attr, 0, sizeof(m_ah_attr)); + //m_ah = NULL; + m_qkey = 0; + neigh_val::zero_all_members(); + } +}; + +/* neigh_entry inherits from cache_entry_subject where + * Key = address (peer IP) + * Val = class neigh_val + */ +typedef std::deque unsent_queue_t; + +class neigh_entry : public cache_entry_subject, public event_handler_rdma_cm, public timer_handler +{ +public: + enum type + { + UNKNOWN, + MC, + UC + }; + + enum state_t + { + ST_NOT_ACTIVE = 0, + ST_INIT = 1, + ST_INIT_RESOLUTION, + ST_ADDR_RESOLVED, + ST_ARP_RESOLVED, + ST_PATH_RESOLVED, + ST_READY, + ST_ERROR, + ST_LAST + }; + + enum event_t + { + EV_KICK_START = 0, + EV_START_RESOLUTION, + EV_ARP_RESOLVED, + EV_ADDR_RESOLVED, + EV_PATH_RESOLVED, + EV_ERROR, + EV_TIMEOUT_EXPIRED, // For IB MC join + EV_UNHANDLED, + EV_LAST + }; + + friend class neighbour_table_mgr; + + neigh_entry (neigh_key key, transport_type_t type, bool is_init_resources = true); + virtual ~neigh_entry(); + + //Overwrite cach_entry virtual function + virtual bool is_deletable(); + virtual void clean_obj(); + + //Implementation of pure virtual function: Don't use get_val function, instead use get_peer_info + virtual bool get_val(INOUT neigh_val * & val){ NOT_IN_USE(val); return false;}; + + virtual bool get_peer_info(neigh_val * val); + // Overriding subject's register_observer + virtual bool register_observer(const observer* const new_observer); + //Overriding tostr to_str() + virtual const std::string to_str() const; + + const char* event_to_str(event_t event) const; + const char* state_to_str(state_t state) const; + + void handle_event_rdma_cm_cb(struct rdma_cm_event* p_event); + void handle_neigh_event(neigh_nl_event* nl_ev); + + static void general_st_entry(const sm_info_t& func_info); + static void general_st_leave(const sm_info_t& func_info); + static void print_event_info(int state, int event, void* app_data); + static void dofunc_enter_init(const sm_info_t& func_info); + static void dofunc_enter_init_resolution(const sm_info_t& func_info); + static void dofunc_enter_addr_resolved(const sm_info_t& func_info); + static void dofunc_enter_error(const sm_info_t& func_info); + static void dofunc_enter_not_active(const sm_info_t& func_info); + static void dofunc_enter_ready(const sm_info_t& func_info); + + //Implementing pure virtual function of sender + virtual int send(neigh_send_info &s_info); + +protected: + rdma_cm_id* m_cma_id; + sockaddr_in m_dst_addr; + sockaddr_in m_src_addr; + enum rdma_port_space m_rdma_port_space; + state_machine* m_state_machine; + type m_type; // UC / MC + transport_type_t m_trans_type; + bool m_state; + unsent_queue_t m_unsent_queue; + //Counter to sign that KickStart was already generated in ERROR_ST + uint32_t m_err_counter; + + void* m_timer_handle; + // members for sending arp + uint32_t m_arp_counter; + net_device_val* m_p_dev; + ring* m_p_ring; + vma_ibv_send_wr m_send_wqe; + ibv_sge m_sge; + bool m_is_loopback; + + const std::string m_to_str; + ring_user_id_t m_id; + + virtual void priv_general_st_entry(const sm_info_t& func_info); + virtual void priv_general_st_leave(const sm_info_t& func_info); + virtual void priv_print_event_info(state_t state, event_t event); + virtual void priv_kick_start_sm(); + virtual void priv_enter_not_active(); + virtual void priv_enter_error(); + virtual int priv_enter_init(); + virtual int priv_enter_init_resolution(); + virtual int priv_enter_addr_resolved(); + virtual int priv_enter_ready(); + + bool priv_get_neigh_state(int & state); + bool priv_get_neigh_l2(address_t & l2_addr); + bool priv_is_reachable(int state) { return state & (NUD_REACHABLE | NUD_PERMANENT); } + bool priv_is_failed(int state) { return state & (NUD_FAILED | NUD_INCOMPLETE); } + + void event_handler(event_t event, void* p_event_info = NULL); + void priv_event_handler_no_locks(event_t event, void* p_event_info = NULL); + + virtual bool priv_handle_neigh_is_l2_changed(address_t) { return false; }; + void priv_handle_neigh_reachable_event(); + void priv_destroy_cma_id(); + virtual void* priv_register_timer_event(int timeout_msec, timer_handler* handler, timer_req_type_t req_type, void* user_data); + void priv_unregister_timer(); + + virtual void send_arp(); + virtual bool post_send_arp(bool) { return true;}; + virtual bool prepare_to_send_packet(header *) {return true;}; + void handle_timer_expired(void* user_data); + + virtual ring_user_id_t generate_ring_user_id(header *h = NULL) { NOT_IN_USE(h); return m_p_ring->generate_id(); }; + + lock_mutex_recursive m_sm_lock; + +private: + bool m_is_first_send_arp; + const uint32_t m_n_sysvar_neigh_wait_till_send_arp_msec; + const uint32_t m_n_sysvar_neigh_uc_arp_quata; + const uint32_t m_n_sysvar_neigh_num_err_retries; + ring_allocation_logic_tx m_ring_allocation_logic; + event_t rdma_event_mapping(struct rdma_cm_event* p_event); + void empty_unsent_queue(); + bool post_send_packet(neigh_send_data *n_send_data); + bool post_send_udp(neigh_send_data *n_send_data); + bool post_send_tcp(neigh_send_data *n_send_data); +}; + +class neigh_ib : public neigh_entry, public event_handler_ibverbs +{ +public: + friend class neighbour_table_mgr; + neigh_ib(neigh_key key, bool is_init_resources = true); + ~neigh_ib(); + + static void dofunc_enter_arp_resolved(const sm_info_t& func_info); + static void dofunc_enter_path_resolved(const sm_info_t& func_info); + +protected: + ibv_pd* m_pd; + + int find_pd(); + int create_ah(); + int destroy_ah(); + virtual int build_mc_neigh_val(struct rdma_cm_event* event_data, uint32_t & wait_after_join_msec); + +private: + + //Implementation of pure virtual functions + void handle_event_ibverbs_cb(void* ev_data, void* ctx); + void handle_timer_expired(void* user_data); + + // Overriding neigh_entry priv_enter_not_active + void priv_enter_not_active(); + void priv_enter_error(); + int priv_enter_arp_resolved(); + int priv_enter_path_resolved(struct rdma_cm_event* event_data, uint32_t & wait_after_join_msec); + virtual bool priv_handle_neigh_is_l2_changed(address_t); + // Overriding neigh_entry priv_enter_ready + int priv_enter_ready(); + + int handle_enter_arp_resolved_uc(); + int handle_enter_arp_resolved_mc(); + int build_uc_neigh_val(struct rdma_cm_event* event_data, uint32_t & wait_after_join_msec); + + event_t ibverbs_event_mapping(void* p_event_info); + virtual bool post_send_arp(bool); + virtual bool prepare_to_send_packet(header *); + + const uint32_t m_n_sysvar_wait_after_join_msec; +}; + +class neigh_ib_broadcast : public neigh_ib +{ +public: + neigh_ib_broadcast(neigh_key key); + virtual int send(neigh_send_info & s_info); + virtual bool get_peer_info(neigh_val * p_val); + virtual bool is_deletable() { return false; }; + +private: + void build_mc_neigh_val(); + virtual void send_arp(); +}; + +class neigh_eth : public neigh_entry +{ +public: + friend class neighbour_table_mgr; + neigh_eth(neigh_key key); + ~neigh_eth(); + virtual bool get_peer_info(neigh_val * val); + //Overriding neigh_entry register_observer + bool register_observer(const observer* const new_observer); + //Overriding neigh_entry is_deletable + virtual bool is_deletable(); + +protected: + virtual ring_user_id_t generate_ring_user_id(header * h = NULL); + +private: + + int build_mc_neigh_val(); + int build_uc_neigh_val(); + //Overriding neigh_entry priv_enter_ready + virtual int priv_enter_ready(); + virtual int priv_enter_init(); + virtual int priv_enter_init_resolution(); + virtual bool priv_handle_neigh_is_l2_changed(address_t); + virtual bool post_send_arp(bool is_broadcast); + virtual bool prepare_to_send_packet(header *); +}; + +#endif /* NEIGHBOUR_H */ diff --git a/src/vma/proto/neighbour_observer.h b/src/vma/proto/neighbour_observer.h new file mode 100644 index 0000000..244c82d --- /dev/null +++ b/src/vma/proto/neighbour_observer.h @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + + +#ifndef NEIGHBOUR_OBSERVER_H +#define NEIGHBOUR_OBSERVER_H + +#include "vma/util/sys_vars.h" +#include "vma/infra/subject_observer.h" + +class neigh_observer : public observer +{ +public: + virtual transport_type_t get_obs_transport_type() const = 0; +}; + +#endif /* NEIGHBOUR_OBSERVER_H */ diff --git a/src/vma/proto/neighbour_table_mgr.cpp b/src/vma/proto/neighbour_table_mgr.cpp new file mode 100644 index 0000000..2a97163 --- /dev/null +++ b/src/vma/proto/neighbour_table_mgr.cpp @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include +#include + +#include "utils/bullseye.h" +#include "vma/netlink/netlink_wrapper.h" +#include "vma/event/netlink_event.h" +#include "vma/proto/neighbour_table_mgr.h" + +#include "vma/proto/neighbour_observer.h" +#include "vma/dev/net_device_table_mgr.h" + +#define MODULE_NAME "ntm:" + +#define neigh_mgr_logpanic __log_panic +#define neigh_mgr_logerr __log_err +#define neigh_mgr_logwarn __log_warn +#define neigh_mgr_loginfo __log_info +#define neigh_mgr_logdbg __log_dbg +#define neigh_mgr_logfunc __log_func +#define neigh_mgr_logfuncall __log_funcall + +neigh_table_mgr * g_p_neigh_table_mgr = NULL; + +#define DEFAULT_GARBAGE_COLLECTOR_TIME 100000 + +neigh_table_mgr::neigh_table_mgr():m_neigh_cma_event_channel(NULL) +{ + // Creating cma_event_channel + + m_neigh_cma_event_channel = rdma_create_event_channel(); + BULLSEYE_EXCLUDE_BLOCK_START + if (m_neigh_cma_event_channel == NULL) { + neigh_mgr_logdbg("Failed to create neigh_cma_event_channel (errno=%d %m)", errno); + } else { + neigh_mgr_logdbg("Creation of neigh_cma_event_channel on fd=%d", m_neigh_cma_event_channel->fd); + } + BULLSEYE_EXCLUDE_BLOCK_END + + start_garbage_collector(DEFAULT_GARBAGE_COLLECTOR_TIME); +} + +neigh_table_mgr::~neigh_table_mgr() +{ + stop_garbage_collector(); + if (m_neigh_cma_event_channel) { + rdma_destroy_event_channel(m_neigh_cma_event_channel); + } +} + +bool neigh_table_mgr::register_observer(neigh_key key, + const cache_observer *new_observer, + cache_entry_subject **cache_entry) +{ + //Register to netlink event handler only if this is the first entry + if (get_cache_tbl_size() == 0) { + g_p_netlink_handler->register_event(nlgrpNEIGH, this); + neigh_mgr_logdbg("Registered to g_p_netlink_handler"); + } + return cache_table_mgr::register_observer(key, new_observer, cache_entry); +} + +neigh_entry* neigh_table_mgr::create_new_entry(neigh_key neigh_key, const observer* new_observer) +{ + observer * tmp = const_cast(new_observer); + const neigh_observer * dst = dynamic_cast(tmp) ; + + BULLSEYE_EXCLUDE_BLOCK_START + if (dst == NULL) { + //TODO: Need to add handling of this case + neigh_mgr_logpanic("dynamic_casr failed, new_observer type is not neigh_observer"); + } + BULLSEYE_EXCLUDE_BLOCK_END + + + transport_type_t transport = dst->get_obs_transport_type(); + + if (transport == VMA_TRANSPORT_IB) { + if(IS_BROADCAST_N(neigh_key.get_in_addr())){ + neigh_mgr_logdbg("Creating new neigh_ib_broadcast"); + return (new neigh_ib_broadcast(neigh_key)); + } + neigh_mgr_logdbg("Creating new neigh_ib"); + return (new neigh_ib(neigh_key)); + } + else if (transport == VMA_TRANSPORT_ETH) { + neigh_mgr_logdbg("Creating new neigh_eth"); + return (new neigh_eth(neigh_key)); + } + else { + neigh_mgr_logdbg("Cannot create new entry, transport type is UNKNOWN"); + return NULL; + } +} + +void neigh_table_mgr::notify_cb(event *ev) +{ + neigh_mgr_logdbg(""); + // Got event from netlink + + neigh_nl_event* nl_ev = dynamic_cast (ev); + BULLSEYE_EXCLUDE_BLOCK_START + if (nl_ev == NULL) { + neigh_mgr_logdbg("Non neigh_nl_event type"); + return; + } + BULLSEYE_EXCLUDE_BLOCK_END + + const netlink_neigh_info* nl_info = nl_ev->get_neigh_info(); + struct in_addr in; + if (1 != inet_pton(AF_INET, (const char *)(nl_info->dst_addr_str.c_str()), &in)) { + neigh_mgr_logdbg("Ignoring netlink neigh event neigh for IP = %s, not a valid IP", nl_info->dst_addr_str.c_str()); + return; + } + + in_addr_t neigh_ip = in.s_addr; + + // Search for this neigh ip in cache_table + m_lock.lock(); + net_device_val* p_ndev = g_p_net_device_table_mgr->get_net_device_val(nl_info->ifindex); + + //find all neigh entries with an appropriate peer_ip and net_device + if (p_ndev) { + neigh_entry *p_ne = dynamic_cast (get_entry(neigh_key(ip_address(neigh_ip), p_ndev))); + if (p_ne) { + // Call the relevant neigh_entry to handle the event + p_ne->handle_neigh_event(nl_ev); + } else { + neigh_mgr_logdbg("Ignoring netlink neigh event for IP = %s if:%s, index=%d, p_ndev=%p", nl_info->dst_addr_str.c_str(), p_ndev->to_str().c_str(), nl_info->ifindex, p_ndev); + } + } else { + neigh_mgr_logdbg("could not find ndv_val for ifindex=%d", nl_info->ifindex); + } + m_lock.unlock(); + + return; +} diff --git a/src/vma/proto/neighbour_table_mgr.h b/src/vma/proto/neighbour_table_mgr.h new file mode 100644 index 0000000..7ac0f61 --- /dev/null +++ b/src/vma/proto/neighbour_table_mgr.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef NEIGHBOUR_TABLE_MGR_H +#define NEIGHBOUR_TABLE_MGR_H + +#include "vma/proto/neighbour.h" +#include "vma/infra/cache_subject_observer.h" + +class neigh_table_mgr : public cache_table_mgr, public observer +{ +public: + neigh_table_mgr(); + ~neigh_table_mgr(); + virtual void notify_cb(event * event); + rdma_event_channel* m_neigh_cma_event_channel; + bool register_observer(neigh_key, + const cache_observer *, + cache_entry_subject **); + +private: + /* This function will retrieve neigh transport type by the following actions: + * 1. go to route manager table and get route entry according to the peer ip + * 2. get netdev from route entry + * 3. get transport type from netdev + */ + neigh_entry* create_new_entry(neigh_key neigh_key, const observer* dst); +}; + +extern neigh_table_mgr *g_p_neigh_table_mgr; + + +#endif /* NEIGHBOUR_TABLE_MGR_H */ diff --git a/src/vma/proto/netlink_socket_mgr.h b/src/vma/proto/netlink_socket_mgr.h new file mode 100644 index 0000000..3600caf --- /dev/null +++ b/src/vma/proto/netlink_socket_mgr.h @@ -0,0 +1,323 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef NETLINK_SOCKET_MGR_H +#define NETLINK_SOCKET_MGR_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "utils/bullseye.h" +#include "utils/lock_wrapper.h" +#include "vlogger/vlogger.h" +#include "vma/util/if.h" +#include "vma/netlink/netlink_wrapper.h" +#include "vma/event/netlink_event.h" +#include "vma/util/vtypes.h" +#include "vma/util/utils.h" +#include "vma/sock/socket_fd_api.h" +#include "vma/sock/sock-redirect.h" + + +#ifndef MODULE_NAME +#define MODULE_NAME "netlink_socket_mgr:" +#endif + +#define NLMSG_TAIL(nmsg) ((struct rtattr *) (((uint8_t *) (nmsg)) + NLMSG_ALIGN((nmsg)->nlmsg_len))) + +#define MAX_TABLE_SIZE 4096 +#define MSG_BUFF_SIZE 81920 + +// This enum specify the type of data to be retrieve using netlink socket. +enum nl_data_t +{ + RULE_DATA_TYPE, + ROUTE_DATA_TYPE +}; + +/* +* This class manage retrieving data (Rule, Route) from kernel using netlink socket. +*/ +template +class netlink_socket_mgr +{ +public: + netlink_socket_mgr(nl_data_t data_type); + virtual ~netlink_socket_mgr(); + +protected: + typedef struct + { + Type value[MAX_TABLE_SIZE]; + uint16_t entries_num; + } table_t; + + table_t m_tab; + + virtual bool parse_enrty(nlmsghdr *nl_header, Type *p_val) = 0; + virtual void update_tbl(); + virtual void print_val_tbl(); + + void build_request(struct nlmsghdr **nl_msg); + bool query(struct nlmsghdr *&nl_msg, int &len); + int recv_info(); + void parse_tbl(int len, int *p_ent_num = NULL); + +private: + nl_data_t m_data_type; + + int m_fd; // netlink socket to communicate with the kernel + uint32_t m_pid; // process pid + uint32_t m_seq_num; // seq num of the netlink messages + char m_msg_buf[MSG_BUFF_SIZE]; // we use this buffer for sending/receiving netlink messages + uint32_t m_buff_size; +}; + +/*********************************Implementation ********************************/ + +template +netlink_socket_mgr ::netlink_socket_mgr(nl_data_t data_type) +{ + __log_dbg(""); + + m_data_type = data_type; + m_pid = getpid(); + m_buff_size = MSG_BUFF_SIZE; + m_seq_num = 0; + + memset(m_msg_buf, 0, m_buff_size); + + // Create Socket + BULLSEYE_EXCLUDE_BLOCK_START + if ((m_fd = orig_os_api.socket(PF_NETLINK, SOCK_DGRAM, NETLINK_ROUTE)) < 0) { + __log_err("NL socket Creation: "); + return; + } + + if (orig_os_api.fcntl(m_fd, F_SETFD, FD_CLOEXEC) != 0) { + __log_warn("Fail in fctl, error = %d", errno); + } + BULLSEYE_EXCLUDE_BLOCK_END + + __log_dbg("Done"); +} + +template +netlink_socket_mgr ::~netlink_socket_mgr() +{ + __log_dbg(""); + if (m_fd) { + orig_os_api.close(m_fd); + m_fd = -1; + } + + __log_dbg("Done"); +} + +// This function build Netlink request to retrieve data (Rule, Route) from kernel. +// Parameters : +// nl_msg : request to be returned +template +void netlink_socket_mgr ::build_request(struct nlmsghdr **nl_msg) +{ + struct rtmsg *rt_msg; + + memset(m_msg_buf, 0, m_buff_size); + + // point the header and the msg structure pointers into the buffer + *nl_msg = (struct nlmsghdr *)m_msg_buf; + rt_msg = (struct rtmsg *)NLMSG_DATA(*nl_msg); + + //Fill in the nlmsg header + (*nl_msg)->nlmsg_len = NLMSG_LENGTH(sizeof(struct rtmsg)); + (*nl_msg)->nlmsg_seq = m_seq_num++; + (*nl_msg)->nlmsg_pid = m_pid; + rt_msg->rtm_family = AF_INET; + + if (m_data_type == RULE_DATA_TYPE) + { + (*nl_msg)->nlmsg_type = RTM_GETRULE; + } + else if (m_data_type == ROUTE_DATA_TYPE) + { + (*nl_msg)->nlmsg_type = RTM_GETROUTE; + } + + (*nl_msg)->nlmsg_flags = NLM_F_DUMP | NLM_F_REQUEST; + +} + +// Query built request and receive requested data (Rule, Route) +// Parameters: +// nl_msg : request that is built previously. +// len : length of received data. +template +bool netlink_socket_mgr ::query(struct nlmsghdr *&nl_msg, int &len) +{ + if(m_fd < 0) + return false; + + BULLSEYE_EXCLUDE_BLOCK_START + if(orig_os_api.send(m_fd, nl_msg, nl_msg->nlmsg_len, 0) < 0){ + __log_err("Write To Socket Failed...\n"); + return false; + } + if((len = recv_info()) < 0) { + __log_err("Read From Socket Failed...\n"); + return false; + } + BULLSEYE_EXCLUDE_BLOCK_END + + return true; +} + +// Receive requested data and save it locally. +// Return length of received data. +template +int netlink_socket_mgr ::recv_info() +{ + struct nlmsghdr *nlHdr; + int readLen = 0, msgLen = 0; + + char *buf_ptr = m_msg_buf; + + do{ + //Receive response from the kernel + BULLSEYE_EXCLUDE_BLOCK_START + if((readLen = orig_os_api.recv(m_fd, buf_ptr, MSG_BUFF_SIZE - msgLen, 0)) < 0){ + __log_err("SOCK READ: "); + return -1; + } + + nlHdr = (struct nlmsghdr *)buf_ptr; + + //Check if the header is valid + if((NLMSG_OK(nlHdr, (u_int)readLen) == 0) || (nlHdr->nlmsg_type == NLMSG_ERROR)) + { + __log_err("Error in received packet, readLen = %d, msgLen = %d, type=%d, bufLen = %d", readLen, nlHdr->nlmsg_len, nlHdr->nlmsg_type, MSG_BUFF_SIZE); + if (nlHdr->nlmsg_len == MSG_BUFF_SIZE) { + __log_err("The buffer we pass to netlink is too small for reading the whole table"); + } + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_END + + buf_ptr += readLen; + msgLen += readLen; + + //Check if the its the last message + if(nlHdr->nlmsg_type == NLMSG_DONE || + (nlHdr->nlmsg_flags & NLM_F_MULTI) == 0) { + break; + } + + } while((nlHdr->nlmsg_seq != m_seq_num) || (nlHdr->nlmsg_pid != m_pid)); + return msgLen; +} + +// Update data in a table +template +void netlink_socket_mgr ::update_tbl() +{ + struct nlmsghdr *nl_msg = NULL; + int counter = 0; + int len = 0; + + m_tab.entries_num = 0; + + // Build Netlink request to get route entry + build_request(&nl_msg); + + // Query built request and receive requested data + if (!query(nl_msg, len)) + return; + + // Parse received data in custom object (route_val) + parse_tbl(len, &counter); + + m_tab.entries_num = counter; + + if (counter >= MAX_TABLE_SIZE) { + __log_warn("reached the maximum route table size"); + } +} + +// Parse received data in a table +// Parameters: +// len : length of received data. +// p_ent_num : number of rows in received data. +template +void netlink_socket_mgr ::parse_tbl(int len, int *p_ent_num) +{ + struct nlmsghdr *nl_header; + int entry_cnt = 0; + + nl_header = (struct nlmsghdr *) m_msg_buf; + for(;NLMSG_OK(nl_header, (u_int)len) && entry_cnt < MAX_TABLE_SIZE; nl_header = NLMSG_NEXT(nl_header, len)) + { + if (parse_enrty(nl_header, &m_tab.value[entry_cnt])) { + entry_cnt++; + } + } + if (p_ent_num) + *p_ent_num = entry_cnt; +} + +//print the table +template +void netlink_socket_mgr ::print_val_tbl() +{ + Type *p_val; + for (int i = 0; i < m_tab.entries_num; i++) + { + p_val = &m_tab.value[i]; + p_val->print_val(); + } +} + +#undef MODULE_NAME + +#endif /* NETLINK_SOCKET_MGR_H */ diff --git a/src/vma/proto/peer_key.h b/src/vma/proto/peer_key.h new file mode 100644 index 0000000..e348b3c --- /dev/null +++ b/src/vma/proto/peer_key.h @@ -0,0 +1,54 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __PEER_KEY_H__ +#define __PEER_KEY_H__ + +/** + * Use union for representing ip:port as one uint64_t primitive, + * + * NOTE: this type provides implicit cast to uint64_t. Hence, it natively supports containers such as map and hash. + */ +union peer_key { +public: + peer_key(uint32_t _ip, uint16_t _port) : ip(_ip), port(_port){} + operator uint64_t() const {return key;} // this saves the need for operator< and for operator== and for operator size_t() with map/hash + +private: + uint64_t key; + + struct { + uint32_t ip; + uint32_t port; // 32 bits for making sure all bits of key are initialized + }; +}; + +#endif /* ! __PEER_KEY_H__ */ diff --git a/src/vma/proto/route_entry.cpp b/src/vma/proto/route_entry.cpp new file mode 100644 index 0000000..dcac1e3 --- /dev/null +++ b/src/vma/proto/route_entry.cpp @@ -0,0 +1,145 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "vma/proto/ip_address.h" +#include "route_entry.h" +#include "route_table_mgr.h" +#include "vma/infra/cache_subject_observer.h" +#include "vma/dev/net_device_table_mgr.h" + +// debugging macros +#define MODULE_NAME "rte" +#undef MODULE_HDR_INFO +#define MODULE_HDR_INFO MODULE_NAME "[%s]:%d:%s() " +#undef __INFO__ +#define __INFO__ m_str.c_str() + +#define rt_entry_logdbg __log_info_dbg + +route_entry::route_entry(route_rule_table_key rtk) : + cache_entry_subject(rtk), cache_observer(), + m_p_net_dev_entry(NULL), + m_p_net_dev_val(NULL), + m_b_offloaded_net_dev(false), + m_is_valid(false) +{ + m_val = NULL; + m_p_rr_entry = NULL; + cache_entry_subject*>* rr_entry = NULL; + g_p_rule_table_mgr->register_observer(rtk, this, &rr_entry); + m_p_rr_entry = dynamic_cast(rr_entry); +} + +route_entry::~route_entry() +{ + unregister_to_net_device(); + if (m_p_rr_entry) { + g_p_rule_table_mgr->unregister_observer(get_key(), this); + m_p_rr_entry = NULL; + } +} + +bool route_entry::get_val(INOUT route_val* &val) +{ + rt_entry_logdbg(""); + val = m_val; + return is_valid(); +} + +void route_entry::set_str() +{ + m_str = get_key().to_str() + "->" + m_val->get_if_name(); +} + +void route_entry::set_val(IN route_val* &val) +{ + cache_entry_subject::set_val(val); + set_str(); +} + +void route_entry::register_to_net_device() +{ + local_ip_list_t lip_offloaded_list = g_p_net_device_table_mgr->get_ip_list(m_val->get_if_index()); + if (lip_offloaded_list.empty()) { + rt_entry_logdbg("No matched net device for %s interface", m_val->get_if_name()); + m_b_offloaded_net_dev = false; + } else { + ip_address src_addr = lip_offloaded_list.front().local_addr; + rt_entry_logdbg("register to net device with src_addr %s", src_addr.to_str().c_str()); + + cache_entry_subject *net_dev_entry = (cache_entry_subject *)m_p_net_dev_entry; + if (g_p_net_device_table_mgr->register_observer(src_addr, this, &net_dev_entry)) { + rt_entry_logdbg("route_entry [%p] is registered to an offloaded device", this); + m_p_net_dev_entry = (net_device_entry *) net_dev_entry; + m_p_net_dev_entry->get_val(m_p_net_dev_val); + m_b_offloaded_net_dev = true; + } + else { + rt_entry_logdbg("route_entry [%p] tried to register to non-offloaded device ---> registration failed", this); + m_b_offloaded_net_dev = false; + } + } +} + +void route_entry::unregister_to_net_device() +{ + if (!m_val) { + rt_entry_logdbg("ERROR: failed to find route val"); + return; + } + + if (m_p_net_dev_val) { + ip_address src_addr = m_p_net_dev_val->get_local_addr(); + rt_entry_logdbg("unregister from net device with src_addr %s", src_addr.to_str().c_str()); + if (! g_p_net_device_table_mgr->unregister_observer(src_addr, this)) { + rt_entry_logdbg("ERROR: failed to unregister from net_device_entry"); + } + } + + m_p_net_dev_entry = NULL; + m_p_net_dev_val = NULL; + +} + +void route_entry::notify_cb() +{ + // got addr_change event from net_device_entry --> does not change the validity of route_entry! + rt_entry_logdbg(""); + if (m_p_net_dev_entry->is_valid()) { + m_p_net_dev_entry->get_val(m_p_net_dev_val); + } + else { + m_p_net_dev_val = NULL; + } + notify_observers(); +} diff --git a/src/vma/proto/route_entry.h b/src/vma/proto/route_entry.h new file mode 100644 index 0000000..01cc3dd --- /dev/null +++ b/src/vma/proto/route_entry.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef ROUTE_ENTRY_H +#define ROUTE_ENTRY_H + +#include "vma/util/if.h" +#include + +#include "vma/proto/route_rule_table_key.h" +#include "vma/infra/cache_subject_observer.h" +#include "vma/dev/net_device_entry.h" +#include "route_val.h" +#include "rule_entry.h" + +class route_entry : public cache_entry_subject, public cache_observer +{ +public: + friend class route_table_mgr; + + route_entry(route_rule_table_key rtk); + virtual ~route_entry(); + + bool get_val(INOUT route_val* &val); + void set_val(IN route_val* &val); + + net_device_val* get_net_dev_val() { return m_p_net_dev_val; } + + inline void set_entry_valid() { m_is_valid = true; } + inline bool is_valid() { return m_is_valid && m_val && m_val->is_valid(); }; //m_val is NULL at first + + virtual void notify_cb(); + + void set_str(); + const string to_str() const { return m_str; }; + + inline rule_entry* get_rule_entry() const { return m_p_rr_entry; }; + +private: + net_device_entry* m_p_net_dev_entry; + net_device_val* m_p_net_dev_val; + bool m_b_offloaded_net_dev; + bool m_is_valid; + string m_str; + rule_entry* m_p_rr_entry; + + void register_to_net_device(); + void unregister_to_net_device(); +}; + +#endif /* ROUTE_ENTRY_H */ diff --git a/src/vma/proto/route_rule_table_key.h b/src/vma/proto/route_rule_table_key.h new file mode 100644 index 0000000..103183f --- /dev/null +++ b/src/vma/proto/route_rule_table_key.h @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef ROUTE_RULE_TABLE_KEY_H +#define ROUTE_RULE_TABLE_KEY_H + +#include +#include +#include + +#include "vma/util/to_str.h" +#include "vma/util/vtypes.h" +#include + +/* +* This class is used as key for route and rule table cashed history +* and its consist from destination IP, source IP and TOS. +*/ +class route_rule_table_key : public tostr +{ +public: + route_rule_table_key(in_addr_t dst_ip, in_addr_t src_ip, uint8_t tos): m_dst_ip(dst_ip), m_src_ip(src_ip), m_tos(tos){}; + ~route_rule_table_key(){}; + + const std::string to_str() const + { + char s[100] = {0}; + /* cppcheck-suppress wrongPrintfScanfArgNum */ + sprintf(s, "Destination IP:%d.%d.%d.%d", NIPQUAD(m_dst_ip)); + if (m_src_ip) { + char sx[40] = {0}; + /* cppcheck-suppress wrongPrintfScanfArgNum */ + sprintf(sx, " Source IP:%d.%d.%d.%d", NIPQUAD(m_src_ip)); + strcat(s, sx); + } + if (m_tos) { + char sx[20] = {0}; + sprintf(sx, " TOS:%u", m_tos); + strcat(s, sx); + } + + return(std::string(s)); + } + + in_addr_t get_dst_ip() const { return m_dst_ip; }; + in_addr_t get_src_ip() const { return m_src_ip; }; + uint8_t get_tos() const { return m_tos; }; + + bool operator==(const route_rule_table_key &rrk) const { + return (m_dst_ip == rrk.get_dst_ip() && m_src_ip == rrk.get_src_ip() && m_tos == rrk.get_tos()); + }; + +private: + in_addr_t m_dst_ip; + in_addr_t m_src_ip; + uint8_t m_tos; +}; + +namespace std { namespace tr1 { +template<> +class hash +{ +public: + size_t operator()(const route_rule_table_key &key) const + { + hash_hash; + char s[40] = {0}; + /* + Build string from exist parameter (destination IP, source IP, TOS) + which is unique for different route-rule entries. + */ + /* cppcheck-suppress wrongPrintfScanfArgNum */ + sprintf(s, "%d.%d.%d.%d", NIPQUAD(key.get_dst_ip())); + if (key.get_src_ip()) { + char sx[20] = {0}; + /* cppcheck-suppress wrongPrintfScanfArgNum */ + sprintf(sx, " %d.%d.%d.%d", NIPQUAD(key.get_src_ip())); + strcat(s, sx); + } + if (key.get_tos()) { + char sx[20] = {0}; + sprintf(sx, " %u", key.get_tos()); + strcat(s, sx); + } + return _hash(std::string(s));// Use built in hash function for string input. + } +}; +}} + + +#endif /* ROUTE_RULE_TABLE_KEY_H */ diff --git a/src/vma/proto/route_table_mgr.cpp b/src/vma/proto/route_table_mgr.cpp new file mode 100644 index 0000000..0be304d --- /dev/null +++ b/src/vma/proto/route_table_mgr.cpp @@ -0,0 +1,508 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "utils/bullseye.h" +#include "utils/lock_wrapper.h" +#include "vlogger/vlogger.h" +#include "vma/util/vtypes.h" +#include "vma/util/utils.h" +#include "route_table_mgr.h" +#include "vma/sock/socket_fd_api.h" +#include "vma/sock/sock-redirect.h" +#include "vma/dev/net_device_table_mgr.h" +#include "ip_address.h" + +// debugging macros +#define MODULE_NAME "rtm:" +#define rt_mgr_if_logpanic __log_panic +#define rt_mgr_logerr __log_err +#define rt_mgr_logwarn __log_warn +#define rt_mgr_loginfo __log_info +#define rt_mgr_logdbg __log_dbg +#define rt_mgr_logfunc __log_func +#define rt_mgr_logfuncall __log_funcall + +route_table_mgr* g_p_route_table_mgr = NULL; + +route_table_mgr::route_table_mgr() : netlink_socket_mgr(ROUTE_DATA_TYPE), cache_table_mgr("route_table_mgr") +{ + rt_mgr_logdbg(""); + + //Read Route table from kernel and save it in local variable. + update_tbl(); + + // create route_entry for each net_dev- needed for receiving port up/down events for net_dev_entry + route_val *p_val; + for (int i = 0; i < m_tab.entries_num; i++) + { + p_val = &m_tab.value[i]; + in_addr_t src_addr = p_val->get_src_addr(); + in_addr_route_entry_map_t::iterator iter = m_rte_list_for_each_net_dev.find(src_addr); + // if src_addr of interface exists in the map, no need to create another route_entry + if (iter == m_rte_list_for_each_net_dev.end()) { + in_addr_t dst_ip = src_addr; + in_addr_t src_ip = 0; + uint8_t tos = 0; + m_rte_list_for_each_net_dev.insert(pair (src_addr, create_new_entry(route_rule_table_key(dst_ip, src_ip, tos), NULL))); + } + } + + //Print table + print_val_tbl(); + + // register to netlink event + g_p_netlink_handler->register_event(nlgrpROUTE, this); + rt_mgr_logdbg("Registered to g_p_netlink_handler"); + + rt_mgr_logdbg("Done"); +} + +route_table_mgr::~route_table_mgr() +{ + rt_mgr_logdbg(""); + + // clear all route_entrys created in the constructor + in_addr_route_entry_map_t::iterator iter; + + while ((iter = m_rte_list_for_each_net_dev.begin()) != m_rte_list_for_each_net_dev.end()) { + delete(iter->second); + m_rte_list_for_each_net_dev.erase(iter); + } + + rt_tbl_cach_entry_map_t::iterator cache_itr; + while ((cache_itr = m_cache_tbl.begin()) != m_cache_tbl.end()) { + delete(cache_itr->second); + m_cache_tbl.erase(cache_itr); + } + rt_mgr_logdbg("Done"); +} + +void route_table_mgr::update_tbl() +{ + auto_unlocker lock(m_lock); + + netlink_socket_mgr::update_tbl(); + + rt_mgr_update_source_ip(); + + return; +} + +void route_table_mgr::rt_mgr_update_source_ip() +{ + route_val *p_val; + //for route entries which still have no src ip and no gw + for (int i = 0; i < m_tab.entries_num; i++) { + p_val = &m_tab.value[i]; + if (p_val->get_src_addr() || p_val->get_gw_addr()) continue; + if (g_p_net_device_table_mgr) { //try to get src ip from net_dev list of the interface + in_addr_t longest_prefix = 0; + in_addr_t correct_src = 0; + local_ip_list_t::iterator lip_iter; + local_ip_list_t lip_offloaded_list = g_p_net_device_table_mgr->get_ip_list(p_val->get_if_index()); + if (!lip_offloaded_list.empty()) { + for (lip_iter = lip_offloaded_list.begin(); lip_offloaded_list.end() != lip_iter; lip_iter++) + { + ip_data_t ip = *lip_iter; + if((p_val->get_dst_addr() & ip.netmask) == (ip.local_addr & ip.netmask)) { //found a match in routing table + if((ip.netmask | longest_prefix) != longest_prefix){ + longest_prefix = ip.netmask; // this is the longest prefix match + correct_src = ip.local_addr; + } + } + } + if (correct_src) { + p_val->set_src_addr(correct_src); + continue; + } + } + } + // if still no src ip, get it from ioctl + struct sockaddr_in src_addr; + char *if_name = (char *)p_val->get_if_name(); + if (!get_ipv4_from_ifname(if_name, &src_addr)) { + p_val->set_src_addr(src_addr.sin_addr.s_addr); + } + else { + // Failed mapping if_name to IPv4 address + rt_mgr_logwarn("could not figure out source ip for rtv = %s", p_val->to_str()); + } + } + + //for route entries with gateway, do recursive search for src ip + int num_unresolved_src = m_tab.entries_num; + int prev_num_unresolved_src = 0; + do { + prev_num_unresolved_src = num_unresolved_src; + num_unresolved_src = 0; + for (int i = 0; i < m_tab.entries_num; i++) { + p_val = &m_tab.value[i]; + if (p_val->get_gw_addr() && !p_val->get_src_addr()) { + route_val* p_val_dst; + in_addr_t in_addr = p_val->get_gw_addr(); + unsigned char table_id = p_val->get_table_id(); + if (find_route_val(in_addr, table_id, p_val_dst)) { + if (p_val_dst->get_src_addr()) { + p_val->set_src_addr(p_val_dst->get_src_addr()); + } else if (p_val == p_val_dst) { //gateway of the entry lead to same entry + local_ip_list_t::iterator lip_iter; + local_ip_list_t lip_offloaded_list = g_p_net_device_table_mgr->get_ip_list(p_val->get_if_index()); + for (lip_iter = lip_offloaded_list.begin(); lip_offloaded_list.end() != lip_iter; lip_iter++) + { + ip_data_t ip = *lip_iter; + if(p_val->get_gw_addr() == ip.local_addr) { + p_val->set_gw(0); + p_val->set_src_addr(ip.local_addr); + break; + } + } + if (!p_val->get_src_addr()) + num_unresolved_src++; + } else { + num_unresolved_src++; + } + // gateway and source are equal, no need of gw. + if (p_val->get_src_addr() == p_val->get_gw_addr()) { + p_val->set_gw(0); + } + } else { + num_unresolved_src++; + } + } + } + } while (num_unresolved_src && prev_num_unresolved_src > num_unresolved_src); + + //for route entries which still have no src ip + for (int i = 0; i < m_tab.entries_num; i++) { + p_val = &m_tab.value[i]; + if (p_val->get_src_addr()) continue; + if (p_val->get_gw_addr()) { + rt_mgr_logdbg("could not figure out source ip for gw address. rtv = %s", p_val->to_str()); + } + // if still no src ip, get it from ioctl + struct sockaddr_in src_addr; + char *if_name = (char *)p_val->get_if_name(); + if (!get_ipv4_from_ifname(if_name, &src_addr)) { + p_val->set_src_addr(src_addr.sin_addr.s_addr); + } + else { + // Failed mapping if_name to IPv4 address + rt_mgr_logdbg("could not figure out source ip for rtv = %s", p_val->to_str()); + } + } +} + +bool route_table_mgr::parse_enrty(nlmsghdr *nl_header, route_val *p_val) +{ + int len; + struct rtmsg *rt_msg; + struct rtattr *rt_attribute; + + // get route entry header + rt_msg = (struct rtmsg *) NLMSG_DATA(nl_header); + + // we are not concerned about the local and default route table + if (rt_msg->rtm_family != AF_INET || rt_msg->rtm_table == RT_TABLE_LOCAL) + return false; + + p_val->set_protocol(rt_msg->rtm_protocol); + p_val->set_scope(rt_msg->rtm_scope); + p_val->set_type(rt_msg->rtm_type); + p_val->set_table_id(rt_msg->rtm_table); + + in_addr_t dst_mask = htonl(VMA_NETMASK(rt_msg->rtm_dst_len)); + p_val->set_dst_mask(dst_mask); + p_val->set_dst_pref_len(rt_msg->rtm_dst_len); + + len = RTM_PAYLOAD(nl_header); + rt_attribute = (struct rtattr *) RTM_RTA(rt_msg); + + for (;RTA_OK(rt_attribute, len);rt_attribute=RTA_NEXT(rt_attribute,len)) { + parse_attr(rt_attribute, p_val); + } + p_val->set_state(true); + p_val->set_str(); + return true; +} + +void route_table_mgr::parse_attr(struct rtattr *rt_attribute, route_val *p_val) +{ + switch (rt_attribute->rta_type) { + case RTA_DST: + p_val->set_dst_addr(*(in_addr_t *)RTA_DATA(rt_attribute)); + break; + // next hop IPv4 address + case RTA_GATEWAY: + p_val->set_gw(*(in_addr_t *)RTA_DATA(rt_attribute)); + break; + // unique ID associated with the network interface + case RTA_OIF: + p_val->set_if_index(*(int *)RTA_DATA(rt_attribute)); + char if_name[IFNAMSIZ]; + if_indextoname(p_val->get_if_index(),if_name); + p_val->set_if_name(if_name); + break; + case RTA_SRC: + case RTA_PREFSRC: + p_val->set_src_addr(*(in_addr_t *)RTA_DATA(rt_attribute)); + break; + case RTA_TABLE: + p_val->set_table_id(*(uint32_t *)RTA_DATA(rt_attribute)); + break; + case RTA_METRICS: + { + struct rtattr *rta = (struct rtattr *)RTA_DATA(rt_attribute); + int len = RTA_PAYLOAD(rt_attribute); + uint16_t type; + while (RTA_OK(rta, len)) { + type = rta->rta_type; + switch (type) { + case RTAX_MTU: + p_val->set_mtu(*(uint32_t *)RTA_DATA(rta)); + break; + default: + rt_mgr_logdbg("got unexpected METRICS %d %x", + type, *(uint32_t *)RTA_DATA(rta)); + break; + } + rta = RTA_NEXT(rta, len); + } + break; + } + default: + rt_mgr_logdbg("got unexpected type %d %x", rt_attribute->rta_type, + *(uint32_t *)RTA_DATA(rt_attribute)); + break; + } +} + +bool route_table_mgr::find_route_val(in_addr_t &dst, unsigned char table_id, route_val* &p_val) +{ + ip_address dst_addr = dst; + rt_mgr_logfunc("dst addr '%s'", dst_addr.to_str().c_str()); + + route_val *correct_route_val = NULL; + int longest_prefix = -1; + + for (int i = 0; i < m_tab.entries_num; i++) { + route_val* p_val_from_tbl = &m_tab.value[i]; + if (!p_val_from_tbl->is_deleted() && p_val_from_tbl->is_if_up()) { // value was not deleted + if(p_val_from_tbl->get_table_id() == table_id) { //found a match in routing table ID + if(p_val_from_tbl->get_dst_addr() == (dst & p_val_from_tbl->get_dst_mask())) { //found a match in routing table + if(p_val_from_tbl->get_dst_pref_len() > longest_prefix) { // this is the longest prefix match + longest_prefix = p_val_from_tbl->get_dst_pref_len(); + correct_route_val = p_val_from_tbl; + } + } + } + } + } + if (correct_route_val) { + ip_address dst_gw = correct_route_val->get_dst_addr(); + p_val = correct_route_val; + rt_mgr_logdbg("found route val[%p]: %s", p_val, p_val->to_str()); + return true; + } + + rt_mgr_logdbg("destination gw wasn't found"); + return false; +} + +bool route_table_mgr::route_resolve(IN route_rule_table_key key, OUT route_result &res) +{ + in_addr_t dst = key.get_dst_ip(); + ip_address dst_addr = dst; + rt_mgr_logdbg("dst addr '%s'", dst_addr.to_str().c_str()); + + route_val *p_val = NULL; + std::deque table_id_list; + + g_p_rule_table_mgr->rule_resolve(key, table_id_list); + + auto_unlocker lock(m_lock); + std::deque::iterator table_id_iter = table_id_list.begin(); + for (; table_id_iter != table_id_list.end(); table_id_iter++) { + if (find_route_val(dst, *table_id_iter, p_val)) { + res.p_src = p_val->get_src_addr(); + rt_mgr_logdbg("dst ip '%s' resolved to src addr " + "'%d.%d.%d.%d'", dst_addr.to_str().c_str(), + NIPQUAD(res.p_src)); + res.p_gw = p_val->get_gw_addr(); + rt_mgr_logdbg("dst ip '%s' resolved to gw addr '%d.%d.%d.%d'", + dst_addr.to_str().c_str(), NIPQUAD(res.p_gw)); + res.mtu = p_val->get_mtu(); + rt_mgr_logdbg("found route mtu %d", res.mtu); + return true; + } + } + /* prevent usage on false return */ + return false; +} + +void route_table_mgr::update_entry(INOUT route_entry* p_ent, bool b_register_to_net_dev /*= false*/) +{ + rt_mgr_logdbg("entry [%p]", p_ent); + auto_unlocker lock(m_lock); + if (p_ent && !p_ent->is_valid()) { //if entry is found in the collection and is not valid + rt_mgr_logdbg("route_entry is not valid-> update value"); + rule_entry* p_rr_entry = p_ent->get_rule_entry(); + std::deque* p_rr_val; + if (p_rr_entry && p_rr_entry->get_val(p_rr_val)) { + route_val* p_val = NULL; + in_addr_t peer_ip = p_ent->get_key().get_dst_ip(); + unsigned char table_id; + for (std::deque::iterator p_rule_val = p_rr_val->begin(); p_rule_val != p_rr_val->end(); p_rule_val++) { + table_id = (*p_rule_val)->get_table_id(); + if (find_route_val(peer_ip, table_id, p_val)) { + p_ent->set_val(p_val); + if (b_register_to_net_dev) { + //in_addr_t src_addr = p_val->get_src_addr(); + //net_device_val* p_ndv = g_p_net_device_table_mgr->get_net_device_val(src_addr); + + // Check if broadcast IP which is NOT supported + if (IS_BROADCAST_N(peer_ip)) { + rt_mgr_logdbg("Disabling Offload for route_entry '%s' - this is BC address", p_ent->to_str().c_str()); + // Need to route traffic to/from OS + // Prevent registering of net_device to route entry + } + // Check if: Local loopback over Ethernet case which was not supported before OFED 2.1 + /*else if (p_ndv && (p_ndv->get_transport_type() == VMA_TRANSPORT_ETH) && (peer_ip == src_addr)) { + rt_mgr_logdbg("Disabling Offload for route_entry '%s' - this is an Ethernet unicast loopback route", p_ent->to_str().c_str()); + // Need to route traffic to/from OS + // Prevent registering of net_device to route entry + }*/ + else { + // register to net device for bonding events + p_ent->register_to_net_device(); + } + } + // All good, validate the new route entry + p_ent->set_entry_valid(); + break; + } else { + rt_mgr_logdbg("could not find route val for route_entry '%s in table %u'", p_ent->to_str().c_str(), table_id); + } + } + } + else { + rt_mgr_logdbg("rule entry is not valid"); + } + } +} + +route_entry* route_table_mgr::create_new_entry(route_rule_table_key key, const observer *obs) +{ + // no need for lock - lock is activated in cache_collection_mgr::register_observer + + rt_mgr_logdbg(""); + NOT_IN_USE(obs); + route_entry* p_ent = new route_entry(key); + update_entry(p_ent, true); + rt_mgr_logdbg("new entry %p created successfully", p_ent); + return p_ent; +} + +void route_table_mgr::new_route_event(route_val* netlink_route_val) +{ + if (!netlink_route_val) { + rt_mgr_logdbg("Invalid route entry"); + return; + } + + if (m_tab.entries_num >= MAX_TABLE_SIZE) { + rt_mgr_logwarn("No available space for new route entry"); + return; + } + + auto_unlocker lock(m_lock); + route_val* p_route_val = &m_tab.value[m_tab.entries_num]; + p_route_val->set_dst_addr(netlink_route_val->get_dst_addr()); + p_route_val->set_dst_mask(netlink_route_val->get_dst_mask()); + p_route_val->set_dst_pref_len(netlink_route_val->get_dst_pref_len()); + p_route_val->set_src_addr(netlink_route_val->get_src_addr()); + p_route_val->set_gw(netlink_route_val->get_gw_addr()); + p_route_val->set_protocol(netlink_route_val->get_protocol()); + p_route_val->set_scope(netlink_route_val->get_scope()); + p_route_val->set_type(netlink_route_val->get_type()); + p_route_val->set_table_id(netlink_route_val->get_table_id()); + p_route_val->set_if_index(netlink_route_val->get_if_index()); + p_route_val->set_if_name(const_cast (netlink_route_val->get_if_name())); + p_route_val->set_mtu((netlink_route_val->get_mtu())); + p_route_val->set_state(true); + p_route_val->set_str(); + p_route_val->print_val(); + ++m_tab.entries_num; +} + +void route_table_mgr::notify_cb(event *ev) +{ + rt_mgr_logdbg("received route event from netlink"); + + route_nl_event *route_netlink_ev = dynamic_cast (ev); + if (!route_netlink_ev) { + rt_mgr_logwarn("Received non route event!!!"); + return; + } + + netlink_route_info* p_netlink_route_info = route_netlink_ev->get_route_info(); + if (!p_netlink_route_info) { + rt_mgr_logdbg("Received invalid route event!!!"); + return; + } + + switch(route_netlink_ev->nl_type) { + case RTM_NEWROUTE: + new_route_event(p_netlink_route_info->get_route_val()); + break; +#if 0 + case RTM_DELROUTE: + del_route_event(p_netlink_route_info->get_route_val()); + break; +#endif + default: + rt_mgr_logdbg("Route event (%u) is not handled", route_netlink_ev->nl_type); + break; + } +} diff --git a/src/vma/proto/route_table_mgr.h b/src/vma/proto/route_table_mgr.h new file mode 100644 index 0000000..a32ca2d --- /dev/null +++ b/src/vma/proto/route_table_mgr.h @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef ROUTE_TABLE_MGR_H +#define ROUTE_TABLE_MGR_H + +#include +#include +#include +#include "vma/infra/cache_subject_observer.h" +#include "vma/netlink/netlink_wrapper.h" +#include "vma/event/netlink_event.h" +#include "rule_table_mgr.h" +#include "route_entry.h" + +#define ADDR_LEN 46 // needs 16-bytes for IPv4, and 46-bytes for IPv6 + +typedef std::tr1::unordered_map in_addr_route_entry_map_t; +typedef std::tr1::unordered_map *> rt_tbl_cach_entry_map_t; + +struct route_result { + in_addr_t p_src; + in_addr_t p_gw; + uint32_t mtu; + route_result(): p_src(0), p_gw(0) ,mtu(0) {} +}; + +class route_table_mgr : public netlink_socket_mgr, public cache_table_mgr, public observer +{ +public: + route_table_mgr(); + virtual ~route_table_mgr(); + + bool route_resolve(IN route_rule_table_key key, OUT route_result &res); + + route_entry* create_new_entry(route_rule_table_key key, const observer *obs); + void update_entry(INOUT route_entry* p_ent, bool b_register_to_net_dev = false); + + virtual void notify_cb(event *ev); + +protected: + virtual bool parse_enrty(nlmsghdr *nl_header, route_val *p_val); + +private: + // in constructor creates route_entry for each net_dev, to receive events in case there are no other route_entrys + in_addr_route_entry_map_t m_rte_list_for_each_net_dev; + + bool find_route_val(in_addr_t &dst_addr, unsigned char table_id, route_val* &p_val); + + // save current main rt table + void update_tbl(); + void parse_attr(struct rtattr *rt_attribute, route_val *p_val); + + void rt_mgr_update_source_ip(); + + void new_route_event(route_val* netlink_route_val); +}; + +extern route_table_mgr* g_p_route_table_mgr; + +#endif /* ROUTE_TABLE_MGR_H */ diff --git a/src/vma/proto/route_val.cpp b/src/vma/proto/route_val.cpp new file mode 100644 index 0000000..7776fa8 --- /dev/null +++ b/src/vma/proto/route_val.cpp @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include +#include +#include +#include "vma/util/if.h" + +#include "route_val.h" +#include "route_table_mgr.h" +#include "vma/dev/net_device_table_mgr.h" + +#define MODULE_NAME "rtv" + +#define rt_val_loginfo __log_info_info +#define rt_val_logdbg __log_info_dbg +#define rt_val_logfunc __log_info_func + +route_val::route_val() +{ + m_dst_addr = 0; + m_dst_mask = 0; + m_dst_pref_len = 0; + m_src_addr = 0; + m_gw = 0; + m_protocol = 0; + m_scope = 0; + m_type = 0; + m_table_id = 0; + memset(m_if_name, 0, IFNAMSIZ * sizeof(char)); + m_if_index = 0; + m_is_valid = false; + m_b_deleted = false; + m_b_if_up = true; + m_mtu = 0; + memset(m_str, 0, BUFF_SIZE * sizeof(char)); +} + +void route_val::set_str() +{ + char str_addr[INET_ADDRSTRLEN]; + char str_x[100] = {0}; + + strcpy(m_str, "dst:"); + + str_x[0] = '\0'; + if (m_dst_addr != 0) { + inet_ntop(AF_INET, &m_dst_addr_in_addr, str_addr, sizeof(str_addr)); + sprintf(str_x, " %-15s", str_addr); + } else { + sprintf(str_x, " %-15s", "default"); + } + strcat(m_str, str_x); + + str_x[0] = '\0'; + if (m_dst_mask != 0) { + inet_ntop(AF_INET, &m_dst_mask_in_addr, str_addr, sizeof(str_addr)); + sprintf(str_x, " netmask: %-15s", str_addr); + } + strcat(m_str, str_x); + + str_x[0] = '\0'; + if (m_gw != 0) { + inet_ntop(AF_INET, &m_gw_in_addr, str_addr, sizeof(str_addr)); + sprintf(str_x, " gw: %-15s", str_addr); + } + strcat(m_str, str_x); + + str_x[0] = '\0'; + sprintf(str_x, " dev: %-5s", m_if_name); + strcat(m_str, str_x); + + str_x[0] = '\0'; + if (m_src_addr != 0) { + inet_ntop(AF_INET, &m_src_addr_in_addr, str_addr, sizeof(str_addr)); + sprintf(str_x, " src: %-15s", str_addr); + } else { + sprintf(str_x, " "); + } + strcat(m_str, str_x); + + str_x[0] = '\0'; + if (m_table_id != RT_TABLE_MAIN) { + sprintf(str_x, " table :%-10u", m_table_id); + } else { + sprintf(str_x, " table :%-10s", "main"); + } + strcat(m_str, str_x); + + str_x[0] = '\0'; + sprintf(str_x, " scope %3d type %2d index %2d", m_scope, m_type, m_if_index); + strcat(m_str, str_x); + // add route metrics + if (m_mtu) { + sprintf(str_x, " mtu %d", m_mtu); + strcat(m_str, str_x); + } + if (m_b_deleted) { + sprintf(str_x, " ---> DELETED"); + } + strcat(m_str, str_x); +} + +void route_val::print_val() +{ + set_str(); + rt_val_logdbg("%s", to_str()); +} + +void route_val::set_mtu(uint32_t mtu) +{ + if (mtu > g_p_net_device_table_mgr->get_max_mtu()) { + rt_val_logdbg("route mtu cannot be bigger then max mtu set on devices"); + } else { + m_mtu = mtu; + } +} diff --git a/src/vma/proto/route_val.h b/src/vma/proto/route_val.h new file mode 100644 index 0000000..f6d9233 --- /dev/null +++ b/src/vma/proto/route_val.h @@ -0,0 +1,121 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef ROUTE_VAL_H +#define ROUTE_VAL_H + +#include +#include +#include "utils/bullseye.h" + +#define BUFF_SIZE 255 + +class route_val +{ +public: + route_val(); + virtual ~route_val() {}; + + inline void set_dst_addr(in_addr_t const &dst_addr) { m_dst_addr = dst_addr; }; + inline void set_dst_mask(in_addr_t const &dst_mask) { m_dst_mask = dst_mask; }; + inline void set_dst_pref_len(uint8_t dst_pref_len) { m_dst_pref_len = dst_pref_len; }; + inline void set_src_addr(in_addr_t const &src_addr) { m_src_addr = src_addr; }; + inline void set_gw(in_addr_t const &gw) { m_gw = gw; }; + inline void set_protocol(unsigned char protocol) { m_protocol = protocol; }; + inline void set_scope(unsigned char scope) { m_scope = scope; }; + inline void set_type(unsigned char type) { m_type = type; }; + inline void set_table_id(uint32_t table_id) { m_table_id = table_id; }; + void set_mtu(uint32_t mtu); + inline void set_if_index(int if_index) { m_if_index = if_index; }; + inline void set_if_name(char *if_name) { memcpy(m_if_name, if_name, IFNAMSIZ); }; + void set_str(); + + inline in_addr_t get_dst_addr() const { return m_dst_addr; }; + inline in_addr_t get_dst_mask() const { return m_dst_mask; }; + inline uint8_t get_dst_pref_len() const { return m_dst_pref_len; }; + inline in_addr_t get_src_addr() const { return m_src_addr; }; + inline in_addr_t get_gw_addr() const { return m_gw; }; + inline unsigned char get_protocol() const { return m_protocol; }; + inline unsigned char get_scope() const { return m_scope; }; + inline unsigned char get_type() const { return m_type; }; + inline uint32_t get_table_id() const { return m_table_id; }; + inline int get_if_index() const { return m_if_index; }; + inline const char* get_if_name() const { return m_if_name; }; + inline uint32_t get_mtu() const { return m_mtu; }; + + inline void set_state(bool state) { m_is_valid = state; }; + inline bool is_valid() const { return m_is_valid; }; + + inline bool is_deleted() const { return m_b_deleted; }; + + inline bool is_if_up() const { return m_b_if_up; }; + + void print_val(); + char* to_str() { return m_str; }; + +private: + + union { + in_addr_t m_dst_addr; + in_addr m_dst_addr_in_addr; + }; + union { + in_addr_t m_dst_mask; + in_addr m_dst_mask_in_addr; + }; + uint8_t m_dst_pref_len; + union { + in_addr_t m_src_addr; + in_addr m_src_addr_in_addr; + }; + union { + in_addr_t m_gw; + in_addr m_gw_in_addr; + }; + + unsigned char m_protocol; + unsigned char m_scope; + unsigned char m_type; + uint32_t m_table_id; + + char m_if_name[IFNAMSIZ]; + int m_if_index; + + bool m_is_valid; + bool m_b_deleted; + bool m_b_if_up; + uint32_t m_mtu; + char m_str[BUFF_SIZE]; // Nice str to represent route_val +}; + +#endif /* ROUTE_VAL_H */ diff --git a/src/vma/proto/rule_entry.cpp b/src/vma/proto/rule_entry.cpp new file mode 100644 index 0000000..38c90a6 --- /dev/null +++ b/src/vma/proto/rule_entry.cpp @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "rule_entry.h" +#include "rule_table_mgr.h" +#include "vma/infra/cache_subject_observer.h" + +#define MODULE_NAME "rre" +#undef MODULE_HDR_INFO +#define MODULE_HDR_INFO MODULE_NAME "[%s]:%d:%s() " +#undef __INFO__ +#define __INFO__ to_str().c_str() + +#define rr_entry_logdbg __log_info_dbg + +rule_entry::rule_entry(route_rule_table_key rrk) : + cache_entry_subject*>(rrk) +{ + m_val = &values; +} + +bool rule_entry::get_val(INOUT std::deque* &val) +{ + rr_entry_logdbg(""); + val = m_val; + return is_valid(); +} + diff --git a/src/vma/proto/rule_entry.h b/src/vma/proto/rule_entry.h new file mode 100644 index 0000000..1c21d75 --- /dev/null +++ b/src/vma/proto/rule_entry.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef RULE_ENTRY_H +#define RULE_ENTRY_H + +#include "vma/infra/cache_subject_observer.h" +#include "vma/proto/route_rule_table_key.h" +#include "rule_val.h" + + +// This class represent an entry in rule table cashed history. +class rule_entry : public cache_entry_subject*> +{ +public: + friend class rule_table_mgr; + + rule_entry(route_rule_table_key rrk); + + bool get_val(INOUT std::deque* &val); + + inline bool is_valid(){ + /* TODO for future rules live updates */ + /* for (std::deque::iterator val = m_val->begin(); val != m_val->end(); val++) { + if (!(*val)->is_valid()) { + return false; + } + } */ + return !m_val->empty(); + } + + inline const string to_str() const { return get_key().to_str(); }; + +private: + std::deque values; +}; + +#endif /* RULE_ENTRY_H */ diff --git a/src/vma/proto/rule_table_mgr.cpp b/src/vma/proto/rule_table_mgr.cpp new file mode 100644 index 0000000..454e3e6 --- /dev/null +++ b/src/vma/proto/rule_table_mgr.cpp @@ -0,0 +1,280 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "utils/bullseye.h" +#include "utils/lock_wrapper.h" +#include "vlogger/vlogger.h" +#include "vma/util/vtypes.h" +#include "vma/util/utils.h" +#include "vma/util/if.h" +#include "rule_table_mgr.h" +#include "vma/sock/socket_fd_api.h" +#include "vma/sock/sock-redirect.h" +#include "ip_address.h" + +// debugging macros +#define MODULE_NAME "rrm:" + +#define rr_mgr_if_logpanic __log_panic +#define rr_mgr_logerr __log_err +#define rr_mgr_logwarn __log_warn +#define rr_mgr_loginfo __log_info +#define rr_mgr_logdbg __log_dbg +#define rr_mgr_logfunc __log_func +#define rr_mgr_logfuncall __log_funcall + +rule_table_mgr* g_p_rule_table_mgr = NULL; + +rule_table_mgr::rule_table_mgr() : netlink_socket_mgr(RULE_DATA_TYPE), cache_table_mgr*>("rule_table_mgr") +{ + + rr_mgr_logdbg(""); + + //Read Rule table from kernel and save it in local variable. + update_tbl(); + + //Print table + print_val_tbl(); + + rr_mgr_logdbg("Done"); +} + +//This function uses Netlink to get routing rules saved in kernel then saved it locally. +void rule_table_mgr::update_tbl() +{ + auto_unlocker lock(m_lock); + + netlink_socket_mgr::update_tbl(); + + return; +} + +// Parse received rule entry into custom object (rule_val). +// Parameters: +// nl_header : object that contain rule entry. +// p_val : custom object that contain parsed rule data. +// return true if its not related to local or default table, false otherwise. +bool rule_table_mgr::parse_enrty(nlmsghdr *nl_header, rule_val *p_val) +{ + int len; + struct rtmsg *rt_msg; + struct rtattr *rt_attribute; + + // get rule entry header + rt_msg = (struct rtmsg *) NLMSG_DATA(nl_header); + + // we are not concerned about the local and default rule table + if (rt_msg->rtm_family != AF_INET || rt_msg->rtm_table == RT_TABLE_LOCAL) + return false; + + p_val->set_protocol(rt_msg->rtm_protocol); + p_val->set_scope(rt_msg->rtm_scope); + p_val->set_type(rt_msg->rtm_type); + p_val->set_tos(rt_msg->rtm_tos); + p_val->set_table_id(rt_msg->rtm_table); + + len = RTM_PAYLOAD(nl_header); + rt_attribute = (struct rtattr *) RTM_RTA(rt_msg); + + for (;RTA_OK(rt_attribute, len);rt_attribute=RTA_NEXT(rt_attribute,len)) { + parse_attr(rt_attribute, p_val); + } + p_val->set_state(true); + p_val->set_str(); + return true; +} + +// Parse received rule attribute for given rule. +// Parameters: +// rt_attribute : object that contain rule attribute. +// p_val : custom object that contain parsed rule data. +void rule_table_mgr::parse_attr(struct rtattr *rt_attribute, rule_val *p_val) +{ + switch (rt_attribute->rta_type) { + case FRA_PRIORITY: + p_val->set_priority(*(uint32_t *)RTA_DATA(rt_attribute)); + break; + case FRA_DST: + p_val->set_dst_addr(*(in_addr_t *)RTA_DATA(rt_attribute)); + break; + case FRA_SRC: + p_val->set_src_addr(*(in_addr_t *)RTA_DATA(rt_attribute)); + break; + case FRA_IFNAME: + p_val->set_iif_name((char *)RTA_DATA(rt_attribute)); + break; + case FRA_TABLE: + p_val->set_table_id(*(uint32_t *)RTA_DATA(rt_attribute)); + break; +#if DEFINED_FRA_OIFNAME + case FRA_OIFNAME: + p_val->set_oif_name((char *)RTA_DATA(rt_attribute)); + break; +#endif + default: + rr_mgr_logdbg("got undetected rta_type %d %x", rt_attribute->rta_type, *(uint32_t *)RTA_DATA(rt_attribute)); + break; + } +} + + +// Create rule entry object for given destination key and fill it with matching rule value from rule table. +// Parameters: +// key : key object that contain information about destination. +// obs : object that contain observer for specific rule entry. +// Returns created rule entry object. +rule_entry* rule_table_mgr::create_new_entry(route_rule_table_key key, const observer *obs) +{ + rr_mgr_logdbg(""); + NOT_IN_USE(obs); + rule_entry* p_ent = new rule_entry(key); + update_entry(p_ent); + rr_mgr_logdbg("new entry %p created successfully", p_ent); + return p_ent; +} + +// Update invalid rule entry with matching rule value from rule table. +// Parameters: +// p_ent : rule entry that will be updated if it is invalid. +void rule_table_mgr::update_entry(rule_entry* p_ent) +{ + rr_mgr_logdbg("entry [%p]", p_ent); + auto_unlocker lock(m_lock); + + if (p_ent && !p_ent->is_valid()) { //if entry is found in the collection and is not valid + + rr_mgr_logdbg("rule_entry is not valid-> update value"); + std::deque* p_rrv; + p_ent->get_val(p_rrv); + /* p_rrv->clear(); TODO for future rule live updates */ + if (!find_rule_val(p_ent->get_key(), p_rrv)) { + rr_mgr_logdbg("ERROR: could not find rule val for rule_entry '%s'", p_ent->to_str().c_str()); + } + } +} + +// Find rule form rule table that match given destination info. +// Parameters: +// key : key object that contain information about destination. +// p_val : list of rule_val object that will contain information about all rule that match destination info +// Returns true if at least one rule match destination info, false otherwise. +bool rule_table_mgr::find_rule_val(route_rule_table_key key, std::deque* &p_val) +{ + rr_mgr_logfunc("destination info %s:", key.to_str().c_str()); + + for (int index = 0; index < m_tab.entries_num; index++) { + rule_val* p_val_from_tbl = &m_tab.value[index]; + if (p_val_from_tbl->is_valid() && is_matching_rule(key, p_val_from_tbl)) { + p_val->push_back(p_val_from_tbl); + rr_mgr_logdbg("found rule val[%p]: %s", p_val_from_tbl, p_val_from_tbl->to_str()); + } + } + + return !p_val->empty(); +} + +// Check matching between given destination info. and specific rule from rule table. +// Parameters: +// key : key object that contain information about destination. +// p_val : rule_val object that contain information about specific rule from rule table +// Returns true if destination info match rule value, false otherwise. +bool rule_table_mgr::is_matching_rule(route_rule_table_key key, rule_val* p_val) +{ + + in_addr_t m_dst_ip = key.get_dst_ip(); + in_addr_t m_src_ip = key.get_src_ip(); + uint8_t m_tos = key.get_tos(); + + in_addr_t rule_dst_ip = p_val->get_dst_addr(); + in_addr_t rule_src_ip = p_val->get_src_addr(); + uint8_t rule_tos = p_val->get_tos(); + char* rule_iif_name = (char *)p_val->get_iif_name(); + char* rule_oif_name = (char *)p_val->get_oif_name(); + + bool is_match = false; + + // Only destination IP, source IP and TOS are checked with rule, since IIF and OIF is not filled in dst_entry object. + if ((rule_dst_ip == 0) || (rule_dst_ip == m_dst_ip)) { // Check match in destination IP + + if ((rule_src_ip == 0) || (rule_src_ip == m_src_ip)) { // Check match in source IP + + if ((rule_tos == 0) || (rule_tos == m_tos)) { // Check match in TOS value + + if (strcmp(rule_iif_name, "") == 0) { // Check that rule doesn't contain IIF since we can't check match with + + if (strcmp(rule_oif_name, "") == 0) { // Check that rule doesn't contain OIF since we can't check match with + is_match = true; + } + } + } + } + } + + return is_match; +} + +// Find table ID for given destination info. +// Parameters: +// key : key object that contain information about destination. +// table_id_list : list that will contain table ID for all rule that match destination info +// Returns true if at least one rule match destination info, false otherwise. +bool rule_table_mgr::rule_resolve(route_rule_table_key key, std::deque &table_id_list) +{ + rr_mgr_logdbg("dst info: '%s'", key.to_str().c_str()); + + std::deque values; + std::deque* p_values = &values; + auto_unlocker lock(m_lock); + if (find_rule_val(key, p_values)) { + for (std::deque::iterator val = values.begin(); val != values.end(); val++) { + table_id_list.push_back((*val)->get_table_id()); + rr_mgr_logdbg("dst info: '%s' resolved to table ID '%u'", key.to_str().c_str(), (*val)->get_table_id()); + } + } + + return !table_id_list.empty(); +} + diff --git a/src/vma/proto/rule_table_mgr.h b/src/vma/proto/rule_table_mgr.h new file mode 100644 index 0000000..d670528 --- /dev/null +++ b/src/vma/proto/rule_table_mgr.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef RULE_TABLE_MGR_H +#define RULE_TABLE_MGR_H + +#include +#include +#include +#include "vma/infra/cache_subject_observer.h" +#include "vma/proto/netlink_socket_mgr.h" +#include "rule_entry.h" + +/* +* This class manages routing rule related operation such as getting rules from kernel, +* finding table ID for given destination info and cashing usage history for rule table. +*/ +class rule_table_mgr : public netlink_socket_mgr, public cache_table_mgr*> +{ +public: + rule_table_mgr(); + + rule_entry* create_new_entry(route_rule_table_key key, const observer *obs); + void update_entry(rule_entry* p_ent); + bool rule_resolve(route_rule_table_key key, std::deque &table_id_list); + +protected: + virtual bool parse_enrty(nlmsghdr *nl_header, rule_val *p_val); + virtual void update_tbl(); + +private: + + void parse_attr(struct rtattr *rt_attribute, rule_val *p_val); + + bool find_rule_val(route_rule_table_key key, std::deque* &p_val); + bool is_matching_rule(route_rule_table_key rrk, rule_val* p_val); +}; + +extern rule_table_mgr* g_p_rule_table_mgr; + +#endif /* RULE_TABLE_MGR_H */ diff --git a/src/vma/proto/rule_val.cpp b/src/vma/proto/rule_val.cpp new file mode 100644 index 0000000..b70f657 --- /dev/null +++ b/src/vma/proto/rule_val.cpp @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include +#include +#include + +#include "rule_val.h" +#include "rule_table_mgr.h" + +#define MODULE_NAME "rrv" + +#define rr_val_loginfo __log_info_info +#define rr_val_logdbg __log_info_dbg +#define rr_val_logfunc __log_info_func + +rule_val::rule_val(): cache_observer() +{ + m_protocol = 0; + m_scope = 0; + m_type = 0; + m_dst_addr = 0; + m_src_addr = 0; + memset(m_oif_name, 0, IFNAMSIZ * sizeof(char)); + memset(m_iif_name, 0, IFNAMSIZ * sizeof(char)); + m_priority = 0; + m_tos = 0; + m_table_id = 0; + m_is_valid = false; + memset(m_str, 0, BUFF_SIZE * sizeof(char)); + +} + +//This function build string that represent a row in the rule table. +void rule_val::set_str() +{ + char str_addr[INET_ADDRSTRLEN]; + char str_x[100] = {0}; + + sprintf(m_str, "Priority :%-10u", m_priority); + + if (m_src_addr != 0) { + inet_ntop(AF_INET, &m_src_addr_in_addr, str_addr, sizeof(str_addr)); + sprintf(str_x, " from :%-10s", str_addr); + } + strcat(m_str, str_x); + + str_x[0] = '\0'; + if (m_dst_addr != 0) { + inet_ntop(AF_INET, &m_dst_addr_in_addr, str_addr, sizeof(str_addr)); + sprintf(str_x, " to :%-12s", str_addr); + } + strcat(m_str, str_x); + + str_x[0] = '\0'; + if (m_tos != 0) + sprintf(str_x, " tos :%-11u", m_tos); + strcat(m_str, str_x); + + str_x[0] = '\0'; + if (strcmp(m_iif_name, "") != 0) + sprintf(str_x, " iif :%-11s", m_iif_name); + strcat(m_str, str_x); + + str_x[0] = '\0'; + if (strcmp(m_oif_name, "") != 0) + sprintf(str_x, " oif :%-11s", m_oif_name); + strcat(m_str, str_x); + + str_x[0] = '\0'; + if (m_table_id != RT_TABLE_MAIN) + sprintf(str_x, " lookup table :%-10u", m_table_id); + else + sprintf(str_x, " lookup table :%-10s", "main"); + strcat(m_str, str_x); +} + +//This function prints a string that represent a row in the rule table as debug log. +void rule_val::print_val() +{ + set_str(); + rr_val_logdbg("%s", to_str()); +} diff --git a/src/vma/proto/rule_val.h b/src/vma/proto/rule_val.h new file mode 100644 index 0000000..a7e04bf --- /dev/null +++ b/src/vma/proto/rule_val.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef RULE_VAL_H +#define RULE_VAL_H + +#include +#include +#include "vma/util/if.h" +#include "vma/infra/cache_subject_observer.h" + +#define BUFF_SIZE 255 + +/* +This class will contain information for given routing rule entry. +*/ +class rule_val : public cache_observer +{ +public: + rule_val(); + virtual ~rule_val() {}; + + inline void set_dst_addr(in_addr_t const &dst_addr) { m_dst_addr = dst_addr; }; + inline void set_src_addr(in_addr_t const &src_addr) { m_src_addr = src_addr; }; + inline void set_protocol(unsigned char protocol) { m_protocol = protocol; }; + inline void set_scope(unsigned char scope) { m_scope = scope; }; + inline void set_type(unsigned char type) { m_type = type; }; + inline void set_tos(unsigned char tos) { m_tos = tos; }; + inline void set_table_id(uint32_t table_id) { m_table_id = table_id; }; + inline void set_iif_name(char *iif_name) { memcpy(m_iif_name, iif_name, IFNAMSIZ); }; + inline void set_oif_name(char *oif_name) { memcpy(m_oif_name, oif_name, IFNAMSIZ); }; + inline void set_priority(uint32_t priority) { m_priority = priority; }; + + void set_str(); + + inline in_addr_t get_dst_addr() const { return m_dst_addr; }; + inline in_addr_t get_src_addr() const { return m_src_addr; }; + inline unsigned char get_tos() const { return m_tos; }; + inline uint32_t get_table_id() const { return m_table_id; }; + inline const char* get_iif_name() const { return m_iif_name; }; + inline const char* get_oif_name() const { return m_oif_name; }; + + inline void set_state(bool state) { m_is_valid = state; }; + inline bool is_valid() const { return m_is_valid; }; + + void print_val(); + char* to_str() { return m_str; }; + +private: + + unsigned char m_protocol; + unsigned char m_scope; + unsigned char m_type; + unsigned char m_tos; + + union { + in_addr_t m_dst_addr; + in_addr m_dst_addr_in_addr; + }; + union { + in_addr_t m_src_addr; + in_addr m_src_addr_in_addr; + }; + char m_iif_name[IFNAMSIZ]; + char m_oif_name[IFNAMSIZ]; + uint32_t m_priority; + uint32_t m_table_id; + + bool m_is_valid; + + char m_str[BUFF_SIZE]; +}; + +#endif /* RULE_VAL_H */ diff --git a/src/vma/proto/vma_lwip.cpp b/src/vma/proto/vma_lwip.cpp new file mode 100644 index 0000000..42ce85c --- /dev/null +++ b/src/vma/proto/vma_lwip.cpp @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "utils/rdtsc.h" +#include "vlogger/vlogger.h" + +#include "vma/event/event_handler_manager.h" +#include "vma/sock/sockinfo_tcp.h" +#include "vma/lwip/init.h" +#include "vma/lwip/tcp_impl.h" +#include "vma_lwip.h" + +// debugging macros +#define MODULE_NAME "lwip" +#undef MODULE_HDR_INFO +#define MODULE_HDR_INFO MODULE_NAME ":%s%d:%s() " +#undef __INFO__ +#define __INFO__ "" + + +#define lwip_logpanic __log_info_panic +#define lwip_logerr __log_info_err +#define lwip_logwarn __log_info_warn +#define lwip_loginfo __log_info_info +#define lwip_logdbg __log_info_dbg +#define lwip_logfunc __log_info_func +#define lwip_logfuncall __log_info_funcall + +int32_t enable_wnd_scale = 0; +u32_t rcv_wnd_scale = 0; + +u32_t vma_lwip::sys_now(void) +{ + struct timespec now; + + gettimefromtsc(&now); + return now.tv_sec * 1000 + now.tv_nsec / 1000000; +} + +u8_t vma_lwip::read_tcp_timestamp_option(void) +{ + u8_t res = (safe_mce_sys().tcp_ts_opt == TCP_TS_OPTION_FOLLOW_OS) ? safe_mce_sys().sysctl_reader.get_net_ipv4_tcp_timestamps() : (safe_mce_sys().tcp_ts_opt == TCP_TS_OPTION_ENABLE ? 1 : 0); + if (res) { +#if LWIP_TCP_TIMESTAMPS + lwip_logdbg("TCP timestamp option has been enabled"); +#else + lwip_logwarn("Cannot enable TCP timestamp option because LWIP_TCP_TIMESTAMPS is not defined"); + res = 0; +#endif + } + return res; +} + +vma_lwip *g_p_lwip = 0; + +/** + * LWIP "network" driver code + */ + +vma_lwip::vma_lwip() +{ + m_run_timers = false; + + if (*g_p_vlogger_level >= VLOG_DEBUG) + __vma_print_conf_file(__instance_list); + + lwip_logdbg(""); + + lwip_cc_algo_module = (enum cc_algo_mod)safe_mce_sys().lwip_cc_algo_mod; + + lwip_tcp_mss = get_lwip_tcp_mss(safe_mce_sys().mtu, safe_mce_sys().lwip_mss); + BULLSEYE_EXCLUDE_BLOCK_END + + enable_ts_option = read_tcp_timestamp_option(); + int is_window_scaling_enabled = safe_mce_sys().sysctl_reader.get_tcp_window_scaling(); + if(is_window_scaling_enabled) { + int rmem_max_value = safe_mce_sys().sysctl_reader.get_tcp_rmem()->max_value; + int core_rmem_max = safe_mce_sys().sysctl_reader.get_net_core_rmem_max(); + enable_wnd_scale = 1; + rcv_wnd_scale = get_window_scaling_factor(rmem_max_value, core_rmem_max); + } else { + enable_wnd_scale = 0; + rcv_wnd_scale = 0; + } + + //Bring up LWIP + lwip_init(); + lwip_logdbg("LWIP subsystem initialized"); + + register_tcp_tx_pbuf_alloc(sockinfo_tcp::tcp_tx_pbuf_alloc); + register_tcp_tx_pbuf_free(sockinfo_tcp::tcp_tx_pbuf_free); + register_tcp_seg_alloc(sockinfo_tcp::tcp_seg_alloc); + register_tcp_seg_free(sockinfo_tcp::tcp_seg_free); + register_ip_output(sockinfo_tcp::ip_output); + register_tcp_state_observer(sockinfo_tcp::tcp_state_observer); + register_ip_route_mtu(sockinfo_tcp::get_route_mtu); + register_sys_now(sys_now); + register_sys_readv(orig_os_api.readv); + set_tmr_resolution(safe_mce_sys().tcp_timer_resolution_msec); + //tcp_ticks increases in the rate of tcp slow_timer + void *node = g_p_event_handler_manager->register_timer_event(safe_mce_sys().tcp_timer_resolution_msec * 2, this, PERIODIC_TIMER, 0); + if (!node) { + lwip_logdbg("LWIP: failed to register timer event"); + free_lwip_resources(); + throw_vma_exception("LWIP: failed to register timer event"); + } +} + +vma_lwip::~vma_lwip() +{ + free_lwip_resources(); +} + +void vma_lwip::free_lwip_resources(void) +{ + /* TODO - revert the constructor */ +} + +void vma_lwip::handle_timer_expired(void* user_data) { + NOT_IN_USE(user_data); + tcp_ticks++; +} + +uint32_t get_lwip_tcp_mss(uint32_t mtu, uint32_t lwip_mss) +{ + uint32_t _lwip_tcp_mss; + + /* + * lwip_tcp_mss calculation + * 1. safe_mce_sys().mtu==0 && safe_mce_sys().lwip_mss==0 ==> lwip_tcp_mss = 0 (namelyl-must be calculated per interface) + * 2. safe_mce_sys().mtu==0 && safe_mce_sys().lwip_mss!=0 ==> lwip_tcp_mss = safe_mce_sys().lwip_mss + * 3. safe_mce_sys().mtu!=0 && safe_mce_sys().lwip_mss==0 ==> lwip_tcp_mss = safe_mce_sys().mtu - IP header len - TCP header len (must be positive) + * 4. safe_mce_sys().mtu!=0 && safe_mce_sys().lwip_mss!=0 ==> lwip_tcp_mss = safe_mce_sys().lwip_mss + */ + switch (lwip_mss) { + case MSS_FOLLOW_MTU: /* 0 */ + switch(mtu) { + case MTU_FOLLOW_INTERFACE: + _lwip_tcp_mss = 0; /* MSS must follow the specific MTU per interface */ + break; + default: + // set MSS to match VMA_MTU, MSS is equal to (VMA_MTU-40), but forced to be at least 1. + _lwip_tcp_mss = (MAX(mtu, (IP_HLEN+TCP_HLEN+1)) - IP_HLEN-TCP_HLEN); + break; + } + break; + default: + _lwip_tcp_mss = (MAX(lwip_mss, 1)); + break; + } + return _lwip_tcp_mss; +} diff --git a/src/vma/proto/vma_lwip.h b/src/vma/proto/vma_lwip.h new file mode 100644 index 0000000..a334541 --- /dev/null +++ b/src/vma/proto/vma_lwip.h @@ -0,0 +1,102 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef _VMA_LWIP_H +#define _VMA_LWIP_H + +#include "vma/event/timer_handler.h" +#include "vma/proto/mem_buf_desc.h" +#include "vma/sock/pkt_rcvr_sink.h" +#include "vma/lwip/tcp.h" + +typedef enum vma_wr_tx_packet_attr { + /* 8 bits are reserved for TCP flags (see lwip/tcp.h) + * this option should be synchronized with lwip/tcp value + */ + /* retransmit operation. */ + VMA_TX_PACKET_REXMIT = TCP_WRITE_REXMIT, /* 0x08 */ + /* nop send operation. */ + VMA_TX_PACKET_DUMMY = TCP_WRITE_DUMMY, /* 0x10 */ + /* large segment offload operation. */ + VMA_TX_PACKET_TSO = TCP_WRITE_TSO, /* 0x20 */ + /* sendfile operation. */ + VMA_TX_FILE = TCP_WRITE_FILE, /* 0x40 */ + + /* MLX5_ETH_WQE_L3_CSUM offload to HW L3 (IP) header checksum */ + VMA_TX_PACKET_L3_CSUM = (1 << 6), /* hardcoded values. It is the same as VMA_TX_FILE but there is no conflict */ + /* MLX5_ETH_WQE_L4_CSUM offload to HW L4 (TCP/UDP) header checksum */ + VMA_TX_PACKET_L4_CSUM = (1 << 7), /* hardcoded values */ + /* blocking send operation */ + VMA_TX_PACKET_BLOCK = (1 << 8), + /* Force SW checksum */ + VMA_TX_SW_CSUM = (1 << 9), +} vma_wr_tx_packet_attr; + +static inline bool is_set(vma_wr_tx_packet_attr state_, vma_wr_tx_packet_attr tx_mode_) +{ + return (uint32_t)state_ & (uint32_t)tx_mode_; +} + +static inline const char* lwip_cc_algo_str(uint32_t algo) +{ + switch (algo) { + case CC_MOD_CUBIC: return "(CUBIC)"; + case CC_MOD_NONE: return "(NONE)"; + case CC_MOD_LWIP: + default: return "(LWIP)"; + } +} + +class vma_lwip : public timer_handler +{ +public: + vma_lwip(); + virtual ~vma_lwip(); + + virtual void handle_timer_expired(void* user_data); + + static u32_t sys_now(void); + +private: + bool m_run_timers; + + void free_lwip_resources(void); + + static u8_t read_tcp_timestamp_option(void); +}; + +extern vma_lwip *g_p_lwip; + +uint32_t get_lwip_tcp_mss(uint32_t mtu, uint32_t lwip_mss); + +#endif diff --git a/src/vma/sock/cleanable_obj.h b/src/vma/sock/cleanable_obj.h new file mode 100644 index 0000000..3480171 --- /dev/null +++ b/src/vma/sock/cleanable_obj.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef CLEANABLE_OBJ_H_ +#define CLEANABLE_OBJ_H_ + +// This interface should be implemented by classes that we do not want to delete explicitly. +// For example, classes that inherit timer_handler should be deleted only from the context of the internal thread. +// Instead of calling delete for the object, call clean_obj() which should handle the deletion of the object. +class cleanable_obj +{ +public: + cleanable_obj(){ m_b_cleaned = false; }; + + virtual ~cleanable_obj(){}; + + virtual void clean_obj(){ set_cleaned(); delete this; }; + + bool is_cleaned(){ return m_b_cleaned; }; + +protected: + + void set_cleaned(){ m_b_cleaned = true; }; + +private: + + bool m_b_cleaned; // indicate that clean_obj() was called. +}; + +#endif /* CLEANABLE_OBJ_H_ */ diff --git a/src/vma/sock/fd_collection.cpp b/src/vma/sock/fd_collection.cpp new file mode 100644 index 0000000..99febfc --- /dev/null +++ b/src/vma/sock/fd_collection.cpp @@ -0,0 +1,682 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + + +#include + +#include "utils/bullseye.h" +#include "vma/util/libvma.h" +#include "fd_collection.h" +#include "sock-redirect.h" +#include "socket_fd_api.h" +#include "sockinfo_udp.h" +#include "pipeinfo.h" +#include "sockinfo_tcp.h" +#include "vma/iomux/epfd_info.h" + +#undef MODULE_NAME +#define MODULE_NAME "fdc:" +#undef MODULE_HDR +#define MODULE_HDR MODULE_NAME "%d:%s() " + +#define fdcoll_logpanic __log_panic +#define fdcoll_logerr __log_err +#define fdcoll_logwarn __log_warn +#define fdcoll_loginfo __log_info +#define fdcoll_logdetails __log_details +#define fdcoll_logdbg __log_dbg +#define fdcoll_logfunc __log_func + + +fd_collection* g_p_fd_collection = NULL; + +fd_collection::fd_collection() : + lock_mutex_recursive("fd_collection"), + m_timer_handle(0), + m_b_sysvar_offloaded_sockets(safe_mce_sys().offloaded_sockets) +{ + fdcoll_logfunc(""); + + m_pendig_to_remove_lst.set_id("fd_collection (%p) : m_pendig_to_remove_lst", this); + + m_n_fd_map_size = 1024; + struct rlimit rlim; + if ((getrlimit(RLIMIT_NOFILE, &rlim) == 0) && ((int)rlim.rlim_max > m_n_fd_map_size)) + m_n_fd_map_size = rlim.rlim_max; + fdcoll_logdbg("using open files max limit of %d file descriptors", m_n_fd_map_size); + + m_p_sockfd_map = new socket_fd_api*[m_n_fd_map_size]; + memset(m_p_sockfd_map, 0, m_n_fd_map_size * sizeof(socket_fd_api*)); + + m_p_epfd_map = new epfd_info*[m_n_fd_map_size]; + memset(m_p_epfd_map, 0, m_n_fd_map_size * sizeof(epfd_info*)); + + m_p_cq_channel_map = new cq_channel_info*[m_n_fd_map_size]; + memset(m_p_cq_channel_map, 0, m_n_fd_map_size * sizeof(cq_channel_info*)); + + m_p_tap_map = new ring_tap*[m_n_fd_map_size]; + memset(m_p_tap_map, 0, m_n_fd_map_size * sizeof(ring_tap*)); +} + +fd_collection::~fd_collection() +{ + fdcoll_logfunc(""); + + clear(); + m_n_fd_map_size = -1; + + delete [] m_p_sockfd_map; + m_p_sockfd_map = NULL; + + delete [] m_p_epfd_map; + m_p_epfd_map = NULL; + + delete [] m_p_cq_channel_map; + m_p_cq_channel_map = NULL; + + delete [] m_p_tap_map; + m_p_tap_map = NULL; + + // TODO: check if NOT empty - apparently one of them contains 1 element according to debug printout from ~vma_list_t + m_epfd_lst.clear_without_cleanup(); + m_pendig_to_remove_lst.clear_without_cleanup(); + +} + +//Triggers connection close of all handled fds. +//This is important for TCP connection which needs some time to terminate the connection, +//before the connection can be finally and properly closed. +void fd_collection::prepare_to_close() +{ + lock(); + for (int fd = 0; fd < m_n_fd_map_size; ++fd) { + if (m_p_sockfd_map[fd]) { + if(!g_is_forked_child) { + socket_fd_api *p_sfd_api = get_sockfd(fd); + if (p_sfd_api) { + p_sfd_api->prepare_to_close(true); + } + } + } + } + unlock(); +} + +void fd_collection::clear() +{ + int fd; + + fdcoll_logfunc(""); + + if (!m_p_sockfd_map) + return; + + lock(); + + if (m_timer_handle) { + g_p_event_handler_manager->unregister_timer_event(this, m_timer_handle); + m_timer_handle = 0; + } + + /* internal thread should be already dead and + * these sockets can not been deleted through the it. + */ + while (!m_pendig_to_remove_lst.empty()) { + socket_fd_api *p_sfd_api = m_pendig_to_remove_lst.get_and_pop_back(); + p_sfd_api->clean_obj(); + } + + /* Clean up all left overs sockinfo + */ + for (fd = 0; fd < m_n_fd_map_size; ++fd) { + if (m_p_sockfd_map[fd]) { + if(!g_is_forked_child) { + socket_fd_api *p_sfd_api = get_sockfd(fd); + if (p_sfd_api) { + p_sfd_api->statistics_print(); + p_sfd_api->clean_obj(); + } + } + /**** Problem here - if one sockinfo is in a blocked call rx()/tx() then this will block too!!! + * also - statistics_print() and destructor_helper() should not be called in two lines above, because they are called from the dtor + *delete m_p_sockfd_map[fd]; + */ + m_p_sockfd_map[fd] = NULL; + fdcoll_logdbg("destroyed fd=%d", fd); + } + + if (m_p_epfd_map[fd]) { + epfd_info *p_epfd = get_epfd(fd); + if (p_epfd) { + delete p_epfd; + } + m_p_epfd_map[fd] = NULL; + fdcoll_logdbg("destroyed epfd=%d", fd); + } + + if (m_p_cq_channel_map[fd]) { + cq_channel_info *p_cq_ch_info = get_cq_channel_fd(fd); + if (p_cq_ch_info) { + delete p_cq_ch_info; + } + m_p_cq_channel_map[fd] = NULL; + fdcoll_logdbg("destroyed cq_channel_fd=%d", fd); + } + + if (m_p_tap_map[fd]) { + m_p_tap_map[fd] = NULL; + fdcoll_logdbg("destroyed tapfd=%d", fd); + } + } + + unlock(); + fdcoll_logfunc("done"); +} + +int fd_collection::addsocket(int fd, int domain, int type, bool check_offload /*= false*/) +{ + transport_t transport; + const int SOCK_TYPE_MASK = 0xf; + int sock_type = type & SOCK_TYPE_MASK; + int sock_flags = type & ~SOCK_TYPE_MASK; + + if (check_offload && !create_offloaded_sockets()) { + fdcoll_logdbg("socket [fd=%d, domain=%d, type=%d] is not offloaded by thread rules or by VMA_OFFLOADED_SOCKETS", fd, domain, type); + return -1; + } + + // IPV4 domain only (at least today) + if (domain != AF_INET) + return -1; + + fdcoll_logfunc("fd=%d", fd); + + if (!is_valid_fd(fd)) + return -1; + + lock(); + + // Sanity check to remove any old sockinfo object using the same fd!! + socket_fd_api* p_sfd_api_obj = get_sockfd(fd); + BULLSEYE_EXCLUDE_BLOCK_START + if (p_sfd_api_obj) { + fdcoll_logwarn("[fd=%d] Deleting old duplicate sockinfo object (%p)", fd, p_sfd_api_obj); + unlock(); + handle_close(fd); + lock(); + } + BULLSEYE_EXCLUDE_BLOCK_END + + unlock(); + try { + switch (sock_type) { + case SOCK_DGRAM: + { transport = __vma_match_by_program(PROTO_UDP, safe_mce_sys().app_id); + if (transport == TRANS_OS) { + fdcoll_logdbg("All UDP rules are consistent and instructing to use OS. TRANSPORT: OS"); + return -1; + } + fdcoll_logdbg("UDP rules are either not consistent or instructing to use VMA. TRANSPORT: VMA"); + p_sfd_api_obj = new sockinfo_udp(fd); + break; + } + case SOCK_STREAM: + { + transport = __vma_match_by_program(PROTO_TCP, safe_mce_sys().app_id); + if (transport == TRANS_OS) { + fdcoll_logdbg("All TCP rules are consistent and instructing to use OS.transport == USE_OS"); + return -1; + } + fdcoll_logdbg("TCP rules are either not consistent or instructing to use VMA.transport == USE_VMA"); + p_sfd_api_obj = new sockinfo_tcp(fd); + break; + } + default: + fdcoll_logdbg("unsupported socket type=%d", sock_type); + return -1; + } + } catch (vma_exception& e) { + fdcoll_logdbg("recovering from %s", e.what()); + return -1; + } + lock(); + + BULLSEYE_EXCLUDE_BLOCK_START + if (p_sfd_api_obj == NULL) { + fdcoll_logpanic("[fd=%d] Failed creating new sockinfo (%m)", fd); + } + BULLSEYE_EXCLUDE_BLOCK_END + + if (sock_flags) { + if (sock_flags & SOCK_NONBLOCK) + p_sfd_api_obj->fcntl(F_SETFL, O_NONBLOCK); + if (sock_flags & SOCK_CLOEXEC) + p_sfd_api_obj->fcntl(F_SETFD, FD_CLOEXEC); + } + + m_p_sockfd_map[fd] = p_sfd_api_obj; + + unlock(); + + return fd; +} + +bool fd_collection::create_offloaded_sockets() +{ + bool ret = m_b_sysvar_offloaded_sockets; + + lock(); + if (m_offload_thread_rule.find(pthread_self()) == m_offload_thread_rule.end()) { + unlock(); + return ret; + } + unlock(); + + return !ret; +} + +/* + * Create sockets on the given thread as offloaded/not-offloaded. + * pass true for offloaded, false for not-offloaded. + */ +void fd_collection::offloading_rule_change_thread(bool offloaded, pthread_t tid) +{ + fdcoll_logdbg("tid=%ul, offloaded=%d", tid, offloaded); + + lock(); + if (offloaded == m_b_sysvar_offloaded_sockets) { + m_offload_thread_rule.erase(tid); + } else { + m_offload_thread_rule[tid] = 1; + } + unlock(); +} + +void fd_collection::statistics_print_helper(int fd, vlog_levels_t log_level) +{ + socket_fd_api* socket_fd; + epfd_info* epoll_fd; + + if ((socket_fd = get_sockfd(fd))) { + vlog_printf(log_level, "==================== SOCKET FD ===================\n"); + socket_fd->statistics_print(log_level); + goto found_fd; + } + if ((epoll_fd = get_epfd(fd))) { + vlog_printf(log_level, "==================== EPOLL FD ====================\n"); + epoll_fd->statistics_print(log_level); + goto found_fd; + } + + return; + +found_fd: + + vlog_printf(log_level, "==================================================\n"); +} + +void fd_collection::statistics_print(int fd, vlog_levels_t log_level) +{ + vlog_printf(log_level, "==================================================\n"); + if (fd) { + vlog_printf(log_level, "============ DUMPING FD %d STATISTICS ============\n", fd); + g_p_fd_collection->statistics_print_helper(fd, log_level); + } else { + vlog_printf(log_level, "======= DUMPING STATISTICS FOR ALL OPEN FDS ======\n"); + int fd_map_size = g_p_fd_collection->get_fd_map_size(); + for (int i = 0 ; i < fd_map_size ; i++) { + g_p_fd_collection->statistics_print_helper(i, log_level); + } + } + vlog_printf(log_level, "==================================================\n"); +} + +int fd_collection::addpipe(int fdrd, int fdwr) +{ + fdcoll_logfunc("fdrd=%d, fdwr=%d", fdrd, fdwr); + + if (!is_valid_fd(fdrd) || !is_valid_fd(fdwr)) + return -1; + + lock(); + + // Sanity check to remove any old objects using the same fd!! + socket_fd_api* p_fdrd_api_obj = get_sockfd(fdrd); + BULLSEYE_EXCLUDE_BLOCK_START + if (p_fdrd_api_obj) { + fdcoll_logwarn("[fd=%d] Deleting old duplicate object (%p)", fdrd, p_fdrd_api_obj); + unlock(); + handle_close(fdrd, true); + lock(); + } + BULLSEYE_EXCLUDE_BLOCK_END + socket_fd_api* p_fdwr_api_obj = get_sockfd(fdwr); + BULLSEYE_EXCLUDE_BLOCK_START + if (p_fdwr_api_obj) { + fdcoll_logwarn("[fd=%d] Deleting old duplicate object (%p)", fdwr, p_fdwr_api_obj); + unlock(); + handle_close(fdwr, true); + lock(); + } + BULLSEYE_EXCLUDE_BLOCK_END + + unlock(); + p_fdrd_api_obj = new pipeinfo(fdrd); + p_fdwr_api_obj = new pipeinfo(fdwr); + lock(); + + BULLSEYE_EXCLUDE_BLOCK_START + if (p_fdrd_api_obj == NULL) { + fdcoll_logpanic("[fd=%d] Failed creating new pipeinfo (%m)", fdrd); + } + if (p_fdwr_api_obj == NULL) { + fdcoll_logpanic("[fd=%d] Failed creating new pipeinfo (%m)", fdwr); + } + BULLSEYE_EXCLUDE_BLOCK_END + + m_p_sockfd_map[fdrd] = p_fdrd_api_obj; + m_p_sockfd_map[fdwr] = p_fdwr_api_obj; + + unlock(); + + return 0; +} + +int fd_collection::addepfd(int epfd, int size) +{ + fdcoll_logfunc("epfd=%d", epfd); + + if (!is_valid_fd(epfd)) + return -1; + + lock(); + + // Sanity check to remove any old sockinfo object using the same fd!! + epfd_info* p_fd_info = get_epfd(epfd); + if (p_fd_info) { + fdcoll_logwarn("[fd=%d] Deleting old duplicate sockinfo object (%p)", epfd, p_fd_info); + unlock(); + handle_close(epfd, true); + lock(); + } + + unlock(); + p_fd_info = new epfd_info(epfd, size); + lock(); + + BULLSEYE_EXCLUDE_BLOCK_START + if (p_fd_info == NULL) { + fdcoll_logpanic("[fd=%d] Failed creating new sockinfo (%m)", epfd); + } + BULLSEYE_EXCLUDE_BLOCK_END + m_p_epfd_map[epfd] = p_fd_info; + m_epfd_lst.push_back(p_fd_info); + + unlock(); + + return 0; +} + + +int fd_collection::addtapfd(int tapfd, ring_tap* p_ring) +{ + fdcoll_logfunc("tapfd=%d, p_ring=%p", tapfd, p_ring); + + if (!is_valid_fd(tapfd)) + return -1; + + lock(); + + if (get_tapfd(tapfd)) { + fdcoll_logwarn("[tapfd=%d] already exist in the collection (ring %p)", tapfd, get_tapfd(tapfd)); + return -1; + } + + m_p_tap_map[tapfd] = p_ring; + + unlock(); + + return 0; +} + +int fd_collection::add_cq_channel_fd(int cq_ch_fd, ring* p_ring) +{ + fdcoll_logfunc("cq_ch_fd=%d", cq_ch_fd); + + if (!is_valid_fd(cq_ch_fd)) + return -1; + + lock(); + + epfd_info* p_fd_info = get_epfd(cq_ch_fd); + BULLSEYE_EXCLUDE_BLOCK_START + if (p_fd_info) { + fdcoll_logwarn("[fd=%d] Deleting old duplicate sockinfo object (%p)", cq_ch_fd, p_fd_info); + unlock(); + handle_close(cq_ch_fd, true); + lock(); + } + BULLSEYE_EXCLUDE_BLOCK_END + + // Sanity check to remove any old objects using the same fd!! + socket_fd_api* p_cq_ch_fd_api_obj = get_sockfd(cq_ch_fd); + BULLSEYE_EXCLUDE_BLOCK_START + if (p_cq_ch_fd_api_obj) { + fdcoll_logwarn("[fd=%d] Deleting old duplicate object (%p)", cq_ch_fd, p_cq_ch_fd_api_obj); + unlock(); + handle_close(cq_ch_fd, true); + lock(); + } + BULLSEYE_EXCLUDE_BLOCK_END + + // Check if cq_channel_info was already created + cq_channel_info* p_cq_ch_info = get_cq_channel_fd(cq_ch_fd); + BULLSEYE_EXCLUDE_BLOCK_START + if (p_cq_ch_info) { + fdcoll_logwarn("cq channel fd already exists in fd_collection"); + m_p_cq_channel_map[cq_ch_fd] = NULL; + delete p_cq_ch_info; + p_cq_ch_info = NULL; + } + BULLSEYE_EXCLUDE_BLOCK_END + + unlock(); + p_cq_ch_info = new cq_channel_info(p_ring); + lock(); + + BULLSEYE_EXCLUDE_BLOCK_START + if (p_cq_ch_info == NULL) { + fdcoll_logpanic("[fd=%d] Failed creating new cq_channel_info (%m)", cq_ch_fd); + } + BULLSEYE_EXCLUDE_BLOCK_END + m_p_cq_channel_map[cq_ch_fd] = p_cq_ch_info; + + unlock(); + + return 0; +} + +int fd_collection::del_sockfd(int fd, bool b_cleanup /*=false*/) +{ + int ret_val = -1; + socket_fd_api *p_sfd_api; + + p_sfd_api = get_sockfd(fd); + + if (p_sfd_api) { + //TCP socket need some timer to before it can be deleted, + //in order to gracefuly terminate TCP connection + //so we have to stages: + //1. Prepare to close: kikstarts TCP connection termination + //2. Socket deletion when TCP connection == CLOSED + if (p_sfd_api->prepare_to_close()) { + //the socket is already closable + ret_val = del(fd, b_cleanup, m_p_sockfd_map); + } + else { + lock(); + //The socket is not ready for close. + //Delete it from fd_col and add it to pending_to_remove list. + //This socket will be handled and destroyed now by fd_col. + //This will be done from fd_col timer handler. + if (m_p_sockfd_map[fd] == p_sfd_api) { + m_p_sockfd_map[fd] = NULL; + m_pendig_to_remove_lst.push_front(p_sfd_api); + } + + if (m_pendig_to_remove_lst.size() == 1) { + //Activate timer + try { + m_timer_handle = g_p_event_handler_manager->register_timer_event(250, this, PERIODIC_TIMER, 0); + } catch (vma_exception &error) { + fdcoll_logdbg("recovering from %s", error.what()); + unlock(); + return -1; + } + } + unlock(); + ret_val = 0; + } + } + + return ret_val; +} + +int fd_collection::del_epfd(int fd, bool b_cleanup /*=false*/) +{ + return del(fd, b_cleanup, m_p_epfd_map); +} + +void fd_collection::remove_epfd_from_list(epfd_info* epfd) +{ + lock(); + m_epfd_lst.erase(epfd); + unlock(); +} + +int fd_collection::del_cq_channel_fd(int fd, bool b_cleanup /*=false*/) +{ + return del(fd, b_cleanup, m_p_cq_channel_map); +} + +void fd_collection::del_tapfd(int fd) +{ + if (!is_valid_fd(fd)) + return; + + lock(); + m_p_tap_map[fd] = NULL; + unlock(); +} + +template +int fd_collection::del(int fd, bool b_cleanup, cls **map_type) +{ + fdcoll_logfunc("fd=%d%s", fd, + b_cleanup ? ", cleanup case: trying to remove old socket handler":""); + + if (!is_valid_fd(fd)) + return -1; + + lock(); + cls* p_obj = map_type[fd]; + if (p_obj) { + map_type[fd] = NULL; + unlock(); + p_obj->clean_obj(); + return 0; + } + if (!b_cleanup) { + fdcoll_logdbg("[fd=%d] Could not find related object", fd); + } + unlock(); + return -1; +} + +void fd_collection::handle_timer_expired(void* user_data) +{ + sock_fd_api_list_t::iterator itr; + fdcoll_logfunc(); + + lock(); + + NOT_IN_USE(user_data); + + for (itr = m_pendig_to_remove_lst.begin(); itr != m_pendig_to_remove_lst.end(); ) { + if((*itr)->is_closable()) { + fdcoll_logfunc("Closing:%d", (*itr)->get_fd()); + //The socket is ready to be closed, remove it from the list + delete it + socket_fd_api* p_sock_fd = *itr; + itr++; + m_pendig_to_remove_lst.erase(p_sock_fd); + + if (p_sock_fd) { + p_sock_fd->clean_obj(); + p_sock_fd = NULL; + } + + //Deactivate the timer since there are no any pending to remove socket to handle anymore + if (!m_pendig_to_remove_lst.size()) { + if (m_timer_handle) { + g_p_event_handler_manager->unregister_timer_event(this, m_timer_handle); + m_timer_handle = 0; + } + } + } + else { //The socket is not closable yet + sockinfo_tcp* si_tcp = dynamic_cast(*itr); + + if (si_tcp) { + //In case of TCP socket progress the TCP connection + fdcoll_logfunc("Call to handler timer of TCP socket:%d", (*itr)->get_fd()); + si_tcp->handle_timer_expired(NULL); + } + itr++; + } + } + + unlock(); +} + +void fd_collection::remove_from_all_epfds(int fd, bool passthrough) +{ + epfd_info_list_t::iterator itr; + + lock(); + for (itr = m_epfd_lst.begin(); itr != m_epfd_lst.end(); itr++) { + itr->fd_closed(fd, passthrough); + } + unlock(); + + return; +} diff --git a/src/vma/sock/fd_collection.h b/src/vma/sock/fd_collection.h new file mode 100644 index 0000000..d8f689d --- /dev/null +++ b/src/vma/sock/fd_collection.h @@ -0,0 +1,302 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef FD_COLLECTION_H +#define FD_COLLECTION_H + +#include +#include +#include "vlogger/vlogger.h" +#include "utils/lock_wrapper.h" +#include "vma/iomux/epfd_info.h" + +#include "vma/sock/socket_fd_api.h" +#include "vma/event/timer_handler.h" +#include "vma/event/event_handler_manager.h" +#include +#include "vma/dev/ring_tap.h" + +typedef vma_list_t sock_fd_api_list_t; +typedef vma_list_t epfd_info_list_t; + +typedef std::tr1::unordered_map offload_thread_rule_t; + +#if (VMA_MAX_DEFINED_LOG_LEVEL < DEFINED_VLOG_FINER) +#define fdcoll_logfuncall(log_fmt, log_args...) ((void)0) +#else +#define fdcoll_logfuncall(log_fmt, log_args...) do { if (g_vlogger_level >= VLOG_FUNC_ALL) vlog_printf(VLOG_FUNC_ALL, "fdc:%d:%s() " log_fmt "\n", __LINE__, __FUNCTION__, ##log_args); } while (0) +#endif /* VMA_MAX_DEFINED_LOG_LEVEL */ + +class cq_channel_info: public cleanable_obj +{ +public: + cq_channel_info(ring* p_ring) : m_p_ring(p_ring) {}; + ~cq_channel_info() {}; + ring* get_ring() const { return m_p_ring; }; + +protected: + ring* m_p_ring; +}; + + +class fd_collection : private lock_mutex_recursive, public timer_handler +{ +public: + fd_collection(); + ~fd_collection(); + + /** + * Create and add a sockinfo. Use get_sock() to get it. + * @param domain e.g AF_INET. + * @param type e.g SOCK_DGRAM. + * @return socket fd or -1 on failure. + */ + int addsocket(int fd, int domain, int type, bool check_offload = false); + + /** + * Create pipeinfo. Use get_sock() to get it. + * @param fdrd Read fd. + * @param fdwr Write fd. + * @return 0 on success, -1 on failure. + */ + int addpipe(int fdrd, int fdwr); + + /** + * Create epfd_info. Use get_epfd() to get it. + * @param epfd epoll fd. + * @param size epoll fd size (as passed to epoll_create). + * @return 0 on success, -1 on failure. + */ + int addepfd(int epfd, int size); + + /** + * Create cq_channel_info. Use get_cq_channel_info() to get it. + * @param cq_ch_fd: cq channel fd. + * @param p_ring: pointer to ring which is the relevant rx_cq owner. + * @return 0 on success, -1 on failure. + */ + int add_cq_channel_fd(int cq_ch_fd, ring* p_ring); + + /** + * Add tap fd index to tap_map. + * @param tapfd: tap fd. + * @param p_ring: pointer to ring owner of the tap. + * @return 0 on success, -1 on failure. + */ + int addtapfd(int tapfd, ring_tap* p_ring); + + /** + * Remove pipeinfo/sockinfo. + */ + int del_sockfd(int fd, bool b_cleanup = false); + + /** + * Remove epfd_info. + */ + int del_epfd(int fd, bool b_cleanup = false); + void remove_epfd_from_list(epfd_info* epfd); + + /** + * Remove cq_channel_info. + */ + int del_cq_channel_fd(int fd, bool b_cleanup = false); + + /** + * Remove tap_fd from tap_map. + */ + void del_tapfd(int fd); + + /** + * Call set_immediate_os_sample of the input fd. + */ + inline bool set_immediate_os_sample(int fd); + + /** + * Get sock_fd_api (sockinfo or pipeinfo) by fd. + */ + inline socket_fd_api* get_sockfd(int fd); + + /** + * Get epfd_info by fd. + */ + inline epfd_info* get_epfd(int fd); + + /** + * Get cq_channel_info by fd. + */ + inline cq_channel_info* get_cq_channel_fd(int fd); + + /** + * Get rint_tap by tap fd. + */ + inline ring_tap* get_tapfd(int fd); + + /** + * Get the fd_map size. + */ + inline int get_fd_map_size(); + + /** + * Remove fd from the collection of all epfd's + */ + void remove_from_all_epfds(int fd, bool passthrough); + + /** + * Remove everything from the collection. + */ + void clear(); + void prepare_to_close(); + + void offloading_rule_change_thread(bool offloaded, pthread_t tid); + + /** + * Dump fd statistics using VMA logger. + */ + void statistics_print(int fd, vlog_levels_t log_level); + +private: + template int del(int fd, bool b_cleanup, cls **map_type); + template inline cls* get(int fd, cls **map_type); + + int m_n_fd_map_size; + socket_fd_api** m_p_sockfd_map; + epfd_info** m_p_epfd_map; + cq_channel_info** m_p_cq_channel_map; + ring_tap** m_p_tap_map; + + epfd_info_list_t m_epfd_lst; + //Contains fds which are in closing process + sock_fd_api_list_t m_pendig_to_remove_lst; + + void* m_timer_handle; + + const bool m_b_sysvar_offloaded_sockets; + + //if (m_b_sysvar_offloaded_sockets is true) contain all threads that need not be offloaded. + //else contain all threads that need to be offloaded. + offload_thread_rule_t m_offload_thread_rule; + + inline bool is_valid_fd(int fd); + + inline bool create_offloaded_sockets(); + + //Fd collection timer implementation + //This gives context to handle pending to remove fds. + //In case of TCP we recheck if TCP socket is closable and delete + //it if it does otherwise we run handle_timer of the socket to + //progress the TCP connection. + void handle_timer_expired(void* user_data); + + void statistics_print_helper(int fd, vlog_levels_t log_level); +}; + + +inline bool fd_collection::is_valid_fd(int fd) +{ + if (fd < 0 || fd >= m_n_fd_map_size) + return false; + return true; +} + +template +inline cls* fd_collection::get(int fd, cls **map_type) +{ + if (!is_valid_fd(fd)) + return NULL; + + cls* obj = map_type[fd]; + return obj; +} + +inline bool fd_collection::set_immediate_os_sample(int fd) +{ + epfd_info* epfd_fd; + ring_tap* p_ring; + + auto_unlocker locker(*this); + + if ((p_ring = get_tapfd(fd))) { + p_ring->set_tap_data_available(); + return true; + } + + if ((epfd_fd = get_epfd(fd))){ + epfd_fd->set_os_data_available(); + return true; + } + + return false; +} + +inline socket_fd_api* fd_collection::get_sockfd(int fd) +{ + return get(fd, m_p_sockfd_map); +} + +inline epfd_info* fd_collection::get_epfd(int fd) +{ + return get(fd, m_p_epfd_map); +} + +inline cq_channel_info* fd_collection::get_cq_channel_fd(int fd) +{ + return get(fd, m_p_cq_channel_map); +} + +inline ring_tap* fd_collection::get_tapfd(int fd) +{ + return get(fd, m_p_tap_map); +} + +inline int fd_collection::get_fd_map_size() +{ + return m_n_fd_map_size; +} + +extern fd_collection* g_p_fd_collection; + +inline socket_fd_api* fd_collection_get_sockfd(int fd) +{ + if (g_p_fd_collection) + return g_p_fd_collection->get_sockfd(fd); + return NULL; +} + +inline epfd_info* fd_collection_get_epfd(int fd) +{ + if (g_p_fd_collection) + return g_p_fd_collection->get_epfd(fd); + return NULL; +} + +#endif diff --git a/src/vma/sock/pipeinfo.cpp b/src/vma/sock/pipeinfo.cpp new file mode 100644 index 0000000..1b61509 --- /dev/null +++ b/src/vma/sock/pipeinfo.cpp @@ -0,0 +1,395 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + + +#include +#include "utils/bullseye.h" +#include + +#include "sock-redirect.h" + +#include "pipeinfo.h" + +#define MODULE_NAME "pi" +#undef VLOG_PRINTF +#define VLOG_PRINTF(log_level, log_fmt, log_args...) vlog_printf(log_level, "fd[%#x]:%s() " log_fmt "\n", m_fd, __FUNCTION__, ##log_args) +#define VLOG_PRINTF_DETAILS(log_level, log_fmt, log_args...) vlog_printf(log_level, MODULE_NAME ":%d:fd[%#x]:%s() " log_fmt "\n", __LINE__, m_fd, __FUNCTION__, ##log_args) + +#define pi_logpanic(log_fmt, log_args...) VLOG_PRINTF(VLOG_PANIC, log_fmt, ##log_args); throw; +#define pi_logerr(log_fmt, log_args...) VLOG_PRINTF(VLOG_ERROR, log_fmt, ##log_args) +#define pi_logwarn(log_fmt, log_args...) VLOG_PRINTF(VLOG_WARNING, log_fmt, ##log_args) +#define pi_loginfo(log_fmt, log_args...) VLOG_PRINTF(VLOG_INFO, log_fmt, ##log_args) + +#if (VMA_MAX_DEFINED_LOG_LEVEL < DEFINED_VLOG_DEBUG) +#define pi_logdbg_no_funcname(log_fmt, log_args...) ((void)0) +#define pi_logdbg(log_fmt, log_args...) ((void)0) +#define si_logdbg_no_funcname(log_fmt, log_args...) ((void)0) +#else +#define pi_logdbg_no_funcname(log_fmt, log_args...) if (g_vlogger_level >= VLOG_DEBUG) vlog_printf(VLOG_DEBUG, MODULE_NAME ":%d:fd[%d]: " log_fmt "\n", __LINE__, m_fd, ##log_args) +#define pi_logdbg(log_fmt, log_args...) if (g_vlogger_level >= VLOG_DEBUG) VLOG_PRINTF_DETAILS(VLOG_DEBUG, log_fmt, ##log_args) +#define si_logdbg_no_funcname(log_fmt, log_args...) do { if (g_vlogger_level >= VLOG_DEBUG) vlog_printf(VLOG_DEBUG, MODULE_NAME "[fd=%d]:%d: " log_fmt "\n", m_fd, __LINE__, ##log_args); } while (0) +#endif + +#if (VMA_MAX_DEFINED_LOG_LEVEL < DEFINED_VLOG_FINE) +#define pi_logfunc(log_fmt, log_args...) ((void)0) +#else +#define pi_logfunc(log_fmt, log_args...) if (g_vlogger_level >= VLOG_FUNC) VLOG_PRINTF_DETAILS(VLOG_FUNC, log_fmt, ##log_args) +#endif + +#if (VMA_MAX_DEFINED_LOG_LEVEL < DEFINED_VLOG_FINER) +#define pi_logfuncall(log_fmt, log_args...) ((void)0) +#else +#define pi_logfuncall(log_fmt, log_args...) if (g_vlogger_level >= VLOG_FUNC_ALL) VLOG_PRINTF_DETAILS(VLOG_FUNC_ALL, log_fmt, ##log_args) +#endif /* VMA_MAX_DEFINED_LOG_LEVEL */ + +pipeinfo::pipeinfo(int fd) : socket_fd_api(fd), + m_lock("pipeinfo::m_lock"), + m_lock_rx("pipeinfo::m_lock_rx"), + m_lock_tx("pipeinfo::m_lock_tx") +{ + pi_logfunc(""); + + m_b_closed = true; + m_timer_handle = NULL; + + m_b_blocking = true; + + m_p_socket_stats = NULL; // mce_stats_instance_create_socket_block(); + if (m_p_socket_stats == NULL) { + // pi_logdbg("Got NULL from mce_stats_instance_create_socket_block, using local member"); + m_p_socket_stats = &m_socket_stats; + } + m_p_socket_stats->reset(); + m_p_socket_stats->fd = m_fd; + m_p_socket_stats->b_blocking = m_b_blocking; + m_p_socket_stats->n_rx_ready_pkt_count = 0; + m_p_socket_stats->counters.n_rx_ready_pkt_max = 0; + m_p_socket_stats->n_rx_ready_byte_count = 0; + m_p_socket_stats->n_tx_ready_byte_count = 0; + m_p_socket_stats->counters.n_rx_ready_byte_max = 0; + m_p_socket_stats->n_rx_zcopy_pkt_count = 0; + + m_b_closed = false; + + m_b_lbm_event_q_pipe_timer_on = false; + m_write_count = m_write_count_on_last_timer = 0; + m_write_count_no_change_count = 0; + + + pi_logfunc("done"); +} + +pipeinfo::~pipeinfo() +{ + m_b_closed = true; + pi_logfunc(""); + + + // Change to non-blocking socket so calling threads can exit + m_b_blocking = false; + + m_lock_tx.lock(); + m_lock_rx.lock(); + m_lock.lock(); + + if (m_timer_handle) { + g_p_event_handler_manager->unregister_timer_event(this, m_timer_handle); + m_timer_handle = NULL; + } + + statistics_print(); + + m_lock_tx.unlock(); + m_lock_rx.unlock(); + m_lock.unlock(); + + pi_logfunc("done"); +} + +void pipeinfo::clean_obj() +{ + if (is_cleaned()) { + return ; + } + + set_cleaned(); + m_timer_handle = NULL; + if (g_p_event_handler_manager->is_running()) { + g_p_event_handler_manager->unregister_timers_event_and_delete(this); + } else { + cleanable_obj::clean_obj(); + } +} + +int pipeinfo::fcntl(int __cmd, unsigned long int __arg) +{ + switch (__cmd) { + case F_SETFL: + { + pi_logfunc("cmd=F_SETFL, arg=%#x", __cmd, __arg); + if (__arg & O_NONBLOCK) { + pi_logdbg("set to non-blocking mode"); + m_b_blocking = false; + } + else { + pi_logdbg("set to blocked mode"); + m_b_blocking = true; + } + m_p_socket_stats->b_blocking = m_b_blocking; + } + break; + + case F_GETFL: /* Get file status flags. */ + pi_logfunc("F_GETFL, arg=%#x", __arg); + break; + + case F_GETFD: /* Get file descriptor flags. */ + pi_logfunc("F_GETFD, arg=%#x", __arg); + break; + + case F_SETFD: /* Set file descriptor flags. */ + pi_logfunc("F_SETFD, arg=%#x", __arg); + break; + + default: + pi_logfunc("cmd=%d, arg=%#x", __cmd, __arg); + break; + } + + return orig_os_api.fcntl(m_fd, __cmd, __arg); +} + +int pipeinfo::ioctl(unsigned long int __request, unsigned long int __arg) +{ + int *p_arg = (int *)__arg; + + switch (__request) { + case FIONBIO: + { + if (*p_arg) { + pi_logdbg("FIONBIO, arg=%d - set to non-blocking mode", *p_arg); + m_b_blocking = false; + } + else { + pi_logdbg("FIONBIO, arg=%d - set to blocked mode", *p_arg); + m_b_blocking = true; + } + + m_p_socket_stats->b_blocking = m_b_blocking; + } + break; + + default: + pi_logfunc("request=%d, arg=%#x", __request, __arg); + break; + } + + return orig_os_api.ioctl(m_fd, __request, __arg); +} + +ssize_t pipeinfo::rx(const rx_call_t call_type, iovec* p_iov, ssize_t sz_iov, + int* p_flags, sockaddr *__from, socklen_t *__fromlen, struct msghdr *__msg) +{ + pi_logfunc(""); + ssize_t ret = socket_fd_api::rx_os(call_type, p_iov, sz_iov, *p_flags, __from, __fromlen, __msg); + save_stats_rx_os(ret); + return ret; +} + +void pipeinfo::handle_timer_expired(void* user_data) +{ + NOT_IN_USE(user_data); + pi_logfunc("(m_write_count=%d)", m_write_count); + m_lock_tx.lock(); + write_lbm_pipe_enhance(); + m_lock_tx.unlock(); +} + +ssize_t pipeinfo::tx(vma_tx_call_attr_t &tx_arg) +{ + const iovec* p_iov = tx_arg.attr.msg.iov; + const ssize_t sz_iov = tx_arg.attr.msg.sz_iov; + const int __flags = tx_arg.attr.msg.flags; + const struct sockaddr *__to = tx_arg.attr.msg.addr; + const socklen_t __tolen = tx_arg.attr.msg.len; + ssize_t ret = -1; + + pi_logfunc(""); + m_lock_tx.lock(); + switch (tx_arg.opcode) { + case TX_WRITE: + + if ((safe_mce_sys().mce_spec == MCE_SPEC_29WEST_LBM_29 || safe_mce_sys().mce_spec == MCE_SPEC_WOMBAT_FH_LBM_554) && + (p_iov[0].iov_len == 1) && (((char*)p_iov[0].iov_base)[0] == '\0')) { + + // We will pass one pipe write in every T usec + // + // 1) First signaling pipe write will go through, and triger timer logic + // 2) Then we'll send a single pipe writes every T usec (safe_mce_sys().mce_spec_param1) + // 3) We'll stop the timer once we have N cycles with no pipe write + // + + m_write_count++; + if (m_b_lbm_event_q_pipe_timer_on == false) { + m_timer_handle = g_p_event_handler_manager->register_timer_event(safe_mce_sys().mce_spec_param1/1000, this, PERIODIC_TIMER, 0); + m_b_lbm_event_q_pipe_timer_on = true; + m_write_count_on_last_timer = 0; + m_write_count_no_change_count = 0; + + pi_logdbg("\n\n\npipe_write DONE timer Reg\n\n\n"); + + // simulate a pipe_write + write_lbm_pipe_enhance(); + } + else if ((int)m_write_count > (int)(m_write_count_on_last_timer + safe_mce_sys().mce_spec_param2)) { + // simulate a pipe_write + write_lbm_pipe_enhance(); + } + + ret = 1; + } + else { + ret = orig_os_api.write(m_fd, p_iov[0].iov_base, p_iov[0].iov_len); + } + + break; + + case TX_SEND: + case TX_SENDTO: + case TX_SENDMSG: + default: + ret = socket_fd_api::tx_os(tx_arg.opcode, p_iov, sz_iov, __flags, __to, __tolen); + break; + } + + save_stats_tx_os(ret); + m_lock_tx.unlock(); + return ret; +} + +void pipeinfo::write_lbm_pipe_enhance() +{ + pi_logfunc("(m_write_count=%d)", m_write_count); + + if (m_write_count == m_write_count_on_last_timer) { + // No pipe write happened during the last timer_expired() + m_write_count_no_change_count++; + + // After 3 of these stop timer + if (m_write_count_no_change_count >= 2 && m_b_lbm_event_q_pipe_timer_on) { + if (m_timer_handle) { + g_p_event_handler_manager->unregister_timer_event(this, m_timer_handle); + m_timer_handle = NULL; + } + m_b_lbm_event_q_pipe_timer_on = false; + + pi_logfunc("pipe_write DONE timer Un-Reg"); + } + } + + m_write_count = 0; + m_write_count_no_change_count = 0; + m_write_count_on_last_timer = 0; + + // Send the buffered data + char buf[10] = "\0"; + orig_os_api.write(m_fd, buf, 1); +} + +void pipeinfo::statistics_print() +{ + bool b_any_activiy = false; + if (m_p_socket_stats->counters.n_tx_sent_byte_count || m_p_socket_stats->counters.n_tx_sent_pkt_count || m_p_socket_stats->counters.n_tx_errors || m_p_socket_stats->counters.n_tx_drops) { + pi_logdbg_no_funcname("Tx Offload: %d KB / %d / %d / %d [bytes/packets/errors/drops]", m_p_socket_stats->counters.n_tx_sent_byte_count/1024, m_p_socket_stats->counters.n_tx_sent_pkt_count, m_p_socket_stats->counters.n_tx_errors, m_p_socket_stats->counters.n_tx_drops); + b_any_activiy = true; + } + if (m_p_socket_stats->counters.n_tx_os_bytes || m_p_socket_stats->counters.n_tx_os_packets || m_p_socket_stats->counters.n_tx_os_errors) { + pi_logdbg_no_funcname("Tx OS info: %d KB / %d / %d [bytes/packets/errors]", m_p_socket_stats->counters.n_tx_os_bytes/1024, m_p_socket_stats->counters.n_tx_os_packets, m_p_socket_stats->counters.n_tx_os_errors); + b_any_activiy = true; + } + if (m_p_socket_stats->counters.n_rx_bytes || m_p_socket_stats->counters.n_rx_packets || m_p_socket_stats->counters.n_rx_errors || m_p_socket_stats->counters.n_rx_eagain) { + pi_logdbg_no_funcname("Rx Offload: %d KB / %d / %d / %d [bytes/packets/errors/eagains]", m_p_socket_stats->counters.n_rx_bytes/1024, m_p_socket_stats->counters.n_rx_packets, m_p_socket_stats->counters.n_rx_errors, m_p_socket_stats->counters.n_rx_eagain); + b_any_activiy = true; + } + if (m_p_socket_stats->counters.n_rx_os_bytes || m_p_socket_stats->counters.n_rx_os_packets || m_p_socket_stats->counters.n_rx_os_errors) { + pi_logdbg_no_funcname("Rx OS info: %d KB / %d / %d [bytes/packets/errors]", m_p_socket_stats->counters.n_rx_os_bytes/1024, m_p_socket_stats->counters.n_rx_os_packets, m_p_socket_stats->counters.n_rx_os_errors); + b_any_activiy = true; + } + if (m_p_socket_stats->counters.n_rx_poll_miss || m_p_socket_stats->counters.n_rx_poll_hit) { + pi_logdbg_no_funcname("Rx poll: %d / %d (%2.2f%%) [miss/hit]", m_p_socket_stats->counters.n_rx_poll_miss, m_p_socket_stats->counters.n_rx_poll_hit, + (float)(m_p_socket_stats->counters.n_rx_poll_hit * 100) / (float)(m_p_socket_stats->counters.n_rx_poll_miss + m_p_socket_stats->counters.n_rx_poll_hit)); + b_any_activiy = true; + } + if (m_p_socket_stats->counters.n_rx_ready_byte_drop) { + si_logdbg_no_funcname("Rx byte: max %d / dropped %d (%2.2f%%) [limit is %d]", m_p_socket_stats->counters.n_rx_ready_byte_max, m_p_socket_stats->counters.n_rx_ready_byte_drop, + (m_p_socket_stats->counters.n_rx_packets ? (float)(m_p_socket_stats->counters.n_rx_ready_byte_drop * 100) / (float)m_p_socket_stats->counters.n_rx_packets : 0), + m_p_socket_stats->n_rx_ready_byte_limit); + b_any_activiy = true; + } + if (m_p_socket_stats->counters.n_rx_ready_pkt_drop) { + si_logdbg_no_funcname("Rx pkt : max %d / dropped %d (%2.2f%%)", m_p_socket_stats->counters.n_rx_ready_pkt_max, m_p_socket_stats->counters.n_rx_ready_pkt_drop, + (m_p_socket_stats->counters.n_rx_packets ? (float)(m_p_socket_stats->counters.n_rx_ready_pkt_drop * 100) / (float)m_p_socket_stats->counters.n_rx_packets : 0)); + b_any_activiy = true; + } + if (b_any_activiy == false) { + pi_logdbg_no_funcname("Rx and Tx where not active"); + } +} + +void pipeinfo::save_stats_rx_os(int bytes) +{ + if (bytes >= 0) { + m_p_socket_stats->counters.n_rx_os_bytes += bytes; + m_p_socket_stats->counters.n_rx_os_packets++; + }else if ( errno == EAGAIN ){ + m_p_socket_stats->counters.n_rx_os_eagain++; + } + else { + m_p_socket_stats->counters.n_rx_os_errors++; + } +} + +void pipeinfo::save_stats_tx_os(int bytes) +{ + if (bytes >= 0) { + m_p_socket_stats->counters.n_tx_os_bytes += bytes; + m_p_socket_stats->counters.n_tx_os_packets++; + }else if ( errno == EAGAIN ){ + m_p_socket_stats->counters.n_rx_os_eagain++; + } + else { + m_p_socket_stats->counters.n_tx_os_errors++; + } +} + + diff --git a/src/vma/sock/pipeinfo.h b/src/vma/sock/pipeinfo.h new file mode 100644 index 0000000..af5fa8f --- /dev/null +++ b/src/vma/sock/pipeinfo.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef PIPEINFO_H +#define PIPEINFO_H + +#include "socket_fd_api.h" +#include "utils/lock_wrapper.h" +#include +#include + +class pipeinfo : public socket_fd_api, public timer_handler +{ +public: + pipeinfo(int fd); + ~pipeinfo(); + + virtual void clean_obj(); + + int fcntl(int __cmd, unsigned long int __arg); + int ioctl(unsigned long int __request, unsigned long int __arg); + + // Process a Rx request, we might have a ready packet, or we might block until + // we have one (if sockinfo::m_b_blocking == true) + ssize_t rx(const rx_call_t call_type, struct iovec* p_iov, ssize_t sz_iov, + int* p_flags, struct sockaddr *__from = NULL, socklen_t *__fromlen = NULL, struct msghdr *__msg = NULL); + + // Process a Tx request, handle all that is needed to send the packet, we might block + // until the connection info is ready or a tx buffer is releast (if sockinfo::m_b_blocking == true) + ssize_t tx(vma_tx_call_attr_t &tx_arg); + + void statistics_print(); + + virtual inline fd_type_t get_type() { + return FD_TYPE_PIPE; + } + +private: + bool m_b_blocking; + + // Main mutex to protect from multi threaded access to sockinfo from sock-redirect + bool m_b_closed; + lock_mutex m_lock; + lock_mutex m_lock_rx; + lock_mutex m_lock_tx; + + socket_stats_t m_socket_stats; + socket_stats_t* m_p_socket_stats; + + void* m_timer_handle; + + int m_write_count; + int m_write_count_on_last_timer; + int m_write_count_no_change_count; + bool m_b_lbm_event_q_pipe_timer_on; + + void handle_timer_expired(void* user_data); + + void write_lbm_pipe_enhance(); + + void save_stats_rx_os(int bytes); + void save_stats_tx_os(int bytes); +}; + +#endif + diff --git a/src/vma/sock/pkt_rcvr_sink.h b/src/vma/sock/pkt_rcvr_sink.h new file mode 100644 index 0000000..e4a1003 --- /dev/null +++ b/src/vma/sock/pkt_rcvr_sink.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef PKT_RECVR_SINK_H +#define PKT_RECVR_SINK_H + +class mem_buf_desc_t; +class flow_tuple_with_local_if; +class ring; + +/* + * Class pkt_rcvr_sink + * An object must implement pkt_rcvr_sink to register with ib_conn_mgr_base + * The rx_joined_notify_cb() will be called when the IBCM is ready to start + * receiving packets (MC join is complete and CQ is mapped). + * The rx_diconnect_notify_cb() will be called before the IB stops receiving + * packets (CQ is being removed and MC leave is called). + * The rx_pkt_notify_cb() will be called when a ip packet is in the ready q for the socket. + * The implementing object should register the information and release calling context immediately. + * When no packet receivers (or transmitters) are registered the objects will be deleted +*/ +class pkt_rcvr_sink +{ +public: + virtual ~pkt_rcvr_sink() {}; + + // Callback from lower layer notifying new receive packets + // Return: 'true' if object queuing this receive packet + // 'false' if not interested in this receive packet + virtual bool rx_input_cb(mem_buf_desc_t* p_rx_pkt_mem_buf_desc_info, void* pv_fd_ready_array) = 0; + + // Callback from lower layer notifying completion of RX registration process + virtual void rx_add_ring_cb(flow_tuple_with_local_if &flow_key, ring* p_ring, bool is_migration = false) = 0; + + // Callback from lower layer notifying before RX resources deallocation + virtual void rx_del_ring_cb(flow_tuple_with_local_if &flow_key, ring* p_ring, bool is_migration = false) = 0; +}; + +#endif diff --git a/src/vma/sock/pkt_sndr_source.h b/src/vma/sock/pkt_sndr_source.h new file mode 100644 index 0000000..696cef7 --- /dev/null +++ b/src/vma/sock/pkt_sndr_source.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef PKT_SNDR_SOURCE_H +#define PKT_SNDR_SOURCE_H + +/** + * @class pkt_sndr_source + * An object must implement pkt_sndr_source to register with ib_conn_mgr_base + * When no packet transmitters (or receivers) are registered the objects will be + * deleted. + */ +class pkt_sndr_source +{ +public: + virtual ~pkt_sndr_source() {}; +}; + + +#endif diff --git a/src/vma/sock/sock-redirect.cpp b/src/vma/sock/sock-redirect.cpp new file mode 100644 index 0000000..5fe504b --- /dev/null +++ b/src/vma/sock/sock-redirect.cpp @@ -0,0 +1,2768 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "sock-redirect.h" + +#include +#include +#include +#include + +#include "utils/lock_wrapper.h" +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "vma/vma_extra.h" + +#include + +#include "fd_collection.h" +#include "vma/util/instrumentation.h" + +using namespace std; + + +#define MODULE_NAME "srdr:" + +#define srdr_logpanic __log_panic +#define srdr_logerr __log_err +#define srdr_logwarn __log_warn +#define srdr_loginfo __log_info +#define srdr_logdbg __log_dbg +#define srdr_logfunc __log_func +#define srdr_logfuncall __log_funcall + +#define srdr_logdbg_entry __log_entry_dbg +#define srdr_logfunc_entry __log_entry_func +#define srdr_logfuncall_entry __log_entry_funcall + +#define srdr_logdbg_exit __log_exit_dbg +#define srdr_logfunc_exit __log_exit_func + +#define EP_MAX_EVENTS (int)((INT_MAX / sizeof(struct epoll_event))) + +struct os_api orig_os_api; +struct sigaction g_act_prev; +sighandler_t g_sighandler = NULL; +class ring_simple; +class ring_eth_cb; +class ring_eth_direct; + +template +void assign_dlsym(T &ptr, const char *name) { + ptr = reinterpret_cast(dlsym(RTLD_NEXT, name)); +} + +#define FD_MAP_SIZE (g_p_fd_collection ? g_p_fd_collection->get_fd_map_size() : 1024) + +#define DO_GLOBAL_CTORS() do { \ + int __res = do_global_ctors(); \ + if (__res) { \ + vlog_printf(VLOG_ERROR, "%s vma failed to start errno: %m\n", \ + __FUNCTION__, errno); \ + if (safe_mce_sys().exception_handling == vma_exception_handling::MODE_EXIT) { \ + exit(-1); \ + } \ + return -1; \ + } \ +} while (0) + +#define GET_ORIG_FUNC(__name) \ + if (!orig_os_api.__name) { \ + dlerror(); \ + assign_dlsym(orig_os_api.__name, #__name); \ + char *dlerror_str = dlerror(); \ + if (dlerror_str) { \ + __log_warn("dlsym returned with error '%s' when looking for '%s'", \ + dlerror_str, #__name); \ + } else { \ + __log_dbg("dlsym found %p for '%s()'", orig_os_api.__name , #__name); \ + } \ + } + +#define SET_EXTRA_API(dst, func, mask) do { \ + vma_api->dst = func; \ + vma_api->vma_extra_supported_mask |= mask; \ +} while(0); + +#define VERIFY_PASSTROUGH_CHANGED(__ret, __func_and_params__) do { \ + bool passthrough = p_socket_object->isPassthrough(); \ + __ret = __func_and_params__; \ + if (!passthrough && p_socket_object->isPassthrough()) { \ + handle_close(__fd, false, true); \ + } \ +} while(0); + +void get_orig_funcs() +{ + // Save pointer to original functions + GET_ORIG_FUNC(socket); + GET_ORIG_FUNC(close); + GET_ORIG_FUNC(close); + GET_ORIG_FUNC(__res_iclose); + GET_ORIG_FUNC(shutdown); + GET_ORIG_FUNC(listen); + GET_ORIG_FUNC(accept); + GET_ORIG_FUNC(accept4); + GET_ORIG_FUNC(bind); + GET_ORIG_FUNC(connect); + GET_ORIG_FUNC(setsockopt); + GET_ORIG_FUNC(getsockopt); + GET_ORIG_FUNC(fcntl); + GET_ORIG_FUNC(ioctl); + GET_ORIG_FUNC(getsockname); + GET_ORIG_FUNC(getpeername); + GET_ORIG_FUNC(read); + GET_ORIG_FUNC(__read_chk); + GET_ORIG_FUNC(readv); + GET_ORIG_FUNC(recv); + GET_ORIG_FUNC(__recv_chk); + GET_ORIG_FUNC(recvmsg); + GET_ORIG_FUNC(recvmmsg); + GET_ORIG_FUNC(recvfrom); + GET_ORIG_FUNC(__recvfrom_chk); + GET_ORIG_FUNC(write); + GET_ORIG_FUNC(writev); + GET_ORIG_FUNC(send); + GET_ORIG_FUNC(sendmsg); + GET_ORIG_FUNC(sendmmsg); + GET_ORIG_FUNC(sendto); + GET_ORIG_FUNC(sendfile); + GET_ORIG_FUNC(sendfile64); + GET_ORIG_FUNC(select); + GET_ORIG_FUNC(pselect); + GET_ORIG_FUNC(poll); + GET_ORIG_FUNC(ppoll); + GET_ORIG_FUNC(epoll_create); + GET_ORIG_FUNC(epoll_create1); + GET_ORIG_FUNC(epoll_ctl); + GET_ORIG_FUNC(epoll_wait); + GET_ORIG_FUNC(epoll_pwait); + GET_ORIG_FUNC(socketpair); + GET_ORIG_FUNC(pipe); + GET_ORIG_FUNC(open); + GET_ORIG_FUNC(creat); + GET_ORIG_FUNC(dup); + GET_ORIG_FUNC(dup2); + GET_ORIG_FUNC(clone); + GET_ORIG_FUNC(fork); + GET_ORIG_FUNC(vfork); + GET_ORIG_FUNC(daemon); + GET_ORIG_FUNC(sigaction); + GET_ORIG_FUNC(signal); +} + +const char* socket_get_domain_str(int domain) +{ + switch (domain) { + case AF_INET: return "AF_INET"; + case AF_INET6: return "AF_INET6"; + case AF_UNSPEC: return "AF_UNSPEC"; + case AF_LOCAL: return "AF_LOCAL"; + default: + break; + } + return ""; +} + +const char* socket_get_type_str(int type) +{ + switch (type) { + case SOCK_STREAM: return "SOCK_STREAM"; + case SOCK_DGRAM: return "SOCK_DGRAM"; + case SOCK_RAW: return "SOCK_RAW"; + default: + break; + } + return ""; +} + +// Format a sockaddr into a string for logging +char* sprintf_sockaddr(char* buf, int buflen, const struct sockaddr* _addr, socklen_t _addrlen) +{ + if ((_addrlen >= sizeof(struct sockaddr_in)) && (get_sa_family(_addr) == AF_INET)) { + in_addr_t in_addr = get_sa_ipv4_addr(_addr); + in_port_t in_port = get_sa_port(_addr); + /* cppcheck-suppress wrongPrintfScanfArgNum */ + snprintf(buf, buflen, "AF_INET, addr=%d.%d.%d.%d, port=%d", NIPQUAD(in_addr), ntohs(in_port)); + } + else { + snprintf(buf, buflen, "sa_family=%d", get_sa_family(_addr)); + } + return buf; +} + +#define VMA_DBG_SEND_MCPKT_COUNTER_STR "VMA_DBG_SEND_MCPKT_COUNTER" +#define VMA_DBG_SEND_MCPKT_MCGROUP_STR "VMA_DBG_SEND_MCPKT_MCGROUP" +static int dbg_check_if_need_to_send_mcpkt_setting = -1; // 1-Init, 0-Disabled, N>0-send mc packet on the Nth socket() call +static int dbg_check_if_need_to_send_mcpkt_counter = 1; +static int dbg_check_if_need_to_send_mcpkt_prevent_nested_calls = 0; + +void dbg_send_mcpkt() +{ + int fd = 0; + char *env_ptr = NULL; + if ((fd = socket(AF_INET, SOCK_DGRAM, 0)) < 0) { + vlog_printf(VLOG_WARNING, "send_mc_packet_test:%d: socket() errno %d %m", __LINE__, errno); + exit(1); + } + + struct sockaddr_in addr_in; + struct sockaddr* p_addr = (struct sockaddr*)&addr_in; + + addr_in.sin_family = AF_INET; + addr_in.sin_port = INPORT_ANY; + addr_in.sin_addr.s_addr = INADDR_ANY; + if ((env_ptr = getenv(VMA_DBG_SEND_MCPKT_MCGROUP_STR)) == NULL) { + vlog_printf(VLOG_WARNING, "send_mc_packet_test:%d: Need to set '%s' parameter to dest ip (dot format)\n", __LINE__, VMA_DBG_SEND_MCPKT_MCGROUP_STR); + exit(2); + } + if (1 != inet_pton(AF_INET, env_ptr, &addr_in.sin_addr)) { + vlog_printf(VLOG_WARNING, "send_mc_packet_test:%d: Invalid input IP address: '%s' errno %d %m\n", __LINE__, env_ptr, errno); + exit(3); + } + + const char msgbuf[256] = "Hello Alex"; + + vlog_printf(VLOG_WARNING, "send_mc_packet_test:%d: Sending MC test packet to address: %d.%d.%d.%d [%s]\n", __LINE__, NIPQUAD(get_sa_ipv4_addr(p_addr)), VMA_DBG_SEND_MCPKT_MCGROUP_STR); + if (sendto(fd, msgbuf, strlen(msgbuf), 0, p_addr, sizeof(struct sockaddr)) < 0) + vlog_printf(VLOG_ERROR, "sendto mc_packet failed! errno %m\n", errno); + close(fd); +} + +void dbg_check_if_need_to_send_mcpkt() +{ + if (dbg_check_if_need_to_send_mcpkt_prevent_nested_calls) + return; + dbg_check_if_need_to_send_mcpkt_prevent_nested_calls = 1; + + // Read user setting + if (dbg_check_if_need_to_send_mcpkt_setting == -1) { + // Default will be 'Disbaled' + dbg_check_if_need_to_send_mcpkt_setting++; + + // Then we will read the user settings + char *env_ptr = NULL; + if ((env_ptr = getenv(VMA_DBG_SEND_MCPKT_COUNTER_STR)) != NULL) { + dbg_check_if_need_to_send_mcpkt_setting = atoi(env_ptr); + } + if (dbg_check_if_need_to_send_mcpkt_setting > 0) { + vlog_printf(VLOG_WARNING, "send_mc_packet_test: *************************************************************\n"); + vlog_printf(VLOG_WARNING, "send_mc_packet_test: Send test MC packet setting is: %d [%s]\n", dbg_check_if_need_to_send_mcpkt_setting, VMA_DBG_SEND_MCPKT_COUNTER_STR); + vlog_printf(VLOG_WARNING, "send_mc_packet_test: If you don't know what this means don't use '%s' VMA configuration parameter!\n", VMA_DBG_SEND_MCPKT_COUNTER_STR); + vlog_printf(VLOG_WARNING, "send_mc_packet_test: *************************************************************\n"); + } + } + + // Test for action + if (dbg_check_if_need_to_send_mcpkt_setting > 0) { + if (dbg_check_if_need_to_send_mcpkt_counter == dbg_check_if_need_to_send_mcpkt_setting) + { + // Actual send mc packet + dbg_send_mcpkt(); + } + else { + vlog_printf(VLOG_WARNING, "send_mc_packet_test:%d: Skipping this socket() call\n", __LINE__); + } + dbg_check_if_need_to_send_mcpkt_counter++; + } + dbg_check_if_need_to_send_mcpkt_prevent_nested_calls--; +} + +void handle_close(int fd, bool cleanup, bool passthrough) +{ + + srdr_logfunc("Cleanup fd=%d", fd); + + if (g_p_fd_collection) { + // Remove fd from all existing epoll sets + g_p_fd_collection->remove_from_all_epfds(fd, passthrough); + + if (fd_collection_get_sockfd(fd)) { + g_p_fd_collection->del_sockfd(fd, cleanup); + } + if (fd_collection_get_epfd(fd)) { + g_p_fd_collection->del_epfd(fd, cleanup); + } + + } +} + + +//----------------------------------------------------------------------------- +// extended API functions +//----------------------------------------------------------------------------- + +extern "C" +int vma_register_recv_callback(int __fd, vma_recv_callback_t __callback, void *__context) +{ + srdr_logfunc_entry("fd=%d", __fd); + + socket_fd_api* p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + p_socket_object->register_callback(__callback, __context); + return 0; + } + errno = EINVAL; + return -1; +} + +extern "C" +int vma_recvfrom_zcopy(int __fd, void *__buf, size_t __nbytes, int *__flags, + struct sockaddr *__from, socklen_t *__fromlen) +{ + srdr_logfuncall_entry("fd=%d", __fd); + + socket_fd_api* p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + struct iovec piov[1]; + piov[0].iov_base = __buf; + piov[0].iov_len = __nbytes; + *__flags |= MSG_VMA_ZCOPY; + return p_socket_object->rx(RX_RECVFROM, piov, 1, __flags, __from, __fromlen); + + } + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.recvfrom) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + return orig_os_api.recvfrom(__fd, __buf, __nbytes, *__flags, __from, __fromlen); +} + +extern "C" +int vma_free_packets(int __fd, struct vma_packet_t *pkts, size_t count) +{ + socket_fd_api* p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + return p_socket_object->free_packets(pkts, count); + } + + errno = EINVAL; + return -1; +} + +static int dummy_vma_socketxtreme_poll(int fd, struct vma_completion_t* completions, unsigned int ncompletions, int flags) +{ + VLOG_PRINTF_ONCE_THEN_ALWAYS(VLOG_WARNING, VLOG_DEBUG, + "socketXtreme was not enabled during runtime. Set %s to use. Ignoring...", SYS_VAR_SOCKETXTREME, + fd, completions, ncompletions, flags); + errno = EOPNOTSUPP; + return -1; +} + +extern "C" +int vma_socketxtreme_poll(int fd, struct vma_completion_t* completions, unsigned int ncompletions, int flags) +{ + int ret_val = -1; + cq_channel_info* cq_ch_info = NULL; + + cq_ch_info = g_p_fd_collection->get_cq_channel_fd(fd); + + if (likely(cq_ch_info)) { + ring* p_ring = cq_ch_info->get_ring(); + + ret_val = p_ring->socketxtreme_poll(completions, ncompletions, flags); +#ifdef RDTSC_MEASURE_RX_PROCCESS_BUFFER_TO_RECIVEFROM + RDTSC_TAKE_END(g_rdtsc_instr_info_arr[RDTSC_FLOW_PROCCESS_RX_BUFFER_TO_RECIVEFROM]); +#endif //RDTSC_MEASURE_RX_PROCCESS_BUFFER_TO_RECIVEFROM + +#ifdef RDTSC_MEASURE_RX_LWIP_TO_RECEVEFROM + RDTSC_TAKE_END(g_rdtsc_instr_info_arr[RDTSC_FLOW_RX_LWIP_TO_RECEVEFROM]); +#endif //RDTSC_MEASURE_RX_LWIP_TO_RECEVEFROM + +#ifdef RDTSC_MEASURE_RX_CQE_RECEIVEFROM + RDTSC_TAKE_END(g_rdtsc_instr_info_arr[RDTSC_FLOW_RX_CQE_TO_RECEIVEFROM]); +#endif //RDTSC_MEASURE_RX_CQE_RECEIVEFROM + +#ifdef RDTSC_MEASURE_RECEIVEFROM_TO_SENDTO + RDTSC_TAKE_START(g_rdtsc_instr_info_arr[RDTSC_FLOW_RECEIVEFROM_TO_SENDTO]); +#endif //RDTSC_MEASURE_RECEIVEFROM_TO_SENDTO + return ret_val; + } + else { + errno = EBADFD; + return ret_val; + } +} + +static int dummy_vma_socketxtreme_free_vma_packets(struct vma_packet_desc_t *packets, int num) +{ + VLOG_PRINTF_ONCE_THEN_ALWAYS(VLOG_WARNING, VLOG_DEBUG, + "socketXtreme was not enabled during runtime. Set %s to use. Ignoring...", SYS_VAR_SOCKETXTREME, + packets, num); + errno = EOPNOTSUPP; + return -1; +} + +extern "C" +int vma_socketxtreme_free_vma_packets(struct vma_packet_desc_t *packets, int num) +{ + mem_buf_desc_t* desc = NULL; + socket_fd_api* p_socket_object = NULL; + + if (likely(packets)) { + for (int i = 0; i < num; i++) { + desc = (mem_buf_desc_t*)packets[i].buff_lst; + if (desc) { + p_socket_object = (socket_fd_api*)desc->rx.context; + ring_slave* rng = desc->p_desc_owner; + if (p_socket_object) { + p_socket_object->free_buffs(packets[i].total_len); + } + if (rng) { + rng->reclaim_recv_buffers(desc); + } else { + goto err; + } + } else { + goto err; + } + } + } + else { + goto err; + } + + return 0; + +err: + errno = EINVAL; + return -1; +} + +static int dummy_vma_socketxtreme_ref_vma_buff(vma_buff_t *buff) +{ + VLOG_PRINTF_ONCE_THEN_ALWAYS(VLOG_WARNING, VLOG_DEBUG, + "socketXtreme was not enabled during runtime. Set %s to use. Ignoring...", SYS_VAR_SOCKETXTREME, + buff); + errno = EOPNOTSUPP; + return -1; +} + +extern "C" +int vma_socketxtreme_ref_vma_buff(vma_buff_t *buff) +{ + int ret_val = 0; + mem_buf_desc_t* desc = NULL; + + if (likely(buff)) { + desc = (mem_buf_desc_t*)buff; + ret_val = desc->lwip_pbuf_inc_ref_count(); + } + else { + errno = EINVAL; + ret_val = -1; + } + return ret_val; +} + +static int dummy_vma_socketxtreme_free_vma_buff(vma_buff_t *buff) +{ + VLOG_PRINTF_ONCE_THEN_ALWAYS(VLOG_WARNING, VLOG_DEBUG, + "socketXtreme was not enabled during runtime. Set %s to use. Ignoring...", SYS_VAR_SOCKETXTREME, + buff); + errno = EOPNOTSUPP; + return -1; +} + +extern "C" +int vma_socketxtreme_free_vma_buff(vma_buff_t *buff) +{ + int ret_val = 0; + mem_buf_desc_t* desc = NULL; + + if (likely(buff)) { + desc = (mem_buf_desc_t*)buff; + ring_slave* rng = desc->p_desc_owner; + ret_val = rng->reclaim_recv_single_buffer(desc); + } + else { + errno = EINVAL; + ret_val = -1; + } + return ret_val; +} + +extern "C" +int vma_get_socket_rings_num(int fd) +{ + socket_fd_api* p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(fd); + if (p_socket_object && p_socket_object->check_rings()) { + return p_socket_object->get_rings_num(); + } + + return 0; +} + +extern "C" +int vma_get_socket_rings_fds(int fd, int *ring_fds, int ring_fds_sz) +{ + int* p_rings_fds = NULL; + socket_fd_api* p_socket_object = NULL; + int rings_num = 0; + + if (ring_fds_sz <= 0 || ring_fds == NULL) { + errno = EINVAL; + return -1; + } + p_socket_object = fd_collection_get_sockfd(fd); + if (p_socket_object && p_socket_object->check_rings()) { + p_rings_fds = p_socket_object->get_rings_fds(rings_num); + for (int i = 0; i < min(ring_fds_sz, rings_num); i++) { + ring_fds[i] = p_rings_fds[i]; + } + } + + return min(ring_fds_sz, rings_num); +} + +extern "C" +int vma_get_socket_tx_ring_fd(int sock_fd, struct sockaddr *to, socklen_t tolen) +{ + socket_fd_api* p_socket_object = fd_collection_get_sockfd(sock_fd); + + if (!p_socket_object) { + errno = EINVAL; + return -1; + } + return p_socket_object->get_socket_tx_ring_fd(to, tolen); +} + +extern "C" +int vma_add_conf_rule(const char *config_line) +{ + srdr_logdbg("adding conf rule: %s", config_line); + + int ret = __vma_parse_config_line(config_line); + + if (*g_p_vlogger_level >= VLOG_DEBUG) + __vma_print_conf_file(__instance_list); + + return ret; +} + +extern "C" +int vma_thread_offload(int offload, pthread_t tid) +{ + if (g_p_fd_collection) { + g_p_fd_collection->offloading_rule_change_thread(offload, tid); + } else { + return -1; + } + + return 0; +} + +extern "C" +int vma_dump_fd_stats(int fd, int log_level) +{ + if (g_p_fd_collection) { + g_p_fd_collection->statistics_print(fd, log_level::from_int(log_level)); + return 0; + } + return -1; +} + +/* Multi Packet Receive Queue functionality is deprecated + * and is not going to be supported in the future releases + */ +extern "C" +int vma_cyclic_buffer_read(int fd, struct vma_completion_cb_t *completion, + size_t min, size_t max, int flags) +{ +#ifdef HAVE_MP_RQ + cq_channel_info* p_cq_ch_info = g_p_fd_collection->get_cq_channel_fd(fd); + if (p_cq_ch_info) { + ring_eth_cb* p_ring = (ring_eth_cb *)p_cq_ch_info->get_ring(); + if (likely(p_ring && p_ring->is_mp_ring())) { + return p_ring->cyclic_buffer_read(*completion, min, max, + flags); + } else { + vlog_printf(VLOG_ERROR, "could not find ring, got fd " + "%d\n", fd); + return -1; + } + } else { + vlog_printf(VLOG_ERROR, "could not find p_cq_ch_info, got fd " + "%d\n", fd); + return -1; + } +#else + VLOG_PRINTF_ONCE_THEN_ALWAYS(VLOG_WARNING, VLOG_DEBUG, "Striding RQ is no supported. ignoring...", fd, completion, min, max, flags); + errno = EOPNOTSUPP; + return -1; +#endif // HAVE_MP_RQ +} + +/* Multi Packet Receive Queue functionality is deprecated + * and is not going to be supported in the future releases + */ +extern "C" +int vma_get_mem_info(int fd, void **addr, size_t *length, uint32_t *lkey) +{ +#ifdef HAVE_MP_RQ + cq_channel_info* p_cq_ch_info = g_p_fd_collection->get_cq_channel_fd(fd); + if (!length || !lkey || !addr) { + vlog_printf(VLOG_ERROR, "invalid pointers given. fd: %d, addr " + "%p length %p lkey %p\n", fd, addr, length, lkey); + return -1; + } + if (p_cq_ch_info) { + ring_eth_cb *p_ring = dynamic_cast(p_cq_ch_info->get_ring()); + ibv_sge mem_info; + if (likely(p_ring && p_ring->get_mem_info(mem_info) == 0)) { + *addr = (void*)mem_info.addr; + *length = mem_info.length; + *lkey = mem_info.lkey; + return 0; + } else { + vlog_printf(VLOG_ERROR, "could not find ring_eth_cb, " + "got fd %d\n", fd); + } + } else { + vlog_printf(VLOG_ERROR, "could not find p_cq_ch_info, got fd " + "%d\n", fd); + } + return -1; +#else + NOT_IN_USE(addr); + NOT_IN_USE(length); + NOT_IN_USE(lkey); + VLOG_PRINTF_ONCE_THEN_ALWAYS(VLOG_WARNING, VLOG_DEBUG, + "vma_get_mem_info is no supported with this ring", fd); + errno = EOPNOTSUPP; + return -1; +#endif // HAVE_MP_RQ +} + +extern "C" +int vma_add_ring_profile(vma_ring_type_attr *profile, vma_ring_profile_key *res) +{ + if (!g_p_ring_profile) { + vlog_printf(VLOG_DEBUG, "%s g_p_ring_profile is null\n",__func__); + return -1; + } + *res = g_p_ring_profile->add_profile(profile); + return 0; +} + +extern "C" +int vma_modify_ring(struct vma_modify_ring_attr *mr_data) +{ + srdr_logfunc_entry("ring_fd=%d, mr_data=%p ", mr_data->ring_fd, mr_data); + int ret = -1; + cq_channel_info* p_cq_ch_info = g_p_fd_collection->get_cq_channel_fd(mr_data->ring_fd); + if (likely(p_cq_ch_info)) { + ring_simple* p_ring = dynamic_cast(p_cq_ch_info->get_ring()); + if (likely(p_ring)) { + if (VMA_MODIFY_RING_CQ_ARM & mr_data->comp_bit_mask) { + if (RING_ETH_CB == p_ring->get_type()) { + ret = p_ring->ack_and_arm_cq(CQT_RX); + } else if (RING_ETH_DIRECT == p_ring->get_type()) { + ret = p_ring->ack_and_arm_cq(CQT_TX); + } else { + vlog_printf(VLOG_ERROR, "Ring type [%d] is not supported\n", + p_ring->get_type()); + } + } else if (VMA_MODIFY_RING_CQ_MODERATION & mr_data->comp_bit_mask) { + p_ring->modify_cq_moderation(mr_data->cq_moderation.cq_moderation_period_usec, + mr_data->cq_moderation.cq_moderation_count); + ret = 0; + } else { + vlog_printf(VLOG_ERROR, "comp_mask [0x%x] is not supported\n", + mr_data->comp_bit_mask); + } + } else { + vlog_printf(VLOG_ERROR, "could not find ring_simple," + " got fd %d\n", mr_data->ring_fd); + } + } else { + vlog_printf(VLOG_ERROR, "could not find p_cq_ch_info, got fd " + "%d\n", mr_data->ring_fd); + } + + return ret; +} + +extern "C" +int vma_get_socket_netowrk_header(int __fd, void *ptr, uint16_t *len) +{ + srdr_logdbg_entry("fd=%d, ptr=%p len=%d", __fd, ptr, len); + + socket_fd_api* p_socket_object = fd_collection_get_sockfd(__fd); + + if (p_socket_object) { + return p_socket_object->get_socket_network_ptr(ptr, *len); + } + errno = EINVAL; + return -1; +} + +extern "C" +int vma_get_ring_direct_descriptors(int __fd, + struct vma_mlx_hw_device_data *data) +{ + srdr_logdbg_entry("fd=%d, ptr=%p ", __fd, data); + + cq_channel_info* p_cq_ch_info = g_p_fd_collection->get_cq_channel_fd(__fd); + if (p_cq_ch_info) { + ring_simple* p_ring = dynamic_cast(p_cq_ch_info->get_ring()); + if (likely(p_ring)) { + return p_ring->get_ring_descriptors(*data); + } else { + vlog_printf(VLOG_ERROR, "could not find ring_simple," + " got fd %d\n", __fd); + return -1; + } + } else { + vlog_printf(VLOG_ERROR, "could not find p_cq_ch_info, got fd " + "%d\n", __fd); + return -1; + } +} + +extern "C" +int vma_reg_mr_on_ring(int __fd, void *addr, size_t length, uint32_t *lkey) +{ + srdr_logdbg_entry("fd=%d, addr=%p length %zd key %p", __fd, addr, length, lkey); + + if (!lkey) { + vlog_printf(VLOG_DEBUG, "key is null fd %d, addr %p, length %zd\n", + __fd, addr, length); + errno = EINVAL; + return -1; + } + cq_channel_info* p_cq_ch_info = g_p_fd_collection->get_cq_channel_fd(__fd); + if (p_cq_ch_info) { + ring* p_ring = p_cq_ch_info->get_ring(); + if (likely(p_ring)) { + return p_ring->reg_mr(addr, length, *lkey); + } else { + vlog_printf(VLOG_ERROR, "could not find ring, got fd " + "%d\n", __fd); + return -1; + } + } else { + vlog_printf(VLOG_ERROR, "could not find p_cq_ch_info, got fd " + "%d\n", __fd); + return -1; + } +} + +extern "C" +int vma_dereg_mr_on_ring(int __fd, void *addr, size_t length) +{ + srdr_logdbg_entry("fd=%d, addr=%p ", __fd, addr); + + cq_channel_info* p_cq_ch_info = g_p_fd_collection->get_cq_channel_fd(__fd); + if (p_cq_ch_info) { + ring* p_ring = p_cq_ch_info->get_ring(); + if (likely(p_ring)) { + return p_ring->dereg_mr(addr, length); + } else { + vlog_printf(VLOG_ERROR, "could not find ring, got fd " + "%d\n", __fd); + return -1; + } + } else { + vlog_printf(VLOG_ERROR, "could not find p_cq_ch_info, got fd " + "%d\n", __fd); + return -1; + } +} + +extern "C" +int vma_get_dpcp_devices(uintptr_t **devices, size_t *devices_num) +{ +#ifdef DEFINED_DPCP + ib_context_map_t *ib_ctx_map = NULL; + ib_ctx_handler *p_ib_ctx_h = NULL; + size_t found_devs = 0; + + if (!devices_num) { + return EINVAL; + } + + ib_ctx_map = g_p_ib_ctx_handler_collection->get_ib_cxt_list(); + if (ib_ctx_map) { + ib_context_map_t::iterator iter; + + for (iter = ib_ctx_map->begin(); iter != ib_ctx_map->end(); iter++) { + p_ib_ctx_h = iter->second; + if (p_ib_ctx_h->get_dpcp_adapter()) { + if (devices && (found_devs < *devices_num)) { + devices[found_devs] = (uintptr_t*)p_ib_ctx_h->get_dpcp_adapter(); + } + found_devs++; + } + } + } + + *devices_num = found_devs; + srdr_logdbg_entry("returned %zd devices", found_devs); + + return 0; +#else + NOT_IN_USE(devices); + NOT_IN_USE(devices_num); + VLOG_PRINTF_ONCE_THEN_ALWAYS(VLOG_WARNING, VLOG_DEBUG, + "vma_get_dpcp_devices is no supported"); + errno = EOPNOTSUPP; + return -1; +#endif /* DEFINED_DPCP */ +} + +//----------------------------------------------------------------------------- +// replacement functions +//----------------------------------------------------------------------------- + +/* Create a new socket of type TYPE in domain DOMAIN, using + protocol PROTOCOL. If PROTOCOL is zero, one is chosen automatically. + Returns a file descriptor for the new socket, or -1 for errors. */ +extern "C" +int socket(int __domain, int __type, int __protocol) +{ + return socket_internal(__domain, __type, __protocol, true); +} + +// allow calling our socket(...) implementation safely from within libvma.so +// this is critical in case VMA was loaded using dlopen and not using LD_PRELOAD +// TODO: look for additional such functions/calls +int socket_internal(int __domain, int __type, int __protocol, bool check_offload /*= false*/) +{ + bool offload_sockets = (__type & 0xf) == SOCK_DGRAM || (__type & 0xf) == SOCK_STREAM; + + if (offload_sockets) + DO_GLOBAL_CTORS(); + + dbg_check_if_need_to_send_mcpkt(); + + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.socket) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + int fd = orig_os_api.socket(__domain, __type, __protocol); + vlog_printf(VLOG_DEBUG, "ENTER: %s(domain=%s(%d), type=%s(%d), protocol=%d) = %d\n",__func__, socket_get_domain_str(__domain), __domain, socket_get_type_str(__type), __type, __protocol, fd); + if (fd < 0) { + return fd; + } + + if (g_p_fd_collection) { + // Sanity check to remove any old sockinfo object using the same fd!! + handle_close(fd, true); + + // Create new sockinfo object for this new socket + if (offload_sockets) + g_p_fd_collection->addsocket(fd, __domain, __type, check_offload); + } + + return fd; +} + +extern "C" +int close(int __fd) +{ + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.close) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + srdr_logdbg_entry("fd=%d", __fd); + + handle_close(__fd); + + return orig_os_api.close(__fd); +} + +extern "C" +void __res_iclose(res_state statp, bool free_addr) +{ + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.__res_iclose) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + srdr_logdbg_entry(""); + for (int ns = 0; ns < statp->_u._ext.nscount; ns++) { + int sock = statp->_u._ext.nssocks[ns]; + if (sock != -1) { + handle_close(sock); + } + } + orig_os_api.__res_iclose(statp, free_addr); +} + +/* Shut down all or part of the connection open on socket FD. + HOW determines what to shut down: + SHUT_RD = No more receptions; + SHUT_WR = No more transmissions; + SHUT_RDWR = No more receptions or transmissions. + Returns 0 on success, -1 for errors. */ +extern "C" +int shutdown(int __fd, int __how) +{ + srdr_logdbg_entry("fd=%d, how=%d", __fd, __how); + + socket_fd_api* p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) + return p_socket_object->shutdown(__how); + + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.shutdown) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + return orig_os_api.shutdown(__fd, __how); +} + +extern "C" +int listen(int __fd, int backlog) +{ + srdr_logdbg_entry("fd=%d, backlog=%d", __fd, backlog); + + socket_fd_api* p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + + if (p_socket_object) { + int ret = p_socket_object->prepareListen(); // for verifying that the socket is really offloaded + if (ret < 0) + return ret; //error + if (ret > 0) { //Passthrough + handle_close(__fd, false, true); + p_socket_object = NULL; + } + } + if (p_socket_object) { + return p_socket_object->listen(backlog); + } + + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.listen) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + return orig_os_api.listen(__fd, backlog); +} + +extern "C" +int accept(int __fd, struct sockaddr *__addr, socklen_t *__addrlen) +{ + socket_fd_api* p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) + return p_socket_object->accept(__addr, __addrlen); + + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.accept) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + return orig_os_api.accept(__fd, __addr, __addrlen); +} + +extern "C" +int accept4(int __fd, struct sockaddr *__addr, socklen_t *__addrlen, int __flags) +{ + socket_fd_api* p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) + return p_socket_object->accept4(__addr, __addrlen, __flags); + + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.accept4) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + return orig_os_api.accept4(__fd, __addr, __addrlen, __flags); +} + +/* Give the socket FD the local address ADDR (which is LEN bytes long). */ +extern "C" +int bind(int __fd, const struct sockaddr *__addr, socklen_t __addrlen) +{ + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.bind) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + char buf[256]; + NOT_IN_USE(buf); /* to suppress warning in case VMA_MAX_DEFINED_LOG_LEVEL */ + srdr_logdbg_entry("fd=%d, %s", __fd, sprintf_sockaddr(buf, 256, __addr, __addrlen)); + + int ret = 0; + socket_fd_api* p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + ret = p_socket_object->bind(__addr, __addrlen); + if (p_socket_object->isPassthrough()) { + handle_close(__fd, false, true); + if (ret) { + ret = orig_os_api.bind(__fd, __addr, __addrlen); + } + } + } + else { + ret = orig_os_api.bind(__fd, __addr, __addrlen); + } + + if (ret >= 0) + srdr_logdbg_exit("returned with %d", ret); + else + srdr_logdbg_exit("failed (errno=%d %m)", errno); + return ret; +} + +/* Open a connection on socket FD to peer at ADDR (which LEN bytes long). + For connectionless socket types, just set the default address to send to + and the only address from which to accept transmissions. + Return 0 on success, -1 for errors. + + This function is a cancellation point and therefore not marked with + __THROW. */ +extern "C" +int connect(int __fd, const struct sockaddr *__to, socklen_t __tolen) +{ + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.connect) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + char buf[256]; + NOT_IN_USE(buf); /* to suppress warning in case VMA_MAX_DEFINED_LOG_LEVEL */ + srdr_logdbg_entry("fd=%d, %s", __fd, sprintf_sockaddr(buf, 256, __to, __tolen)); + + int ret = 0; + socket_fd_api* p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (__to && __to->sa_family == AF_INET && p_socket_object) { + ret = p_socket_object->connect(__to, __tolen); + if (p_socket_object->isPassthrough()) { + handle_close(__fd, false, true); + if (ret) { + ret = orig_os_api.connect(__fd, __to, __tolen); + } + } + } + else { + if (p_socket_object) { + p_socket_object->setPassthrough(); + } + ret = orig_os_api.connect(__fd, __to, __tolen); + } + + if (ret >= 0) + srdr_logdbg_exit("returned with %d", ret); + else + srdr_logdbg_exit("failed (errno=%d %m)", errno); + return ret; +} + +/* Set socket FD's option OPTNAME at protocol level LEVEL + to *OPTVAL (which is OPTLEN bytes long). + Returns 0 on success, -1 for errors. */ +extern "C" +int setsockopt(int __fd, int __level, int __optname, + __const void *__optval, socklen_t __optlen) +{ + srdr_logdbg_entry("fd=%d, level=%d, optname=%d", __fd, __level, __optname); + + if (NULL == __optval) { + errno = EFAULT; + return -1; + } + + int ret = 0; + socket_fd_api* p_socket_object = NULL; + + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + VERIFY_PASSTROUGH_CHANGED(ret, p_socket_object->setsockopt(__level, __optname, __optval, __optlen)); + } + else { + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.setsockopt) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + ret = orig_os_api.setsockopt(__fd, __level, __optname, __optval, __optlen); + } + + if (ret >= 0) + srdr_logdbg_exit("returned with %d", ret); + else + srdr_logdbg_exit("failed (errno=%d %m)", errno); + return ret; +} + +/* Get socket FD's option OPTNAME at protocol level LEVEL + to *OPTVAL (which is OPTLEN bytes long). + Returns 0 on success, -1 for errors. */ +extern "C" +int getsockopt(int __fd, int __level, int __optname, + void *__optval, socklen_t *__optlen) +{ + srdr_logdbg_entry("fd=%d, level=%d, optname=%d", __fd, __level, __optname); + + if (__fd == -1 && __level == SOL_SOCKET && __optname == SO_VMA_GET_API && + __optlen && *__optlen >= sizeof(struct vma_api_t*)) { + DO_GLOBAL_CTORS(); + bool enable_socketxtreme = safe_mce_sys().enable_socketxtreme; + srdr_logdbg("User request for VMA Extra API pointers"); + struct vma_api_t *vma_api = new struct vma_api_t(); + + vma_api->vma_extra_supported_mask = 0; + SET_EXTRA_API(register_recv_callback, vma_register_recv_callback, VMA_EXTRA_API_REGISTER_RECV_CALLBACK); + SET_EXTRA_API(recvfrom_zcopy, vma_recvfrom_zcopy, VMA_EXTRA_API_RECVFROM_ZCOPY); + SET_EXTRA_API(free_packets, vma_free_packets, VMA_EXTRA_API_FREE_PACKETS); + SET_EXTRA_API(add_conf_rule, vma_add_conf_rule, VMA_EXTRA_API_ADD_CONF_RULE); + SET_EXTRA_API(thread_offload, vma_thread_offload, VMA_EXTRA_API_THREAD_OFFLOAD); + SET_EXTRA_API(get_socket_rings_num, vma_get_socket_rings_num, VMA_EXTRA_API_GET_SOCKET_RINGS_NUM); + SET_EXTRA_API(get_socket_rings_fds, vma_get_socket_rings_fds, VMA_EXTRA_API_GET_SOCKET_RINGS_FDS); + SET_EXTRA_API(get_socket_tx_ring_fd, vma_get_socket_tx_ring_fd, VMA_EXTRA_API_GET_SOCKET_TX_RING_FD); + SET_EXTRA_API(vma_add_ring_profile, vma_add_ring_profile, VMA_EXTRA_API_ADD_RING_PROFILE); + SET_EXTRA_API(get_socket_network_header, vma_get_socket_netowrk_header, VMA_EXTRA_API_GET_SOCKET_NETWORK_HEADER); + SET_EXTRA_API(get_ring_direct_descriptors, vma_get_ring_direct_descriptors, VMA_EXTRA_API_GET_RING_DIRECT_DESCRIPTORS); + SET_EXTRA_API(register_memory_on_ring, vma_reg_mr_on_ring, VMA_EXTRA_API_REGISTER_MEMORY_ON_RING); + SET_EXTRA_API(deregister_memory_on_ring, vma_dereg_mr_on_ring, VMA_EXTRA_API_DEREGISTER_MEMORY_ON_RING); + SET_EXTRA_API(socketxtreme_free_vma_packets, enable_socketxtreme ? vma_socketxtreme_free_vma_packets : dummy_vma_socketxtreme_free_vma_packets, VMA_EXTRA_API_SOCKETXTREME_FREE_VMA_PACKETS); + SET_EXTRA_API(socketxtreme_poll, enable_socketxtreme ? vma_socketxtreme_poll : dummy_vma_socketxtreme_poll, VMA_EXTRA_API_SOCKETXTREME_POLL); + SET_EXTRA_API(socketxtreme_ref_vma_buff, enable_socketxtreme ? vma_socketxtreme_ref_vma_buff : dummy_vma_socketxtreme_ref_vma_buff, VMA_EXTRA_API_SOCKETXTREME_REF_VMA_BUFF); + SET_EXTRA_API(socketxtreme_free_vma_buff, enable_socketxtreme ? vma_socketxtreme_free_vma_buff : dummy_vma_socketxtreme_free_vma_buff, VMA_EXTRA_API_SOCKETXTREME_FREE_VMA_BUFF); + SET_EXTRA_API(dump_fd_stats, vma_dump_fd_stats, VMA_EXTRA_API_DUMP_FD_STATS); + SET_EXTRA_API(vma_cyclic_buffer_read, vma_cyclic_buffer_read, VMA_EXTRA_API_CYCLIC_BUFFER_READ); + SET_EXTRA_API(get_mem_info, vma_get_mem_info, VMA_EXTRA_API_GET_MEM_INFO); + SET_EXTRA_API(vma_modify_ring, vma_modify_ring, VMA_EXTRA_API_MODIFY_RING); + SET_EXTRA_API(get_dpcp_devices, vma_get_dpcp_devices, VMA_EXTRA_API_GET_DPCP_DEVICES); + *((vma_api_t**)__optval) = vma_api; + return 0; + } + + int ret = 0; + socket_fd_api* p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + VERIFY_PASSTROUGH_CHANGED(ret, p_socket_object->getsockopt(__level, __optname, __optval, __optlen)); + } else { + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.getsockopt) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + ret = orig_os_api.getsockopt(__fd, __level, __optname, __optval, __optlen); + } + + if (ret >= 0) + srdr_logdbg_exit("returned with %d", ret); + else + srdr_logdbg_exit("failed (errno=%d %m)", errno); + return ret; +} + +/* Do the file control operation described by CMD on FD. + The remaining arguments are interpreted depending on CMD. + + This function is a cancellation point and therefore not marked with + __THROW. + NOTE: VMA throw will never occur during handling of any command. + VMA will only throw in case VMA doesn't know to handle a command and the + user requested explicitly that VMA will throw an exception in such a case + by setting VMA_EXCEPTION_HANDLING accordingly (see README.txt) + */ +extern "C" +int fcntl(int __fd, int __cmd, ...) +{ + srdr_logfunc_entry("fd=%d, cmd=%d", __fd, __cmd); + + int res = -1; + va_list va; + va_start(va, __cmd); + unsigned long int arg = va_arg(va, unsigned long int); + va_end(va); + + int ret = 0; + socket_fd_api* p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + VERIFY_PASSTROUGH_CHANGED(res, p_socket_object->fcntl(__cmd, arg)); + } else { + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.fcntl) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + res = orig_os_api.fcntl(__fd, __cmd, arg); + } + + if (__cmd == F_DUPFD) { + handle_close(__fd); + } + + if (ret >= 0) + srdr_logfunc_exit("returned with %d", ret); + else + srdr_logfunc_exit("failed (errno=%d %m)", errno); + return res; +} + +/* Perform the I/O control operation specified by REQUEST on FD. + One argument may follow; its presence and type depend on REQUEST. + Return value depends on REQUEST. Usually -1 indicates error. */ +extern "C" +int ioctl (int __fd, unsigned long int __request, ...) +{ + srdr_logfunc_entry("fd=%d, request=%d", __fd, __request); + + int res = -1; + va_list va; + va_start(va, __request); + unsigned long int arg = va_arg(va, unsigned long int); + va_end(va); + + int ret = 0; + + socket_fd_api* p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object && arg) { + VERIFY_PASSTROUGH_CHANGED(res, p_socket_object->ioctl(__request, arg)); + } else { + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.ioctl) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + res = orig_os_api.ioctl(__fd, __request, arg); + } + + if (ret >= 0) + srdr_logfunc_exit("returned with %d", ret); + else + srdr_logfunc_exit("failed (errno=%d %m)", errno); + return res; +} + +extern "C" +int getsockname(int __fd, struct sockaddr *__name, socklen_t *__namelen) +{ + srdr_logdbg_entry("fd=%d", __fd); + + int ret = 0; + socket_fd_api* p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + ret = p_socket_object->getsockname(__name, __namelen); + + if (safe_mce_sys().trigger_dummy_send_getsockname) { + char buf[264] = {0}; + struct iovec msg_iov = {&buf, sizeof(buf)}; + struct msghdr msg = {NULL, 0, &msg_iov, 1, NULL, 0, 0}; + int ret_send = sendmsg(__fd, &msg, VMA_SND_FLAGS_DUMMY); + srdr_logdbg("Triggered dummy message for socket fd=%d (ret_send=%d)", __fd, ret_send); + NOT_IN_USE(ret_send); + } + } + else { + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.getsockname) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + ret = orig_os_api.getsockname(__fd, __name, __namelen); + } + + if (ret >= 0) + srdr_logdbg_exit("returned with %d", ret); + else + srdr_logdbg_exit("failed (errno=%d %m)", errno); + return ret; +} + +extern "C" +int getpeername(int __fd, struct sockaddr *__name, socklen_t *__namelen) +{ + srdr_logdbg_entry("fd=%d", __fd); + + int ret = 0; + socket_fd_api* p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + ret = p_socket_object->getpeername(__name, __namelen); + } + else { + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.getpeername) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + ret = orig_os_api.getpeername(__fd, __name, __namelen); + } + + if (ret >= 0) + srdr_logdbg_exit("returned with %d", ret); + else + srdr_logdbg_exit("failed (errno=%d %m)", errno); + return ret; +} + + +/* Read NBYTES into BUF from FD. Return the + number read, -1 for errors or 0 for EOF. + + This function is a cancellation point and therefore not marked with + __THROW. */ +extern "C" +ssize_t read(int __fd, void *__buf, size_t __nbytes) +{ + srdr_logfuncall_entry("fd=%d", __fd); + + socket_fd_api* p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + struct iovec piov[1]; + piov[0].iov_base = __buf; + piov[0].iov_len = __nbytes; + int dummy_flags = 0; + return p_socket_object->rx(RX_READ, piov, 1, &dummy_flags); + } + + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.read) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + return orig_os_api.read(__fd, __buf, __nbytes); +} + +/* Checks that the buffer is big enough to contain the number of bytes + * the user requests to read. If the buffer is too small, aborts, + * else read NBYTES into BUF from FD. Return the + number read, -1 for errors or 0 for EOF. + + This function is a cancellation point and therefore not marked with + __THROW. */ +extern "C" +ssize_t __read_chk(int __fd, void *__buf, size_t __nbytes, size_t __buflen) +{ + srdr_logfuncall_entry("fd=%d", __fd); + + socket_fd_api* p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + BULLSEYE_EXCLUDE_BLOCK_START + if (__nbytes > __buflen) { + srdr_logpanic("buffer overflow detected"); + } + BULLSEYE_EXCLUDE_BLOCK_END + + struct iovec piov[1]; + piov[0].iov_base = __buf; + piov[0].iov_len = __nbytes; + int dummy_flags = 0; + return p_socket_object->rx(RX_READ, piov, 1, &dummy_flags); + } + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.__read_chk) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + return orig_os_api.__read_chk(__fd, __buf, __nbytes, __buflen); +} + +/* Read COUNT blocks into VECTOR from FD. Return the + number of bytes read, -1 for errors or 0 for EOF. + + This function is a cancellation point and therefore not marked with + __THROW. */ + +extern "C" +ssize_t readv(int __fd, const struct iovec *iov, int iovcnt) +{ + srdr_logfuncall_entry("fd=%d", __fd); + + socket_fd_api* p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + struct iovec* piov = (struct iovec*)iov; + int dummy_flags = 0; + return p_socket_object->rx(RX_READV, piov, iovcnt, &dummy_flags); + } + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.readv) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + return orig_os_api.readv(__fd, iov, iovcnt); +} + +/* Read N bytes into BUF from socket FD. + Returns the number read or -1 for errors. + + This function is a cancellation point and therefore not marked with + __THROW. */ +extern "C" +ssize_t recv(int __fd, void *__buf, size_t __nbytes, int __flags) +{ + srdr_logfuncall_entry("fd=%d", __fd); + + socket_fd_api* p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + struct iovec piov[1]; + piov[0].iov_base = __buf; + piov[0].iov_len = __nbytes; + return p_socket_object->rx(RX_RECV, piov, 1, &__flags); + } + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.recv) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + return orig_os_api.recv(__fd, __buf, __nbytes, __flags); +} + +/* Checks that the buffer is big enough to contain the number of bytes + the user requests to read. If the buffer is too small, aborts, + else read N bytes into BUF from socket FD. + Returns the number read or -1 for errors. + + This function is a cancellation point and therefore not marked with + __THROW. */ +extern "C" +ssize_t __recv_chk(int __fd, void *__buf, size_t __nbytes, size_t __buflen, int __flags) +{ + srdr_logfuncall_entry("fd=%d", __fd); + + socket_fd_api* p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + BULLSEYE_EXCLUDE_BLOCK_START + if (__nbytes > __buflen) { + srdr_logpanic("buffer overflow detected"); + } + BULLSEYE_EXCLUDE_BLOCK_END + + struct iovec piov[1]; + piov[0].iov_base = __buf; + piov[0].iov_len = __nbytes; + return p_socket_object->rx(RX_RECV, piov, 1, &__flags); + } + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.__recv_chk) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + return orig_os_api.__recv_chk(__fd, __buf, __nbytes, __buflen, __flags); +} + +/* Receive a message as described by MESSAGE from socket FD. + Returns the number of bytes read or -1 for errors. + + This function is a cancellation point and therefore not marked with + __THROW. */ +extern "C" +ssize_t recvmsg(int __fd, struct msghdr *__msg, int __flags) +{ + srdr_logfuncall_entry("fd=%d", __fd); + + if (__msg == NULL) { + srdr_logdbg("NULL msghdr"); + errno = EINVAL; + return -1; + } + + socket_fd_api* p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + __msg->msg_flags = 0; + return p_socket_object->rx(RX_RECVMSG, __msg->msg_iov, __msg->msg_iovlen, &__flags, (__SOCKADDR_ARG)__msg->msg_name, (socklen_t*)&__msg->msg_namelen, __msg); + } + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.recvmsg) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + return orig_os_api.recvmsg(__fd, __msg, __flags); +} + +/* The following definitions are for kernels previous to 2.6.32 which dont support recvmmsg */ +#ifndef HAVE_STRUCT_MMSGHDR +#ifndef __INTEL_COMPILER +struct mmsghdr { + struct msghdr msg_hdr; // Message header + unsigned int msg_len; // Number of received bytes for header +}; +#endif +#endif + +#ifndef MSG_WAITFORONE +#define MSG_WAITFORONE 0x10000 //recvmmsg(): block until 1+ packets avail +#endif + +/* Receive multiple messages as described by MESSAGE from socket FD. + Returns the number of messages received or -1 for errors. + + This function is a cancellation point and therefore not marked with + __THROW. */ +extern "C" +#ifdef RECVMMSG_WITH_CONST_TIMESPEC +int recvmmsg(int __fd, struct mmsghdr *__mmsghdr, unsigned int __vlen, int __flags, const struct timespec *__timeout) +#else +int recvmmsg(int __fd, struct mmsghdr *__mmsghdr, unsigned int __vlen, int __flags, struct timespec *__timeout) +#endif +{ + int num_of_msg=0; + struct timespec start_time = TIMESPEC_INITIALIZER, current_time = TIMESPEC_INITIALIZER, delta_time = TIMESPEC_INITIALIZER; + + srdr_logfuncall_entry("fd=%d, mmsghdr length=%d flags=%x", __fd, __vlen, __flags); + + if (__mmsghdr == NULL) { + srdr_logdbg("NULL mmsghdr"); + errno = EINVAL; + return -1; + } + + if (__timeout) { + gettime(&start_time); + } + socket_fd_api* p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + int ret = 0; + for (unsigned int i=0; i<__vlen; i++) { + int flags = __flags; + __mmsghdr[i].msg_hdr.msg_flags = 0; + ret = p_socket_object->rx(RX_RECVMSG, __mmsghdr[i].msg_hdr.msg_iov, __mmsghdr[i].msg_hdr.msg_iovlen, &flags, + (__SOCKADDR_ARG)__mmsghdr[i].msg_hdr.msg_name, (socklen_t*)&__mmsghdr[i].msg_hdr.msg_namelen, &__mmsghdr[i].msg_hdr); + if (ret < 0){ + break; + } + num_of_msg++; + __mmsghdr[i].msg_len = ret; + if ((i==0) && (flags & MSG_WAITFORONE)) { + __flags |= MSG_DONTWAIT; + } + if (__timeout) { + gettime(¤t_time); + ts_sub(¤t_time, &start_time, &delta_time); + if (ts_cmp(&delta_time, __timeout, >)) { + break; + } + } + } + if (num_of_msg || ret == 0) { + //todo save ret for so_error if ret != 0(see kernel) + return num_of_msg; + } else { + return ret; + } + } + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.recvmmsg) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + return orig_os_api.recvmmsg(__fd, __mmsghdr, __vlen, __flags, __timeout); +} + + +/* Read N bytes into BUF through socket FD. + If ADDR is not NULL, fill in *ADDR_LEN bytes of it with tha address of + the sender, and store the actual size of the address in *ADDR_LEN. + Returns the number of bytes read or -1 for errors. + + This function is a cancellation point and therefore not marked with + __THROW. */ +extern "C" +ssize_t recvfrom(int __fd, void *__buf, size_t __nbytes, int __flags, + struct sockaddr *__from, socklen_t *__fromlen) +{ + ssize_t ret_val = 0; + + srdr_logfuncall_entry("fd=%d", __fd); + + socket_fd_api* p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + struct iovec piov[1]; + piov[0].iov_base = __buf; + piov[0].iov_len = __nbytes; + ret_val = p_socket_object->rx(RX_RECVFROM, piov, 1, &__flags, __from, __fromlen); + } + else { + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.recvfrom) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + ret_val = orig_os_api.recvfrom(__fd, __buf, __nbytes, __flags, __from, __fromlen); + } +#ifdef RDTSC_MEASURE_RX_PROCCESS_BUFFER_TO_RECIVEFROM + RDTSC_TAKE_END(g_rdtsc_instr_info_arr[RDTSC_FLOW_PROCCESS_RX_BUFFER_TO_RECIVEFROM]); +#endif //RDTSC_MEASURE_RX_PROCCESS_BUFFER_TO_RECIVEFROM + +#ifdef RDTSC_MEASURE_RX_LWIP_TO_RECEVEFROM + RDTSC_TAKE_END(g_rdtsc_instr_info_arr[RDTSC_FLOW_RX_LWIP_TO_RECEVEFROM]); +#endif //RDTSC_MEASURE_RX_LWIP_TO_RECEVEFROM + +#ifdef RDTSC_MEASURE_RX_CQE_RECEIVEFROM + RDTSC_TAKE_END(g_rdtsc_instr_info_arr[RDTSC_FLOW_RX_CQE_TO_RECEIVEFROM]); +#endif //RDTSC_MEASURE_RX_CQE_RECEIVEFROM + +#ifdef RDTSC_MEASURE_RECEIVEFROM_TO_SENDTO + RDTSC_TAKE_START(g_rdtsc_instr_info_arr[RDTSC_FLOW_RECEIVEFROM_TO_SENDTO]); +#endif //RDTSC_MEASURE_RECEIVEFROM_TO_SENDTO + return ret_val; +} + +/* Checks that the buffer is big enough to contain the number of bytes + the user requests to read. If the buffer is too small, aborts, + else read N bytes into BUF through socket FD. + If ADDR is not NULL, fill in *ADDR_LEN bytes of it with tha address of + the sender, and store the actual size of the address in *ADDR_LEN. + Returns the number of bytes read or -1 for errors. + + This function is a cancellation point and therefore not marked with + __THROW. */ +extern "C" +ssize_t __recvfrom_chk(int __fd, void *__buf, size_t __nbytes, size_t __buflen, int __flags, + struct sockaddr *__from, socklen_t *__fromlen) +{ + srdr_logfuncall_entry("fd=%d", __fd); + + socket_fd_api* p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + BULLSEYE_EXCLUDE_BLOCK_START + if (__nbytes > __buflen) { + srdr_logpanic("buffer overflow detected"); + } + BULLSEYE_EXCLUDE_BLOCK_END + + struct iovec piov[1]; + piov[0].iov_base = __buf; + piov[0].iov_len = __nbytes; + return p_socket_object->rx(RX_RECVFROM, piov, 1, &__flags, __from, __fromlen); + } + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.__recvfrom_chk) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + return orig_os_api.__recvfrom_chk(__fd, __buf, __nbytes, __buflen, __flags, __from, __fromlen); +} + +/* Write N bytes of BUF to FD. Return the number written, or -1. + + This function is a cancellation point and therefore not marked with + __THROW. */ +extern "C" +ssize_t write(int __fd, __const void *__buf, size_t __nbytes) +{ + srdr_logfuncall_entry("fd=%d, nbytes=%d", __fd, __nbytes); + + socket_fd_api* p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + struct iovec piov[1] = {{(void*)__buf, __nbytes}}; + vma_tx_call_attr_t tx_arg; + + tx_arg.opcode = TX_WRITE; + tx_arg.attr.msg.iov = piov; + tx_arg.attr.msg.sz_iov = 1; + + return p_socket_object->tx(tx_arg); + } + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.write) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + return orig_os_api.write(__fd, __buf, __nbytes); +} + +/* Write IOCNT blocks from IOVEC to FD. Return the number written, or -1. + + This function is a cancellation point and therefore not marked with + __THROW. */ +extern "C" +ssize_t writev(int __fd, const struct iovec *iov, int iovcnt) +{ + srdr_logfuncall_entry("fd=%d, %d iov blocks", __fd, iovcnt); + + socket_fd_api* p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + vma_tx_call_attr_t tx_arg; + + tx_arg.opcode = TX_WRITEV; + tx_arg.attr.msg.iov = (struct iovec *)iov; + tx_arg.attr.msg.sz_iov = iovcnt; + + return p_socket_object->tx(tx_arg); + } + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.writev) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + return orig_os_api.writev(__fd, iov, iovcnt); +} + + +/* Send N bytes of BUF to socket FD. Returns the number sent or -1. + + This function is a cancellation point and therefore not marked with + __THROW. */ +extern "C" +ssize_t send(int __fd, __const void *__buf, size_t __nbytes, int __flags) +{ + srdr_logfuncall_entry("fd=%d, nbytes=%d", __fd, __nbytes); + + socket_fd_api* p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + struct iovec piov[1] = {{(void*)__buf, __nbytes}}; + vma_tx_call_attr_t tx_arg; + + tx_arg.opcode = TX_SEND; + tx_arg.attr.msg.iov = piov; + tx_arg.attr.msg.sz_iov = 1; + tx_arg.attr.msg.flags = __flags; + + return p_socket_object->tx(tx_arg); + } + + // Ignore dummy messages for OS + if (unlikely(IS_DUMMY_PACKET(__flags))) { + errno = EINVAL; + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.send) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + return orig_os_api.send(__fd, __buf, __nbytes, __flags); +} + +/* Sends a message as described by MESSAGE to socket FD. + Returns the number of bytes read or -1 for errors. + + This function is a cancellation point and therefore not marked with + __THROW. */ +extern "C" +ssize_t sendmsg(int __fd, __const struct msghdr *__msg, int __flags) +{ + srdr_logfuncall_entry("fd=%d", __fd); + + socket_fd_api* p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + vma_tx_call_attr_t tx_arg; + + tx_arg.opcode = TX_SENDMSG; + tx_arg.attr.msg.iov = __msg->msg_iov; + tx_arg.attr.msg.sz_iov = (ssize_t)__msg->msg_iovlen; + tx_arg.attr.msg.flags = __flags; + tx_arg.attr.msg.addr = (struct sockaddr *)(__CONST_SOCKADDR_ARG)__msg->msg_name; + tx_arg.attr.msg.len = (socklen_t)__msg->msg_namelen; + + return p_socket_object->tx(tx_arg); + } + + // Ignore dummy messages for OS + if (unlikely(IS_DUMMY_PACKET(__flags))) { + errno = EINVAL; + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.sendmsg) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + return orig_os_api.sendmsg(__fd, __msg, __flags); + +} + +/* Send multiple messages as described by MESSAGE from socket FD. + Returns the number of messages sent or -1 for errors. + + This function is a cancellation point and therefore not marked with + __THROW. */ +extern "C" +int sendmmsg(int __fd, struct mmsghdr *__mmsghdr, unsigned int __vlen, int __flags) +{ + int num_of_msg=0; + + srdr_logfuncall_entry("fd=%d, mmsghdr length=%d flags=%x", __fd, __vlen, __flags); + + if (__mmsghdr == NULL) { + srdr_logdbg("NULL mmsghdr"); + errno = EINVAL; + return -1; + } + + socket_fd_api* p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + for (unsigned int i=0; i<__vlen; i++) { + vma_tx_call_attr_t tx_arg; + + tx_arg.opcode = TX_SENDMSG; + tx_arg.attr.msg.iov = __mmsghdr[i].msg_hdr.msg_iov; + tx_arg.attr.msg.sz_iov = (ssize_t)__mmsghdr[i].msg_hdr.msg_iovlen; + tx_arg.attr.msg.flags = __flags; + tx_arg.attr.msg.addr = (struct sockaddr *)(__SOCKADDR_ARG)__mmsghdr[i].msg_hdr.msg_name; + tx_arg.attr.msg.len = (socklen_t)__mmsghdr[i].msg_hdr.msg_namelen; + + int ret = p_socket_object->tx(tx_arg); + if (ret < 0){ + if (num_of_msg) + return num_of_msg; + else + return ret; + } + num_of_msg++; + __mmsghdr[i].msg_len = ret; + } + return num_of_msg; + } + + // Ignore dummy messages for OS + if (unlikely(IS_DUMMY_PACKET(__flags))) { + errno = EINVAL; + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.sendmmsg) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + return orig_os_api.sendmmsg(__fd, __mmsghdr, __vlen, __flags); +} + +/* Send N bytes of BUF on socket FD to peer at address ADDR (which is + ADDR_LEN bytes long). Returns the number sent, or -1 for errors. + + This function is a cancellation point and therefore not marked with + __THROW. */ +extern "C" +ssize_t sendto(int __fd, __const void *__buf, size_t __nbytes, int __flags, + const struct sockaddr *__to, socklen_t __tolen) +{ +#ifdef RDTSC_MEASURE_TX_SENDTO_TO_AFTER_POST_SEND + RDTSC_TAKE_START(g_rdtsc_instr_info_arr[RDTSC_FLOW_SENDTO_TO_AFTER_POST_SEND]); +#endif //RDTSC_MEASURE_TX_SENDTO_TO_AFTER_POST_SEND + +#ifdef RDTSC_MEASURE_RECEIVEFROM_TO_SENDTO + RDTSC_TAKE_END(g_rdtsc_instr_info_arr[RDTSC_FLOW_RECEIVEFROM_TO_SENDTO]); +#endif //RDTSC_MEASURE_TX_SENDTO_TO_AFTER_POST_SEND + srdr_logfuncall_entry("fd=%d, nbytes=%d", __fd, __nbytes); + + socket_fd_api* p_socket_object = NULL; + p_socket_object = fd_collection_get_sockfd(__fd); + if (p_socket_object) { + struct iovec piov[1] = {{(void*)__buf, __nbytes}}; + vma_tx_call_attr_t tx_arg; + + tx_arg.opcode = TX_SENDTO; + tx_arg.attr.msg.iov = piov; + tx_arg.attr.msg.sz_iov = 1; + tx_arg.attr.msg.flags = __flags; + tx_arg.attr.msg.addr = (struct sockaddr *)__to; + tx_arg.attr.msg.len = __tolen; + + return p_socket_object->tx(tx_arg); + } + + // Ignore dummy messages for OS + if (unlikely(IS_DUMMY_PACKET(__flags))) { + errno = EINVAL; + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.sendto) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + return orig_os_api.sendto(__fd, __buf, __nbytes, __flags, __to, __tolen); +} + +static ssize_t sendfile_helper(socket_fd_api* p_socket_object, int in_fd, __off64_t *offset, size_t count) +{ + ssize_t totSent = 0; + struct stat64 stat_buf; + __off64_t orig_offset = 0; + __off64_t cur_offset = 0; + struct iovec piov[1] = {{NULL, 0}}; + vma_tx_call_attr_t tx_arg; + sockinfo* s = (sockinfo*)p_socket_object; + + if (p_socket_object->get_type() != FD_TYPE_SOCKET) { + errno = EBADF; + return -1; + } + + orig_offset = lseek64(in_fd, 0, SEEK_CUR); + if (orig_offset < 0) { + errno = ESPIPE; + return -1; + } + + cur_offset = (offset ? *offset : orig_offset); + if (offset && (lseek64(in_fd, cur_offset, SEEK_SET) == -1)) { + errno = EINVAL; + return -1; + } + + if ((fstat64(in_fd, &stat_buf) == -1) || + ((__off64_t)stat_buf.st_size < (__off64_t)(cur_offset + count))) { + errno = EOVERFLOW; + return -1; + } + + if (PROTO_TCP == s->get_protocol()) { + tx_arg.opcode = TX_FILE; + tx_arg.attr.msg.iov = piov; + tx_arg.attr.msg.sz_iov = 1; + + piov[0].iov_base = (void *)&in_fd; + piov[0].iov_len = count; + + totSent = p_socket_object->tx(tx_arg); + } else { + __off64_t pa_offset = 0; + size_t pa_count = 0; + struct flock64 lock; + + tx_arg.opcode = TX_WRITE; + tx_arg.attr.msg.iov = piov; + tx_arg.attr.msg.sz_iov = 1; + + /* The off argument of mmap() is constrained to be aligned and + * sized according to the value returned by sysconf() + */ + pa_offset = cur_offset & ~(sysconf(_SC_PAGE_SIZE) - 1); + pa_count = count + cur_offset - pa_offset; + + lock.l_type = F_RDLCK; + lock.l_whence = SEEK_SET; + lock.l_start = pa_offset; + lock.l_len = pa_count; + lock.l_pid = 0; + + /* try to use mmap() approach */ + if (-1 != (fcntl(in_fd, F_SETLK, &lock))) { + void *addr = NULL; + addr = mmap64(NULL, pa_count, PROT_READ, MAP_SHARED | MAP_NORESERVE, in_fd, pa_offset); + if (MAP_FAILED != addr) { + ssize_t toRead, numSent = 0; + + while (count > 0) { + toRead = min(sysconf(_SC_PAGE_SIZE), (ssize_t)count); + + piov[0].iov_base = (void *)((uintptr_t)addr + cur_offset - pa_offset + totSent); + piov[0].iov_len = toRead; + + numSent = p_socket_object->tx(tx_arg); + if (numSent == -1) { + break; + } + + count -= numSent; + totSent += numSent; + } + (void)munmap(addr, pa_count); + } + lock.l_type = F_UNLCK; + (void)fcntl(in_fd, F_SETLK, &lock); + } + + /* fallback on read() approach */ + if (totSent <= 0) { + char buf[sysconf(_SC_PAGE_SIZE)]; + ssize_t toRead, numRead, numSent = 0; + + while (count > 0) { + toRead = min(sizeof(buf), count); + numRead = orig_os_api.read(in_fd, buf, toRead); + if (numRead <= 0) { + break; + } + + piov[0].iov_base = (void *)buf; + piov[0].iov_len = numRead; + + numSent = p_socket_object->tx(tx_arg); + if (numSent == -1) { + break; + } + + count -= numSent; + totSent += numSent; + } + } + } + + if (totSent > 0) { + if (offset != NULL) { + (void)lseek64(in_fd, (orig_offset), SEEK_SET); + *offset = *offset + totSent; + } else { + (void)lseek64(in_fd, (orig_offset + totSent), SEEK_SET); + } + } + + return totSent; +} + +extern "C" +ssize_t sendfile(int out_fd, int in_fd, off_t *offset, size_t count) +{ + srdr_logfuncall_entry("out_fd=%d, in_fd=%d, offset=%p, *offset=%zu, count=%d", out_fd, in_fd, offset, offset ? *offset : 0, count); + + socket_fd_api* p_socket_object = fd_collection_get_sockfd(out_fd); + if (!p_socket_object) { + if (!orig_os_api.sendfile) get_orig_funcs(); + return orig_os_api.sendfile(out_fd, in_fd, offset, count); + } + + return sendfile_helper(p_socket_object, in_fd, offset, count); +} + +extern "C" +ssize_t sendfile64(int out_fd, int in_fd, __off64_t *offset, size_t count) +{ + srdr_logfuncall_entry("out_fd=%d, in_fd=%d, offset=%p, *offset=%zu, count=%d", out_fd, in_fd, offset, offset ? *offset : 0, count); + + socket_fd_api* p_socket_object = fd_collection_get_sockfd(out_fd); + if (!p_socket_object) { + if (!orig_os_api.sendfile64) get_orig_funcs(); + return orig_os_api.sendfile64(out_fd, in_fd, offset, count); + } + + return sendfile_helper(p_socket_object, in_fd, offset, count); +} + +// Format a fd_set into a string for logging +// Check nfd to know how many 32 bits hexs do we want to sprintf into user buffer +const char* sprintf_fdset(char* buf, int buflen, int __nfds, fd_set *__fds) +{ + if (buflen<1) + return "(null)"; + buf[0] = '\0'; + + if ((__nfds <= 0) || (__fds == NULL)) + return "(null)"; + + int fdsize = 1 + ((__nfds-1) / (8*sizeof(uint32_t))); + switch (fdsize) { + case 1: + snprintf(buf, buflen, "%08x", ((uint32_t*)__fds)[0]); + break; + case 2: + snprintf(buf, buflen, "%08x %08x", ((uint32_t*)__fds)[1], ((uint32_t*)__fds)[0]); + break; + case 3: + snprintf(buf, buflen, "%08x %08x %08x", ((uint32_t*)__fds)[2], ((uint32_t*)__fds)[1], ((uint32_t*)__fds)[0]); + break; + case 4: + snprintf(buf, buflen, "%08x %08x %08x %08x", ((uint32_t*)__fds)[3], ((uint32_t*)__fds)[2], ((uint32_t*)__fds)[1], ((uint32_t*)__fds)[0]); + break; + case 5: + snprintf(buf, buflen, "%08x %08x %08x %08x %08x", ((uint32_t*)__fds)[4], ((uint32_t*)__fds)[3], ((uint32_t*)__fds)[2], ((uint32_t*)__fds)[1], ((uint32_t*)__fds)[0]); + break; + case 6: + snprintf(buf, buflen, "%08x %08x %08x %08x %08x %08x", ((uint32_t*)__fds)[5], ((uint32_t*)__fds)[4], ((uint32_t*)__fds)[3], ((uint32_t*)__fds)[2], ((uint32_t*)__fds)[1], ((uint32_t*)__fds)[0]); + break; + default: + buf[0] = '\0'; + } + return buf; +} + +/* Check the first NFDS descriptors each in READFDS (if not NULL) for read + readiness, in WRITEFDS (if not NULL) for write readiness, and in EXCEPTFDS + (if not NULL) for exceptional conditions. If TIMis not NULL, time out + after waiting the interval specified therein. Returns the number of ready + descriptors, or -1 for errors. + + This function is a cancellation point and therefore not marked with + __THROW. */ +int select_helper(int __nfds, + fd_set *__readfds, + fd_set * __writefds, + fd_set * __exceptfds, + struct timeval * __timeout, + const sigset_t *__sigmask = NULL) +{ + int off_rfds_buffer[__nfds]; + io_mux_call::offloaded_mode_t off_modes_buffer[__nfds]; + + if (g_vlogger_level >= VLOG_FUNC) { + const int tmpbufsize = 256; + char tmpbuf[tmpbufsize], tmpbuf2[tmpbufsize]; + NOT_IN_USE(tmpbufsize); /* to suppress warning in case VMA_MAX_DEFINED_LOG_LEVEL */ + NOT_IN_USE(tmpbuf); /* to suppress warning in case VMA_MAX_DEFINED_LOG_LEVEL */ + NOT_IN_USE(tmpbuf2); /* to suppress warning in case VMA_MAX_DEFINED_LOG_LEVEL */ + srdr_logfunc("readfds: %s, writefds: %s", + sprintf_fdset(tmpbuf, tmpbufsize, __nfds, __readfds), + sprintf_fdset(tmpbuf2, tmpbufsize, __nfds, __writefds)); + } + + try { + select_call scall(off_rfds_buffer, off_modes_buffer, + __nfds, __readfds, __writefds, __exceptfds, __timeout, __sigmask); + int rc = scall.call(); + + if (g_vlogger_level >= VLOG_FUNC) { + const int tmpbufsize = 256; + char tmpbuf[tmpbufsize], tmpbuf2[tmpbufsize]; + NOT_IN_USE(tmpbufsize); /* to suppress warning in case VMA_MAX_DEFINED_LOG_LEVEL */ + NOT_IN_USE(tmpbuf); /* to suppress warning in case VMA_MAX_DEFINED_LOG_LEVEL */ + NOT_IN_USE(tmpbuf2); /* to suppress warning in case VMA_MAX_DEFINED_LOG_LEVEL */ + srdr_logfunc_exit("readfds: %s, writefds: %s", + sprintf_fdset(tmpbuf, tmpbufsize, __nfds, __readfds), + sprintf_fdset(tmpbuf2, tmpbufsize, __nfds, __writefds)); + } + + return rc; + } + catch (io_mux_call::io_error&) { + srdr_logfunc_exit("io_mux_call::io_error (errno=%d %m)", errno); + return -1; + } +} + +extern "C" +int select(int __nfds, + fd_set *__readfds, + fd_set * __writefds, + fd_set * __exceptfds, + struct timeval * __timeout) +{ + if (!g_p_fd_collection) { + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.select) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + return orig_os_api.select(__nfds, __readfds, __writefds, __exceptfds, __timeout); + } + + if (__timeout) { + srdr_logfunc_entry("nfds=%d, timeout=(%d sec, %d usec)", + __nfds, __timeout->tv_sec, __timeout->tv_usec); + } else { + srdr_logfunc_entry("nfds=%d, timeout=(infinite)", __nfds); + } + + return select_helper(__nfds, __readfds, __writefds, __exceptfds, __timeout); +} + +extern "C" +int pselect(int __nfds, + fd_set *__readfds, + fd_set *__writefds, + fd_set *__errorfds, + const struct timespec *__timeout, + const sigset_t *__sigmask) +{ + if (!g_p_fd_collection) { + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.pselect) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + return orig_os_api.pselect(__nfds, __readfds, __writefds, __errorfds, __timeout, __sigmask); + } + + struct timeval select_time; + if (__timeout) { + srdr_logfunc_entry("nfds=%d, timeout=(%d sec, %d nsec)", + __nfds, __timeout->tv_sec, __timeout->tv_nsec); + select_time.tv_sec = __timeout->tv_sec; + select_time.tv_usec = __timeout->tv_nsec / 1000; + } else { + srdr_logfunc_entry("nfds=%d, timeout=(infinite)", __nfds); + } + + return select_helper(__nfds, __readfds, __writefds, __errorfds, __timeout ? &select_time : NULL, __sigmask); +} + +/* Poll the file descriptors described by the NFDS structures starting at + FDS. If TIMis nonzero and not -1, allow TIMmilliseconds for + an event to occur; if TIMis -1, block until an event occurs. + Returns the number of file descriptors with events, zero if timed out, + or -1 for errors. */ +int poll_helper(struct pollfd *__fds, nfds_t __nfds, int __timeout, const sigset_t *__sigmask = NULL) +{ + int off_rfd_buffer[__nfds]; + io_mux_call::offloaded_mode_t off_modes_buffer[__nfds]; + int lookup_buffer[__nfds]; + pollfd working_fds_arr[__nfds + 1]; + + try { + poll_call pcall(off_rfd_buffer, off_modes_buffer, lookup_buffer, working_fds_arr, + __fds, __nfds, __timeout, __sigmask); + + int rc = pcall.call(); + srdr_logfunc_exit("rc = %d", rc); + return rc; + } + catch (io_mux_call::io_error&) { + srdr_logfunc_exit("io_mux_call::io_error (errno=%d %m)", errno); + return -1; + } +} + +extern "C" +int poll(struct pollfd *__fds, nfds_t __nfds, int __timeout) +{ + if (!g_p_fd_collection) { + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.poll) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + return orig_os_api.poll(__fds, __nfds, __timeout); + } + + srdr_logfunc_entry("nfds=%d, timeout=(%d milli-sec)", __nfds, __timeout); + + return poll_helper(__fds, __nfds, __timeout); +} + +extern "C" +int ppoll(struct pollfd *__fds, nfds_t __nfds, const struct timespec *__timeout, const sigset_t *__sigmask) +{ + if (!g_p_fd_collection) { + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.ppoll) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + return orig_os_api.ppoll(__fds, __nfds, __timeout, __sigmask); + } + + int timeout = (__timeout == NULL) ? -1 : + (__timeout->tv_sec * 1000 + __timeout->tv_nsec / 1000000); + + srdr_logfunc_entry("nfds=%d, timeout=(%d milli-sec)", __nfds, timeout); + + return poll_helper(__fds, __nfds, timeout, __sigmask); +} + +void vma_epoll_create(int epfd, int size) +{ + if (g_p_fd_collection) { + // Sanity check to remove any old sockinfo object using the same fd!! + handle_close(epfd, true); + + // insert epfd to fd_collection as epfd_info + g_p_fd_collection->addepfd(epfd, size); + } +} + +/* Creates an epoll instance. Returns fd for the new instance. + The "size" parameter is a hint specifying the number of file + descriptors to be associated with the new instance. The fd + returned by epoll_create() should be closed with close(). */ +extern "C" +int epoll_create(int __size) +{ + DO_GLOBAL_CTORS(); + + if (__size <= 0 ) { + srdr_logdbg("invalid size (size=%d) - must be a positive integer\n", __size); + errno = EINVAL; + return -1; + } + + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.epoll_create) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + int epfd = orig_os_api.epoll_create(__size + 1); // +1 for the cq epfd + srdr_logdbg("ENTER: (size=%d) = %d\n", __size, epfd); + + if (epfd <=0) + return epfd; + + vma_epoll_create(epfd, 8); + + return epfd; +} + +extern "C" +int epoll_create1(int __flags) +{ + DO_GLOBAL_CTORS(); + + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.epoll_create1) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + int epfd = orig_os_api.epoll_create1(__flags); + srdr_logdbg("ENTER: (flags=%d) = %d\n", __flags, epfd); + + if (epfd <=0) + return epfd; + + vma_epoll_create(epfd, 8); + + return epfd; +} + +/* Manipulate an epoll instance "epfd". Returns 0 in case of success, + -1 in case of error ("errno" variable will contain the specific + error code). The "op" parameter is one of the EPOLL_CTL_* + constants defined above. The "fd" parameter is the target of the + operation. The "event" parameter describes which events the caller + is interested in and any associated user data. */ +extern "C" +int epoll_ctl(int __epfd, int __op, int __fd, struct epoll_event *__event) +{ + const static char * op_names[] = { + "", + "ADD", + "DEL", + "MOD" + }; + NOT_IN_USE(op_names); /* to suppress warning in case VMA_MAX_DEFINED_LOG_LEVEL */ + if (__event) { + srdr_logfunc_entry("epfd=%d, op=%s, fd=%d, events=%#x, data=%x", + __epfd, op_names[__op], __fd, __event->events, __event->data.u64); + } + else { + srdr_logfunc_entry("epfd=%d, op=%s, fd=%d, event=NULL", __epfd, op_names[__op], __fd); + } + + int rc = -1; + epfd_info *epfd_info = fd_collection_get_epfd(__epfd); + if (!epfd_info) { + errno = EBADF; + } + else { + // TODO handle race - if close() gets here.. + rc = epfd_info->ctl(__op, __fd, __event); + } + + srdr_logfunc_exit("rc = %d", rc); + return rc; +} + +/* Wait for events on an epoll instance "epfd". Returns the number of + triggered events returned in "events" buffer. Or -1 in case of + error with the "errno" variable set to the specific error code. The + "events" parameter is a buffer that will contain triggered + events. The "maxevents" is the maximum number of events to be + returned ( usually size of "events" ). The "timeout" parameter + specifies the maximum wait time in milliseconds (-1 == infinite). */ +inline int epoll_wait_helper(int __epfd, struct epoll_event *__events, int __maxevents, int __timeout, const sigset_t *__sigmask = NULL) +{ + if (__maxevents <= 0 || __maxevents > EP_MAX_EVENTS) { + srdr_logdbg("invalid value for maxevents: %d", __maxevents); + errno = EINVAL; + return -1; + } + + epoll_event extra_events_buffer[__maxevents]; + + try { + epoll_wait_call epcall(extra_events_buffer, NULL, + __epfd, __events, __maxevents, __timeout, __sigmask); + + int rc = epcall.get_current_events(); // returns ready nfds + if (rc <= 0) { + // if no ready nfds available then check all lower level queues (VMA ring's and OS queues) + epcall.init_offloaded_fds(); + rc = epcall.call(); + } + + srdr_logfunc_exit("rc = %d", rc); + return rc; + } + catch (io_mux_call::io_error&) { + srdr_logfunc_exit("io_mux_call::io_error (errno=%d %m)", errno); + return -1; + } +} + +extern "C" +int epoll_wait(int __epfd, struct epoll_event *__events, int __maxevents, int __timeout) +{ + srdr_logfunc_entry("epfd=%d, maxevents=%d, timeout=(%d milli-sec)", __epfd, __maxevents, __timeout); + + return epoll_wait_helper(__epfd, __events, __maxevents, __timeout); +} + +extern "C" +int epoll_pwait(int __epfd, struct epoll_event *__events, int __maxevents, int __timeout, const sigset_t *__sigmask) +{ + srdr_logfunc_entry("epfd=%d, maxevents=%d, timeout=(%d milli-sec)", __epfd, __maxevents, __timeout); + + return epoll_wait_helper(__epfd, __events, __maxevents, __timeout, __sigmask); +} + +/* Create two new sockets, of type TYPE in domain DOMand using + protocol PROTOCOL, which are connected to each other, and put file + descriptors for them in FDS[0] and FDS[1]. If PROTOCOL is zero, + one will be chosen automatically. Returns 0 on success, -1 for errors. */ +extern "C" +int socketpair(int __domain, int __type, int __protocol, int __sv[2]) +{ + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.socketpair) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + int ret = orig_os_api.socketpair(__domain, __type, __protocol, __sv); + + srdr_logdbg("(domain=%s(%d) type=%s(%d) protocol=%d, fd[%d,%d]) = %d\n", socket_get_domain_str(__domain), __domain, socket_get_type_str(__type), __type, __protocol, __sv[0], __sv[1], ret); + + // Sanity check to remove any old sockinfo object using the same fd!! + if (ret == 0 && g_p_fd_collection) { + handle_close(__sv[0], true); + handle_close(__sv[1], true); + } + + return ret; +} + +/* Create a one-way communication channel (pipe). + If successful, two file descriptors are stored in PIPEDES; + bytes written on PIPEDES[1] can be read from PIPEDES[0]. + Returns 0 if successful, -1 if not. */ +extern "C" +int pipe(int __filedes[2]) +{ + bool offload_pipe = safe_mce_sys().mce_spec == MCE_SPEC_29WEST_LBM_29 || + safe_mce_sys().mce_spec == MCE_SPEC_WOMBAT_FH_LBM_554; + if (offload_pipe) + DO_GLOBAL_CTORS(); + + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.pipe) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + int ret = orig_os_api.pipe(__filedes); + srdr_logdbg("(fd[%d,%d]) = %d\n", __filedes[0], __filedes[1], ret); + + if (ret == 0 && g_p_fd_collection) { + // Sanity check to remove any old sockinfo object using the same fd!! + int fdrd = __filedes[0]; + handle_close(fdrd, true); + int fdwr = __filedes[1]; + handle_close(fdwr, true); + + // Create new pipeinfo object for this new fd pair + if (offload_pipe) + g_p_fd_collection->addpipe(fdrd, fdwr); + } + + return ret; +} + +extern "C" +int open(__const char *__file, int __oflag, ...) +{ + va_list va; + va_start(va, __oflag); + mode_t mode = va_arg(va, mode_t); + + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.open) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + int fd = orig_os_api.open(__file, __oflag, mode); + va_end(va); + + srdr_logdbg("(file=%s, flags=%#x, mode=%#x) = %d\n", __file, __oflag, mode, fd); + + // Sanity check to remove any old sockinfo object using the same fd!! + handle_close(fd, true); + + return fd; +} + +extern "C" +int creat(const char *__pathname, mode_t __mode) +{ + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.creat) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + int fd = orig_os_api.creat(__pathname, __mode); + + srdr_logdbg("(pathname=%s, mode=%#x) = %d\n", __pathname, __mode, fd); + + // Sanity check to remove any old sockinfo object using the same fd!! + handle_close(fd, true); + + return fd; +} + +/* Duplicate FD, returning a new file descriptor on the same file. */ +extern "C" +int dup(int __fd) +{ + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.dup) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + int fid = orig_os_api.dup(__fd); + + srdr_logdbg("(fd=%d) = %d\n", __fd, fid); + + // Sanity check to remove any old sockinfo object using the same fd!! + handle_close(fid, true); + + return fid; +} + +/* Duplicate FD to FD2, closing FD2 and making it open on the same file. */ +extern "C" +int dup2(int __fd, int __fd2) +{ + if (safe_mce_sys().close_on_dup2 && __fd != __fd2) { + srdr_logdbg("oldfd=%d, newfd=%d. Closing %d in VMA.\n", __fd, __fd2, __fd2); + handle_close(__fd2); + } + + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.dup2) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + int fid = orig_os_api.dup2(__fd, __fd2); + + srdr_logdbg("(fd=%d, fd2=%d) = %d\n", __fd, __fd2, fid); + + // Sanity check to remove any old sockinfo object using the same fd!! + handle_close(fid, true); + + return fid; +} + +#ifdef _CHANGE_CLONE_PROTO_IN_SLES_10_ +extern "C" +int clone(int (*__fn)(void *), void *__child_stack, int __flags, void *__arg) +{ + srdr_logfunc_entry("flags=%#x", __flags); + + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.clone) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + return orig_os_api.clone(__fn, __child_stack, __flags, __arg); +} +#endif + +/* Clone the calling process, creating an exact copy. + Return -1 for errors, 0 to the new process, + and the process ID of the new process to the old process. */ + +extern "C" +pid_t fork(void) +{ + srdr_logdbg("ENTER: **********\n"); + + if (!g_init_global_ctors_done) { + set_env_params(); + prepare_fork(); + } + + if (!g_init_ibv_fork_done) + srdr_logdbg("ERROR: ibv_fork_init failed, the effect of an application calling fork() is undefined!!\n"); + + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.fork) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + pid_t pid = orig_os_api.fork(); + if (pid == 0) { + g_is_forked_child = true; + srdr_logdbg_exit("Child Process: returned with %d", pid); + // Child's process - restart module + vlog_stop(); + + + // In case of child process, we want all global objects to re-construct + reset_globals(); + + g_init_global_ctors_done = false; + sock_redirect_exit(); + + safe_mce_sys().get_env_params(); + vlog_start("VMA", safe_mce_sys().log_level, safe_mce_sys().log_filename, safe_mce_sys().log_details, safe_mce_sys().log_colors); + if (vma_rdma_lib_reset()) { + srdr_logerr("Child Process: rdma_lib_reset failed %m", + errno); + } + srdr_logdbg_exit("Child Process: starting with %d", getpid()); + g_is_forked_child = false; + sock_redirect_main(); + } + else if (pid > 0) { + srdr_logdbg_exit("Parent Process: returned with %d", pid); + } + else { + srdr_logdbg_exit("failed (errno=%d %m)", errno); + } + + return pid; +} + +/* Redirect vfork to fork */ +extern "C" +pid_t vfork(void) +{ + return fork(); +} + +/* Put the program in the background, and dissociate from the controlling + terminal. If NOCHDIR is zero, do `chdir ("/")'. If NOCLOSE is zero, + redirects stdin, stdout, and stderr to /dev/null. */ +extern "C" +int daemon(int __nochdir, int __noclose) +{ + srdr_logdbg("ENTER: ***** (%d, %d) *****\n", __nochdir, __noclose); + + if (!g_init_global_ctors_done) { + set_env_params(); + prepare_fork(); + } + + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.daemon) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + int ret = orig_os_api.daemon(__nochdir, __noclose); + if (ret == 0) { + g_is_forked_child = true; + srdr_logdbg_exit("returned with %d", ret); + + // Child's process - restart module + vlog_stop(); + + // In case of child process, we want all global objects to re-construct + reset_globals(); + + g_init_global_ctors_done = false; + sock_redirect_exit(); + + safe_mce_sys().get_env_params(); + vlog_start("VMA", safe_mce_sys().log_level, safe_mce_sys().log_filename, safe_mce_sys().log_details, safe_mce_sys().log_colors); + if (vma_rdma_lib_reset()) { + srdr_logerr("Child Process: rdma_lib_reset failed %m", + errno); + } + srdr_logdbg_exit("Child Process: starting with %d", getpid()); + g_is_forked_child = false; + sock_redirect_main(); + } + else { + srdr_logdbg_exit("failed (errno=%d %m)", errno); + } + return ret; +} + +static void handler_intr(int sig) +{ + switch (sig) { + case SIGINT: + g_b_exit = true; + srdr_logdbg("Catch Signal: SIGINT (%d)\n", sig); + break; + default: + srdr_logdbg("Catch Signal: %d\n", sig); + break; + } + + if (g_act_prev.sa_handler) + g_act_prev.sa_handler(sig); +} + +extern "C" +int sigaction(int signum, const struct sigaction *act, struct sigaction *oldact) +{ + int ret = 0; + + BULLSEYE_EXCLUDE_BLOCK_START + if (!orig_os_api.sigaction) get_orig_funcs(); + BULLSEYE_EXCLUDE_BLOCK_END + + if (safe_mce_sys().handle_sigintr) { + srdr_logdbg_entry("signum=%d, act=%p, oldact=%p", signum, act, oldact); + + switch (signum) { + case SIGINT: + if (oldact && g_act_prev.sa_handler) { + *oldact = g_act_prev; + } + if (act) { + struct sigaction vma_action; + vma_action.sa_handler = handler_intr; + vma_action.sa_flags = 0; + sigemptyset(&vma_action.sa_mask); + + ret = orig_os_api.sigaction(SIGINT, &vma_action, NULL); + + if (ret < 0) { + srdr_logdbg("Failed to register VMA SIGINT handler, calling to original sigaction handler\n"); + break; + } + srdr_logdbg("Registered VMA SIGINT handler\n"); + g_act_prev = *act; + } + if (ret >= 0) + srdr_logdbg_exit("returned with %d", ret); + else + srdr_logdbg_exit("failed (errno=%d %m)", errno); + + return ret; + break; + default: + break; + } + } + ret = orig_os_api.sigaction(signum, act, oldact); + + if (safe_mce_sys().handle_sigintr) { + if (ret >= 0) + srdr_logdbg_exit("returned with %d", ret); + else + srdr_logdbg_exit("failed (errno=%d %m)", errno); + } + return ret; +} + +static void handle_signal(int signum) +{ + srdr_logdbg_entry("Caught signal! signum=%d", signum); + + if (signum == SIGINT) { + g_b_exit = true; + } + + if (g_sighandler) { + g_sighandler(signum); + } +} + +extern "C" +sighandler_t signal(int signum, sighandler_t handler) +{ + srdr_logdbg_entry("signum=%d, handler=%p", signum, handler); + + if (!orig_os_api.signal) get_orig_funcs(); + + if (handler && handler != SIG_ERR && handler != SIG_DFL && handler != SIG_IGN) { + // Only SIGINT is supported for now + if (signum == SIGINT) { + g_sighandler = handler; + return orig_os_api.signal(SIGINT, &handle_signal); + } + } + + return orig_os_api.signal(signum, handler); +} diff --git a/src/vma/sock/sock-redirect.h b/src/vma/sock/sock-redirect.h new file mode 100644 index 0000000..7030f2b --- /dev/null +++ b/src/vma/sock/sock-redirect.h @@ -0,0 +1,189 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef SOCK_REDIRECT_H +#define SOCK_REDIRECT_H + + +//if you need select with more than 1024 sockets - enable this +#ifndef SELECT_BIG_SETSIZE +#define SELECT_BIG_SETSIZE 0 +#endif + +#if SELECT_BIG_SETSIZE +#include +#if (__GLIBC__ > 2) || (__GLIBC__ == 2 && __GLIBC_MINOR__ >= 2) +#include +#undef __FD_SETSIZE +#define __FD_SETSIZE 32768 +#endif +#endif //SELECT_BIG_SETSIZE + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include +#include +#include + +#include + +struct mmsghdr; + +// Format a fd_set into a string for logging +// Check nfd to know how many 32 bits hexs do we want to sprintf into user buffer +const char* sprintf_fdset(char* buf, int buflen, int __nfds, fd_set *__fds); + + +/** + *----------------------------------------------------------------------------- + * variables to hold the function-pointers to original functions + *----------------------------------------------------------------------------- + */ + +struct os_api { + int (*creat) (const char *__pathname, mode_t __mode); + int (*open) (__const char *__file, int __oflag, ...); + int (*dup) (int fildes); + int (*dup2) (int fildes, int fildes2); + int (*pipe) (int __filedes[2]); + int (*socket) (int __domain, int __type, int __protocol); + int (*socketpair) (int __domain, int __type, int __protocol, int __sv[2]); + + int (*close) (int __fd); + int (*__res_iclose) (res_state statp, bool free_addr); + int (*shutdown) (int __fd, int __how); + + int (*accept) (int __fd, struct sockaddr *__addr, socklen_t *__addrlen); + int (*accept4) (int __fd, struct sockaddr *__addr, socklen_t *__addrlen, int __flags); + int (*bind) (int __fd, const struct sockaddr *__addr, socklen_t __addrlen); + int (*connect) (int __fd, const struct sockaddr *__to, socklen_t __tolen); + int (*listen) (int __fd, int __backlog); + + int (*setsockopt) (int __fd, int __level, int __optname, __const void *__optval, socklen_t __optlen); + int (*getsockopt) (int __fd, int __level, int __optname, void *__optval, socklen_t *__optlen); + int (*fcntl) (int __fd, int __cmd, ...); + int (*ioctl) (int __fd, unsigned long int __request, ...); + int (*getsockname) (int __fd, struct sockaddr *__name,socklen_t *__namelen); + int (*getpeername) (int __fd, struct sockaddr *__name,socklen_t *__namelen); + + ssize_t (*read) (int __fd, void *__buf, size_t __nbytes); + ssize_t (*__read_chk) (int __fd, void *__buf, size_t __nbytes, size_t __buflen); + ssize_t (*readv) (int __fd, const struct iovec *iov, int iovcnt); + ssize_t (*recv) (int __fd, void *__buf, size_t __n, int __flags); + ssize_t (*__recv_chk) (int __fd, void *__buf, size_t __n, size_t __buflen, int __flags); + ssize_t (*recvmsg) (int __fd, struct msghdr *__message, int __flags); + int (*recvmmsg) (int __fd, struct mmsghdr *__mmsghdr, unsigned int __vlen, int __flags, const struct timespec *__timeout); + + ssize_t (*recvfrom) (int __fd, void *__restrict __buf, size_t __n, int __flags, struct sockaddr *__from, socklen_t *__fromlen); + ssize_t (*__recvfrom_chk) (int __fd, void *__restrict __buf, size_t __n, size_t __buflen, int __flags, struct sockaddr *__from, socklen_t *__fromlen); + + + ssize_t (*write) (int __fd, __const void *__buf, size_t __n); + ssize_t (*writev) (int __fd, const struct iovec *iov, int iovcnt); + ssize_t (*send) (int __fd, __const void *__buf, size_t __n, int __flags); + ssize_t (*sendmsg) (int __fd, __const struct msghdr *__message, int __flags); + ssize_t (*sendmmsg) (int __fd, struct mmsghdr *__mmsghdr, unsigned int __vlen, int __flags); + ssize_t (*sendto) (int __fd, __const void *__buf, size_t __n,int __flags, const struct sockaddr *__to, socklen_t __tolen); + ssize_t (*sendfile) (int out_fd, int in_fd, off_t *offset, size_t count); + ssize_t (*sendfile64) (int out_fd, int in_fd, __off64_t *offset, size_t count); + + int (*select) (int __nfds, fd_set *__readfds, fd_set *__writefds, fd_set *__exceptfds, struct timeval *__timeout); + int (*pselect) (int __nfds, fd_set *__readfds, fd_set *__writefds, fd_set *__errorfds, const struct timespec *__timeout, const sigset_t *__sigmask); + + int (*poll) (struct pollfd *__fds, nfds_t __nfds, int __timeout); + int (*ppoll) (struct pollfd *__fds, nfds_t __nfds, const struct timespec *__timeout, const sigset_t *__sigmask); + int (*epoll_create) (int __size); + int (*epoll_create1) (int __flags); + int (*epoll_ctl) (int __epfd, int __op, int __fd, struct epoll_event *__event); + int (*epoll_wait) (int __epfd, struct epoll_event *__events, int __maxevents, int __timeout); + int (*epoll_pwait) (int __epfd, struct epoll_event *__events, int __maxevents, int __timeout, const sigset_t *sigmask); + + int (*clone) (int (*__fn)(void *), void *__child_stack, int __flags, void *__arg); + pid_t (*fork) (void); + pid_t (*vfork) (void); + int (*daemon) (int __nochdir, int __noclose); + + int (*sigaction) (int signum, const struct sigaction *act, struct sigaction *oldact); + sighandler_t (*signal) (int signum, sighandler_t handler); +}; + +/** + *----------------------------------------------------------------------------- + * variables to hold the function-pointers to original functions + *----------------------------------------------------------------------------- + */ +extern os_api orig_os_api; + +extern void get_orig_funcs(); + +extern iomux_stats_t* g_p_select_stats; +extern iomux_stats_t* g_p_poll_stats; +extern iomux_stats_t* g_p_epoll_stats; + +int do_global_ctors(); +void reset_globals(); +void handle_close(int fd, bool cleanup = false, bool passthrough = false); + +// allow calling our socket(...) implementation safely from within libvma.so +// this is critical in case VMA was loaded using dlopen and not using LD_PRELOAD +// TODO: look for additional such functions/calls +int socket_internal(int __domain, int __type, int __protocol, bool check_offload = false); + +#endif //SOCK_REDIRECT_H + + diff --git a/src/vma/sock/socket_fd_api.cpp b/src/vma/sock/socket_fd_api.cpp new file mode 100644 index 0000000..03727f0 --- /dev/null +++ b/src/vma/sock/socket_fd_api.cpp @@ -0,0 +1,378 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include + +#include +#include +#include "utils/bullseye.h" +#include "sock-redirect.h" + +#include "socket_fd_api.h" + +#define MODULE_NAME "sapi" +#undef MODULE_HDR_INFO +#define MODULE_HDR_INFO MODULE_NAME "[fd=%d]:%d:%s() " +#undef __INFO__ +#define __INFO__ m_fd + +#if _BullseyeCoverage + #pragma BullseyeCoverage off +#endif + +socket_fd_api::socket_fd_api(int fd) : m_epoll_event_flags(0), m_fd(fd), m_n_sysvar_select_poll_os_ratio(safe_mce_sys().select_poll_os_ratio), m_econtext(NULL) +{ +} + +socket_fd_api::~socket_fd_api() +{ +} + + +void socket_fd_api::destructor_helper() +{ +} + +int socket_fd_api::shutdown(int __how) +{ + __log_info_func(""); + int ret = orig_os_api.shutdown(m_fd, __how); + if (ret) { + __log_info_dbg("shutdown failed (ret=%d %m)", ret); + } + return ret; +} + +int socket_fd_api::bind(const sockaddr *__addr, socklen_t __addrlen) +{ + __log_info_func(""); + int ret = orig_os_api.bind(m_fd, __addr, __addrlen); + if (ret) { + __log_info_dbg("bind failed (ret=%d %m)", ret); + } + return ret; +} + +int socket_fd_api::connect(const sockaddr *__to, socklen_t __tolen) +{ + __log_info_func(""); + int ret = orig_os_api.connect(m_fd, __to, __tolen); + if (ret) { + __log_info_dbg("connect failed (ret=%d %m)", ret); + } + return ret; +} + +int socket_fd_api::accept(struct sockaddr *__addr, socklen_t *__addrlen) +{ + __log_info_func(""); + int ret = orig_os_api.accept(m_fd, __addr, __addrlen); + if (ret < 0) { + __log_info_dbg("accept failed (ret=%d %m)", ret); + } + return ret; +} + +int socket_fd_api::accept4(struct sockaddr *__addr, socklen_t *__addrlen, int __flags) +{ + __log_info_func(""); + int ret = orig_os_api.accept4(m_fd, __addr, __addrlen, __flags); + if (ret < 0) { + __log_info_dbg("accept4 failed (ret=%d %m)", ret); + } + return ret; +} + +int socket_fd_api::listen(int backlog) +{ + __log_info_func(""); + int ret = orig_os_api.listen(m_fd, backlog); + if (ret < 0) { + __log_info_dbg("listen failed (ret=%d %m)", ret); + } + return ret; +} + +int socket_fd_api::getsockname(sockaddr *__name, socklen_t *__namelen) +{ + __log_info_func(""); + int ret = orig_os_api.getsockname(m_fd, __name, __namelen); + if (ret) { + __log_info_dbg("getsockname failed (ret=%d %m)", ret); + } + return ret; +} + +int socket_fd_api::getpeername(sockaddr *__name, socklen_t *__namelen) +{ + __log_info_func(""); + int ret = orig_os_api.getpeername(m_fd, __name, __namelen); + if (ret) { + __log_info_dbg("getpeername failed (ret=%d %m)", ret); + } + return ret; +} + +int socket_fd_api::setsockopt(int __level, int __optname, + __const void *__optval, socklen_t __optlen) +{ + __log_info_func(""); + int ret = orig_os_api.setsockopt(m_fd, __level, __optname, __optval, __optlen); + if (ret) { + __log_info_dbg("setsockopt failed (ret=%d %m)", ret); + } + return ret; +} + +int socket_fd_api::getsockopt(int __level, int __optname, void *__optval, + socklen_t *__optlen) +{ + __log_info_func(""); + int ret = orig_os_api.getsockopt(m_fd, __level, __optname, __optval, __optlen); + if (ret) { + __log_info_dbg("getsockopt failed (ret=%d %m)", ret); + } + return ret; +} + + +bool socket_fd_api::is_readable(uint64_t *p_poll_sn, fd_array_t* p_fd_array) +{ + NOT_IN_USE(p_poll_sn); + NOT_IN_USE(p_fd_array); + __log_info_funcall(""); + return false; +} + +void socket_fd_api::set_immediate_os_sample() +{ + __log_info_funcall(""); + return; +} + +void socket_fd_api::unset_immediate_os_sample() +{ + __log_info_funcall(""); + return; +} + +bool socket_fd_api::is_writeable() +{ + __log_info_funcall(""); + return true; +} + +bool socket_fd_api::is_errorable(int *errors) +{ + NOT_IN_USE(errors); + __log_info_funcall(""); + return false; +} + +void socket_fd_api::statistics_print(vlog_levels_t log_level /* = VLOG_DEBUG */) +{ + int epoll_fd = get_epoll_context_fd(); + + // Socket data + vlog_printf(log_level, "Fd number : %d\n", m_fd); + if (epoll_fd) { + vlog_printf(log_level, "Socket epoll Fd : %d\n", epoll_fd); + vlog_printf(log_level, "Socket epoll flags : 0x%x\n", m_fd_rec.events); + } + +} + +ssize_t socket_fd_api::rx_os(const rx_call_t call_type, iovec* p_iov, + ssize_t sz_iov, const int flags, sockaddr *__from, + socklen_t *__fromlen, struct msghdr *__msg) +{ + errno = 0; + switch (call_type) { + case RX_READ: + __log_info_func("calling os receive with orig read"); + return orig_os_api.read(m_fd, p_iov[0].iov_base, p_iov[0].iov_len); + + case RX_READV: + __log_info_func("calling os receive with orig readv"); + return orig_os_api.readv(m_fd, p_iov, sz_iov); + + case RX_RECV: + __log_info_func("calling os receive with orig recv"); + return orig_os_api.recv(m_fd, p_iov[0].iov_base, p_iov[0].iov_len, + flags); + + case RX_RECVFROM: + __log_info_func("calling os receive with orig recvfrom"); + return orig_os_api.recvfrom(m_fd, p_iov[0].iov_base, p_iov[0].iov_len, + flags, __from, __fromlen); + + case RX_RECVMSG: { + __log_info_func("calling os receive with orig recvmsg"); + return orig_os_api.recvmsg(m_fd, __msg, flags); + } + } + return (ssize_t) -1; +} + +ssize_t socket_fd_api::tx_os(const tx_call_t call_type, + const iovec* p_iov, const ssize_t sz_iov, + const int __flags, const sockaddr *__to, + const socklen_t __tolen) +{ + errno = 0; + + // Ignore dummy messages for OS + if (unlikely(IS_DUMMY_PACKET(__flags))) { + errno = EINVAL; + return -1; + } + + switch (call_type) { + case TX_WRITE: + __log_info_func("calling os transmit with orig write"); + return orig_os_api.write(m_fd, p_iov[0].iov_base, p_iov[0].iov_len); + + case TX_WRITEV: + __log_info_func("calling os transmit with orig writev"); + return orig_os_api.writev(m_fd, p_iov, sz_iov); + + case TX_SEND: + __log_info_func("calling os transmit with orig send"); + return orig_os_api.send(m_fd, p_iov[0].iov_base, p_iov[0].iov_len, + __flags); + + case TX_SENDTO: + __log_info_func("calling os transmit with orig sendto"); + return orig_os_api.sendto(m_fd, p_iov[0].iov_base, p_iov[0].iov_len, + __flags, __to, __tolen); + + case TX_SENDMSG: { + msghdr __message; + memset(&__message, 0, sizeof(__message)); + __message.msg_iov = (iovec*) p_iov; + __message.msg_iovlen = sz_iov; + __message.msg_name = (void*) __to; + __message.msg_namelen = __tolen; + + __log_info_func("calling os transmit with orig sendmsg"); + return orig_os_api.sendmsg(m_fd, &__message, __flags); + } + default: + __log_info_func("calling undefined os call type!"); + break; + } + return (ssize_t) -1; +} + +int socket_fd_api::register_callback(vma_recv_callback_t callback, void *context) +{ + NOT_IN_USE(callback); + NOT_IN_USE(context); + return -1; +} + +int socket_fd_api::free_packets(struct vma_packet_t *pkts, size_t count) +{ + NOT_IN_USE(pkts); + NOT_IN_USE(count); + return -1; +} + +int socket_fd_api::free_buffs(uint16_t len) +{ + NOT_IN_USE(len); + return -1; +} + +int socket_fd_api::add_epoll_context(epfd_info *epfd) +{ + if (!m_econtext) { + // This socket is not registered to any epfd + m_econtext = epfd; + return 0; + } else { + // Currently VMA does not support more then 1 epfd listed + errno = (m_econtext == epfd) ? EEXIST : ENOMEM; + return -1; + } +} + +void socket_fd_api::remove_epoll_context(epfd_info *epfd) +{ + if (m_econtext == epfd) + m_econtext = NULL; +} + +void socket_fd_api::notify_epoll_context(uint32_t events) +{ + if (m_econtext) { + m_econtext->insert_epoll_event_cb(this, events); + } +} + +void socket_fd_api::notify_epoll_context_add_ring(ring* ring) +{ + if (m_econtext) { + m_econtext->increase_ring_ref_count(ring); + } +} + +void socket_fd_api::notify_epoll_context_remove_ring(ring* ring) +{ + if (m_econtext){ + m_econtext->decrease_ring_ref_count(ring); + } +} + +bool socket_fd_api::notify_epoll_context_verify(epfd_info *epfd) +{ + return m_econtext == epfd; +} + +void socket_fd_api::notify_epoll_context_fd_is_offloaded() +{ + if (m_econtext) { + m_econtext->remove_fd_from_epoll_os(m_fd); + } +} + +int socket_fd_api::get_epoll_context_fd() { + if (m_econtext) { + return m_econtext->get_epoll_fd(); + } + return 0; +} + +#if _BullseyeCoverage + #pragma BullseyeCoverage on +#endif diff --git a/src/vma/sock/socket_fd_api.h b/src/vma/sock/socket_fd_api.h new file mode 100644 index 0000000..c522649 --- /dev/null +++ b/src/vma/sock/socket_fd_api.h @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef SOCKET_FD_API_H +#define SOCKET_FD_API_H + +#include "config.h" +#include +#include "vma/vma_extra.h" + +#include +#include +#include + +#ifndef SOCK_NONBLOCK +#define SOCK_NONBLOCK 04000 +#endif +#ifndef SOCK_CLOEXEC +#define SOCK_CLOEXEC 02000000 +#endif +#ifndef SO_MAX_PACING_RATE +#define SO_MAX_PACING_RATE 47 +#endif + +#define IS_DUMMY_PACKET(flags) (flags & VMA_SND_FLAGS_DUMMY) + +class cq_mgr; +class epfd_info; +class mem_buf_desc_t; + +struct epoll_fd_rec +{ + uint32_t events; + epoll_data epdata; + int offloaded_index; // offloaded fd index + 1 + + epoll_fd_rec() { + reset(); + } + + void reset() { + this->events = 0; + memset(&this->epdata, 0, sizeof(this->epdata)); + this->offloaded_index = 0; + } +}; + +typedef enum { + TX_WRITE = 13, TX_WRITEV, TX_SEND, TX_SENDTO, TX_SENDMSG, TX_FILE, TX_UNDEF +} tx_call_t; + +/* This structure describes an attributes of send operation + * Used attributes can depend on a type of operation + * attr.file - is used by TX_FILE + * attr.msg - is used by TX_WRITE, TX_WRITEV, TX_SEND, TX_SENDTO, TX_SENDMSG + */ +typedef struct vma_tx_call_attr { + tx_call_t opcode; + union { + struct { + struct iovec *iov; + ssize_t sz_iov; + int flags; + struct sockaddr * addr; + socklen_t len; + } msg; + } attr; + + vma_tx_call_attr() { + opcode = TX_UNDEF; + memset(&attr, 0, sizeof(attr)); + } +} vma_tx_call_attr_t; + +typedef enum { + RX_READ = 23, RX_READV, RX_RECV, RX_RECVFROM, RX_RECVMSG +} rx_call_t; + +#define FD_ARRAY_MAX 24 +typedef struct { + // coverity[member_decl] + int fd_list[FD_ARRAY_MAX]; // Note: An FD might appear twice in the list, + // the user of this array will need to handle it correctly + int fd_max; + int fd_count; +} fd_array_t; + +enum fd_type_t{ + FD_TYPE_SOCKET = 0, + FD_TYPE_PIPE, +}; + +typedef vma_list_t vma_desc_list_t; + +/** + * + * class socket_fd_api + * + */ + +class socket_fd_api: public cleanable_obj +{ +public: + socket_fd_api(int fd); + virtual ~socket_fd_api(); + + virtual void setPassthrough() {} + virtual bool isPassthrough() {return false;} + + virtual int prepareListen() {return 0;} + + virtual void destructor_helper(); + + virtual int shutdown(int __how); + + virtual int listen(int backlog); + + virtual int accept(struct sockaddr *__addr, socklen_t *__addrlen); + + virtual int accept4(struct sockaddr *__addr, socklen_t *__addrlen, int __flags); + + virtual int bind(const sockaddr *__addr, socklen_t __addrlen); + + virtual int connect(const sockaddr *__to, socklen_t __tolen); + + virtual int getsockname(sockaddr *__name, socklen_t *__namelen); + virtual int getpeername(sockaddr *__name, socklen_t *__namelen); + + virtual int setsockopt(int __level, int __optname, + __const void *__optval, socklen_t __optlen); + + virtual int getsockopt(int __level, int __optname, void *__optval, + socklen_t *__optlen); + virtual int fcntl(int __cmd, unsigned long int __arg) = 0; + + virtual int ioctl(unsigned long int __request, unsigned long int __arg) = 0; + + virtual ssize_t rx(const rx_call_t call_type, iovec* iov, + const ssize_t iovlen, int* p_flags = 0, + sockaddr *__from = NULL, + socklen_t *__fromlen = NULL, + struct msghdr *__msg = NULL) = 0; + + virtual bool is_readable(uint64_t *p_poll_sn, + fd_array_t* p_fd_array = NULL); + + virtual bool is_writeable(); + + virtual bool is_errorable(int *errors); + + // Instructing the socket to immediately sample/un-sample the OS in receive flow + virtual void set_immediate_os_sample(); + virtual void unset_immediate_os_sample(); + + virtual bool is_closable(){ return true; } + + + //In some cases we need the socket can't be deleted immidiatly + //(for example STREAME sockets) + //This prepares the socket for termination and return true if the + //Return val: true is the socket is already closable and false otherwise + virtual bool prepare_to_close(bool process_shutdown = false) { NOT_IN_USE(process_shutdown); return is_closable(); } + + virtual ssize_t tx(vma_tx_call_attr_t &tx_arg) = 0; + + virtual void statistics_print(vlog_levels_t log_level = VLOG_DEBUG); + + virtual int register_callback(vma_recv_callback_t callback, void *context); + + virtual int free_packets(struct vma_packet_t *pkts, size_t count); + + /* This function is used for socketxtreme mode */ + virtual int free_buffs(uint16_t len); + + virtual int get_fd( ) const { return m_fd; }; + + // true if fd must be skipped from OS select() + // If m_n_sysvar_select_poll_os_ratio == 0, it means that user configured VMA not to poll os (i.e. TRUE...) + virtual bool skip_os_select() { return (!m_n_sysvar_select_poll_os_ratio); }; + + virtual fd_type_t get_type() = 0; + + virtual void consider_rings_migration() {} + + virtual int add_epoll_context(epfd_info *epfd); + virtual void remove_epoll_context(epfd_info *epfd); + int get_epoll_context_fd(); + + // Calling OS transmit + ssize_t tx_os(const tx_call_t call_type, const iovec* p_iov, + const ssize_t sz_iov, const int __flags, + const sockaddr *__to, const socklen_t __tolen); + + static inline size_t pendig_to_remove_node_offset(void) {return NODE_OFFSET(socket_fd_api, pendig_to_remove_node);} + list_node pendig_to_remove_node; + + static inline size_t socket_fd_list_node_offset(void) {return NODE_OFFSET(socket_fd_api, socket_fd_list_node);} + list_node socket_fd_list_node; + + static inline size_t ep_ready_fd_node_offset(void) {return NODE_OFFSET(socket_fd_api, ep_ready_fd_node);} + list_node ep_ready_fd_node; + uint32_t m_epoll_event_flags; + + static inline size_t ep_info_fd_node_offset(void) {return NODE_OFFSET(socket_fd_api, ep_info_fd_node);} + list_node ep_info_fd_node; + epoll_fd_rec m_fd_rec; + + virtual int get_rings_num() {return 0;} + virtual bool check_rings() {return false;} + virtual int* get_rings_fds(int& res_length) { res_length=0; return NULL;} + virtual int get_socket_network_ptr(void *ptr, uint16_t &len) { NOT_IN_USE(ptr);NOT_IN_USE(len);errno=ENOSYS;return -1;}; + virtual int get_socket_tx_ring_fd(struct sockaddr *to, socklen_t tolen) { ;NOT_IN_USE(to);NOT_IN_USE(tolen);errno=ENOSYS; return -1; } +protected: + void notify_epoll_context(uint32_t events); + void notify_epoll_context_add_ring(ring* ring); + void notify_epoll_context_remove_ring(ring* ring); + bool notify_epoll_context_verify(epfd_info *epfd); + void notify_epoll_context_fd_is_offloaded(); + + // identification information + int m_fd; + const uint32_t m_n_sysvar_select_poll_os_ratio; + epfd_info *m_econtext; + + // Calling OS receive + ssize_t rx_os(const rx_call_t call_type, iovec* p_iov, ssize_t sz_iov, + const int flags, sockaddr *__from, socklen_t *__fromlen, struct msghdr *__msg); +}; +#endif diff --git a/src/vma/sock/sockinfo.cpp b/src/vma/sock/sockinfo.cpp new file mode 100644 index 0000000..6749dea --- /dev/null +++ b/src/vma/sock/sockinfo.cpp @@ -0,0 +1,1740 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "sockinfo.h" + +#include +#include +#include + +#include "utils/bullseye.h" +#include "vlogger/vlogger.h" +#include "vma/util/if.h" +#include "vma/proto/route_table_mgr.h" +#include "sock-redirect.h" +#include "fd_collection.h" +#include "vma/dev/ring_simple.h" + + +#define MODULE_NAME "si" +#undef MODULE_HDR_INFO +#define MODULE_HDR_INFO MODULE_NAME "[fd=%d]:%d:%s() " +#undef __INFO__ +#define __INFO__ m_fd + +#define si_logpanic __log_info_panic +#define si_logerr __log_info_err +#define si_logwarn __log_info_warn +#define si_loginfo __log_info_info +#define si_logdbg __log_info_dbg +#define si_logfunc __log_info_func +#define si_logfuncall __log_info_funcall + + +sockinfo::sockinfo(int fd): + socket_fd_api(fd), + m_b_blocking(true), + m_b_pktinfo(false), + m_b_rcvtstamp(false), + m_b_rcvtstampns(false), + m_n_tsing_flags(0), + m_protocol(PROTO_UNDEFINED), + m_lock_rcv(MODULE_NAME "::m_lock_rcv"), + m_lock_snd(MODULE_NAME "::m_lock_snd"), + m_state(SOCKINFO_OPENED), + m_p_connected_dst_entry(NULL), + m_so_bindtodevice_ip(INADDR_ANY), + m_p_rx_ring(0), + m_rx_reuse_buf_pending(false), + m_rx_reuse_buf_postponed(false), + m_rx_ring_map_lock(MODULE_NAME "::m_rx_ring_map_lock"), + m_n_rx_pkt_ready_list_count(0), m_rx_pkt_ready_offset(0), m_rx_ready_byte_count(0), + m_n_sysvar_rx_num_buffs_reuse(safe_mce_sys().rx_bufs_batch), + m_n_sysvar_rx_poll_num(safe_mce_sys().rx_poll_num), + m_ring_alloc_log_rx(safe_mce_sys().ring_allocation_logic_rx), + m_ring_alloc_log_tx(safe_mce_sys().ring_allocation_logic_tx), + m_pcp(0), + m_rx_callback(NULL), + m_rx_callback_context(NULL), + m_fd_context((void *)((uintptr_t)m_fd)), + m_flow_tag_id(0), + m_flow_tag_enabled(false), + m_n_uc_ttl(safe_mce_sys().sysctl_reader.get_net_ipv4_ttl()), + m_tcp_flow_is_5t(false), + m_p_rings_fds(NULL) + +{ + m_ring_alloc_logic = ring_allocation_logic_rx(get_fd(), m_ring_alloc_log_rx, this); + m_rx_epfd = orig_os_api.epoll_create(128); + if (unlikely(m_rx_epfd == -1)) { + throw_vma_exception("create internal epoll"); + } + wakeup_set_epoll_fd(m_rx_epfd); + + m_p_socket_stats = &m_socket_stats; // Save stats as local copy and allow state publisher to copy from this location + vma_stats_instance_create_socket_block(m_p_socket_stats); + m_p_socket_stats->reset(); + m_p_socket_stats->fd = m_fd; + m_p_socket_stats->inode = fd2inode(m_fd); + m_p_socket_stats->b_blocking = m_b_blocking; + m_p_socket_stats->ring_alloc_logic_rx = m_ring_alloc_log_rx.get_ring_alloc_logic(); + m_p_socket_stats->ring_alloc_logic_tx = m_ring_alloc_log_tx.get_ring_alloc_logic(); + m_p_socket_stats->ring_user_id_rx = m_ring_alloc_logic.calc_res_key_by_logic(); + m_p_socket_stats->ring_user_id_tx = + ring_allocation_logic_tx(get_fd(), m_ring_alloc_log_tx, this).calc_res_key_by_logic(); + m_rx_reuse_buff.n_buff_num = 0; + memset(&m_so_ratelimit, 0, sizeof(vma_rate_limit_t)); + set_flow_tag(m_fd + 1); + + m_socketxtreme.ec.clear(); + m_socketxtreme.completion = NULL; + m_socketxtreme.last_buff_lst = NULL; +} + +sockinfo::~sockinfo() +{ + m_state = SOCKINFO_CLOSED; + + // Change to non-blocking socket so calling threads can exit + m_b_blocking = false; + orig_os_api.close(m_rx_epfd); // this will wake up any blocked thread in rx() call to orig_os_api.epoll_wait() + if (m_p_rings_fds) { + delete[] m_p_rings_fds; + m_p_rings_fds = NULL; + } + vma_stats_instance_remove_socket_block(m_p_socket_stats); +} + +void sockinfo::set_blocking(bool is_blocked) +{ + if (is_blocked) { + si_logdbg("set socket to blocked mode"); + m_b_blocking = true; + } + else { + si_logdbg("set socket to non-blocking mode"); + m_b_blocking = false; + } + + // Update statistics info + m_p_socket_stats->b_blocking = m_b_blocking; +} + +int sockinfo::fcntl(int __cmd, unsigned long int __arg) +{ + switch (__cmd) { + case F_SETFL: + { + si_logdbg("cmd=F_SETFL, arg=%#x", __arg); + if (__arg & O_NONBLOCK) + set_blocking(false); + else + set_blocking(true); + } + break; + case F_GETFL: /* Get file status flags. */ + si_logfunc("cmd=F_GETFL, arg=%#x", __arg); + break; + + case F_GETFD: /* Get file descriptor flags. */ + si_logfunc("cmd=F_GETFD, arg=%#x", __arg); + break; + + case F_SETFD: /* Set file descriptor flags. */ + si_logfunc("cmd=F_SETFD, arg=%#x", __arg); + break; + + default: + char buf[128]; + snprintf(buf, sizeof(buf), "unimplemented fcntl cmd=%#x, arg=%#x", (unsigned)__cmd, (unsigned)__arg); + buf[ sizeof(buf)-1 ] = '\0'; + + VLOG_PRINTF_INFO(safe_mce_sys().exception_handling.get_log_severity(), "%s", buf); + int rc = handle_exception_flow(); + switch (rc) { + case -1: + return rc; + case -2: + vma_throw_object_with_msg(vma_unsupported_api, buf); + } + break; + } + si_logdbg("going to OS for fcntl cmd=%d, arg=%#x", __cmd, __arg); + return orig_os_api.fcntl(m_fd, __cmd, __arg); +} + +int sockinfo::set_ring_attr(vma_ring_alloc_logic_attr *attr) +{ + if ((attr->comp_mask & VMA_RING_ALLOC_MASK_RING_ENGRESS) && attr->engress) { + if (set_ring_attr_helper(&m_ring_alloc_log_tx, attr)) { + return SOCKOPT_NO_VMA_SUPPORT; + } + ring_alloc_logic_updater du(get_fd(), m_lock_snd, m_ring_alloc_log_tx, m_p_socket_stats); + update_header_field(&du); + m_p_socket_stats->ring_alloc_logic_tx = m_ring_alloc_log_tx.get_ring_alloc_logic(); + m_p_socket_stats->ring_user_id_tx = + ring_allocation_logic_tx(get_fd(), m_ring_alloc_log_tx, this).calc_res_key_by_logic(); + } + if ((attr->comp_mask & VMA_RING_ALLOC_MASK_RING_INGRESS) && attr->ingress) { + ring_alloc_logic_attr old_key(*m_ring_alloc_logic.get_key()); + + if (set_ring_attr_helper(&m_ring_alloc_log_rx, attr)) { + return SOCKOPT_NO_VMA_SUPPORT; + } + m_ring_alloc_logic = ring_allocation_logic_rx(get_fd(), m_ring_alloc_log_rx, this); + + if (m_rx_nd_map.size()) { + auto_unlocker locker(m_rx_migration_lock); + do_rings_migration(old_key); + } + + m_p_socket_stats->ring_alloc_logic_rx = m_ring_alloc_log_rx.get_ring_alloc_logic(); + m_p_socket_stats->ring_user_id_rx = m_ring_alloc_logic.calc_res_key_by_logic(); + } + + return SOCKOPT_INTERNAL_VMA_SUPPORT; +} + +int sockinfo::set_ring_attr_helper(ring_alloc_logic_attr *sock_attr, + vma_ring_alloc_logic_attr *user_attr) +{ + if (user_attr->comp_mask & VMA_RING_ALLOC_MASK_RING_PROFILE_KEY) { + if (sock_attr->get_ring_profile_key()) { + si_logdbg("ring_profile_key is already set and " + "cannot be changed"); + return -1; + } + sock_attr->set_ring_profile_key(user_attr->ring_profile_key); + } + + sock_attr->set_ring_alloc_logic(user_attr->ring_alloc_logic); + + if (user_attr->comp_mask & VMA_RING_ALLOC_MASK_RING_USER_ID) + sock_attr->set_user_id_key(user_attr->user_id); + + return 0; +} + +int sockinfo::ioctl(unsigned long int __request, unsigned long int __arg) +{ + + int *p_arg = (int *)__arg; + + switch (__request) { + case FIONBIO: + { + si_logdbg("request=FIONBIO, arg=%d", *p_arg); + if (*p_arg) + set_blocking(false); + else + set_blocking(true); + } + break; + + case FIONREAD: + { + si_logfunc("request=FIONREAD, arg=%d", *p_arg); + int ret = rx_verify_available_data(); + if (ret >= 0) { + *p_arg = ret; + return 0; + } + return ret; + } + break; + case SIOCGIFVLAN: /* prevent error print */ + break; + default: + char buf[128]; + snprintf(buf, sizeof(buf), "unimplemented ioctl request=%#x, flags=%#x", (unsigned)__request, (unsigned)__arg); + buf[ sizeof(buf)-1 ] = '\0'; + + VLOG_PRINTF_INFO(safe_mce_sys().exception_handling.get_log_severity(), "%s", buf); + int rc = handle_exception_flow(); + switch (rc) { + case -1: + return rc; + case -2: + vma_throw_object_with_msg(vma_unsupported_api, buf); + } + break; + } + + si_logdbg("going to OS for ioctl request=%d, flags=%x", __request, __arg); + return orig_os_api.ioctl(m_fd, __request, __arg); +} + +int sockinfo::setsockopt(int __level, int __optname, const void *__optval, socklen_t __optlen) +{ + int ret = SOCKOPT_PASS_TO_OS; + + if (__level == SOL_SOCKET) { + switch(__optname) { + case SO_VMA_USER_DATA: + if (__optlen == sizeof(m_fd_context)) { + m_fd_context = *(void **)__optval; + ret = SOCKOPT_INTERNAL_VMA_SUPPORT; + } else { + ret = SOCKOPT_NO_VMA_SUPPORT; + errno = EINVAL; + } + break; + case SO_VMA_RING_USER_MEMORY: + if (__optval) { + if (__optlen == sizeof(iovec)) { + iovec *attr = (iovec *)__optval; + m_ring_alloc_log_rx.set_memory_descriptor(*attr); + m_ring_alloc_logic = ring_allocation_logic_rx(get_fd(), m_ring_alloc_log_rx, this); + if (m_p_rx_ring || m_rx_ring_map.size()) { + si_logwarn("user asked to assign memory for " + "RX ring but ring already exists"); + } + ret = SOCKOPT_INTERNAL_VMA_SUPPORT; + } else { + ret = SOCKOPT_NO_VMA_SUPPORT; + errno = EINVAL; + si_logdbg("SOL_SOCKET, SO_VMA_RING_USER_MEMORY - " + "bad length expected %d got %d", + sizeof(iovec), __optlen); + } + } + else { + ret = SOCKOPT_NO_VMA_SUPPORT; + errno = EINVAL; + si_logdbg("SOL_SOCKET, SO_VMA_RING_USER_MEMORY - NOT HANDLED, optval == NULL"); + } + break; + case SO_VMA_FLOW_TAG: + if (__optval) { + if (__optlen == sizeof(uint32_t)) { + if (set_flow_tag(*(uint32_t*)__optval)) { + si_logdbg("SO_VMA_FLOW_TAG, set " + "socket %s to flow id %d", + m_fd, m_flow_tag_id); + // not supported in OS + ret = SOCKOPT_INTERNAL_VMA_SUPPORT; + } else { + ret = SOCKOPT_NO_VMA_SUPPORT; + errno = EINVAL; + } + } else { + ret = SOCKOPT_NO_VMA_SUPPORT; + errno = EINVAL; + si_logdbg("SO_VMA_FLOW_TAG, bad length " + "expected %d got %d", + sizeof(uint32_t), __optlen); + break; + } + } else { + ret = SOCKOPT_NO_VMA_SUPPORT; + errno = EINVAL; + si_logdbg("SO_VMA_FLOW_TAG - NOT HANDLED, " + "optval == NULL"); + } + break; + case SO_TIMESTAMP: + case SO_TIMESTAMPNS: + if (__optval) { + m_b_rcvtstamp = *(bool*)__optval; + if (__optname == SO_TIMESTAMPNS) + m_b_rcvtstampns = m_b_rcvtstamp; + si_logdbg("SOL_SOCKET, %s=%s", setsockopt_so_opt_to_str(__optname), (m_b_rcvtstamp ? "true" : "false")); + } + else { + si_logdbg("SOL_SOCKET, %s=\"???\" - NOT HANDLED, optval == NULL", setsockopt_so_opt_to_str(__optname)); + } + break; + + case SO_TIMESTAMPING: + if (__optval) { + uint8_t val = *(uint8_t*)__optval; + + // SOF_TIMESTAMPING_TX_SOFTWARE and SOF_TIMESTAMPING_TX_HARDWARE is NOT supported. + if (val & (SOF_TIMESTAMPING_TX_SOFTWARE | SOF_TIMESTAMPING_TX_HARDWARE)) { + ret = SOCKOPT_NO_VMA_SUPPORT; + errno = EOPNOTSUPP; + si_logdbg("SOL_SOCKET, SOF_TIMESTAMPING_TX_SOFTWARE and SOF_TIMESTAMPING_TX_HARDWARE is not supported, errno set to EOPNOTSUPP"); + } + + if (val & (SOF_TIMESTAMPING_RAW_HARDWARE | SOF_TIMESTAMPING_RX_HARDWARE)) { + if (g_p_net_device_table_mgr->get_ctx_time_conversion_mode() == TS_CONVERSION_MODE_DISABLE){ + if (safe_mce_sys().hw_ts_conversion_mode == TS_CONVERSION_MODE_DISABLE) { + ret = SOCKOPT_NO_VMA_SUPPORT; + errno = EPERM; + si_logdbg("SOL_SOCKET, SOF_TIMESTAMPING_RAW_HARDWARE and SOF_TIMESTAMPING_RX_HARDWARE socket options were disabled (VMA_HW_TS_CONVERSION = %d) , errno set to EPERM", TS_CONVERSION_MODE_DISABLE); + } else { + ret = SOCKOPT_NO_VMA_SUPPORT; + errno = ENODEV; + si_logdbg("SOL_SOCKET, SOF_TIMESTAMPING_RAW_HARDWARE and SOF_TIMESTAMPING_RX_HARDWARE is not supported by device(s), errno set to ENODEV"); + } + } + } + + m_n_tsing_flags = val; + si_logdbg("SOL_SOCKET, SO_TIMESTAMPING=%u", m_n_tsing_flags); + } + else { + si_logdbg("SOL_SOCKET, %s=\"???\" - NOT HANDLED, optval == NULL", setsockopt_so_opt_to_str(__optname)); + } + break; + case SO_VMA_RING_ALLOC_LOGIC: + if (__optval) { + uint32_t val = ((vma_ring_alloc_logic_attr*) __optval)->comp_mask; + + if (val & (VMA_RING_ALLOC_MASK_RING_PROFILE_KEY | VMA_RING_ALLOC_MASK_RING_USER_ID | + VMA_RING_ALLOC_MASK_RING_INGRESS | VMA_RING_ALLOC_MASK_RING_ENGRESS)) { + if (__optlen == sizeof(vma_ring_alloc_logic_attr)) { + vma_ring_alloc_logic_attr *attr = (vma_ring_alloc_logic_attr *)__optval; + return set_ring_attr(attr); + } + else { + ret = SOCKOPT_NO_VMA_SUPPORT; + errno = EINVAL; + si_logdbg("SOL_SOCKET, %s=\"???\" - bad length expected %d got %d", + setsockopt_so_opt_to_str(__optname), + sizeof(vma_ring_alloc_logic_attr), __optlen); + break; + } + } + else { + ret = SOCKOPT_NO_VMA_SUPPORT; + errno = EINVAL; + si_logdbg("SOL_SOCKET, %s=\"???\" - bad optval (%d)", setsockopt_so_opt_to_str(__optname), val); + } + } + else { + ret = SOCKOPT_NO_VMA_SUPPORT; + errno = EINVAL; + si_logdbg("SOL_SOCKET, %s=\"???\" - NOT HANDLED, optval == NULL", setsockopt_so_opt_to_str(__optname)); + } + break; + case SO_VMA_SHUTDOWN_RX: + shutdown_rx(); + ret = SOCKOPT_INTERNAL_VMA_SUPPORT; + break; + default: + break; + } + } else if (__level == IPPROTO_IP) { + switch(__optname) { + case IP_TTL: + if (__optlen < sizeof(m_n_uc_ttl)) { + ret = SOCKOPT_NO_VMA_SUPPORT; + errno = EINVAL; + } else { + int val = __optlen < sizeof(val) ? (uint8_t) *(uint8_t *)__optval : (int) *(int *)__optval; + if (val != -1 && (val < 1 || val > 255)) { + ret = SOCKOPT_NO_VMA_SUPPORT; + errno = EINVAL; + } else { + m_n_uc_ttl = (val == -1) ? safe_mce_sys().sysctl_reader.get_net_ipv4_ttl() : (uint8_t) val; + header_ttl_updater du(m_n_uc_ttl, false); + update_header_field(&du); + si_logdbg("IPPROTO_IP, optname=IP_TTL (%d)", m_n_uc_ttl); + } + } + break; + default: + break; + } + } + + si_logdbg("ret (%d)", ret); + return ret; +} + +int sockinfo::getsockopt(int __level, int __optname, void *__optval, socklen_t *__optlen) +{ + int ret = -1; + + switch (__level) { + case SOL_SOCKET: + switch(__optname) { + case SO_VMA_USER_DATA: + if (*__optlen == sizeof(m_fd_context)) { + *(void **)__optval = m_fd_context; + ret = 0; + } else { + errno = EINVAL; + } + break; + case SO_VMA_FLOW_TAG: + if (*__optlen >= sizeof(uint32_t)) { + *(uint32_t*)__optval = m_flow_tag_id; + ret = 0; + } else { + errno = EINVAL; + } + break; + case SO_MAX_PACING_RATE: + if (*__optlen == sizeof(struct vma_rate_limit_t)) { + *(struct vma_rate_limit_t*)__optval = m_so_ratelimit; + *__optlen = sizeof(struct vma_rate_limit_t); + si_logdbg("(SO_MAX_PACING_RATE) value: %d, %d, %d", + (*(struct vma_rate_limit_t*)__optval).rate, + (*(struct vma_rate_limit_t*)__optval).max_burst_sz, + (*(struct vma_rate_limit_t*)__optval).typical_pkt_sz); + } else if (*__optlen == sizeof(uint32_t)) { + *(uint32_t*)__optval = KB_TO_BYTE(m_so_ratelimit.rate); + *__optlen = sizeof(uint32_t); + si_logdbg("(SO_MAX_PACING_RATE) value: %d", + *(int *)__optval); + ret = 0; + } else { + errno = EINVAL; + } + break; + } + } + + return ret; +} + +//////////////////////////////////////////////////////////////////////////////// +bool sockinfo::try_un_offloading() // un-offload the socket if possible +{ + if (!this->isPassthrough()) { + setPassthrough(); + } + + return true; +} + +//////////////////////////////////////////////////////////////////////////////// +int sockinfo::get_sock_by_L3_L4(in_protocol_t protocol, in_addr_t ip, in_port_t port) +{ + int map_size = g_p_fd_collection->get_fd_map_size(); + for (int i = 0; i < map_size; i++) { + socket_fd_api* p_sock_i = g_p_fd_collection->get_sockfd(i); + if (!p_sock_i || p_sock_i->get_type() != FD_TYPE_SOCKET) continue; + sockinfo* s = (sockinfo*)p_sock_i; + if (protocol == s->m_protocol && ip == s->m_bound.get_in_addr() && port == s->m_bound.get_in_port()) return i; + } + return -1; +} + +#if _BullseyeCoverage + #pragma BullseyeCoverage off +#endif +int sockinfo::rx_wait(int &poll_count, bool is_blocking) +{ + int ret_val = 0; + ret_val = rx_wait_helper(poll_count, is_blocking); + return ret_val; +} + +int sockinfo::rx_wait_helper(int &poll_count, bool is_blocking) +{ + int ret; + uint64_t poll_sn = 0; + epoll_event rx_epfd_events[SI_RX_EPFD_EVENT_MAX]; + rx_ring_map_t::iterator rx_ring_iter; + + // poll for completion + si_logfunc(""); + + poll_count++; + + for (rx_ring_iter = m_rx_ring_map.begin(); rx_ring_iter != m_rx_ring_map.end(); rx_ring_iter++) { + //BULLSEYE_EXCLUDE_BLOCK_START + if (unlikely(rx_ring_iter->second->refcnt <= 0)) { + si_logerr("Attempted to poll illegal cq"); + continue; + } + //BULLSEYE_EXCLUDE_BLOCK_END + ret = rx_ring_iter->first->poll_and_process_element_rx(&poll_sn); + if (ret > 0) { + si_logfuncall("got %d elements sn=%llu", ret, (unsigned long long)poll_sn); + return ret; + } + } + + if (poll_count < m_n_sysvar_rx_poll_num || m_n_sysvar_rx_poll_num == -1) { + return 0; + } + + // if we polling too much - go to sleep + si_logfunc("too many polls without data blocking=%d", is_blocking); + if (g_b_exit) + return -1; + + if (!is_blocking) { + /* if we are in non blocking mode - return EAGAIN */ + errno = EAGAIN; + return -1; + } + + for (rx_ring_iter = m_rx_ring_map.begin(); rx_ring_iter != m_rx_ring_map.end(); rx_ring_iter++) { + if (rx_ring_iter->second->refcnt <= 0) { + continue; + } + // coverity[check_return] + rx_ring_iter->first->request_notification(CQT_RX, poll_sn); + } + + ret = orig_os_api.epoll_wait(m_rx_epfd, rx_epfd_events, SI_RX_EPFD_EVENT_MAX, -1); + + if (ret < 0) + return -1; + if (ret == 0) + return 0; + + for (int event_idx = 0; event_idx < ret; ++event_idx) { + int cq_channel_fd = rx_epfd_events[event_idx].data.fd; + cq_channel_info* p_cq_ch_info = g_p_fd_collection->get_cq_channel_fd(cq_channel_fd); + if (p_cq_ch_info) { + ring* p_ring = p_cq_ch_info->get_ring(); + if (p_ring) { + p_ring->wait_for_notification_and_process_element(cq_channel_fd, &poll_sn); + } + } + + // TODO: need to handle wakeup + } + return 0; +} + +#if _BullseyeCoverage + #pragma BullseyeCoverage on +#endif + +void sockinfo::save_stats_rx_offload(int nbytes) +{ + if (nbytes >= 0) { + m_p_socket_stats->counters.n_rx_bytes += nbytes; + m_p_socket_stats->counters.n_rx_packets++; + } + else if (errno == EAGAIN) { + m_p_socket_stats->counters.n_rx_eagain++; + } + else { + m_p_socket_stats->counters.n_rx_errors++; + } +} + +void sockinfo::save_stats_rx_os(int bytes) +{ + if (bytes >= 0) { + m_p_socket_stats->counters.n_rx_os_bytes += bytes; + m_p_socket_stats->counters.n_rx_os_packets++; + }else if ( errno == EAGAIN ){ + m_p_socket_stats->counters.n_rx_os_eagain++; + } + else { + m_p_socket_stats->counters.n_rx_os_errors++; + } +} + +void sockinfo::save_stats_tx_os(int bytes) +{ + if (bytes >= 0) { + m_p_socket_stats->counters.n_tx_os_bytes += bytes; + m_p_socket_stats->counters.n_tx_os_packets++; + }else if ( errno == EAGAIN ){ + m_p_socket_stats->counters.n_rx_os_eagain++; + } + else { + m_p_socket_stats->counters.n_tx_os_errors++; + } +} + +size_t sockinfo::handle_msg_trunc(size_t total_rx, size_t payload_size, int in_flags, int* p_out_flags) +{ + NOT_IN_USE(payload_size); + NOT_IN_USE(in_flags); + *p_out_flags &= ~MSG_TRUNC; //don't handle msg_trunc + return total_rx; +} + +bool sockinfo::attach_receiver(flow_tuple_with_local_if &flow_key) +{ + // This function should be called from within mutex protected context of the sockinfo!!! + + si_logdbg("Attaching to %s", flow_key.to_str()); + + // Protect against local loopback used as local_if & peer_ip + // rdma_cm will accept it but we don't want to offload it + if (flow_key.is_local_loopback()) { + si_logdbg("VMA does not offload local loopback IP address"); + return false; + } + + if (m_rx_flow_map.find(flow_key) != m_rx_flow_map.end()) { + si_logdbg("already attached %s", flow_key.to_str()); + return false; + } + + // Allocate resources on specific interface (create ring) + net_device_resources_t* p_nd_resources = create_nd_resources((const ip_address)flow_key.get_local_if()); + if (NULL == p_nd_resources) { + // any error which occurred inside create_nd_resources() was already printed. No need to reprint errors here + return false; + } + + // Map flow in local map + m_rx_flow_map[flow_key] = p_nd_resources->p_ring; + + // Attach tuple + BULLSEYE_EXCLUDE_BLOCK_START + unlock_rx_q(); + if (!p_nd_resources->p_ring->attach_flow(flow_key, this)) { + lock_rx_q(); + si_logdbg("Failed to attach %s to ring %p", flow_key.to_str(), p_nd_resources->p_ring); + return false; + } + set_rx_packet_processor(); + lock_rx_q(); + BULLSEYE_EXCLUDE_BLOCK_END + + // Registered as receiver successfully + si_logdbg("Attached %s to ring %p", flow_key.to_str(), p_nd_resources->p_ring); + + + // Verify 5 tuple over 3 tuple + if (flow_key.is_5_tuple()) + { + // Check and remove lesser 3 tuple + flow_tuple_with_local_if flow_key_3t(flow_key.get_dst_ip(), flow_key.get_dst_port(), INADDR_ANY, INPORT_ANY, flow_key.get_protocol(), flow_key.get_local_if()); + rx_flow_map_t::iterator rx_flow_iter = m_rx_flow_map.find(flow_key_3t); + if (rx_flow_iter != m_rx_flow_map.end()) { + si_logdbg("Removing (and detaching) 3 tuple now that we added a stronger 5 tuple"); + detach_receiver(flow_key_3t); + } + } + + return true; +} + +bool sockinfo::detach_receiver(flow_tuple_with_local_if &flow_key) +{ + si_logdbg("Unregistering receiver: %s", flow_key.to_str()); + + // TODO ALEXR: DO we need to return a 3 tuple instead of a 5 tuple being removed? + // if (peer_ip != INADDR_ANY && peer_port != INPORT_ANY); + + // Find ring associated with this tuple + rx_flow_map_t::iterator rx_flow_iter = m_rx_flow_map.find(flow_key); + BULLSEYE_EXCLUDE_BLOCK_START + if (rx_flow_iter == m_rx_flow_map.end()) { + si_logdbg("Failed to find ring associated with: %s", flow_key.to_str()); + return false; + } + BULLSEYE_EXCLUDE_BLOCK_END + ring* p_ring = rx_flow_iter->second; + + si_logdbg("Detaching %s from ring %p", flow_key.to_str(), p_ring); + + // Detach tuple + unlock_rx_q(); + p_ring->detach_flow(flow_key, this); + lock_rx_q(); + + // Un-map flow from local map + m_rx_flow_map.erase(rx_flow_iter); + + return destroy_nd_resources((const ip_address)flow_key.get_local_if()); +} + +net_device_resources_t* sockinfo::create_nd_resources(const ip_address ip_local) +{ + net_device_resources_t* p_nd_resources = NULL; + + // Check if we are already registered to net_device with the local ip as observers + rx_net_device_map_t::iterator rx_nd_iter = m_rx_nd_map.find(ip_local.get_in_addr()); + if (rx_nd_iter == m_rx_nd_map.end()) { + + // Need to register as observer to net_device + net_device_resources_t nd_resources; + nd_resources.refcnt = 0; + nd_resources.p_nde = NULL; + nd_resources.p_ndv = NULL; + nd_resources.p_ring = NULL; + + BULLSEYE_EXCLUDE_BLOCK_START + cache_entry_subject* p_ces = NULL; + if (!g_p_net_device_table_mgr->register_observer(ip_local, &m_rx_nd_observer, &p_ces)) { + si_logdbg("Failed registering as observer for local ip %s", ip_local.to_str().c_str()); + goto err; + } + nd_resources.p_nde = (net_device_entry*)p_ces; + if (!nd_resources.p_nde) { + si_logerr("Got NULL net_devide_entry for local ip %s", ip_local.to_str().c_str()); + goto err; + } + if (!nd_resources.p_nde->get_val(nd_resources.p_ndv)) { + si_logerr("Got net_device_val=NULL (interface is not offloaded) for local ip %s", ip_local.to_str().c_str()); + goto err; + } + + unlock_rx_q(); + m_rx_ring_map_lock.lock(); + resource_allocation_key *key; + if (m_rx_ring_map.size() && m_ring_alloc_logic.is_logic_support_migration()) { + key = m_ring_alloc_logic.get_key(); + } else { + key = m_ring_alloc_logic.create_new_key(ip_local.get_in_addr()); + } + m_rx_ring_map_lock.unlock(); + nd_resources.p_ring = nd_resources.p_ndv->reserve_ring(key); + lock_rx_q(); + if (!nd_resources.p_ring) { + si_logdbg("Failed to reserve ring for allocation key %s on ip %s", + m_ring_alloc_logic.get_key()->to_str(), ip_local.to_str().c_str()); + goto err; + } + + // Add new net_device to rx_map + m_rx_nd_map[ip_local.get_in_addr()] = nd_resources; + + rx_nd_iter = m_rx_nd_map.find(ip_local.get_in_addr()); + if (rx_nd_iter == m_rx_nd_map.end()) { + si_logerr("Failed to find rx_nd_iter"); + goto err; + } + BULLSEYE_EXCLUDE_BLOCK_END + + } + + // Now we have the net_device object (created or found) + p_nd_resources = &rx_nd_iter->second; + + /* just increment reference counter on attach */ + p_nd_resources->refcnt++; + + // Save the new CQ from ring (dummy_flow_key is not used) + { + flow_tuple_with_local_if dummy_flow_key(m_bound, m_connected, m_protocol, ip_local.get_in_addr()); + rx_add_ring_cb(dummy_flow_key, p_nd_resources->p_ring); + } + + return p_nd_resources; +err: + return NULL; +} + +bool sockinfo::destroy_nd_resources(const ip_address ip_local) +{ + net_device_resources_t* p_nd_resources = NULL; + rx_net_device_map_t::iterator rx_nd_iter = m_rx_nd_map.find(ip_local.get_in_addr()); + BULLSEYE_EXCLUDE_BLOCK_START + if (rx_nd_iter == m_rx_nd_map.end()) { + si_logerr("Failed to net_device associated with: %s", ip_local.to_str().c_str()); + return false; + } + BULLSEYE_EXCLUDE_BLOCK_END + + p_nd_resources = &(rx_nd_iter->second); + + p_nd_resources->refcnt--; + + // Release the new CQ from ring (dummy_flow_key is not used) + { + flow_tuple_with_local_if dummy_flow_key(m_bound, m_connected, m_protocol, ip_local.get_in_addr()); + rx_del_ring_cb(dummy_flow_key, p_nd_resources->p_ring); + } + + if (p_nd_resources->refcnt == 0) { + + // Release ring reference + BULLSEYE_EXCLUDE_BLOCK_START + unlock_rx_q(); + resource_allocation_key *key; + if (m_ring_alloc_logic.is_logic_support_migration()) { + key = m_ring_alloc_logic.get_key(); + } else { + key = m_ring_alloc_logic.create_new_key(ip_local.get_in_addr()); + } + if (!p_nd_resources->p_ndv->release_ring(key)) { + lock_rx_q(); + si_logerr("Failed to release ring for allocation key %s on ip %s", + m_ring_alloc_logic.get_key()->to_str(), + ip_local.to_str().c_str()); + return false; + } + lock_rx_q(); + + // Release observer reference + if (!g_p_net_device_table_mgr->unregister_observer(ip_local, &m_rx_nd_observer)) { + si_logerr("Failed registering as observer for lip %s", ip_local.to_str().c_str()); + return false; + } + BULLSEYE_EXCLUDE_BLOCK_END + + m_rx_nd_map.erase(rx_nd_iter); + } + + return true; +} + +void sockinfo::do_rings_migration(resource_allocation_key &old_key) +{ + lock_rx_q(); + + uint64_t new_calc_id = m_ring_alloc_logic.calc_res_key_by_logic(); + uint64_t old_calc_id = old_key.get_user_id_key(); + resource_allocation_key *new_key = m_ring_alloc_logic.get_key(); + // Check again if migration is needed before migration + if (old_key.get_user_id_key() == new_calc_id && + old_key.get_ring_alloc_logic() == new_key->get_ring_alloc_logic()) { + unlock_rx_q(); + return; + } + + // Update key to new ID + new_key->set_user_id_key(new_calc_id); + rx_net_device_map_t::iterator rx_nd_iter = m_rx_nd_map.begin(); + while (rx_nd_iter != m_rx_nd_map.end()) { + net_device_resources_t* p_nd_resources = &(rx_nd_iter->second); + ring* p_old_ring = p_nd_resources->p_ring; + unlock_rx_q(); + ring* new_ring = p_nd_resources->p_ndv->reserve_ring(new_key); + if (new_ring == p_old_ring) { + if (!p_nd_resources->p_ndv->release_ring(&old_key)) { + si_logerr("Failed to release ring for allocation key %s", + old_key.to_str()); + new_key->set_user_id_key(old_calc_id); + m_ring_alloc_logic.enable_migration(false); + si_logwarn("Migration is disabled due to failure"); + } + lock_rx_q(); + rx_nd_iter++; + continue; + } + BULLSEYE_EXCLUDE_BLOCK_START + if (!new_ring) { + ip_address ip_local(rx_nd_iter->first); + si_logerr("Failed to reserve ring for allocation key %s on lip %s", + new_key->to_str(), ip_local.to_str().c_str()); + new_key->set_user_id_key(old_calc_id); + m_ring_alloc_logic.enable_migration(false); + si_logwarn("Migration is disabled due to failure"); + lock_rx_q(); + rx_nd_iter++; + continue; + } + BULLSEYE_EXCLUDE_BLOCK_END + lock_rx_q(); + rx_flow_map_t::iterator rx_flow_iter = m_rx_flow_map.begin(); + while (rx_flow_iter != m_rx_flow_map.end()) { + + ring* p_ring = rx_flow_iter->second; + if (p_ring != p_old_ring) { + rx_flow_iter++; // Pop next flow rule + continue; + } + + flow_tuple_with_local_if flow_key = rx_flow_iter->first; + // Save the new CQ from ring + rx_add_ring_cb(flow_key, new_ring, true); + + // Attach tuple + BULLSEYE_EXCLUDE_BLOCK_START + unlock_rx_q(); + if (!new_ring->attach_flow(flow_key, this)) { + si_logerr("Failed to attach %s to ring %p", flow_key.to_str(), new_ring); + rx_del_ring_cb(flow_key, new_ring, true); + if (!p_nd_resources->p_ndv->release_ring(new_key)) { + si_logerr("Failed to release ring for allocation key %s", + new_key->to_str()); + } + new_ring = NULL; + break; + } + lock_rx_q(); + BULLSEYE_EXCLUDE_BLOCK_END + + rx_flow_iter->second = new_ring; + + // Registered as receiver successfully + si_logdbg("Attached %s to ring %p", flow_key.to_str(), new_ring); + + si_logdbg("Detaching %s from ring %p", flow_key.to_str(), p_old_ring); + // Detach tuple + unlock_rx_q(); + p_old_ring->detach_flow(flow_key, this); + lock_rx_q(); + rx_del_ring_cb(flow_key, p_old_ring, true); + + rx_flow_iter++; // Pop next flow rule; + } + + if (!new_ring) { + ip_address ip_local(rx_nd_iter->first); + si_logerr("Failed to reserve ring for allocation key %s on lip %s", + new_key->to_str(), ip_local.to_str().c_str()); + new_key->set_user_id_key(old_calc_id); + m_ring_alloc_logic.enable_migration(false); + si_logwarn("Migration is disabled due to failure"); + lock_rx_q(); + rx_nd_iter++; + continue; + } + + unlock_rx_q(); + m_rx_ring_map_lock.lock(); + lock_rx_q(); + if (!m_p_rx_ring && m_rx_ring_map.size() == 1) { + m_p_rx_ring = m_rx_ring_map.begin()->first; + } + unlock_rx_q(); + m_rx_ring_map_lock.unlock(); + + // Release ring reference + BULLSEYE_EXCLUDE_BLOCK_START + if (!p_nd_resources->p_ndv->release_ring(&old_key)) { + ip_address ip_local(rx_nd_iter->first); + si_logerr("Failed to release ring for allocation key %s on lip %s", + old_key.to_str(), ip_local.to_str().c_str()); + } + lock_rx_q(); + BULLSEYE_EXCLUDE_BLOCK_END + p_nd_resources->p_ring = new_ring; + rx_nd_iter++; + } + + unlock_rx_q(); + m_p_socket_stats->counters.n_rx_migrations++; +} + +void sockinfo::consider_rings_migration() +{ + if (m_ring_alloc_logic.is_logic_support_migration()) { + if(!m_rx_migration_lock.trylock()) { + if (m_ring_alloc_logic.should_migrate_ring()) { + ring_alloc_logic_attr old_key(*m_ring_alloc_logic.get_key()); + do_rings_migration(old_key); + } + m_rx_migration_lock.unlock(); + } + } +} + +int sockinfo::add_epoll_context(epfd_info *epfd) +{ + int ret = 0; + rx_ring_map_t::const_iterator sock_ring_map_iter; + + m_rx_ring_map_lock.lock(); + lock_rx_q(); + + ret = socket_fd_api::add_epoll_context(epfd); + if (ret < 0) { + goto unlock_locks; + } + + sock_ring_map_iter = m_rx_ring_map.begin(); + while (sock_ring_map_iter != m_rx_ring_map.end()) { + notify_epoll_context_add_ring(sock_ring_map_iter->first); + sock_ring_map_iter++; + } + +unlock_locks: + + unlock_rx_q(); + m_rx_ring_map_lock.unlock(); + + return ret; +} + +void sockinfo::remove_epoll_context(epfd_info *epfd) +{ + m_rx_ring_map_lock.lock(); + lock_rx_q(); + + if (!notify_epoll_context_verify(epfd)) { + unlock_rx_q(); + m_rx_ring_map_lock.unlock(); + return; + } + + rx_ring_map_t::const_iterator sock_ring_map_iter = m_rx_ring_map.begin(); + while (sock_ring_map_iter != m_rx_ring_map.end()) { + notify_epoll_context_remove_ring(sock_ring_map_iter->first); + sock_ring_map_iter++; + } + + socket_fd_api::remove_epoll_context(epfd); + + unlock_rx_q(); + m_rx_ring_map_lock.unlock(); +} + +void sockinfo::statistics_print(vlog_levels_t log_level /* = VLOG_DEBUG */) +{ + const char * const in_protocol_str[] = { + "PROTO_UNDEFINED", + "PROTO_UDP", + "PROTO_TCP", + "PROTO_ALL", + }; + + const char * const m_state_str[] = { + "SOCKINFO_OPENED", + "SOCKINFO_CLOSING", + "SOCKINFO_CLOSED", + }; + + bool b_any_activity = false; + + socket_fd_api::statistics_print(log_level); + + vlog_printf(log_level, "Bind info : %s\n", m_bound.to_str()); + vlog_printf(log_level, "Connection info : %s\n", m_connected.to_str()); + vlog_printf(log_level, "Protocol : %s\n", in_protocol_str[m_protocol]); + vlog_printf(log_level, "Is closed : %s\n", m_state_str[m_state]); + vlog_printf(log_level, "Is blocking : %s\n", m_b_blocking ? "true" : "false"); + vlog_printf(log_level, "Rx reuse buffer pending : %s\n", m_rx_reuse_buf_pending ? "true" : "false"); + vlog_printf(log_level, "Rx reuse buffer postponed : %s\n", m_rx_reuse_buf_postponed ? "true" : "false"); + + if (m_p_connected_dst_entry) { + vlog_printf(log_level, "Is offloaded : %s\n", m_p_connected_dst_entry->is_offloaded() ? "true" : "false"); + } + + if (m_p_socket_stats->ring_alloc_logic_rx == RING_LOGIC_PER_USER_ID) + vlog_printf(log_level, "RX Ring User ID : %lu\n", m_p_socket_stats->ring_user_id_rx); + if (m_p_socket_stats->ring_alloc_logic_tx == RING_LOGIC_PER_USER_ID) + vlog_printf(log_level, "TX Ring User ID : %lu\n", m_p_socket_stats->ring_user_id_tx); + + if (m_p_socket_stats->counters.n_tx_sent_byte_count || m_p_socket_stats->counters.n_tx_sent_pkt_count || m_p_socket_stats->counters.n_tx_errors || m_p_socket_stats->counters.n_tx_drops ) { + vlog_printf(log_level, "Tx Offload : %d KB / %d / %d / %d [bytes/packets/drops/errors]\n", m_p_socket_stats->counters.n_tx_sent_byte_count/1024, m_p_socket_stats->counters.n_tx_sent_pkt_count, m_p_socket_stats->counters.n_tx_drops, m_p_socket_stats->counters.n_tx_errors); + b_any_activity = true; + } + if (m_p_socket_stats->counters.n_tx_os_bytes || m_p_socket_stats->counters.n_tx_os_packets || m_p_socket_stats->counters.n_tx_os_errors) { + vlog_printf(log_level, "Tx OS info : %d KB / %d / %d [bytes/packets/errors]\n", m_p_socket_stats->counters.n_tx_os_bytes/1024, m_p_socket_stats->counters.n_tx_os_packets, m_p_socket_stats->counters.n_tx_os_errors); + b_any_activity = true; + } + if (m_p_socket_stats->counters.n_tx_dummy) { + vlog_printf(log_level, "Tx Dummy messages : %d\n", m_p_socket_stats->counters.n_tx_dummy); + b_any_activity = true; + } + if (m_p_socket_stats->counters.n_rx_bytes || m_p_socket_stats->counters.n_rx_packets || m_p_socket_stats->counters.n_rx_errors || m_p_socket_stats->counters.n_rx_eagain || m_p_socket_stats->n_rx_ready_pkt_count) { + vlog_printf(log_level, "Rx Offload : %d KB / %d / %d / %d [bytes/packets/eagains/errors]\n", m_p_socket_stats->counters.n_rx_bytes/1024, m_p_socket_stats->counters.n_rx_packets, m_p_socket_stats->counters.n_rx_eagain, m_p_socket_stats->counters.n_rx_errors); + + if (m_p_socket_stats->counters.n_rx_packets) { + float rx_drop_percentage = 0; + if (m_p_socket_stats->n_rx_ready_pkt_count) + rx_drop_percentage = (float)(m_p_socket_stats->counters.n_rx_ready_byte_drop * 100) / (float)m_p_socket_stats->counters.n_rx_packets; + vlog_printf(log_level, "Rx byte : max %d / dropped %d (%2.2f%%) / limit %d\n", m_p_socket_stats->counters.n_rx_ready_byte_max, m_p_socket_stats->counters.n_rx_ready_byte_drop, rx_drop_percentage, m_p_socket_stats->n_rx_ready_byte_limit); + + if (m_p_socket_stats->n_rx_ready_pkt_count) + rx_drop_percentage = (float)(m_p_socket_stats->counters.n_rx_ready_pkt_drop * 100) / (float)m_p_socket_stats->counters.n_rx_packets; + vlog_printf(log_level, "Rx pkt : max %d / dropped %d (%2.2f%%)\n", m_p_socket_stats->counters.n_rx_ready_pkt_max, m_p_socket_stats->counters.n_rx_ready_pkt_drop, rx_drop_percentage); + } + + b_any_activity = true; + } + if (m_p_socket_stats->counters.n_rx_os_bytes || m_p_socket_stats->counters.n_rx_os_packets || m_p_socket_stats->counters.n_rx_os_errors || m_p_socket_stats->counters.n_rx_os_eagain) { + vlog_printf(log_level, "Rx OS info : %d KB / %d / %d / %d [bytes/packets/eagains/errors]\n", m_p_socket_stats->counters.n_rx_os_bytes/1024, m_p_socket_stats->counters.n_rx_os_packets, m_p_socket_stats->counters.n_rx_os_eagain, m_p_socket_stats->counters.n_rx_os_errors); + b_any_activity = true; + } + if (m_p_socket_stats->counters.n_rx_poll_miss || m_p_socket_stats->counters.n_rx_poll_hit) { + float rx_poll_hit_percentage = (float)(m_p_socket_stats->counters.n_rx_poll_hit * 100) / (float)(m_p_socket_stats->counters.n_rx_poll_miss + m_p_socket_stats->counters.n_rx_poll_hit); + vlog_printf(log_level, "Rx poll : %d / %d (%2.2f%%) [miss/hit]\n", m_p_socket_stats->counters.n_rx_poll_miss, m_p_socket_stats->counters.n_rx_poll_hit, rx_poll_hit_percentage); + b_any_activity = true; + } + if (b_any_activity == false) { + vlog_printf(log_level, "Socket activity : Rx and Tx where not active\n"); + } +} + +void sockinfo::rx_add_ring_cb(flow_tuple_with_local_if &flow_key, ring* p_ring, bool is_migration /*= false*/) +{ + si_logdbg(""); + NOT_IN_USE(flow_key); + NOT_IN_USE(is_migration); + + bool notify_epoll = false; + + // Add the rx ring to our rx ring map + unlock_rx_q(); + m_rx_ring_map_lock.lock(); + lock_rx_q(); + rx_ring_map_t::iterator rx_ring_iter = m_rx_ring_map.find(p_ring->get_parent()); + if (rx_ring_iter == m_rx_ring_map.end()) { + // First map of this cq mgr + ring_info_t* p_ring_info = new ring_info_t(); + m_rx_ring_map[p_ring] = p_ring_info; + p_ring_info->refcnt = 1; + p_ring_info->rx_reuse_info.n_buff_num = 0; + + /* m_p_rx_ring is updated in following functions: + * - rx_add_ring_cb() + * - rx_del_ring_cb() + * - do_rings_migration() + */ + if (m_rx_ring_map.size() == 1) { + m_p_rx_ring = m_rx_ring_map.begin()->first; + } + + notify_epoll = true; + + // Add this new CQ channel fd to the rx epfd handle (no need to wake up any sleeping thread about this new fd) + epoll_event ev = {0, {0}}; + ev.events = EPOLLIN; + int num_ring_rx_fds = p_ring->get_num_resources(); + int *ring_rx_fds_array = p_ring->get_rx_channel_fds(); + + for (int i = 0; i < num_ring_rx_fds; i++) { + int cq_ch_fd = ring_rx_fds_array[i]; + + ev.data.fd = cq_ch_fd; + + BULLSEYE_EXCLUDE_BLOCK_START + if (unlikely( orig_os_api.epoll_ctl(m_rx_epfd, EPOLL_CTL_ADD, cq_ch_fd, &ev))) { + si_logerr("failed to add cq channel fd to internal epfd errno=%d (%m)", errno); + } + BULLSEYE_EXCLUDE_BLOCK_END + } + + do_wakeup(); // A ready wce can be pending due to the drain logic (cq channel will not wake up by itself) + } else { + // Increase ref count on cq_mgr object + rx_ring_iter->second->refcnt++; + } + + unlock_rx_q(); + m_rx_ring_map_lock.unlock(); + + if (notify_epoll) { + // todo m_econtext is not protected by socket lock because epfd->m_ring_map_lock should be first in order. + // possible race between removal of fd from epoll (epoll_ctl del, or epoll close) and here. + // need to add a third-side lock (fd_collection?) to sync between epoll and socket. + notify_epoll_context_add_ring(p_ring); + } + + lock_rx_q(); +} + +void sockinfo::rx_del_ring_cb(flow_tuple_with_local_if &flow_key, ring* p_ring, bool is_migration /* = false */) +{ + si_logdbg(""); + NOT_IN_USE(flow_key); + + bool notify_epoll = false; + + // Remove the rx cq_mgr from our rx cq map + unlock_rx_q(); + m_rx_ring_map_lock.lock(); + lock_rx_q(); + + descq_t temp_rx_reuse; + temp_rx_reuse.set_id("sockinfo (%p), fd = %d : rx_del_ring_cb temp_rx_reuse", this, m_fd); + descq_t temp_rx_reuse_global; + temp_rx_reuse_global.set_id("sockinfo (%p), fd = %d : rx_del_ring_cb temp_rx_reuse_global", this, m_fd); + + ring* base_ring = p_ring->get_parent(); + rx_ring_map_t::iterator rx_ring_iter = m_rx_ring_map.find(base_ring); + BULLSEYE_EXCLUDE_BLOCK_START + if (rx_ring_iter != m_rx_ring_map.end()) { + BULLSEYE_EXCLUDE_BLOCK_END + ring_info_t* p_ring_info = rx_ring_iter->second; + // Decrease ref count on cq_mgr object + p_ring_info->refcnt--; + + // Is this the last reference to this cq_mgr? + if (p_ring_info->refcnt == 0) { + + // Get rid of all rx ready buffers from this cq_mgr owner + if (!is_migration) move_owned_rx_ready_descs(base_ring, &temp_rx_reuse); + + // Move all cq_mgr->rx_reuse buffers to temp reuse queue related to p_rx_cq_mgr + move_owned_descs(base_ring, &temp_rx_reuse, &p_ring_info->rx_reuse_info.rx_reuse); + move_not_owned_descs(base_ring, &temp_rx_reuse_global, &p_ring_info->rx_reuse_info.rx_reuse); + if (p_ring_info->rx_reuse_info.rx_reuse.size()) { + si_logerr("possible buffer leak, p_ring_info->rx_reuse_buff still contain %d buffers.", p_ring_info->rx_reuse_info.rx_reuse.size()); + } + + int num_ring_rx_fds = base_ring->get_num_resources(); + int *ring_rx_fds_array = base_ring->get_rx_channel_fds(); + + for (int i = 0; i < num_ring_rx_fds; i++) { + int cq_ch_fd = ring_rx_fds_array[i]; + BULLSEYE_EXCLUDE_BLOCK_START + if (unlikely( orig_os_api.epoll_ctl(m_rx_epfd, EPOLL_CTL_DEL, cq_ch_fd, NULL))) { + si_logerr("failed to delete cq channel fd from internal epfd (errno=%d %m)", errno); + } + BULLSEYE_EXCLUDE_BLOCK_END + } + + notify_epoll = true; + + m_rx_ring_map.erase(base_ring); + delete p_ring_info; + + if (m_p_rx_ring == base_ring) { + /* Remove event from rx ring if it is active + * or just reinitialize + * ring should not have events related closed socket + * in wait list + */ + m_p_rx_ring->del_ec(&m_socketxtreme.ec); + if (m_rx_ring_map.size() == 1) { + m_p_rx_ring = m_rx_ring_map.begin()->first; + } else { + m_p_rx_ring = NULL; + } + + move_owned_descs(base_ring, &temp_rx_reuse, &m_rx_reuse_buff.rx_reuse); + move_not_owned_descs(base_ring, &temp_rx_reuse_global, &m_rx_reuse_buff.rx_reuse); + + m_rx_reuse_buff.n_buff_num = m_rx_reuse_buff.rx_reuse.size(); + } + } + } + else { + si_logerr("oops, ring not found in map, so we can't remove it ???"); + } + unlock_rx_q(); + m_rx_ring_map_lock.unlock(); + + if (notify_epoll) { + // todo m_econtext is not protected by socket lock because epfd->m_ring_map_lock should be first in order. + // possible race between removal of fd from epoll (epoll_ctl del, or epoll close) and here. + // need to add a third-side lock (fd_collection?) to sync between epoll and socket. + notify_epoll_context_remove_ring(base_ring); + } + + if (temp_rx_reuse.size() > 0) { // no need for m_lock_rcv since temp_rx_reuse is on the stack + // Get rig of all rx reuse buffers from temp reuse queue + // Without m_lock_rcv.lock()!!! + unsigned int counter = 1<<20; + while (temp_rx_reuse.size() > 0 && counter--) { + if (base_ring->reclaim_recv_buffers(&temp_rx_reuse)) + break; + sched_yield(); + } + if (temp_rx_reuse.size() > 0) //Awareness: we do this without buffer_poll lock after all other tries failed + g_buffer_pool_rx->put_buffers_after_deref_thread_safe(&temp_rx_reuse); + } + + if (temp_rx_reuse_global.size() > 0) { + g_buffer_pool_rx->put_buffers_after_deref_thread_safe(&temp_rx_reuse_global); + } + + lock_rx_q(); +} + +// Move all owner's rx ready packets to 'toq' +void sockinfo::move_owned_rx_ready_descs(ring* p_ring, descq_t *toq) +{ + // Assume locked by owner!!! + + mem_buf_desc_t *temp; + const size_t size = get_size_m_rx_pkt_ready_list(); + for (size_t i = 0 ; i < size; i++) { + temp = get_front_m_rx_pkt_ready_list(); + pop_front_m_rx_pkt_ready_list(); + if (!p_ring->is_member(temp->p_desc_owner)) { + push_back_m_rx_pkt_ready_list(temp); + continue; + } + m_n_rx_pkt_ready_list_count--; + m_p_socket_stats->n_rx_ready_pkt_count--; + + m_rx_ready_byte_count -= temp->rx.sz_payload; + m_p_socket_stats->n_rx_ready_byte_count -= temp->rx.sz_payload; + toq->push_back(temp); + } +} + +bool sockinfo::attach_as_uc_receiver(role_t role, bool skip_rules /* = false */) +{ + sock_addr addr(m_bound.get_p_sa()); + in_addr_t local_if; + bool ret = true; + + /* m_so_bindtodevice_ip has high priority */ + if (m_so_bindtodevice_ip != INADDR_ANY) { + local_if = m_so_bindtodevice_ip; + addr.set_in_addr(local_if); // we should pass correct ip-address information in case SO_BINDTODEVICE is used + si_logdbg("Attaching using bind to device rule"); + } + else { + local_if = m_bound.get_in_addr(); + si_logdbg("Attaching using bind to ip rule"); + } + + if (local_if != INADDR_ANY) { + si_logdbg("Attached to specific local if: (%d.%d.%d.%d) addr: %s", NIPQUAD(local_if), addr.to_str()); + + transport_t target_family = TRANS_VMA; + if (!skip_rules) target_family = find_target_family(role, addr.get_p_sa()); + if (target_family == TRANS_VMA) { + flow_tuple_with_local_if flow_key(addr, m_connected, m_protocol, local_if); + ret = ret && attach_receiver(flow_key); + } + } + else { + si_logdbg("Attaching to all offload if addr: %s", addr.to_str()); + + local_ip_list_t::iterator lip_iter; + local_ip_list_t lip_offloaded_list = g_p_net_device_table_mgr->get_ip_list(); + for (lip_iter = lip_offloaded_list.begin(); ret && lip_offloaded_list.end() != lip_iter; lip_iter++) + { + ip_data_t ip = *lip_iter; + local_if = ip.local_addr; + addr.set_in_addr(local_if); + transport_t target_family = TRANS_VMA; + if (!skip_rules) target_family = find_target_family(role, addr.get_p_sa()); + if (target_family == TRANS_VMA) { + flow_tuple_with_local_if flow_key(addr, m_connected, m_protocol, local_if); + ret = ret && attach_receiver(flow_key); + } + } + } + + return ret; +} + +transport_t sockinfo::find_target_family(role_t role, struct sockaddr* sock_addr_first, struct sockaddr* sock_addr_second /* = NULL */) +{ + transport_t target_family = TRANS_DEFAULT; + switch (role) { + case ROLE_TCP_SERVER: + target_family = __vma_match_tcp_server(TRANS_VMA, safe_mce_sys().app_id, sock_addr_first, sizeof(struct sockaddr)); + break; + case ROLE_TCP_CLIENT: + target_family = __vma_match_tcp_client(TRANS_VMA, safe_mce_sys().app_id, sock_addr_first, sizeof(struct sockaddr), sock_addr_second, sizeof(struct sockaddr)); + break; + case ROLE_UDP_RECEIVER: + target_family = __vma_match_udp_receiver(TRANS_VMA, safe_mce_sys().app_id, sock_addr_first, sizeof(struct sockaddr)); + break; + case ROLE_UDP_SENDER: + target_family = __vma_match_udp_sender(TRANS_VMA, safe_mce_sys().app_id, sock_addr_first, sizeof(struct sockaddr)); + break; + case ROLE_UDP_CONNECT: + target_family = __vma_match_udp_connect(TRANS_VMA, safe_mce_sys().app_id, sock_addr_first, sizeof(struct sockaddr), sock_addr_second, sizeof(struct sockaddr)); + break; + BULLSEYE_EXCLUDE_BLOCK_START + default: + break; + BULLSEYE_EXCLUDE_BLOCK_END + } + return target_family; +} + +void sockinfo::shutdown_rx() +{ + // Unregister this receiver from all ring's in our list + rx_flow_map_t::iterator rx_flow_iter = m_rx_flow_map.begin(); + while (rx_flow_iter != m_rx_flow_map.end()) { + flow_tuple_with_local_if detach_key = rx_flow_iter->first; + detach_receiver(detach_key); + rx_flow_iter = m_rx_flow_map.begin(); // Pop next flow rule + } + + /* Destroy resources in case they are allocated using SO_BINDTODEVICE call */ + if (m_rx_nd_map.size()) { + destroy_nd_resources(m_so_bindtodevice_ip); + } + si_logdbg("shutdown RX"); +} + +void sockinfo::destructor_helper() +{ + shutdown_rx(); + // Delete all dst_entry in our list + if (m_p_connected_dst_entry) { + delete m_p_connected_dst_entry; + } + m_p_connected_dst_entry = NULL; +} + + +int sockinfo::register_callback(vma_recv_callback_t callback, void *context) +{ + m_rx_callback = callback; + m_rx_callback_context = context; + return 0; +} + +int sockinfo::modify_ratelimit(dst_entry* p_dst_entry, struct vma_rate_limit_t &rate_limit) +{ + if (m_ring_alloc_log_tx.get_ring_alloc_logic() == RING_LOGIC_PER_SOCKET || + m_ring_alloc_log_tx.get_ring_alloc_logic() == RING_LOGIC_PER_USER_ID) { + + if (p_dst_entry) { + int ret = p_dst_entry->modify_ratelimit(rate_limit); + + if (!ret) + m_so_ratelimit = rate_limit; + // value is in bytes (per second). we need to convert it to kilo-bits (per second) + return ret; + } else { + m_so_ratelimit = rate_limit; + } + return 0; + } + si_logwarn("VMA is not configured with TX ring allocation logic per " + "socket or user-id."); + return -1; +} + +int sockinfo::get_rings_num() +{ + int count = 0; + + if (is_socketxtreme()) { + /* socketXtreme mode support just single ring */ + return 1; + } + rx_ring_map_t::iterator it = m_rx_ring_map.begin(); + for (; it != m_rx_ring_map.end(); ++it) { + count += it->first->get_num_resources(); + } + return count; +} + +int* sockinfo::get_rings_fds(int &res_length) +{ + res_length = 0; + int index = 0; + + if (is_socketxtreme()) { + /* socketXtreme mode support just single ring */ + res_length = 1; + return m_p_rx_ring->get_rx_channel_fds(); + } + + if (m_p_rings_fds) { + return m_p_rings_fds; + } + res_length = get_rings_num(); + m_p_rings_fds = new int[res_length]; + + rx_ring_map_t::iterator it = m_rx_ring_map.begin(); + for (; it != m_rx_ring_map.end(); ++it) { + int *p_n_rx_channel_fds = it->first->get_rx_channel_fds(); + for (int j = 0; j < it->first->get_num_resources(); ++j) { + int fd = p_n_rx_channel_fds[j]; + if (fd != -1) { + m_p_rings_fds[index] = fd; + ++index; + } else { + si_logdbg("got ring with fd -1"); + } + } + } + return m_p_rings_fds; +} + +int sockinfo::get_socket_network_ptr(void *ptr, uint16_t &len) +{ + if (!m_p_connected_dst_entry) { + si_logdbg("dst entry no created fd %d", m_fd); + errno = ENOTCONN; + return -1; + } + header* hdr = m_p_connected_dst_entry->get_network_header(); + if (hdr->m_total_hdr_len == 0) { + si_logdbg("header not created yet fd %d", m_fd); + errno = ENOTCONN; + return -1; + } + if (!ptr) { + len = hdr->m_total_hdr_len; + return 0; + } + if (ptr && len >= hdr->m_total_hdr_len) { + len = hdr->m_total_hdr_len; + memcpy(ptr, ((uint8_t*)hdr->m_actual_hdr_addr), len); + return 0; + } + errno = ENOBUFS; + return -1; +} + +int sockinfo::setsockopt_kernel(int __level, int __optname, const void *__optval, + socklen_t __optlen, int supported, bool allow_privileged) +{ + if (!supported) { + char buf[256]; + snprintf(buf, sizeof(buf), "unimplemented setsockopt __level=%#x, __optname=%#x, [__optlen (%d) bytes of __optval=%.*s]", (unsigned)__level, (unsigned)__optname, __optlen, __optlen, (char*)__optval); + buf[ sizeof(buf)-1 ] = '\0'; + + VLOG_PRINTF_INFO(safe_mce_sys().exception_handling.get_log_severity(), "%s", buf); + int rc = handle_exception_flow(); + switch (rc) { + case -1: + return rc; + case -2: + vma_throw_object_with_msg(vma_unsupported_api, buf); + } + } + + si_logdbg("going to OS for setsockopt level %d optname %d", __level, __optname); + int ret = orig_os_api.setsockopt(m_fd, __level, __optname, __optval, __optlen); + BULLSEYE_EXCLUDE_BLOCK_START + if (ret) { + if (EPERM == errno && allow_privileged) { + si_logdbg("setsockopt failure is suppressed (ret=%d %m)", ret); + ret = 0; + errno = 0; + } + else { + si_logdbg("setsockopt failed (ret=%d %m)", ret); + } + } + BULLSEYE_EXCLUDE_BLOCK_END + + return ret; +} + +int sockinfo::set_sockopt_prio(__const void *__optval, socklen_t __optlen) +{ + if (__optlen < sizeof(int)) { + si_logdbg("bad parameter size in set_sockopt_prio"); + errno = EINVAL; + return -1; + } + uint32_t val = *(uint32_t*)__optval; + if (m_pcp != val) { + m_pcp = val; + si_logdbg("set socket pcp to be %d", m_pcp); + header_pcp_updater du(m_pcp); + update_header_field(&du); + } + return 0; + +} + +/** + * Function to process SW & HW timestamps + */ +void sockinfo::process_timestamps(mem_buf_desc_t* p_desc) +{ + // keep the sw_timestamp the same to all sockets + if ((m_b_rcvtstamp || + (m_n_tsing_flags & + (SOF_TIMESTAMPING_RX_SOFTWARE | SOF_TIMESTAMPING_SOFTWARE))) && + !p_desc->rx.timestamps.sw.tv_sec) { + clock_gettime(CLOCK_REALTIME, &(p_desc->rx.timestamps.sw)); + } + + // convert hw timestamp to system time + if (m_n_tsing_flags & SOF_TIMESTAMPING_RAW_HARDWARE) { + ring_simple* owner_ring = (ring_simple*) p_desc->p_desc_owner; + if (owner_ring) { + owner_ring->convert_hw_time_to_system_time(p_desc->rx.hw_raw_timestamp, &p_desc->rx.timestamps.hw); + } + } +} + +void sockinfo::handle_recv_timestamping(struct cmsg_state *cm_state) +{ + struct { + struct timespec systime; + struct timespec hwtimetrans; + struct timespec hwtimeraw; + } tsing; + + memset(&tsing, 0, sizeof(tsing)); + + timestamps_t* packet_timestamps = get_socket_timestamps(); + struct timespec* packet_systime = &packet_timestamps->sw; + + // Only fill in SO_TIMESTAMPNS if both requested. + // This matches the kernel behavior. + if (m_b_rcvtstampns) { + insert_cmsg(cm_state, SOL_SOCKET, SO_TIMESTAMPNS, packet_systime, sizeof(*packet_systime)); + } else if (m_b_rcvtstamp) { + struct timeval tv; + tv.tv_sec = packet_systime->tv_sec; + tv.tv_usec = packet_systime->tv_nsec/1000; + insert_cmsg(cm_state, SOL_SOCKET, SO_TIMESTAMP, &tv, sizeof(tv)); + } + + // Handle timestamping options + // Only support rx time stamps at this time + int support = m_n_tsing_flags & (SOF_TIMESTAMPING_SOFTWARE | SOF_TIMESTAMPING_RAW_HARDWARE); + if (!support) { + return; + } + + if (m_n_tsing_flags & SOF_TIMESTAMPING_SOFTWARE) { + tsing.systime = packet_timestamps->sw; + } + + if (m_n_tsing_flags & SOF_TIMESTAMPING_RAW_HARDWARE) { + tsing.hwtimeraw = packet_timestamps->hw; + } + + insert_cmsg(cm_state, SOL_SOCKET, SO_TIMESTAMPING, &tsing, sizeof(tsing)); +} + +void sockinfo::insert_cmsg(struct cmsg_state * cm_state, int level, int type, void *data, int len) +{ + if (!cm_state->cmhdr || + cm_state->mhdr->msg_flags & MSG_CTRUNC) + return; + + // Ensure there is enough space for the data payload + const unsigned int cmsg_len = CMSG_LEN(len); + if (cmsg_len > cm_state->mhdr->msg_controllen - cm_state->cmsg_bytes_consumed) { + cm_state->mhdr->msg_flags |= MSG_CTRUNC; + return; + } + + // Fill in the cmsghdr + cm_state->cmhdr->cmsg_level = level; + cm_state->cmhdr->cmsg_type = type; + cm_state->cmhdr->cmsg_len = cmsg_len; + memcpy(CMSG_DATA(cm_state->cmhdr), data, len); + + // Update bytes consumed to update msg_controllen later + cm_state->cmsg_bytes_consumed += CMSG_SPACE(len); + + // Advance to next cmsghdr + // can't simply use CMSG_NXTHDR() due to glibc bug 13500 + struct cmsghdr *next = (struct cmsghdr*)((char*)cm_state->cmhdr + + CMSG_ALIGN(cm_state->cmhdr->cmsg_len)); + if ((char*)(next + 1) > + ((char*)cm_state->mhdr->msg_control + cm_state->mhdr->msg_controllen)) + cm_state->cmhdr = NULL; + else + cm_state->cmhdr = next; +} + +void sockinfo::handle_cmsg(struct msghdr * msg) +{ + struct cmsg_state cm_state; + + cm_state.mhdr = msg; + cm_state.cmhdr = CMSG_FIRSTHDR(msg); + cm_state.cmsg_bytes_consumed = 0; + + if (m_b_pktinfo) handle_ip_pktinfo(&cm_state); + if (m_b_rcvtstamp || m_n_tsing_flags) handle_recv_timestamping(&cm_state); + + cm_state.mhdr->msg_controllen = cm_state.cmsg_bytes_consumed; +} diff --git a/src/vma/sock/sockinfo.h b/src/vma/sock/sockinfo.h new file mode 100644 index 0000000..9ba9ad3 --- /dev/null +++ b/src/vma/sock/sockinfo.h @@ -0,0 +1,591 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include + +#include "config.h" +#include "vlogger/vlogger.h" +#include "utils/lock_wrapper.h" +#include "vma/vma_extra.h" +#include "vma/util/data_updater.h" +#include "vma/util/sock_addr.h" +#include "vma/util/vma_stats.h" +#include "vma/util/sys_vars.h" +#include "vma/util/wakeup_pipe.h" +#include "vma/proto/flow_tuple.h" +#include "vma/proto/mem_buf_desc.h" +#include "vma/proto/dst_entry.h" +#include "vma/dev/net_device_table_mgr.h" +#include "vma/dev/ring_simple.h" +#include "vma/dev/ring_allocation_logic.h" + +#include "socket_fd_api.h" +#include "pkt_rcvr_sink.h" +#include "pkt_sndr_source.h" +#include "sock-redirect.h" + +#ifndef BASE_SOCKINFO_H +#define BASE_SOCKINFO_H + +#define SI_RX_EPFD_EVENT_MAX 16 +#define BYTE_TO_KB(byte_value) ((byte_value) / 125) +#define KB_TO_BYTE(kbit_value) ((kbit_value) * 125) + +#if DEFINED_MISSING_NET_TSTAMP +enum { + SOF_TIMESTAMPING_TX_HARDWARE = (1<<0), + SOF_TIMESTAMPING_TX_SOFTWARE = (1<<1), + SOF_TIMESTAMPING_RX_HARDWARE = (1<<2), + SOF_TIMESTAMPING_RX_SOFTWARE = (1<<3), + SOF_TIMESTAMPING_SOFTWARE = (1<<4), + SOF_TIMESTAMPING_SYS_HARDWARE = (1<<5), + SOF_TIMESTAMPING_RAW_HARDWARE = (1<<6), + SOF_TIMESTAMPING_MASK = + (SOF_TIMESTAMPING_RAW_HARDWARE - 1) | + SOF_TIMESTAMPING_RAW_HARDWARE +}; +#else +#include +#endif + +#ifndef SO_TIMESTAMPNS +#define SO_TIMESTAMPNS 35 +#endif + +#ifndef SO_TIMESTAMPING +#define SO_TIMESTAMPING 37 +#endif + +#ifndef SO_REUSEPORT +#define SO_REUSEPORT 15 +#endif + +struct cmsg_state +{ + struct msghdr *mhdr; + struct cmsghdr *cmhdr; + size_t cmsg_bytes_consumed; +}; + +#define NOTIFY_ON_EVENTS(context, events) context->set_events(events) + +struct buff_info_t { + buff_info_t(){ + rx_reuse.set_id("buff_info_t (%p) : rx_reuse", this); + n_buff_num = 0; + } + + int n_buff_num; + descq_t rx_reuse; +}; + +typedef struct { + net_device_entry* p_nde; + net_device_val* p_ndv; + ring* p_ring; + int refcnt; +} net_device_resources_t; + +typedef std::tr1::unordered_map rx_net_device_map_t; + +/* + * Sockinfo setsockopt() return values + */ +#define SOCKOPT_INTERNAL_VMA_SUPPORT 0 // Internal socket option, should not pass request to OS. +#define SOCKOPT_NO_VMA_SUPPORT -1 // Socket option was found but not supported, error should be returned to user. +#define SOCKOPT_PASS_TO_OS 1 // Should pass to TCP/UDP level or OS. + +namespace std { namespace tr1 { +template<> +class hash +{ +public: + size_t operator()(const flow_tuple_with_local_if &key) const + { + flow_tuple_with_local_if* tmp_key = (flow_tuple_with_local_if*)&key; + return tmp_key->hash(); + } +}; +}} +typedef std::tr1::unordered_map rx_flow_map_t; + +typedef struct { + int refcnt; + buff_info_t rx_reuse_info; +} ring_info_t; + +typedef std::tr1::unordered_map rx_ring_map_t; + +// see route.c in Linux kernel +const uint8_t ip_tos2prio[16] = { + 0, 0, 0, 0, + 2, 2, 2, 2, + 6, 6, 6, 6, + 4, 4, 4, 4 +}; + +class sockinfo : public socket_fd_api, public pkt_rcvr_sink, public pkt_sndr_source, public wakeup_pipe +{ +public: + sockinfo(int fd); + virtual ~sockinfo(); + +#if _BullseyeCoverage + #pragma BullseyeCoverage off +#endif + // don't put mt lock around sockinfo just yet + void lock(){}; + void unlock() {}; +#if _BullseyeCoverage + #pragma BullseyeCoverage on +#endif + + enum sockinfo_state { + SOCKINFO_OPENED, + SOCKINFO_CLOSING, + SOCKINFO_CLOSED + }; + + virtual void consider_rings_migration(); + + virtual int add_epoll_context(epfd_info *epfd); + virtual void remove_epoll_context(epfd_info *epfd); + + inline bool tcp_flow_is_5t(void) { return m_tcp_flow_is_5t; } + inline void set_tcp_flow_is_5t(void) { m_tcp_flow_is_5t = true; } + inline bool set_flow_tag(uint32_t flow_tag_id) { + if (flow_tag_id && (flow_tag_id != FLOW_TAG_MASK)) { + m_flow_tag_id = flow_tag_id; + m_flow_tag_enabled = true; + return true; + } + m_flow_tag_id = FLOW_TAG_MASK; + return false; + } + inline bool flow_tag_enabled(void) { return m_flow_tag_enabled; } + inline int get_rx_epfd(void) { return m_rx_epfd; } + + virtual bool flow_in_reuse(void) { return false;}; + virtual int* get_rings_fds(int &res_length); + virtual int get_rings_num(); + virtual int get_socket_network_ptr(void *ptr, uint16_t &len); + virtual bool check_rings() {return m_p_rx_ring ? true: false;} + virtual void statistics_print(vlog_levels_t log_level = VLOG_DEBUG); + uint32_t get_flow_tag_val() { return m_flow_tag_id; } + inline in_protocol_t get_protocol(void) { return m_protocol; } + +protected: + bool m_b_blocking; + bool m_b_pktinfo; + bool m_b_rcvtstamp; + bool m_b_rcvtstampns; + uint8_t m_n_tsing_flags; + in_protocol_t m_protocol; + + lock_spin_recursive m_lock_rcv; + lock_mutex m_lock_snd; + lock_mutex m_rx_migration_lock; + + sockinfo_state m_state; // socket current state + sock_addr m_bound; + sock_addr m_connected; + dst_entry* m_p_connected_dst_entry; + + in_addr_t m_so_bindtodevice_ip; + + socket_stats_t m_socket_stats; + socket_stats_t* m_p_socket_stats; + + int m_rx_epfd; + cache_observer m_rx_nd_observer; + rx_net_device_map_t m_rx_nd_map; + rx_flow_map_t m_rx_flow_map; + // we either listen on ALL system cqs or bound to the specific cq + ring* m_p_rx_ring; //used in TCP/UDP + buff_info_t m_rx_reuse_buff; //used in TCP instead of m_rx_ring_map + bool m_rx_reuse_buf_pending; //used to periodically return buffers, even if threshold was not reached + bool m_rx_reuse_buf_postponed; //used to mark threshold was reached, but free was not done yet + inline void set_rx_reuse_pending(bool is_pending = true) {m_rx_reuse_buf_pending = is_pending;} + + rx_ring_map_t m_rx_ring_map; // CQ map + lock_mutex_recursive m_rx_ring_map_lock; + ring_allocation_logic_rx m_ring_alloc_logic; + + loops_timer m_loops_timer; + + /** + * list of pending ready packet on the Rx, + * each element is a pointer to the ib_conn_mgr that holds this ready rx datagram + */ + int m_n_rx_pkt_ready_list_count; + size_t m_rx_pkt_ready_offset; + size_t m_rx_ready_byte_count; + + const int m_n_sysvar_rx_num_buffs_reuse; + const int32_t m_n_sysvar_rx_poll_num; + ring_alloc_logic_attr m_ring_alloc_log_rx; + ring_alloc_logic_attr m_ring_alloc_log_tx; + uint32_t m_pcp; + + struct { + /* Track internal events to return in socketxtreme_poll() + * Current design support single event for socket at a particular time + */ + struct ring_ec ec; + struct vma_completion_t* completion; + struct vma_buff_t* last_buff_lst; + } m_socketxtreme; + + // Callback function pointer to support VMA extra API (vma_extra.h) + vma_recv_callback_t m_rx_callback; + void* m_rx_callback_context; // user context + struct vma_rate_limit_t m_so_ratelimit; + void* m_fd_context; // Context data stored with socket + uint32_t m_flow_tag_id; // Flow Tag for this socket + bool m_flow_tag_enabled; // for this socket + uint8_t m_n_uc_ttl; // time to live + bool m_tcp_flow_is_5t; // to bypass packet analysis + + int* m_p_rings_fds; + virtual void set_blocking(bool is_blocked); + virtual int fcntl(int __cmd, unsigned long int __arg); + virtual int ioctl(unsigned long int __request, unsigned long int __arg); + virtual int setsockopt(int __level, int __optname, const void *__optval, socklen_t __optlen); + int setsockopt_kernel(int __level, int __optname, const void *__optval, socklen_t __optlen, int supported, bool allow_priv); + virtual int getsockopt(int __level, int __optname, void *__optval, socklen_t *__optlen); + + virtual mem_buf_desc_t* get_front_m_rx_pkt_ready_list() = 0; + virtual size_t get_size_m_rx_pkt_ready_list() = 0; + virtual void pop_front_m_rx_pkt_ready_list() = 0; + virtual void push_back_m_rx_pkt_ready_list(mem_buf_desc_t* buff) = 0; + + int rx_wait(int &poll_count, bool is_blocking = true); + int rx_wait_helper(int &poll_count, bool is_blocking = true); + + void save_stats_rx_os(int bytes); + void save_stats_tx_os(int bytes); + void save_stats_rx_offload(int nbytes); + + virtual int rx_verify_available_data() = 0; + virtual void update_header_field(data_updater *updater) = 0; + virtual mem_buf_desc_t *get_next_desc (mem_buf_desc_t *p_desc) = 0; + virtual mem_buf_desc_t* get_next_desc_peek(mem_buf_desc_t *p_desc, int& rx_pkt_ready_list_idx) = 0; + virtual timestamps_t* get_socket_timestamps() = 0; + virtual void update_socket_timestamps(timestamps_t * ts) = 0; + virtual void post_deqeue (bool release_buff) = 0; + + virtual int zero_copy_rx (iovec *p_iov, mem_buf_desc_t *pdesc, int *p_flags) = 0; + int register_callback(vma_recv_callback_t callback, void *context); + + virtual size_t handle_msg_trunc(size_t total_rx, size_t payload_size, int in_flags, int* p_out_flags); + + bool attach_receiver(flow_tuple_with_local_if &flow_key); + bool detach_receiver(flow_tuple_with_local_if &flow_key); + net_device_resources_t* create_nd_resources(const ip_address ip_local); + bool destroy_nd_resources(const ip_address ip_local); + void do_rings_migration(resource_allocation_key &old_key); + int set_ring_attr(vma_ring_alloc_logic_attr *attr); + int set_ring_attr_helper(ring_alloc_logic_attr *sock_attr, vma_ring_alloc_logic_attr *attr); + + // Attach to all relevant rings for offloading receive flows - always used from slow path + // According to bounded information we need to attach to all UC relevant flows + // If local_ip is ANY then we need to attach to all offloaded interfaces OR to the one our connected_ip is routed to + bool attach_as_uc_receiver(role_t role, bool skip_rules = false); + virtual void set_rx_packet_processor(void) = 0; + transport_t find_target_family(role_t role, struct sockaddr *sock_addr_first, struct sockaddr *sock_addr_second = NULL); + + // This callback will notify that socket is ready to receive and map the cq. + virtual void rx_add_ring_cb(flow_tuple_with_local_if &flow_key, ring* p_ring, bool is_migration = false); + virtual void rx_del_ring_cb(flow_tuple_with_local_if &flow_key, ring* p_ring, bool is_migration = false); + + virtual void lock_rx_q() {m_lock_rcv.lock();} + virtual void unlock_rx_q() {m_lock_rcv.unlock();} + + void shutdown_rx(); + void destructor_helper(); + int modify_ratelimit(dst_entry* p_dst_entry, struct vma_rate_limit_t &rate_limit); + + void move_owned_rx_ready_descs(ring* p_ring, descq_t* toq); // Move all owner's rx ready packets ro 'toq' + int set_sockopt_prio(__const void *__optval, socklen_t __optlen); + + virtual void handle_ip_pktinfo(struct cmsg_state *cm_state) = 0; + inline void handle_recv_timestamping(struct cmsg_state *cm_state); + void insert_cmsg(struct cmsg_state *cm_state, int level, int type, void *data, int len); + void handle_cmsg(struct msghdr * msg); + void process_timestamps(mem_buf_desc_t* p_desc); + + virtual bool try_un_offloading(); // un-offload the socket if possible + + virtual inline void do_wakeup() { + if (!is_socketxtreme()) { + wakeup_pipe::do_wakeup(); + } + } + + inline bool is_socketxtreme() { + return (m_p_rx_ring && m_p_rx_ring->is_socketxtreme()); + } + + inline void set_events(uint64_t events) { + static int enable_socketxtreme = safe_mce_sys().enable_socketxtreme; + + if (enable_socketxtreme && m_state == SOCKINFO_OPENED) { + /* Collect all events if rx ring is enabled */ + if (is_socketxtreme()) { + if (m_socketxtreme.completion) { + if (!m_socketxtreme.completion->events) { + m_socketxtreme.completion->user_data = (uint64_t)m_fd_context; + } + m_socketxtreme.completion->events |= events; + } + else { + if (!m_socketxtreme.ec.completion.events) { + m_socketxtreme.ec.completion.user_data = (uint64_t)m_fd_context; + m_p_rx_ring->put_ec(&m_socketxtreme.ec); + } + m_socketxtreme.ec.completion.events |= events; + } + } + } + + socket_fd_api::notify_epoll_context((uint32_t)events); + } + + // This function validates the ipoib's properties + // Input params: + // 1. IF name (can be alias) + // 2. IF flags + // 3. general path to ipoib property file (for example: /sys/class/net/%s/mtu) + // 4. the expected value of the property + // 5. size of the property + // Output params: + // 1. property sysfs filename + // 2. physical IF name (stripped alias) + // Return Value + // Type: INT + // Val: -1 Reading from the sys file failed + // 1 Reading succeeded but the actual prop value != expected + // 0 Reading succeeded and acutal ptop value == expected one + //TODO need to copy this function from util + //int validate_ipoib_prop(char* ifname, unsigned int ifflags, const char param_file[], const char *val, int size, char *filename, char * base_ifname); + + inline void fetch_peer_info(sockaddr_in *p_peer_addr, sockaddr_in *__from, socklen_t *__fromlen) + { + *__from = *p_peer_addr; + *__fromlen = sizeof(sockaddr_in); + } + + inline int dequeue_packet(iovec *p_iov, ssize_t sz_iov, + sockaddr_in *__from, socklen_t *__fromlen, + int in_flags, int *p_out_flags) + { + mem_buf_desc_t *pdesc; + int total_rx = 0; + uint32_t nbytes, pos; + bool relase_buff = true; + + bool is_peek = in_flags & MSG_PEEK; + int rx_pkt_ready_list_idx = 1; + int rx_pkt_ready_offset = m_rx_pkt_ready_offset; + + pdesc = get_front_m_rx_pkt_ready_list(); + void *iov_base = (uint8_t*)pdesc->rx.frag.iov_base + m_rx_pkt_ready_offset; + size_t bytes_left = pdesc->rx.frag.iov_len - m_rx_pkt_ready_offset; + size_t payload_size = pdesc->rx.sz_payload; + + if (__from && __fromlen) + fetch_peer_info(&pdesc->rx.src, __from, __fromlen); + + if (in_flags & MSG_VMA_ZCOPY) { + relase_buff = false; + total_rx = zero_copy_rx(p_iov, pdesc, p_out_flags); + if (unlikely(total_rx < 0)) + return -1; + m_rx_pkt_ready_offset = 0; + } + else { + for (int i = 0; i < sz_iov && pdesc; i++) { + pos = 0; + while (pos < p_iov[i].iov_len && pdesc) { + nbytes = p_iov[i].iov_len - pos; + if (nbytes > bytes_left) nbytes = bytes_left; + memcpy((char *)(p_iov[i].iov_base) + pos, iov_base, nbytes); + pos += nbytes; + total_rx += nbytes; + m_rx_pkt_ready_offset += nbytes; + bytes_left -= nbytes; + iov_base = (uint8_t*)iov_base + nbytes; + if (m_b_rcvtstamp || m_n_tsing_flags) update_socket_timestamps(&pdesc->rx.timestamps); + if(bytes_left <= 0) { + if (unlikely(is_peek)) { + pdesc = get_next_desc_peek(pdesc, rx_pkt_ready_list_idx); + } else { + pdesc = get_next_desc(pdesc); + } + m_rx_pkt_ready_offset = 0; + if (pdesc) { + iov_base = pdesc->rx.frag.iov_base; + bytes_left = pdesc->rx.frag.iov_len; + } + } + + } + } + + } + + if (unlikely(is_peek)) { + m_rx_pkt_ready_offset = rx_pkt_ready_offset; //if MSG_PEEK is on, m_rx_pkt_ready_offset must be zero-ed + //save_stats_rx_offload(total_rx); //TODO?? + } + else { + m_rx_ready_byte_count -= total_rx; + m_p_socket_stats->n_rx_ready_byte_count -= total_rx; + post_deqeue(relase_buff); + save_stats_rx_offload(total_rx); + } + + total_rx = handle_msg_trunc(total_rx, payload_size, in_flags, p_out_flags); + + return total_rx; + } + + inline void reuse_buffer(mem_buf_desc_t *buff) + { + set_rx_reuse_pending(false); + ring* p_ring = buff->p_desc_owner->get_parent(); + rx_ring_map_t::iterator iter = m_rx_ring_map.find(p_ring); + if(likely(iter != m_rx_ring_map.end())){ + descq_t *rx_reuse = &iter->second->rx_reuse_info.rx_reuse; + int& n_buff_num = iter->second->rx_reuse_info.n_buff_num; + rx_reuse->push_back(buff); + n_buff_num += buff->rx.n_frags; + if(n_buff_num < m_n_sysvar_rx_num_buffs_reuse){ + return; + } + if(n_buff_num >= 2 * m_n_sysvar_rx_num_buffs_reuse){ + if (p_ring->reclaim_recv_buffers(rx_reuse)) { + n_buff_num = 0; + } else { + g_buffer_pool_rx->put_buffers_after_deref_thread_safe(rx_reuse); + n_buff_num = 0; + } + m_rx_reuse_buf_postponed = false; + } else { + m_rx_reuse_buf_postponed = true; + } + } + else{ + // Retuned buffer to global pool when owner can't be found + // In case ring was deleted while buffers where still queued + vlog_printf(VLOG_DEBUG, "Buffer owner not found\n"); + // Awareness: these are best efforts: decRef without lock in case no CQ + if(buff->dec_ref_count() <= 1 && (buff->lwip_pbuf.pbuf.ref-- <= 1)) + g_buffer_pool_rx->put_buffers_thread_safe(buff); + + } + } + + inline void move_owned_descs(ring* p_ring, descq_t *toq, descq_t *fromq) + { + // Assume locked by owner!!! + + mem_buf_desc_t *temp; + const size_t size = fromq->size(); + for (size_t i = 0 ; i < size; i++) { + temp = fromq->front(); + fromq->pop_front(); + if (p_ring->is_member(temp->p_desc_owner)) + toq->push_back(temp); + else + fromq->push_back(temp); + } + } + + static const char * setsockopt_so_opt_to_str(int opt) + { + switch (opt) { + case SO_REUSEADDR: return "SO_REUSEADDR"; + case SO_REUSEPORT: return "SO_REUSEPORT"; + case SO_BROADCAST: return "SO_BROADCAST"; + case SO_RCVBUF: return "SO_RCVBUF"; + case SO_SNDBUF: return "SO_SNDBUF"; + case SO_TIMESTAMP: return "SO_TIMESTAMP"; + case SO_TIMESTAMPNS: return "SO_TIMESTAMPNS"; + case SO_BINDTODEVICE: return "SO_BINDTODEVICE"; + case SO_VMA_RING_ALLOC_LOGIC: return "SO_VMA_RING_ALLOC_LOGIC"; + case SO_MAX_PACING_RATE: return "SO_MAX_PACING_RATE"; + case SO_VMA_FLOW_TAG: return "SO_VMA_FLOW_TAG"; + case SO_VMA_SHUTDOWN_RX: return "SO_VMA_SHUTDOWN_RX"; + default: break; + } + return "UNKNOWN SO opt"; + } + + inline void move_not_owned_descs(ring* p_ring, descq_t *toq, descq_t *fromq) + { + // Assume locked by owner!!! + + mem_buf_desc_t *temp; + const size_t size = fromq->size(); + for (size_t i = 0 ; i < size; i++) { + temp = fromq->front(); + fromq->pop_front(); + if (p_ring->is_member(temp->p_desc_owner)) + fromq->push_back(temp); + else + toq->push_back(temp); + } + } + + + int get_sock_by_L3_L4(in_protocol_t protocol, in_addr_t ip, in_port_t port); + + ////////////////////////////////////////////////////////////////// + int handle_exception_flow(){ + if (safe_mce_sys().exception_handling.is_suit_un_offloading()) { + try_un_offloading(); + } + if (safe_mce_sys().exception_handling == vma_exception_handling::MODE_RETURN_ERROR) { + errno = EINVAL; + return -1; + } + if (safe_mce_sys().exception_handling == vma_exception_handling::MODE_ABORT) { + return -2; + } + return 0; + } + ////////////////////////////////////////////////////////////////// +}; + +#endif /* BASE_SOCKINFO_H */ diff --git a/src/vma/sock/sockinfo_tcp.cpp b/src/vma/sock/sockinfo_tcp.cpp new file mode 100644 index 0000000..663461b --- /dev/null +++ b/src/vma/sock/sockinfo_tcp.cpp @@ -0,0 +1,4765 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include +#include +#include +#include "vma/util/if.h" + +#include "utils/bullseye.h" +#include "vlogger/vlogger.h" +#include "utils/lock_wrapper.h" +#include "utils/rdtsc.h" +#include "vma/util/libvma.h" +#include "vma/util/instrumentation.h" +#include "vma/util/list.h" +#include "vma/util/agent.h" +#include "vma/event/event_handler_manager.h" +#include "vma/proto/route_table_mgr.h" +#include "vma/proto/vma_lwip.h" +#include "vma/proto/dst_entry_tcp.h" +#include "vma/iomux/io_mux_call.h" + +#include "sock-redirect.h" +#include "fd_collection.h" +#include "sockinfo_tcp.h" + + +// debugging macros +#define MODULE_NAME "si_tcp" + +#undef MODULE_HDR_INFO +#define MODULE_HDR_INFO MODULE_NAME "[fd=%d]:%d:%s() " + +#undef __INFO__ +#define __INFO__ m_fd + +#define si_tcp_logpanic __log_info_panic +#define si_tcp_logerr __log_info_err +#define si_tcp_logwarn __log_info_warn +#define si_tcp_loginfo __log_info_info +#define si_tcp_logdbg __log_info_dbg +#define si_tcp_logfunc __log_info_func +#define si_tcp_logfuncall __log_info_funcall + +#define BLOCK_THIS_RUN(blocking, flags) (blocking && !(flags & MSG_DONTWAIT)) +#define TCP_SEG_COMPENSATION 64 + +tcp_seg_pool *g_tcp_seg_pool = NULL; +tcp_timers_collection* g_tcp_timers_collection = NULL; + +/* + * The following socket options are inherited by a connected TCP socket from the listening socket: + * SO_DEBUG, SO_DONTROUTE, SO_KEEPALIVE, SO_LINGER, SO_OOBINLINE, SO_RCVBUF, SO_RCVLOWAT, SO_SNDBUF, + * SO_SNDLOWAT, TCP_MAXSEG, TCP_NODELAY. + */ +static bool is_inherited_option(int __level, int __optname) +{ + bool ret = false; + if (__level == SOL_SOCKET) { + switch (__optname) { + case SO_DEBUG: + case SO_DONTROUTE: + case SO_KEEPALIVE: + case SO_LINGER: + case SO_OOBINLINE: + case SO_RCVBUF: + case SO_RCVLOWAT: + case SO_SNDBUF: + case SO_SNDLOWAT: + case SO_VMA_RING_ALLOC_LOGIC: + ret = true; + } + } else if (__level == IPPROTO_TCP) { + switch (__optname) { + case TCP_MAXSEG: + case TCP_NODELAY: + ret = true; + } + } else if (__level == IPPROTO_IP) { + switch (__optname) { + case IP_TTL: + ret = true; + } + } + + return ret; +} + +/**/ +/** inlining functions can only help if they are implemented before their usage **/ +/**/ + +inline void sockinfo_tcp::lock_tcp_con() +{ + m_tcp_con_lock.lock(); +} + +inline void sockinfo_tcp::unlock_tcp_con() +{ + if (m_timer_pending) { + tcp_timer(); + } + + m_tcp_con_lock.unlock(); +} + +inline void sockinfo_tcp::init_pbuf_custom(mem_buf_desc_t *p_desc) +{ + p_desc->lwip_pbuf.pbuf.flags = PBUF_FLAG_IS_CUSTOM; + p_desc->lwip_pbuf.pbuf.len = p_desc->lwip_pbuf.pbuf.tot_len = (p_desc->sz_data - p_desc->rx.tcp.n_transport_header_len); + p_desc->lwip_pbuf.pbuf.ref = 1; + p_desc->lwip_pbuf.pbuf.type = PBUF_REF; + p_desc->lwip_pbuf.pbuf.next = NULL; + p_desc->lwip_pbuf.pbuf.payload = (u8_t *)p_desc->p_buffer + p_desc->rx.tcp.n_transport_header_len; +} + +/* change default rx_wait impl to flow based one */ +inline int sockinfo_tcp::rx_wait(int &poll_count, bool is_blocking) +{ + int ret_val = 0; + unlock_tcp_con(); + ret_val = rx_wait_helper(poll_count, is_blocking); + lock_tcp_con(); + return ret_val; +} + +inline int sockinfo_tcp::rx_wait_lockless(int & poll_count, bool is_blocking) +{ + if (m_timer_pending) { + m_tcp_con_lock.lock(); + tcp_timer(); + m_tcp_con_lock.unlock(); + } + + return rx_wait_helper(poll_count, is_blocking); +} + +inline void sockinfo_tcp::return_pending_rx_buffs() +{ + // force reuse of buffers especially for avoiding deadlock in case all buffers were taken and we can NOT get new FIN packets that will release buffers + if (m_sysvar_buffer_batching_mode == BUFFER_BATCHING_NO_RECLAIM || !m_rx_reuse_buff.n_buff_num) + return; + + if (m_rx_reuse_buf_pending) { + if (m_p_rx_ring && m_p_rx_ring->reclaim_recv_buffers(&m_rx_reuse_buff.rx_reuse)) { + } else { + g_buffer_pool_rx->put_buffers_after_deref_thread_safe(&m_rx_reuse_buff.rx_reuse); + } + m_rx_reuse_buff.n_buff_num = 0; + set_rx_reuse_pending(false); + } + else { + set_rx_reuse_pending(true); + } +} + +inline void sockinfo_tcp::return_pending_tx_buffs() +{ + if (m_sysvar_buffer_batching_mode == BUFFER_BATCHING_NO_RECLAIM || !m_p_connected_dst_entry) + return; + + m_p_connected_dst_entry->return_buffers_pool(); +} + +//todo inline void sockinfo_tcp::return_pending_tcp_segs() + +inline void sockinfo_tcp::reuse_buffer(mem_buf_desc_t *buff) +{ + set_rx_reuse_pending(false); + if (likely(m_p_rx_ring)) { + m_rx_reuse_buff.n_buff_num += buff->rx.n_frags; + m_rx_reuse_buff.rx_reuse.push_back(buff); + if (m_rx_reuse_buff.n_buff_num < m_n_sysvar_rx_num_buffs_reuse) { + return; + } + if (m_rx_reuse_buff.n_buff_num >= 2 * m_n_sysvar_rx_num_buffs_reuse) { + if (m_p_rx_ring->reclaim_recv_buffers(&m_rx_reuse_buff.rx_reuse)) { + m_rx_reuse_buff.n_buff_num = 0; + } else { + g_buffer_pool_rx->put_buffers_after_deref_thread_safe(&m_rx_reuse_buff.rx_reuse); + m_rx_reuse_buff.n_buff_num = 0; + } + m_rx_reuse_buf_postponed = false; + } else { + m_rx_reuse_buf_postponed = true; + } + } + else { + sockinfo::reuse_buffer(buff); + } +} + +sockinfo_tcp::sockinfo_tcp(int fd): + sockinfo(fd), + m_timer_handle(NULL), + m_timer_pending(false), + m_sysvar_buffer_batching_mode(safe_mce_sys().buffer_batching_mode), + m_sysvar_tcp_ctl_thread(safe_mce_sys().tcp_ctl_thread), + m_sysvar_internal_thread_tcp_timer_handling(safe_mce_sys().internal_thread_tcp_timer_handling), + m_sysvar_rx_poll_on_tx_tcp(safe_mce_sys().rx_poll_on_tx_tcp) +{ + si_tcp_logfuncall(""); + + m_accepted_conns.set_id("sockinfo_tcp (%p), fd = %d : m_accepted_conns", this, m_fd); + m_rx_pkt_ready_list.set_id("sockinfo_tcp (%p), fd = %d : m_rx_pkt_ready_list", this, m_fd); + m_rx_cb_dropped_list.set_id("sockinfo_tcp (%p), fd = %d : m_rx_cb_dropped_list", this, m_fd); + m_rx_ctl_packets_list.set_id("sockinfo_tcp (%p), fd = %d : m_rx_ctl_packets_list", this, m_fd); + m_rx_ctl_reuse_list.set_id("sockinfo_tcp (%p), fd = %d : m_rx_ctl_reuse_list", this, m_fd); + + m_last_syn_tsc = 0; + + m_linger.l_linger = 0; + m_linger.l_onoff = 0; + + m_bound.set_sa_family(AF_INET); + m_protocol = PROTO_TCP; + m_p_socket_stats->socket_type = SOCK_STREAM; + + memset(&m_rx_timestamps, 0, sizeof(m_rx_timestamps)); + + m_sock_state = TCP_SOCK_INITED; + m_conn_state = TCP_CONN_INIT; + m_conn_timeout = CONNECT_DEFAULT_TIMEOUT_MS; + setPassthrough(false); // by default we try to accelerate + si_tcp_logdbg("tcp socket created"); + + tcp_pcb_init(&m_pcb, TCP_PRIO_NORMAL); + + si_tcp_logdbg("new pcb %p pcb state %d", &m_pcb, get_tcp_state(&m_pcb)); + tcp_arg(&m_pcb, this); + tcp_ip_output(&m_pcb, sockinfo_tcp::ip_output); + tcp_recv(&m_pcb, sockinfo_tcp::rx_lwip_cb); + tcp_err(&m_pcb, sockinfo_tcp::err_lwip_cb); + tcp_sent(&m_pcb, sockinfo_tcp::ack_recvd_lwip_cb); + m_pcb.my_container = this; + + m_n_pbufs_rcvd = m_n_pbufs_freed = 0; + + m_parent = NULL; + m_iomux_ready_fd_array = NULL; + + /* SNDBUF accounting */ + m_sndbuff_max = 0; + /* RCVBUF accounting */ + m_rcvbuff_max = safe_mce_sys().sysctl_reader.get_tcp_rmem()->default_value; + + m_rcvbuff_current = 0; + m_rcvbuff_non_tcp_recved = 0; + m_received_syn_num = 0; + m_vma_thr = false; + + m_ready_conn_cnt = 0; + m_backlog = INT_MAX; + report_connected = false; + + m_error_status = 0; + + m_tcp_seg_count = 0; + m_tcp_seg_in_use = 0; + m_tcp_seg_list = g_tcp_seg_pool->get_tcp_segs(TCP_SEG_COMPENSATION); + if (m_tcp_seg_list) m_tcp_seg_count += TCP_SEG_COMPENSATION; + m_tx_consecutive_eagain_count = 0; + + // Disable Nagle algorithm if VMA_TCP_NODELAY flag was set. + if (safe_mce_sys().tcp_nodelay) { + try { + int tcp_nodelay = 1; + setsockopt(IPPROTO_TCP, TCP_NODELAY, &tcp_nodelay, sizeof(tcp_nodelay)); + } catch (vma_error&) { + // We should not be here + } + } + + // Enable Quickack if VMA_TCP_QUICKACK flag was set. + if (safe_mce_sys().tcp_quickack) { + try { + int tcp_quickack = 1; + setsockopt(IPPROTO_TCP, TCP_QUICKACK, &tcp_quickack, sizeof(tcp_quickack)); + } catch (vma_error&) { + // We should not be here + } + } + + si_tcp_logdbg("TCP PCB FLAGS: 0x%x", m_pcb.flags); + g_p_agent->register_cb((agent_cb_t)&sockinfo_tcp::put_agent_msg, (void *)this); + si_tcp_logfunc("done"); +} + +sockinfo_tcp::~sockinfo_tcp() +{ + si_tcp_logfunc(""); + + lock_tcp_con(); + + if (!is_closable()) { + /* Force closing TCP connection + * tcp state should be as CLOSED after finishing this call + */ + prepare_to_close(true); + } + + do_wakeup(); + + destructor_helper(); + + // Release preallocated buffers + tcp_tx_preallocted_buffers_free(&m_pcb); + + if (m_tcp_seg_in_use) { + si_tcp_logwarn("still %d tcp segs in use!", m_tcp_seg_in_use); + } + if (m_tcp_seg_count) { + g_tcp_seg_pool->put_tcp_segs(m_tcp_seg_list); + } + + while (!m_socket_options_list.empty()) { + socket_option_t* opt = m_socket_options_list.front(); + m_socket_options_list.pop_front(); + delete(opt); + } + + unlock_tcp_con(); + + if (m_n_rx_pkt_ready_list_count || m_rx_ready_byte_count || m_rx_pkt_ready_list.size() || m_rx_ring_map.size() || m_rx_reuse_buff.n_buff_num || m_rx_reuse_buff.rx_reuse.size() || m_rx_cb_dropped_list.size() || m_rx_ctl_packets_list.size() || m_rx_peer_packets.size() || m_rx_ctl_reuse_list.size()) + si_tcp_logerr("not all buffers were freed. protocol=TCP. m_n_rx_pkt_ready_list_count=%d, m_rx_ready_byte_count=%d, m_rx_pkt_ready_list.size()=%d, m_rx_ring_map.size()=%d, m_rx_reuse_buff.n_buff_num=%d, m_rx_reuse_buff.rx_reuse.size=%d, m_rx_cb_dropped_list.size=%d, m_rx_ctl_packets_list.size=%d, m_rx_peer_packets.size=%d, m_rx_ctl_reuse_list.size=%d", + m_n_rx_pkt_ready_list_count, m_rx_ready_byte_count, (int)m_rx_pkt_ready_list.size() ,(int)m_rx_ring_map.size(), m_rx_reuse_buff.n_buff_num, m_rx_reuse_buff.rx_reuse.size(), m_rx_cb_dropped_list.size(), m_rx_ctl_packets_list.size(), m_rx_peer_packets.size(), m_rx_ctl_reuse_list.size()); + + g_p_agent->unregister_cb((agent_cb_t)&sockinfo_tcp::put_agent_msg, (void *)this); + + si_tcp_logdbg("sock closed"); +} + +void sockinfo_tcp::clean_obj() +{ + if (is_cleaned()) { + return ; + } + + lock_tcp_con(); + set_cleaned(); + + /* Remove group timers from g_tcp_timers_collection */ + if (g_p_event_handler_manager->is_running() && m_timer_handle) { + g_p_event_handler_manager->unregister_timer_event(this, m_timer_handle); + } + + m_timer_handle = NULL; + if (g_p_event_handler_manager->is_running()) { + g_p_event_handler_manager->unregister_timers_event_and_delete(this); + unlock_tcp_con(); + } else { + unlock_tcp_con(); + cleanable_obj::clean_obj(); + } +} + +bool sockinfo_tcp::prepare_listen_to_close() +{ + //assume locked by sockinfo_tcp lock + + //remove the sockets from the accepted connections list + while (!m_accepted_conns.empty()) + { + sockinfo_tcp *new_sock = m_accepted_conns.get_and_pop_front(); + new_sock->m_sock_state = TCP_SOCK_INITED; + class flow_tuple key; + sockinfo_tcp::create_flow_tuple_key_from_pcb(key, &(new_sock->m_pcb)); + m_syn_received.erase(key); + m_ready_conn_cnt--; + new_sock->lock_tcp_con(); + new_sock->m_parent = NULL; + new_sock->abort_connection(); + new_sock->unlock_tcp_con(); + close(new_sock->get_fd()); + } + + // remove the sockets from the syn_received connections list + syn_received_map_t::iterator syn_received_itr; + for (syn_received_itr = m_syn_received.begin(); syn_received_itr != m_syn_received.end(); ) + { + sockinfo_tcp *new_sock = (sockinfo_tcp *)(syn_received_itr->second->my_container); + new_sock->m_sock_state = TCP_SOCK_INITED; + syn_received_map_t::iterator syn_received_itr_erase = syn_received_itr; + syn_received_itr++; + m_syn_received.erase(syn_received_itr_erase); + m_received_syn_num--; + new_sock->lock_tcp_con(); + new_sock->m_parent = NULL; + new_sock->abort_connection(); + new_sock->unlock_tcp_con(); + close(new_sock->get_fd()); + } + + return true; +} + +bool sockinfo_tcp::prepare_to_close(bool process_shutdown /* = false */) +{ + + lock_tcp_con(); + + si_tcp_logdbg(""); + + bool is_listen_socket = is_server() || get_tcp_state(&m_pcb) == LISTEN; + + /* + * consider process_shutdown: + * workaround for LBM which does not close the listen sockets properly on process shutdown. + * as a result they become ready for select, but calling accept return failure. + * see RM#390019 + */ + + // listen, accepted or connected socket + if ((is_listen_socket && !process_shutdown) || m_sock_state == TCP_SOCK_CONNECTED_RD + || m_sock_state == TCP_SOCK_CONNECTED_WR || m_sock_state == TCP_SOCK_CONNECTED_RDWR) { + m_sock_state = TCP_SOCK_BOUND; + } + + if (!is_listen_socket && m_n_rx_pkt_ready_list_count) { + abort_connection(); + } + + m_rx_ready_byte_count += m_rx_pkt_ready_offset; + m_p_socket_stats->n_rx_ready_byte_count += m_rx_pkt_ready_offset; + while (m_n_rx_pkt_ready_list_count) + { + mem_buf_desc_t* p_rx_pkt_desc = m_rx_pkt_ready_list.get_and_pop_front(); + m_n_rx_pkt_ready_list_count--; + m_p_socket_stats->n_rx_ready_pkt_count--; + m_rx_ready_byte_count -= p_rx_pkt_desc->rx.sz_payload; + m_p_socket_stats->n_rx_ready_byte_count -= p_rx_pkt_desc->rx.sz_payload; + reuse_buffer(p_rx_pkt_desc); + } + + while (!m_rx_ctl_packets_list.empty()) { + /* coverity[double_lock] TODO: RM#1049980 */ + m_rx_ctl_packets_list_lock.lock(); + if (m_rx_ctl_packets_list.empty()) { + m_rx_ctl_packets_list_lock.unlock(); + break; + } + mem_buf_desc_t* p_rx_pkt_desc = m_rx_ctl_packets_list.get_and_pop_front(); + /* coverity[double_unlock] TODO: RM#1049980 */ + m_rx_ctl_packets_list_lock.unlock(); + reuse_buffer(p_rx_pkt_desc); + } + + for (peer_map_t::iterator itr = m_rx_peer_packets.begin(); itr != m_rx_peer_packets.end(); ++itr) { + vma_desc_list_t &peer_packets = itr->second; + // loop on packets of a peer + while (!peer_packets.empty()) { + // get packet from list and reuse them + mem_buf_desc_t* desc = peer_packets.get_and_pop_front(); + reuse_buffer(desc); + } + } + m_rx_peer_packets.clear(); + + while (!m_rx_ctl_reuse_list.empty()) { + mem_buf_desc_t* p_rx_pkt_desc = m_rx_ctl_reuse_list.get_and_pop_front(); + reuse_buffer(p_rx_pkt_desc); + } + + while (!m_rx_cb_dropped_list.empty()) { + mem_buf_desc_t* p_rx_pkt_desc = m_rx_cb_dropped_list.get_and_pop_front(); + reuse_buffer(p_rx_pkt_desc); + } + + return_reuse_buffers_postponed(); + + /* According to "UNIX Network Programming" third edition, + * setting SO_LINGER with timeout 0 prior to calling close() + * will cause the normal termination sequence not to be initiated. + * If l_onoff is nonzero and l_linger is zero, TCP aborts the connection when it is closed. + * That is, TCP discards any data still remaining in the socket + * send buffer and sends an RST to the peer, not the normal four-packet connection + * termination sequence + * If process_shutdown is set as True do abort() with setting tcp state as CLOSED + */ + if (get_tcp_state(&m_pcb) != LISTEN && + (process_shutdown || (m_linger.l_onoff && !m_linger.l_linger))) { + abort_connection(); + } else { + tcp_close(&m_pcb); + + if (is_listen_socket) { + tcp_accept(&m_pcb, 0); + tcp_syn_handled((struct tcp_pcb_listen*)(&m_pcb), 0); + tcp_clone_conn((struct tcp_pcb_listen*)(&m_pcb), 0); + prepare_listen_to_close(); //close pending to accept sockets + } else { + tcp_recv(&m_pcb, sockinfo_tcp::rx_drop_lwip_cb); + tcp_sent(&m_pcb, 0); + } + + //todo should we do this each time we get into prepare_to_close ? + if (get_tcp_state(&m_pcb) != LISTEN) { + handle_socket_linger(); + } + } + + m_state = SOCKINFO_CLOSING; + NOTIFY_ON_EVENTS(this, EPOLLHUP); + + do_wakeup(); + + if (m_econtext) { + m_econtext->fd_closed(m_fd); + } + + unlock_tcp_con(); + + return (is_closable()); +} + +void sockinfo_tcp::handle_socket_linger() { + timeval start, current, elapsed; + long int linger_time_usec; + int poll_cnt = 0; + + linger_time_usec = (!m_linger.l_onoff /*|| !m_b_blocking */) ? 0 : m_linger.l_linger * USEC_PER_SEC; + si_tcp_logdbg("Going to linger for max time of %lu usec", linger_time_usec); + memset(&elapsed, 0,sizeof(elapsed)); + gettime(&start); + while ((tv_to_usec(&elapsed) <= linger_time_usec) && (m_pcb.unsent || m_pcb.unacked)) { + /* SOCKETXTREME WA: Don't call rx_wait() in order not to miss VMA events in socketxtreme_poll() flow. + * TBD: find proper solution! + * rx_wait(poll_cnt, false); + * */ + if (!is_socketxtreme()) { + rx_wait(poll_cnt, false); + } + tcp_output(&m_pcb); + gettime(¤t); + tv_sub(¤t, &start, &elapsed); + } + + if (m_linger.l_onoff && (m_pcb.unsent || m_pcb.unacked)) { + if (m_linger.l_linger > 0 /*&& m_b_blocking*/) { + errno = ERR_WOULDBLOCK; + } + } +} + +// This method will be on syn received on the passive side of a TCP connection +void sockinfo_tcp::create_dst_entry() +{ + if (!m_p_connected_dst_entry) { + socket_data data = { m_fd, m_n_uc_ttl, m_pcb.tos, m_pcp }; + m_p_connected_dst_entry = new dst_entry_tcp(m_connected.get_in_addr(), + m_connected.get_in_port(), + m_bound.get_in_port(), + data, + m_ring_alloc_log_tx); + + BULLSEYE_EXCLUDE_BLOCK_START + if (!m_p_connected_dst_entry) { + si_tcp_logerr("Failed to allocate m_p_connected_dst_entry"); + return; + } + BULLSEYE_EXCLUDE_BLOCK_END + if (!m_bound.is_anyaddr()) { + m_p_connected_dst_entry->set_bound_addr(m_bound.get_in_addr()); + } + if (m_so_bindtodevice_ip) { + m_p_connected_dst_entry->set_so_bindtodevice_addr(m_so_bindtodevice_ip); + } + } +} + +void sockinfo_tcp::lock_rx_q() +{ + lock_tcp_con(); +} + +void sockinfo_tcp::unlock_rx_q() +{ + unlock_tcp_con(); +} + +void sockinfo_tcp::tcp_timer() +{ + if (m_state == SOCKINFO_CLOSED) { + return; + } + + tcp_tmr(&m_pcb); + m_timer_pending = false; + + return_pending_rx_buffs(); + return_pending_tx_buffs(); +} + +bool sockinfo_tcp::prepare_dst_to_send(bool is_accepted_socket /* = false */) +{ + bool ret_val = false; + + if(m_p_connected_dst_entry) { + if (is_accepted_socket) { + ret_val = m_p_connected_dst_entry->prepare_to_send(m_so_ratelimit, true, false); + } else { + ret_val = m_p_connected_dst_entry->prepare_to_send(m_so_ratelimit, false, true); + } + +#ifdef DEFINED_TSO + if (ret_val) { + /* dst_entry has resolved tx ring, + * so it is a time to provide TSO information to PCB + */ + m_pcb.tso.max_buf_sz = std::min(safe_mce_sys().tx_buf_size, + m_p_connected_dst_entry->get_ring()->get_max_payload_sz()); + m_pcb.tso.max_payload_sz = m_p_connected_dst_entry->get_ring()->get_max_payload_sz(); + m_pcb.tso.max_header_sz = m_p_connected_dst_entry->get_ring()->get_max_header_sz(); + m_pcb.tso.max_send_sge = m_p_connected_dst_entry->get_ring()->get_max_send_sge(); + } +#endif /* DEFINED_TSO */ + } + return ret_val; +} + + +unsigned sockinfo_tcp::tx_wait(int & err, bool is_blocking) +{ + unsigned sz = tcp_sndbuf(&m_pcb); + int poll_count = 0; + si_tcp_logfunc("sz = %d rx_count=%d", sz, m_n_rx_pkt_ready_list_count); + err = 0; + while (is_rts() && (sz = tcp_sndbuf(&m_pcb)) == 0) { + err = rx_wait(poll_count, is_blocking); + //AlexV:Avoid from going to sleep, for the blocked socket of course, since + // progress engine may consume an arrived credit and it will not wakeup the + //transmit thread. + if (unlikely(err < 0)) { + return 0; + } + if (unlikely(g_b_exit)) { + errno = EINTR; + return 0; + } + if (is_blocking) { + /* force out TCP data to avoid spinning in this loop + * in case data is not seen on rx + */ + tcp_output(&m_pcb); + poll_count = 0; + } + } + si_tcp_logfunc("end sz=%d rx_count=%d", sz, m_n_rx_pkt_ready_list_count); + return sz; +} + +bool sockinfo_tcp::check_dummy_send_conditions(const int flags, const iovec* p_iov, const ssize_t sz_iov) +{ + // Calculate segment max length + uint8_t optflags = TF_SEG_OPTS_DUMMY_MSG; + uint16_t mss_local = MIN(m_pcb.mss, m_pcb.snd_wnd_max / 2); + mss_local = mss_local ? mss_local : m_pcb.mss; + + #if LWIP_TCP_TIMESTAMPS + if ((m_pcb.flags & TF_TIMESTAMP)) { + optflags |= TF_SEG_OPTS_TS; + mss_local = MAX(mss_local, LWIP_TCP_OPT_LEN_TS + 1); + } + #endif /* LWIP_TCP_TIMESTAMPS */ + + u16_t max_len = mss_local - LWIP_TCP_OPT_LENGTH(optflags); + + // Calculate window size + u32_t wnd = MIN(m_pcb.snd_wnd, m_pcb.cwnd); + + return !m_pcb.unsent && // Unsent queue should be empty + !(flags & MSG_MORE) && // Verify MSG_MORE flags is not set + sz_iov == 1 && // We want to prevent a case in which we call tcp_write() for scatter/gather element. + p_iov->iov_len && // We have data to sent + p_iov->iov_len <= max_len && // Data will not be split into more then one segment + wnd && // Window is not empty + (p_iov->iov_len + m_pcb.snd_lbb - m_pcb.lastack) <= wnd; // Window allows the dummy packet it to be sent +} + +void sockinfo_tcp::put_agent_msg(void *arg) +{ + sockinfo_tcp *p_si_tcp = (sockinfo_tcp *)arg; + struct vma_msg_state data; + + /* Ignore listen socket at the moment */ + if (p_si_tcp->is_server() || get_tcp_state(&p_si_tcp->m_pcb) == LISTEN) { + return ; + } + + data.hdr.code = VMA_MSG_STATE; + data.hdr.ver = VMA_AGENT_VER; + data.hdr.pid = getpid(); + data.fid = p_si_tcp->get_fd(); + data.state = get_tcp_state(&p_si_tcp->m_pcb); + data.type = SOCK_STREAM; + data.src_ip = p_si_tcp->m_bound.get_in_addr(); + data.src_port = p_si_tcp->m_bound.get_in_port(); + data.dst_ip = p_si_tcp->m_connected.get_in_addr(); + data.dst_port = p_si_tcp->m_connected.get_in_port(); + + g_p_agent->put((const void*)&data, sizeof(data), (intptr_t)data.fid); +} + +ssize_t sockinfo_tcp::tx(vma_tx_call_attr_t &tx_arg) +{ + iovec* p_iov = tx_arg.attr.msg.iov; + ssize_t sz_iov = tx_arg.attr.msg.sz_iov; + int __flags = tx_arg.attr.msg.flags; + struct sockaddr *__dst = tx_arg.attr.msg.addr; + socklen_t __dstlen = tx_arg.attr.msg.len; + int errno_tmp = errno; + int total_tx = 0; + unsigned tx_size; + err_t err; + unsigned pos = 0; + int ret = 0; + int poll_count = 0; + uint8_t apiflags = 0; + bool is_dummy = false; + bool block_this_run = false; + void *tx_ptr = NULL; + + /* Let allow OS to process all invalid scenarios to avoid any + * inconsistencies in setting errno values + */ + if (unlikely((m_sock_offload != TCP_SOCK_LWIP) || + (NULL == p_iov) || + (0 >= sz_iov) || + (NULL == p_iov[0].iov_base))) { + goto tx_packet_to_os; + } + +#ifdef VMA_TIME_MEASURE + TAKE_T_TX_START; +#endif + +retry_is_ready: + + if (unlikely(!is_rts())) { + + if (m_conn_state == TCP_CONN_CONNECTING) { + si_tcp_logdbg("TX while async-connect on socket go to poll"); + rx_wait_helper(poll_count, false); + if (m_conn_state == TCP_CONN_CONNECTED) goto retry_is_ready; + si_tcp_logdbg("TX while async-connect on socket return EAGAIN"); + errno = EAGAIN; + } else if (m_conn_state == TCP_CONN_RESETED) { + si_tcp_logdbg("TX on reseted socket"); + errno = ECONNRESET; + } else if (m_conn_state == TCP_CONN_ERROR) { + si_tcp_logdbg("TX on connection failed socket"); + errno = ECONNREFUSED; + } else { + si_tcp_logdbg("TX on disconnected socket"); + errno = EPIPE; + } + +#ifdef VMA_TIME_MEASURE + INC_ERR_TX_COUNT; +#endif + + return -1; + } + si_tcp_logfunc("tx: iov=%p niovs=%d", p_iov, sz_iov); + + if (unlikely(m_sysvar_rx_poll_on_tx_tcp)) { + rx_wait_helper(poll_count, false); + } + + lock_tcp_con(); + + is_dummy = IS_DUMMY_PACKET(__flags); + block_this_run = BLOCK_THIS_RUN(m_b_blocking, __flags); + + if (unlikely(is_dummy)) { + apiflags |= VMA_TX_PACKET_DUMMY; + if (!check_dummy_send_conditions(__flags, p_iov, sz_iov)) { + unlock_tcp_con(); + errno = EAGAIN; + return -1; + } + } + + if (tx_arg.opcode == TX_FILE) { + apiflags |= VMA_TX_FILE; + } + +#ifdef DEFINED_TCP_TX_WND_AVAILABILITY + if (!tcp_is_wnd_available(&m_pcb, p_iov[0].iov_len)) { + unlock_tcp_con(); + errno = EAGAIN; + return -1; + } +#endif + + for (int i = 0; i < sz_iov; i++) { + si_tcp_logfunc("iov:%d base=%p len=%d", i, p_iov[i].iov_base, p_iov[i].iov_len); + + pos = 0; + tx_ptr = p_iov[i].iov_base; + while (pos < p_iov[i].iov_len) { + tx_size = tcp_sndbuf(&m_pcb); + + /* Process a case when space is not available at the sending socket + * to hold the message to be transmitted + * Nonblocking socket: + * - no data is buffered: return (-1) and EAGAIN + * - some data is buffered: return number of bytes ready to be sent + * Blocking socket: + * - block until space is available + */ + if (tx_size == 0) { + if (unlikely(!is_rts())) { + si_tcp_logdbg("TX on disconnected socket"); + ret = -1; + errno = ECONNRESET; + goto err; + } + //force out TCP data before going on wait() + tcp_output(&m_pcb); + + /* Set return values for nonblocking socket and finish processing */ + if (!block_this_run) { + // non blocking socket should return inorder not to tx_wait() + if (total_tx > 0) { + m_tx_consecutive_eagain_count = 0; + goto done; + } + else { + m_tx_consecutive_eagain_count++; + if (m_tx_consecutive_eagain_count >= TX_CONSECUTIVE_EAGAIN_THREASHOLD) { + // in case of zero sndbuf and non-blocking just try once polling CQ for ACK + rx_wait(poll_count, false); + m_tx_consecutive_eagain_count = 0; + } + ret = -1; + errno = EAGAIN; + goto err; + } + } + + tx_size = tx_wait(ret, true); + } + + if (tx_size > p_iov[i].iov_len - pos) { + tx_size = p_iov[i].iov_len - pos; + } +retry_write: + if (unlikely(!is_rts())) { + si_tcp_logdbg("TX on disconnected socket"); + ret = -1; + errno = ECONNRESET; + goto err; + } + if (unlikely(g_b_exit)) { + ret = -1; + errno = EINTR; + si_tcp_logdbg("returning with: EINTR"); + goto err; + } + + err = tcp_write(&m_pcb, tx_ptr, tx_size, apiflags); + if (unlikely(err != ERR_OK)) { + if (unlikely(err == ERR_CONN)) { // happens when remote drops during big write + si_tcp_logdbg("connection closed: tx'ed = %d", total_tx); + shutdown(SHUT_WR); + if (total_tx > 0) { + goto done; + } + errno = EPIPE; + unlock_tcp_con(); +#ifdef VMA_TIME_MEASURE + INC_ERR_TX_COUNT; +#endif + return -1; + } + if (unlikely(err != ERR_MEM)) { + // we should not get here... + BULLSEYE_EXCLUDE_BLOCK_START + si_tcp_logpanic("tcp_write return: %d", err); + BULLSEYE_EXCLUDE_BLOCK_END + } + /* Set return values for nonblocking socket and finish processing */ + if (!block_this_run) { + if (total_tx > 0) { + goto done; + } else { + ret = -1; + errno = EAGAIN; + goto err; + } + } + + rx_wait(poll_count, true); + + //AlexV:Avoid from going to sleep, for the blocked socket of course, since + // progress engine may consume an arrived credit and it will not wakeup the + //transmit thread. + poll_count = 0; + + goto retry_write; + } + tx_ptr = (tx_arg.opcode == TX_FILE ? tx_ptr : (void *)((char *)tx_ptr + tx_size)); + pos += tx_size; + total_tx += tx_size; + } + } +done: + + tcp_output(&m_pcb); // force data out + + if (unlikely(is_dummy)) { + m_p_socket_stats->counters.n_tx_dummy++; + } else if (total_tx) { + m_p_socket_stats->counters.n_tx_sent_byte_count += total_tx; + m_p_socket_stats->counters.n_tx_sent_pkt_count++; + m_p_socket_stats->n_tx_ready_byte_count += total_tx; + } + + unlock_tcp_con(); + +#ifdef VMA_TIME_MEASURE + TAKE_T_TX_END; +#endif + /* Restore errno on function entry in case success */ + errno = errno_tmp; + + return total_tx; + +err: +#ifdef VMA_TIME_MEASURE + INC_ERR_TX_COUNT; +#endif + + // nothing send nb mode or got some other error + if (errno == EAGAIN) + m_p_socket_stats->counters.n_tx_drops++; + else + m_p_socket_stats->counters.n_tx_errors++; + unlock_tcp_con(); + return ret; + +tx_packet_to_os: +#ifdef VMA_TIME_MEASURE + INC_GO_TO_OS_TX_COUNT; +#endif + + ret = socket_fd_api::tx_os(tx_arg.opcode, p_iov, sz_iov, __flags, __dst, __dstlen); + save_stats_tx_os(ret); + return ret; +} + +#ifdef DEFINED_TSO +err_t sockinfo_tcp::ip_output(struct pbuf *p, void* v_p_conn, uint16_t flags) +{ + sockinfo_tcp *p_si_tcp = (sockinfo_tcp *)(((struct tcp_pcb*)v_p_conn)->my_container); + dst_entry *p_dst = p_si_tcp->m_p_connected_dst_entry; + int max_count = p_si_tcp->m_pcb.tso.max_send_sge; + tcp_iovec lwip_iovec[max_count]; + vma_send_attr attr = {(vma_wr_tx_packet_attr)0, 0}; + int count = 0; + + /* maximum number of sge can not exceed this value */ + while (p && (count < max_count)) { + lwip_iovec[count].iovec.iov_base = p->payload; + lwip_iovec[count].iovec.iov_len = p->len; + lwip_iovec[count].p_desc = (mem_buf_desc_t*)p; + p = p->next; + count++; + } + + /* Sanity check */ + if (unlikely(p)) { + vlog_printf(VLOG_ERROR, "Number of buffers in request exceed %d, so silently dropped.", max_count); + return ERR_OK; + } + + attr.flags = (vma_wr_tx_packet_attr)flags; + attr.mss = p_si_tcp->m_pcb.mss; + if (likely((p_dst->is_valid()))) { + p_dst->fast_send((struct iovec *)lwip_iovec, count, attr); + } else { + p_dst->slow_send((struct iovec *)lwip_iovec, count, attr, p_si_tcp->m_so_ratelimit); + } + + if (p_dst->try_migrate_ring(p_si_tcp->m_tcp_con_lock)) { + p_si_tcp->m_p_socket_stats->counters.n_tx_migrations++; + } + + if (is_set(attr.flags, VMA_TX_PACKET_REXMIT)) { + p_si_tcp->m_p_socket_stats->counters.n_tx_retransmits++; + } + + return ERR_OK; +} + +err_t sockinfo_tcp::ip_output_syn_ack(struct pbuf *p, void* v_p_conn, uint16_t flags) +{ + iovec iovec[64]; + struct iovec* p_iovec = iovec; + tcp_iovec tcp_iovec_temp; //currently we pass p_desc only for 1 size iovec, since for bigger size we allocate new buffers + sockinfo_tcp *p_si_tcp = (sockinfo_tcp *)(((struct tcp_pcb*)v_p_conn)->my_container); + dst_entry *p_dst = p_si_tcp->m_p_connected_dst_entry; + int count = 1; + vma_wr_tx_packet_attr attr; + + //ASSERT_NOT_LOCKED(p_si_tcp->m_tcp_con_lock); + + if (likely(!p->next)) { // We should hit this case 99% of cases + tcp_iovec_temp.iovec.iov_base = p->payload; + tcp_iovec_temp.iovec.iov_len = p->len; + tcp_iovec_temp.p_desc = (mem_buf_desc_t*)p; + __log_dbg("p_desc=%p,p->len=%d ", p, p->len); + p_iovec = (struct iovec*)&tcp_iovec_temp; + } else { + for (count = 0; count < 64 && p; ++count) { + iovec[count].iov_base = p->payload; + iovec[count].iov_len = p->len; + p = p->next; + } + + // We don't expect pbuf chain at all + if (p) { + vlog_printf(VLOG_ERROR, "pbuf chain size > 64!!! silently dropped."); + return ERR_OK; + } + } + + attr = (vma_wr_tx_packet_attr)flags; + if (is_set(attr, VMA_TX_PACKET_REXMIT)) + p_si_tcp->m_p_socket_stats->counters.n_tx_retransmits++; + + ((dst_entry_tcp*)p_dst)->slow_send_neigh(p_iovec, count, p_si_tcp->m_so_ratelimit); + + return ERR_OK; +} +#else +err_t sockinfo_tcp::ip_output(struct pbuf *p, void* v_p_conn, int is_rexmit, uint8_t is_dummy) +{ + iovec iovec[64]; + struct iovec* p_iovec = iovec; + tcp_iovec tcp_iovec_temp; //currently we pass p_desc only for 1 size iovec, since for bigger size we allocate new buffers + sockinfo_tcp *p_si_tcp = (sockinfo_tcp *)(((struct tcp_pcb*)v_p_conn)->my_container); + dst_entry *p_dst = p_si_tcp->m_p_connected_dst_entry; + int count = 1; + + if (likely(!p->next)) { // We should hit this case 99% of cases + tcp_iovec_temp.iovec.iov_base = p->payload; + tcp_iovec_temp.iovec.iov_len = p->len; + tcp_iovec_temp.p_desc = (mem_buf_desc_t*)p; + p_iovec = (struct iovec*)&tcp_iovec_temp; + } else { + for (count = 0; count < 64 && p; ++count) { + iovec[count].iov_base = p->payload; + iovec[count].iov_len = p->len; + p = p->next; + } + + // We don't expect pbuf chain at all + if (p) { + vlog_printf(VLOG_ERROR, "pbuf chain size > 64!!! silently dropped."); + return ERR_OK; + } + } + + if (likely((p_dst->is_valid()))) { + p_dst->fast_send(p_iovec, count, is_dummy, false, is_rexmit); + } else { + p_dst->slow_send(p_iovec, count, is_dummy, p_si_tcp->m_so_ratelimit, false, is_rexmit); + } + + if (p_dst->try_migrate_ring(p_si_tcp->m_tcp_con_lock)) { + p_si_tcp->m_p_socket_stats->counters.n_tx_migrations++; + } + + if (is_rexmit) { + p_si_tcp->m_p_socket_stats->counters.n_tx_retransmits++; + } + + return ERR_OK; +} + +err_t sockinfo_tcp::ip_output_syn_ack(struct pbuf *p, void* v_p_conn, int is_rexmit, uint8_t is_dummy) +{ + NOT_IN_USE(is_dummy); + + iovec iovec[64]; + struct iovec* p_iovec = iovec; + tcp_iovec tcp_iovec_temp; //currently we pass p_desc only for 1 size iovec, since for bigger size we allocate new buffers + sockinfo_tcp *p_si_tcp = (sockinfo_tcp *)(((struct tcp_pcb*)v_p_conn)->my_container); + dst_entry *p_dst = p_si_tcp->m_p_connected_dst_entry; + int count = 1; + + //ASSERT_NOT_LOCKED(p_si_tcp->m_tcp_con_lock); + + if (likely(!p->next)) { // We should hit this case 99% of cases + tcp_iovec_temp.iovec.iov_base = p->payload; + tcp_iovec_temp.iovec.iov_len = p->len; + tcp_iovec_temp.p_desc = (mem_buf_desc_t*)p; + __log_dbg("p_desc=%p,p->len=%d ", p, p->len); + p_iovec = (struct iovec*)&tcp_iovec_temp; + } else { + for (count = 0; count < 64 && p; ++count) { + iovec[count].iov_base = p->payload; + iovec[count].iov_len = p->len; + p = p->next; + } + + // We don't expect pbuf chain at all + if (p) { + vlog_printf(VLOG_ERROR, "pbuf chain size > 64!!! silently dropped."); + return ERR_OK; + } + } + + if (is_rexmit) + p_si_tcp->m_p_socket_stats->counters.n_tx_retransmits++; + + ((dst_entry_tcp*)p_dst)->slow_send_neigh(p_iovec, count, p_si_tcp->m_so_ratelimit); + + return ERR_OK; +} +#endif /* DEFINED_TSO */ + +/*static*/void sockinfo_tcp::tcp_state_observer(void* pcb_container, enum tcp_state new_state) +{ + sockinfo_tcp *p_si_tcp = (sockinfo_tcp *)pcb_container; + p_si_tcp->m_p_socket_stats->tcp_state = new_state; + + /* Update daemon about actual state for offloaded connection */ + if (likely(p_si_tcp->m_sock_offload == TCP_SOCK_LWIP)) { + p_si_tcp->put_agent_msg((void *)p_si_tcp); + } +} + +uint16_t sockinfo_tcp::get_route_mtu(struct tcp_pcb *pcb) +{ + sockinfo_tcp *tcp_sock = (sockinfo_tcp *)pcb->my_container; + // in case of listen m_p_connected_dst_entry is still NULL + if (tcp_sock->m_p_connected_dst_entry) { + return tcp_sock->m_p_connected_dst_entry->get_route_mtu(); + } + route_result res; + + g_p_route_table_mgr->route_resolve(route_rule_table_key(pcb->local_ip.addr, pcb->remote_ip.addr, pcb->tos), res); + + if (res.mtu) { + vlog_printf(VLOG_DEBUG, "Using route mtu %u\n", res.mtu); + return res.mtu; + } + net_device_val* ndv = g_p_net_device_table_mgr->get_net_device_val(res.p_src); + if (ndv && ndv->get_mtu() > 0) { + return ndv->get_mtu(); + } + vlog_printf(VLOG_DEBUG, "Could not find device, mtu 0 is used\n"); + return 0; +} + +void sockinfo_tcp::err_lwip_cb(void *pcb_container, err_t err) +{ + + if (!pcb_container) return; + sockinfo_tcp *conn = (sockinfo_tcp *)pcb_container; + __log_dbg("[fd=%d] sock=%p lwip_pcb=%p err=%d\n", conn->m_fd, conn, &(conn->m_pcb), err); + + if (get_tcp_state(&conn->m_pcb) == LISTEN && err == ERR_RST) { + vlog_printf(VLOG_ERROR, "listen socket should not receive RST"); + return; + } + + if (conn->m_parent != NULL) { + //in case we got RST before we accepted the connection + int delete_fd = 0; + sockinfo_tcp *parent = conn->m_parent; + bool locked_by_me = false; + if (conn->m_tcp_con_lock.is_locked_by_me()) { + locked_by_me = true; + conn->unlock_tcp_con(); + } + if ((delete_fd = parent->handle_child_FIN(conn))) { + //close will clean sockinfo_tcp object and the opened OS socket + close(delete_fd); + if (locked_by_me) + conn->lock_tcp_con(); //todo sock and fd_collection destruction race? if so, conn might be invalid? delay close to internal thread? + return; + } + if (locked_by_me) + conn->lock_tcp_con(); + } + + /* + * In case we got RST from the other end we need to marked this socket as ready to read for epoll + */ + if ((conn->m_sock_state == TCP_SOCK_CONNECTED_RD + || conn->m_sock_state == TCP_SOCK_CONNECTED_RDWR + || conn->m_sock_state == TCP_SOCK_ASYNC_CONNECT + || conn->m_conn_state == TCP_CONN_CONNECTING) + && PCB_IN_ACTIVE_STATE(&conn->m_pcb)) { + if (err == ERR_RST) { + if (conn->m_sock_state == TCP_SOCK_ASYNC_CONNECT) + NOTIFY_ON_EVENTS(conn, (EPOLLIN|EPOLLERR|EPOLLHUP)); + else + NOTIFY_ON_EVENTS(conn, (EPOLLIN|EPOLLERR|EPOLLHUP|EPOLLRDHUP)); + /* TODO what about no route to host type of errors, need to add EPOLLERR in this case ? */ + } else { // ERR_TIMEOUT + NOTIFY_ON_EVENTS(conn, (EPOLLIN|EPOLLHUP)); + } + + + /* SOCKETXTREME comment: + * Add this fd to the ready fd list + * Note: No issue is expected in case socketxtreme_poll() usage because 'pv_fd_ready_array' is null + * in such case and as a result update_fd_array() call means nothing + */ + + io_mux_call::update_fd_array(conn->m_iomux_ready_fd_array, conn->m_fd); + } + + conn->m_conn_state = TCP_CONN_FAILED; + if (err == ERR_TIMEOUT) { + conn->m_conn_state = TCP_CONN_TIMEOUT; + conn->m_error_status = ETIMEDOUT; + } else if (err == ERR_RST) { + if (conn->m_sock_state == TCP_SOCK_ASYNC_CONNECT) { + conn->m_conn_state = TCP_CONN_ERROR; + conn->m_error_status = ECONNREFUSED; + } else { + conn->m_conn_state = TCP_CONN_RESETED; + } + } + + //Avoid binding twice in case of calling connect again after previous call failed. + if (conn->m_sock_state != TCP_SOCK_BOUND) { //TODO: maybe we need to exclude more states? + conn->m_sock_state = TCP_SOCK_INITED; + } + + /* In general VMA should avoid calling unregister_timer_event() for the same timer handle twice. + * It is protected by checking m_timer_handle for NULL value that should be under lock. + * In order to save locking time a quick check is done first to ensure that indeed the specific + * timer has not been freed (avoiding the lock/unlock). + * The 2nd check is to avoid a race of the timer been freed while the lock has been taken. + */ + if (conn->m_timer_handle) { + conn->lock_tcp_con(); + if (conn->m_timer_handle) { + g_p_event_handler_manager->unregister_timer_event(conn, conn->m_timer_handle); + conn->m_timer_handle = NULL; + } + conn->unlock_tcp_con(); + } + + conn->do_wakeup(); +} + +bool sockinfo_tcp::process_peer_ctl_packets(vma_desc_list_t &peer_packets) +{ + // 2.1 loop on packets of a peer + while (!peer_packets.empty()) { + // 2.1.1 get packet from list and find its pcb + mem_buf_desc_t* desc = peer_packets.front(); + + if (0 != m_tcp_con_lock.trylock()) { + /* coverity[missing_unlock] */ + return false; + } + + struct tcp_pcb *pcb = get_syn_received_pcb(desc->rx.src.sin_addr.s_addr, + desc->rx.src.sin_port, + desc->rx.dst.sin_addr.s_addr, + desc->rx.dst.sin_port); + + // 2.1.2 get the pcb and sockinfo + if (!pcb) { + pcb = &m_pcb; + } + sockinfo_tcp *sock = (sockinfo_tcp*)pcb->my_container; + + if (sock == this) { // my socket - consider the backlog for the case I am listen socket + if (m_syn_received.size() >= (size_t)m_backlog && desc->rx.tcp.p_tcp_h->syn) { + m_tcp_con_lock.unlock(); + break; // skip to next peer + } else if (safe_mce_sys().tcp_max_syn_rate && desc->rx.tcp.p_tcp_h->syn) { + static tscval_t tsc_delay = get_tsc_rate_per_second() / safe_mce_sys().tcp_max_syn_rate; + tscval_t tsc_now; + gettimeoftsc(&tsc_now); + if (tsc_now - m_last_syn_tsc < tsc_delay) { + m_tcp_con_lock.unlock(); + break; + } else { + m_last_syn_tsc = tsc_now; + } + } + } + else { // child socket from a listener context - switch to child lock + m_tcp_con_lock.unlock(); + if (sock->m_tcp_con_lock.trylock()) { + break; // skip to next peer + } + } + + // 2.1.3 process the packet and remove it from list + peer_packets.pop_front(); + sock->m_vma_thr = true; + // -- start loop + desc->inc_ref_count(); + L3_level_tcp_input((pbuf *)desc, pcb); + + if (desc->dec_ref_count() <= 1) + sock->m_rx_ctl_reuse_list.push_back(desc); // under sock's lock + // -- end loop + sock->m_vma_thr = false; + + sock->m_tcp_con_lock.unlock(); + + } + return true; +} + +void sockinfo_tcp::process_my_ctl_packets() +{ + si_tcp_logfunc(""); + + // 0. fast swap of m_rx_ctl_packets_list with temp_list under lock + vma_desc_list_t temp_list; + + m_rx_ctl_packets_list_lock.lock(); + temp_list.splice_tail(m_rx_ctl_packets_list); + m_rx_ctl_packets_list_lock.unlock(); + + + if (m_backlog == INT_MAX) { // this is a child - no need to demux packets + process_peer_ctl_packets(temp_list); + + if (!temp_list.empty()) { + m_rx_ctl_packets_list_lock.lock(); + m_rx_ctl_packets_list.splice_head(temp_list); + m_rx_ctl_packets_list_lock.unlock(); + } + return; + } + + // 1. demux packets in the listener list to map of list per peer (for child this will be skipped) + while (!temp_list.empty()) { + mem_buf_desc_t* desc = temp_list.get_and_pop_front(); + peer_key pk(desc->rx.src.sin_addr.s_addr, desc->rx.src.sin_port); + + + static const unsigned int MAX_SYN_RCVD = m_sysvar_tcp_ctl_thread > CTL_THREAD_DISABLE ? safe_mce_sys().sysctl_reader.get_tcp_max_syn_backlog() : 0; + // NOTE: currently, in case tcp_ctl_thread is disabled, only established backlog is supported (no syn-rcvd backlog) + unsigned int num_con_waiting = m_rx_peer_packets.size(); + + if (num_con_waiting < MAX_SYN_RCVD) { + m_rx_peer_packets[pk].push_back(desc); + } + else { // map is full + peer_map_t::iterator iter = m_rx_peer_packets.find(pk); + if(iter != m_rx_peer_packets.end()) + { + // entry already exists, we can concatenate our packet + iter->second.push_back(desc); + } + else { + // drop the packet + if (desc->dec_ref_count() <= 1) { + si_tcp_logdbg("CTL packet drop. established-backlog=%d (limit=%d) num_con_waiting=%d (limit=%d)", + (int)m_syn_received.size(), m_backlog, num_con_waiting, MAX_SYN_RCVD); + m_rx_ctl_reuse_list.push_back(desc); + } + } + } + } + + // 2. loop on map of peers and process list of packets per peer + peer_map_t::iterator itr = m_rx_peer_packets.begin(); + while (itr != m_rx_peer_packets.end()) { + vma_desc_list_t &peer_packets = itr->second; + if (!process_peer_ctl_packets(peer_packets)) + return; + // prepare for next map iteration + if (peer_packets.empty()) + m_rx_peer_packets.erase(itr++); // // advance itr before invalidating it by erase (itr++ returns the value before advance) + else + ++itr; + } +} + +void sockinfo_tcp::process_children_ctl_packets() +{ + // handle children + while (!m_ready_pcbs.empty()) { + if (m_tcp_con_lock.trylock()) { + return; + } + ready_pcb_map_t::iterator itr = m_ready_pcbs.begin(); + if (itr == m_ready_pcbs.end()) { + /* coverity[double_unlock] TODO: RM#1049980 */ + m_tcp_con_lock.unlock(); + break; + } + sockinfo_tcp *sock = (sockinfo_tcp*)itr->first->my_container; + /* coverity[double_unlock] TODO: RM#1049980 */ + m_tcp_con_lock.unlock(); + + if (sock->m_tcp_con_lock.trylock()) { + break; + } + sock->m_vma_thr = true; + + while (!sock->m_rx_ctl_packets_list.empty()) { + sock->m_rx_ctl_packets_list_lock.lock(); + if (sock->m_rx_ctl_packets_list.empty()) { + sock->m_rx_ctl_packets_list_lock.unlock(); + break; + } + mem_buf_desc_t* desc = sock->m_rx_ctl_packets_list.get_and_pop_front(); + sock->m_rx_ctl_packets_list_lock.unlock(); + desc->inc_ref_count(); + L3_level_tcp_input((pbuf *)desc, &sock->m_pcb); + if (desc->dec_ref_count() <= 1) //todo reuse needed? + sock->m_rx_ctl_reuse_list.push_back(desc); + } + sock->m_vma_thr = false; + sock->m_tcp_con_lock.unlock(); + + if (m_tcp_con_lock.trylock()) { + break; + } + + /* coverity[double_lock] TODO: RM#1049980 */ + sock->m_rx_ctl_packets_list_lock.lock(); + if (sock->m_rx_ctl_packets_list.empty()) + m_ready_pcbs.erase(&sock->m_pcb); + sock->m_rx_ctl_packets_list_lock.unlock(); + + /* coverity[double_unlock] TODO: RM#1049980 */ + m_tcp_con_lock.unlock(); + } +} + +void sockinfo_tcp::process_reuse_ctl_packets() +{ + while (!m_rx_ctl_reuse_list.empty()) { + if (m_tcp_con_lock.trylock()) { + return; + } + mem_buf_desc_t* desc = m_rx_ctl_reuse_list.get_and_pop_front(); + reuse_buffer(desc); + /* coverity[double_unlock] TODO: RM#1049980 */ + m_tcp_con_lock.unlock(); + } +} + +void sockinfo_tcp::process_rx_ctl_packets() +{ + si_tcp_logfunc(""); + + process_my_ctl_packets(); + process_children_ctl_packets(); + process_reuse_ctl_packets(); +} + +//Execute TCP timers of this connection +void sockinfo_tcp::handle_timer_expired(void* user_data) +{ + NOT_IN_USE(user_data); + si_tcp_logfunc(""); + + if (m_sysvar_tcp_ctl_thread > CTL_THREAD_DISABLE) + process_rx_ctl_packets(); + + if (m_sysvar_internal_thread_tcp_timer_handling == INTERNAL_THREAD_TCP_TIMER_HANDLING_DEFERRED) { + // DEFERRED. if Internal thread is here first and m_timer_pending is false it jsut + // sets it as true for its next iteration (within 100ms), letting + // application threads have a chance of running tcp_timer() + if (m_timer_pending) { + if (m_tcp_con_lock.trylock()) { + return; + } + tcp_timer(); + m_tcp_con_lock.unlock(); + } + m_timer_pending = true; + } + else { // IMMEDIATE + // Set the pending flag before getting the lock, so in the rare case of + // a race with unlock_tcp_con(), the timer will be called twice. If we set + // the flag after trylock(), the timer may not be called in case of a race. + + // any thread (internal or application) will try locking + // and running the tcp_timer + m_timer_pending = true; + if (m_tcp_con_lock.trylock()) { + return; + } + + tcp_timer(); + m_tcp_con_lock.unlock(); + } +} + +void sockinfo_tcp::abort_connection() +{ + tcp_abort(&(m_pcb)); +} + +int sockinfo_tcp::handle_child_FIN(sockinfo_tcp* child_conn) +{ + lock_tcp_con(); + + sock_list_t::iterator conns_iter; + for(conns_iter = m_accepted_conns.begin(); conns_iter != m_accepted_conns.end(); conns_iter++) { + if (*(conns_iter) == child_conn) { + unlock_tcp_con(); + return 0; //don't close conn, it can be accepted + } + } + + if (m_ready_pcbs.find(&child_conn->m_pcb) != m_ready_pcbs.end()) { + m_ready_pcbs.erase(&child_conn->m_pcb); + } + + // remove the connection from m_syn_received and close it by caller + class flow_tuple key; + sockinfo_tcp::create_flow_tuple_key_from_pcb(key, &(child_conn->m_pcb)); + if (!m_syn_received.erase(key)) { + si_tcp_logfunc("Can't find the established pcb in syn received list"); + } + else { + si_tcp_logdbg("received FIN before accept() was called"); + m_received_syn_num--; + child_conn->m_parent = NULL; + unlock_tcp_con(); + child_conn->lock_tcp_con(); + child_conn->abort_connection(); + child_conn->unlock_tcp_con(); + return (child_conn->get_fd()); + } + unlock_tcp_con(); + return 0; +} + +err_t sockinfo_tcp::ack_recvd_lwip_cb(void *arg, struct tcp_pcb *tpcb, u16_t ack) +{ + sockinfo_tcp *conn = (sockinfo_tcp *)arg; + + NOT_IN_USE(tpcb); /* to suppress warning in case VMA_MAX_DEFINED_LOG_LEVEL */ + assert((uintptr_t)tpcb->my_container == (uintptr_t)arg); + + vlog_func_enter(); + + ASSERT_LOCKED(conn->m_tcp_con_lock); + + conn->m_p_socket_stats->n_tx_ready_byte_count -= ack; + + NOTIFY_ON_EVENTS(conn, EPOLLOUT); + + vlog_func_exit(); + + return ERR_OK; +} + +err_t sockinfo_tcp::rx_lwip_cb(void *arg, struct tcp_pcb *pcb, + struct pbuf *p, err_t err) +{ + + sockinfo_tcp *conn = (sockinfo_tcp *)arg; + uint32_t bytes_to_tcp_recved, non_tcp_receved_bytes_remaining, bytes_to_shrink; + int rcv_buffer_space; + + NOT_IN_USE(pcb); + assert((uintptr_t)pcb->my_container == (uintptr_t)arg); + + vlog_func_enter(); + + ASSERT_LOCKED(conn->m_tcp_con_lock); + + //if is FIN + if (unlikely(!p)) { + + if (conn->is_server()) { + vlog_printf(VLOG_ERROR, "listen socket should not receive FIN"); + return ERR_OK; + } + + NOTIFY_ON_EVENTS(conn, EPOLLIN|EPOLLRDHUP); + + /* SOCKETXTREME comment: + * Add this fd to the ready fd list + * Note: No issue is expected in case socketxtreme_poll() usage because 'pv_fd_ready_array' is null + * in such case and as a result update_fd_array() call means nothing + */ + io_mux_call::update_fd_array(conn->m_iomux_ready_fd_array, conn->m_fd); + conn->do_wakeup(); + + //tcp_close(&(conn->m_pcb)); + //TODO: should be a move into half closed state (shut rx) instead of complete close + tcp_shutdown(&(conn->m_pcb), 1, 0); + __log_dbg("[fd=%d] null pbuf sock(%p %p) err=%d\n", conn->m_fd, &(conn->m_pcb), pcb, err); + + if (conn->is_rts() || ((conn->m_sock_state == TCP_SOCK_ASYNC_CONNECT) && (conn->m_conn_state == TCP_CONN_CONNECTED))) { + conn->m_sock_state = TCP_SOCK_CONNECTED_WR; + } else { + conn->m_sock_state = TCP_SOCK_BOUND; + } + /* + * We got FIN, means that we will not receive any new data + * Need to remove the callback functions + */ + tcp_recv(&(conn->m_pcb), sockinfo_tcp::rx_drop_lwip_cb); + + if (conn->m_parent != NULL) { + //in case we got FIN before we accepted the connection + int delete_fd = 0; + sockinfo_tcp *parent = conn->m_parent; + /* TODO need to add some refcount inside parent in case parent and child are closed together*/ + conn->unlock_tcp_con(); + if ((delete_fd = parent->handle_child_FIN(conn))) { + //close will clean sockinfo_tcp object and the opened OS socket + close(delete_fd); + conn->lock_tcp_con(); //todo sock and fd_collection destruction race? if so, conn might be invalid? delay close to internal thread? + return ERR_ABRT; + } + conn->lock_tcp_con(); + } + return ERR_OK; + } + if (unlikely(err != ERR_OK)) { + // notify io_mux + NOTIFY_ON_EVENTS(conn, EPOLLERR); + + conn->do_wakeup(); + vlog_printf(VLOG_ERROR, "%s:%d %s\n", __func__, __LINE__, "recv error!!!\n"); + pbuf_free(p); + conn->m_sock_state = TCP_SOCK_INITED; + return err; + } + mem_buf_desc_t *p_first_desc = (mem_buf_desc_t *)p; + + p_first_desc->rx.sz_payload = p->tot_len; + p_first_desc->rx.n_frags = 0; + + mem_buf_desc_t *p_curr_desc = p_first_desc; + + pbuf *p_curr_buff = p; + conn->m_connected.get_sa(p_first_desc->rx.src); + + while (p_curr_buff) { + p_curr_desc->rx.context = conn; + p_first_desc->rx.n_frags++; + p_curr_desc->rx.frag.iov_base = p_curr_buff->payload; + p_curr_desc->rx.frag.iov_len = p_curr_buff->len; + p_curr_desc->p_next_desc = (mem_buf_desc_t *)p_curr_buff->next; + conn->process_timestamps(p_curr_desc); + p_curr_buff = p_curr_buff->next; + p_curr_desc = p_curr_desc->p_next_desc; + } + + vma_recv_callback_retval_t callback_retval = VMA_PACKET_RECV; + + if (conn->m_rx_callback && !conn->m_vma_thr && !conn->m_n_rx_pkt_ready_list_count) { + mem_buf_desc_t *tmp; + vma_info_t pkt_info; + int nr_frags = 0; + + pkt_info.struct_sz = sizeof(pkt_info); + pkt_info.packet_id = (void*)p_first_desc; + pkt_info.src = &p_first_desc->rx.src; + pkt_info.dst = &p_first_desc->rx.dst; + pkt_info.socket_ready_queue_pkt_count = conn->m_p_socket_stats->n_rx_ready_pkt_count; + pkt_info.socket_ready_queue_byte_count = conn->m_p_socket_stats->n_rx_ready_byte_count; + + if (conn->m_n_tsing_flags & SOF_TIMESTAMPING_RAW_HARDWARE) { + pkt_info.hw_timestamp = p_first_desc->rx.timestamps.hw; + } + if (p_first_desc->rx.timestamps.sw.tv_sec) { + pkt_info.sw_timestamp = p_first_desc->rx.timestamps.sw; + } + + // fill io vector array with data buffer pointers + iovec iov[p_first_desc->rx.n_frags]; + nr_frags = 0; + for (tmp = p_first_desc; tmp; tmp = tmp->p_next_desc) { + iov[nr_frags++] = tmp->rx.frag; + } + + // call user callback + callback_retval = conn->m_rx_callback(conn->m_fd, nr_frags, iov, &pkt_info, conn->m_rx_callback_context); + } + + if (callback_retval == VMA_PACKET_DROP) { + conn->m_rx_cb_dropped_list.push_back(p_first_desc); + + // In ZERO COPY case we let the user's application manage the ready queue + } + else { + if (conn->is_socketxtreme()) { + /* Update vma_completion with + * VMA_SOCKETXTREME_PACKET related data + */ + struct vma_completion_t *completion; + struct vma_buff_t *buf_lst; + + if (conn->m_socketxtreme.completion) { + completion = conn->m_socketxtreme.completion; + buf_lst = conn->m_socketxtreme.last_buff_lst; + } else { + completion = &conn->m_socketxtreme.ec.completion; + buf_lst = conn->m_socketxtreme.ec.last_buff_lst; + } + + if (!buf_lst) { + completion->packet.buff_lst = (struct vma_buff_t*)p_first_desc; + completion->packet.total_len = p->tot_len; + completion->src = p_first_desc->rx.src; + completion->packet.num_bufs = p_first_desc->rx.n_frags; + + if (conn->m_n_tsing_flags & SOF_TIMESTAMPING_RAW_HARDWARE) { + completion->packet.hw_timestamp = p_first_desc->rx.timestamps.hw; + } + + NOTIFY_ON_EVENTS(conn, VMA_SOCKETXTREME_PACKET); + conn->save_stats_rx_offload(completion->packet.total_len); + } + else { + mem_buf_desc_t* prev_lst_tail_desc = (mem_buf_desc_t*)buf_lst; + mem_buf_desc_t* list_head_desc = (mem_buf_desc_t*)completion->packet.buff_lst; + prev_lst_tail_desc->p_next_desc = p_first_desc; + list_head_desc->rx.n_frags += p_first_desc->rx.n_frags; + p_first_desc->rx.n_frags = 0; + completion->packet.total_len += p->tot_len; + completion->packet.num_bufs += list_head_desc->rx.n_frags; + pbuf_cat((pbuf*)completion->packet.buff_lst, p); + } + } + else { + if (callback_retval == VMA_PACKET_RECV) { + // Save rx packet info in our ready list + conn->m_rx_pkt_ready_list.push_back(p_first_desc); + conn->m_n_rx_pkt_ready_list_count++; + conn->m_rx_ready_byte_count += p->tot_len; + conn->m_p_socket_stats->n_rx_ready_byte_count += p->tot_len; + conn->m_p_socket_stats->n_rx_ready_pkt_count++; + conn->m_p_socket_stats->counters.n_rx_ready_pkt_max = + max((uint32_t)conn->m_p_socket_stats->n_rx_ready_pkt_count, + conn->m_p_socket_stats->counters.n_rx_ready_pkt_max); + conn->m_p_socket_stats->counters.n_rx_ready_byte_max = + max((uint32_t)conn->m_p_socket_stats->n_rx_ready_byte_count, + conn->m_p_socket_stats->counters.n_rx_ready_byte_max); + } + // notify io_mux + NOTIFY_ON_EVENTS(conn, EPOLLIN); + } + io_mux_call::update_fd_array(conn->m_iomux_ready_fd_array, conn->m_fd); + + if (callback_retval != VMA_PACKET_HOLD) { + //OLG: Now we should wakeup all threads that are sleeping on this socket. + conn->do_wakeup(); + } else { + conn->m_p_socket_stats->n_rx_zcopy_pkt_count++; + } + } + + /* + * RCVBUFF Accounting: tcp_recved here(stream into the 'internal' buffer) only if the user buffer is not 'filled' + */ + rcv_buffer_space = max(0, conn->m_rcvbuff_max - conn->m_rcvbuff_current - (int)conn->m_pcb.rcv_wnd_max_desired); + if (callback_retval == VMA_PACKET_DROP) { + bytes_to_tcp_recved = (int)p->tot_len; + } else { + bytes_to_tcp_recved = min(rcv_buffer_space, (int)p->tot_len); + conn->m_rcvbuff_current += p->tot_len; + } + + if (likely(bytes_to_tcp_recved > 0)) { + tcp_recved(&(conn->m_pcb), bytes_to_tcp_recved); + } + + non_tcp_receved_bytes_remaining = p->tot_len - bytes_to_tcp_recved; + + if (non_tcp_receved_bytes_remaining > 0) { + bytes_to_shrink = 0; + if (conn->m_pcb.rcv_wnd_max > conn->m_pcb.rcv_wnd_max_desired) { + bytes_to_shrink = MIN(conn->m_pcb.rcv_wnd_max - conn->m_pcb.rcv_wnd_max_desired, non_tcp_receved_bytes_remaining); + conn->m_pcb.rcv_wnd_max -= bytes_to_shrink; + } + conn->m_rcvbuff_non_tcp_recved += non_tcp_receved_bytes_remaining - bytes_to_shrink; + } + + vlog_func_exit(); + return ERR_OK; +} + +err_t sockinfo_tcp::rx_drop_lwip_cb(void *arg, struct tcp_pcb *tpcb, + struct pbuf *p, err_t err) +{ + NOT_IN_USE(tpcb); + NOT_IN_USE(arg); + + vlog_func_enter(); + + if (!p) { + return ERR_OK; + } + if (unlikely(err != ERR_OK)) { //not suppose to get here + return err; + } + + return ERR_CONN; +} + +int sockinfo_tcp::handle_rx_error(bool is_blocking) +{ + int ret = -1; + + lock_tcp_con(); + + if (g_b_exit) { + errno = EINTR; + si_tcp_logdbg("returning with: EINTR"); + } else if (!is_rtr()) { + if (m_conn_state == TCP_CONN_INIT) { + si_tcp_logdbg("RX on never connected socket"); + errno = ENOTCONN; + } else if (m_conn_state == TCP_CONN_CONNECTING) { + si_tcp_logdbg("RX while async-connect on socket"); + errno = EAGAIN; + } else if (m_conn_state == TCP_CONN_RESETED) { + si_tcp_logdbg("RX on reseted socket"); + m_conn_state = TCP_CONN_FAILED; + errno = ECONNRESET; + } else { + si_tcp_logdbg("RX on disconnected socket - EOF"); + ret = 0; + } + } + + if ((errno == EBUSY || errno == EWOULDBLOCK) && !is_blocking) { + errno = EAGAIN; + } + +#ifdef VMA_TIME_MEASURE + INC_ERR_RX_COUNT; +#endif + + if (errno == EAGAIN) { + m_p_socket_stats->counters.n_rx_eagain++; + } else { + m_p_socket_stats->counters.n_rx_errors++; + } + + unlock_tcp_con(); + + return ret; +} + +// +// FIXME: we should not require lwip lock for rx +// +ssize_t sockinfo_tcp::rx(const rx_call_t call_type, iovec* p_iov, ssize_t sz_iov, int* p_flags, sockaddr *__from, socklen_t *__fromlen, struct msghdr *__msg) +{ + int errno_tmp = errno; + int total_rx = 0; + int poll_count = 0; + int bytes_to_tcp_recved; + size_t total_iov_sz = 1; + int out_flags = 0; + int in_flags = *p_flags; + bool block_this_run = BLOCK_THIS_RUN(m_b_blocking, in_flags); + + m_loops_timer.start(); + + si_tcp_logfuncall(""); + if (unlikely(m_sock_offload != TCP_SOCK_LWIP)) { + int ret = 0; +#ifdef VMA_TIME_MEASURE + INC_GO_TO_OS_RX_COUNT; +#endif + ret = socket_fd_api::rx_os(call_type, p_iov, sz_iov, in_flags, __from, __fromlen, __msg); + save_stats_rx_os(ret); + return ret; + } + +#ifdef VMA_TIME_MEASURE + TAKE_T_RX_START; +#endif + + if (unlikely((in_flags & MSG_WAITALL) && !(in_flags & MSG_PEEK))) { + total_iov_sz = 0; + for (int i = 0; i < sz_iov; i++) { + total_iov_sz += p_iov[i].iov_len; + } + if (total_iov_sz == 0) + return 0; + } + + si_tcp_logfunc("rx: iov=%p niovs=%d", p_iov, sz_iov); + /* poll rx queue till we have something */ + lock_tcp_con(); + return_reuse_buffers_postponed(); + unlock_tcp_con(); + + while (m_rx_ready_byte_count < total_iov_sz) { + if (unlikely(g_b_exit ||!is_rtr() || (rx_wait_lockless(poll_count, block_this_run) < 0))) { + return handle_rx_error(block_this_run); + } + } + + lock_tcp_con(); + + si_tcp_logfunc("something in rx queues: %d %p", m_n_rx_pkt_ready_list_count, m_rx_pkt_ready_list.front()); + + total_rx = dequeue_packet(p_iov, sz_iov, (sockaddr_in *)__from, __fromlen, in_flags, &out_flags); + if (__msg) handle_cmsg(__msg); + + /* + * RCVBUFF Accounting: Going 'out' of the internal buffer: if some bytes are not tcp_recved yet - do that. + * The packet might not be 'acked' (tcp_recved) + * + */ + if (!(in_flags & (MSG_PEEK | MSG_VMA_ZCOPY))) { + m_rcvbuff_current -= total_rx; + + // data that was not tcp_recved should do it now. + if ( m_rcvbuff_non_tcp_recved > 0 ) { + bytes_to_tcp_recved = min(m_rcvbuff_non_tcp_recved, total_rx); + tcp_recved(&m_pcb, bytes_to_tcp_recved); + m_rcvbuff_non_tcp_recved -= bytes_to_tcp_recved; + } + } + + unlock_tcp_con(); + + si_tcp_logfunc("rx completed, %d bytes sent", total_rx); + +#ifdef VMA_TIME_MEASURE + if (0 < total_rx) + TAKE_T_RX_END; +#endif + /* Restore errno on function entry in case success */ + errno = errno_tmp; + + return total_rx; +} + +void sockinfo_tcp::register_timer() +{ + if( m_timer_handle == NULL) { + m_timer_handle = g_p_event_handler_manager->register_timer_event(safe_mce_sys().tcp_timer_resolution_msec , this, PERIODIC_TIMER, 0, g_tcp_timers_collection); + }else { + si_tcp_logdbg("register_timer was called more than once. Something might be wrong, or connect was called twice."); + } +} + +void sockinfo_tcp::queue_rx_ctl_packet(struct tcp_pcb* pcb, mem_buf_desc_t *p_desc) +{ + /* in tcp_ctl_thread mode, always lock the child first*/ + p_desc->inc_ref_count(); + if (!p_desc->rx.tcp.gro) + init_pbuf_custom(p_desc); + else + p_desc->rx.tcp.gro = 0; + sockinfo_tcp *sock = (sockinfo_tcp*)pcb->my_container; + + sock->m_rx_ctl_packets_list_lock.lock(); + sock->m_rx_ctl_packets_list.push_back(p_desc); + sock->m_rx_ctl_packets_list_lock.unlock(); + + if (sock != this) { + m_ready_pcbs[pcb] = 1; + } + + if (m_sysvar_tcp_ctl_thread == CTL_THREAD_WITH_WAKEUP) + g_p_event_handler_manager->wakeup_timer_event(this, m_timer_handle); + + return; +} + +bool sockinfo_tcp::rx_input_cb(mem_buf_desc_t* p_rx_pkt_mem_buf_desc_info, void* pv_fd_ready_array) +{ + struct tcp_pcb* pcb = NULL; + int dropped_count = 0; + + lock_tcp_con(); + + m_iomux_ready_fd_array = (fd_array_t*)pv_fd_ready_array; + + /* Try to process socketxtreme_poll() completion directly */ + if (p_rx_pkt_mem_buf_desc_info->rx.socketxtreme_polled) { + m_socketxtreme.completion = m_p_rx_ring->get_comp(); + m_socketxtreme.last_buff_lst = NULL; + } + + if (unlikely(get_tcp_state(&m_pcb) == LISTEN)) { + pcb = get_syn_received_pcb(p_rx_pkt_mem_buf_desc_info->rx.src.sin_addr.s_addr, + p_rx_pkt_mem_buf_desc_info->rx.src.sin_port, + p_rx_pkt_mem_buf_desc_info->rx.dst.sin_addr.s_addr, + p_rx_pkt_mem_buf_desc_info->rx.dst.sin_port); + bool established_backlog_full = false; + if (!pcb) { + pcb = &m_pcb; + + /// respect TCP listen backlog - See redmine issue #565962 + /// distinguish between backlog of established sockets vs. backlog of syn-rcvd + static const unsigned int MAX_SYN_RCVD = m_sysvar_tcp_ctl_thread > CTL_THREAD_DISABLE ? safe_mce_sys().sysctl_reader.get_tcp_max_syn_backlog() : 0; + // NOTE: currently, in case tcp_ctl_thread is disabled, only established backlog is supported (no syn-rcvd backlog) + + unsigned int num_con_waiting = m_rx_peer_packets.size(); + + // 1st - check established backlog + if(num_con_waiting > 0 || (m_syn_received.size() >= (size_t)m_backlog && p_rx_pkt_mem_buf_desc_info->rx.tcp.p_tcp_h->syn) ) { + established_backlog_full = true; + } + + // 2nd - check that we allow secondary backlog (don't check map of peer packets to avoid races) + if (MAX_SYN_RCVD == 0 && established_backlog_full) { + // TODO: consider check if we can now drain into Q of established + si_tcp_logdbg("SYN/CTL packet drop. established-backlog=%d (limit=%d) num_con_waiting=%d (limit=%d)", + (int)m_syn_received.size(), m_backlog, num_con_waiting, MAX_SYN_RCVD); + m_socketxtreme.completion = NULL; + m_socketxtreme.last_buff_lst = NULL; + unlock_tcp_con(); + return false;// return without inc_ref_count() => packet will be dropped + } + } + if (m_sysvar_tcp_ctl_thread > CTL_THREAD_DISABLE || established_backlog_full) { /* 2nd check only worth when MAX_SYN_RCVD>0 for non tcp_ctl_thread */ + queue_rx_ctl_packet(pcb, p_rx_pkt_mem_buf_desc_info); // TODO: need to trigger queue pulling from accept in case no tcp_ctl_thread + m_socketxtreme.completion = NULL; + m_socketxtreme.last_buff_lst = NULL; + unlock_tcp_con(); + return true; + } + } + else { + pcb = &m_pcb; + } + p_rx_pkt_mem_buf_desc_info->inc_ref_count(); + + if (!p_rx_pkt_mem_buf_desc_info->rx.tcp.gro) init_pbuf_custom(p_rx_pkt_mem_buf_desc_info); + else p_rx_pkt_mem_buf_desc_info->rx.tcp.gro = 0; + + dropped_count = m_rx_cb_dropped_list.size(); + + sockinfo_tcp *sock = (sockinfo_tcp*)pcb->my_container; + if (sock != this) { + sock->m_tcp_con_lock.lock(); + } + + sock->m_vma_thr = p_rx_pkt_mem_buf_desc_info->rx.is_vma_thr; +#ifdef RDTSC_MEASURE_RX_READY_POLL_TO_LWIP + RDTSC_TAKE_END(g_rdtsc_instr_info_arr[RDTSC_FLOW_RX_READY_POLL_TO_LWIP]); +#endif //RDTSC_MEASURE_RX_READY_POLL_TO_LWIP + +#ifdef RDTSC_MEASURE_RX_LWIP + RDTSC_TAKE_START(g_rdtsc_instr_info_arr[RDTSC_FLOW_MEASURE_RX_LWIP]); +#endif //RDTSC_MEASURE_RX_LWIP + L3_level_tcp_input((pbuf *)p_rx_pkt_mem_buf_desc_info, pcb); + +#ifdef RDTSC_MEASURE_RX_LWIP + RDTSC_TAKE_END(g_rdtsc_instr_info_arr[RDTSC_FLOW_MEASURE_RX_LWIP]); +#endif //RDTSC_MEASURE_RX_LWIP + +#ifdef RDTSC_MEASURE_RX_LWIP_TO_RECEVEFROM + RDTSC_TAKE_START(g_rdtsc_instr_info_arr[RDTSC_FLOW_RX_LWIP_TO_RECEVEFROM]); +#endif //RDTSC_MEASURE_RX_LWIP_TO_RECEVEFROM + sock->m_vma_thr = false; + + if (sock != this) { + if (unlikely(sock->m_socketxtreme.completion)) { + sock->m_socketxtreme.completion = NULL; + sock->m_socketxtreme.last_buff_lst = NULL; + } + sock->m_tcp_con_lock.unlock(); + } + + m_iomux_ready_fd_array = NULL; + m_socketxtreme.completion = NULL; + m_socketxtreme.last_buff_lst = NULL; + p_rx_pkt_mem_buf_desc_info->rx.socketxtreme_polled = false; + + while (dropped_count--) { + mem_buf_desc_t* p_rx_pkt_desc = m_rx_cb_dropped_list.get_and_pop_front(); + reuse_buffer(p_rx_pkt_desc); + } + + unlock_tcp_con(); + + return true; +} + +/** + * try to connect to the dest over RDMA cm + * try fallback to the OS connect (TODO) + */ +int sockinfo_tcp::connect(const sockaddr *__to, socklen_t __tolen) +{ + int ret = 0; + + NOT_IN_USE(__tolen); + + lock_tcp_con(); + + /* Connection was closed by RST, timeout, ICMP error + * or another process disconnected us. + * Socket should be recreated. + */ + if (report_connected && is_errorable(&ret)) { + errno = ECONNABORTED; + unlock_tcp_con(); + return -1; + } + + // Calling connect more than once should return error codes + if (m_sock_state != TCP_SOCK_INITED && m_sock_state != TCP_SOCK_BOUND) { + switch (m_sock_state) { + case TCP_SOCK_CONNECTED_RD: + case TCP_SOCK_CONNECTED_WR: + case TCP_SOCK_CONNECTED_RDWR: + if (report_connected) { + report_connected = false; + unlock_tcp_con(); + return 0; + } + errno = EISCONN; + break; + case TCP_SOCK_ASYNC_CONNECT: + errno = EALREADY; + break; + default: + // print error so we can better track apps not following our assumptions ;) + si_tcp_logerr("socket is in wrong state for connect: %d", m_sock_state); + errno = EADDRINUSE; + break; + } + unlock_tcp_con(); + return -1; + } + + // take local ip from new sock and local port from acceptor + if (m_sock_state != TCP_SOCK_BOUND && bind(m_bound.get_p_sa(), m_bound.get_socklen()) == -1) { + setPassthrough(); + unlock_tcp_con(); + si_tcp_logdbg("non offloaded socket --> connect only via OS"); + return -1; + } + + // setup peer address + // TODO: Currenlty we don't check the if __to is supported and legal + // socket-redirect probably should do this + m_connected.set(*((sockaddr *)__to)); + + create_dst_entry(); + if (!m_p_connected_dst_entry) { + setPassthrough(); + unlock_tcp_con(); + si_tcp_logdbg("non offloaded socket --> connect only via OS"); + return -1; + } + + prepare_dst_to_send(false); + + // update it after route was resolved and device was updated + m_p_socket_stats->bound_if = m_p_connected_dst_entry->get_src_addr(); + + sockaddr_in remote_addr; + remote_addr.sin_family = AF_INET; + remote_addr.sin_addr.s_addr = m_p_connected_dst_entry->get_dst_addr(); + remote_addr.sin_port = m_p_connected_dst_entry->get_dst_port(); + sock_addr local_addr(m_bound.get_p_sa()); + if (local_addr.is_anyaddr()) + local_addr.set_in_addr(m_p_connected_dst_entry->get_src_addr()); + + if (!m_p_connected_dst_entry->is_offloaded() + || find_target_family(ROLE_TCP_CLIENT, (sockaddr*)&remote_addr, local_addr.get_p_sa()) != TRANS_VMA) { + setPassthrough(); + unlock_tcp_con(); + si_tcp_logdbg("non offloaded socket --> connect only via OS"); + return -1; + } else { + notify_epoll_context_fd_is_offloaded(); //remove fd from os epoll + } + + if (m_bound.is_anyaddr()) { + m_bound.set_in_addr(m_p_connected_dst_entry->get_src_addr()); + in_addr_t ip = m_bound.get_in_addr(); + tcp_bind(&m_pcb, (ip_addr_t*)(&ip), (ntohs(m_bound.get_in_port()))); + } + m_conn_state = TCP_CONN_CONNECTING; + bool success = attach_as_uc_receiver((role_t)NULL, true); + if (!success) { + setPassthrough(); + unlock_tcp_con(); + si_tcp_logdbg("non offloaded socket --> connect only via OS"); + return -1; + } + + in_addr_t peer_ip_addr = m_connected.get_in_addr(); + fit_rcv_wnd(true); + + int err = tcp_connect(&m_pcb, (ip_addr_t*)(&peer_ip_addr), ntohs(m_connected.get_in_port()), /*(tcp_connected_fn)*/sockinfo_tcp::connect_lwip_cb); + if (err != ERR_OK) { + //todo consider setPassthrough and go to OS + destructor_helper(); + errno = ECONNREFUSED; + si_tcp_logerr("bad connect, err=%d", err); + unlock_tcp_con(); + return -1; + } + + //Now we should register socket to TCP timer + register_timer(); + + if (!m_b_blocking) { + errno = EINPROGRESS; + m_error_status = EINPROGRESS; + m_sock_state = TCP_SOCK_ASYNC_CONNECT; + report_connected = true; + unlock_tcp_con(); + si_tcp_logdbg("NON blocking connect"); + return -1; + } + + // if (target_family == USE_VMA || target_family == USE_ULP || arget_family == USE_DEFAULT) + int rc = wait_for_conn_ready(); + // handle ret from async connect + if (rc < 0) { + //todo consider setPassthrough and go to OS + destructor_helper(); + unlock_tcp_con(); + // errno is set inside wait_for_conn_ready + return -1; + } + setPassthrough(false); + unlock_tcp_con(); + return 0; +} + +int sockinfo_tcp::bind(const sockaddr *__addr, socklen_t __addrlen) +{ + struct sockaddr tmp_sin; + socklen_t tmp_sin_len = sizeof(tmp_sin); + + si_tcp_logfuncall(""); + + if (m_sock_state == TCP_SOCK_BOUND) { + si_tcp_logfuncall("already bounded"); + errno = EINVAL; + return -1; + } + + if (m_sock_state != TCP_SOCK_INITED) { + // print error so we can better track apps not following our assumptions ;) + si_tcp_logdbg("socket is in wrong state for bind: %d", m_sock_state); + errno = EINVAL; //EADDRINUSE; //todo or EINVAL for RM BGATE 1545 case 1 + return -1; + } + + lock_tcp_con(); + + uint16_t bind_to_port = (__addr && __addrlen) ? ((struct sockaddr_in*)__addr)->sin_port : INPORT_ANY; //todo verify __addr length + bool disable_reuse_option = (bind_to_port == INPORT_ANY) && (m_pcb.so_options & SOF_REUSEADDR); + int reuse, ret; + + if (disable_reuse_option) { + reuse = 0; + ret = orig_os_api.setsockopt(m_fd, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse)); + BULLSEYE_EXCLUDE_BLOCK_START + if (ret) { + si_tcp_logerr("Failed to disable SO_REUSEADDR option (ret=%d %m), connection will be handled by OS", ret); + setPassthrough(); + si_tcp_logdbg("socket bound only via OS"); + unlock_tcp_con(); + return ret; + } + BULLSEYE_EXCLUDE_BLOCK_END + } + + ret = orig_os_api.bind(m_fd, __addr, __addrlen); + + if (disable_reuse_option) { + reuse = 1; + int rv = orig_os_api.setsockopt(m_fd, SOL_SOCKET, SO_REUSEADDR, &reuse, sizeof(reuse)); + BULLSEYE_EXCLUDE_BLOCK_START + if (rv) { + si_tcp_logerr("Failed to enable SO_REUSEADDR option (ret=%d %m)", rv); + } + BULLSEYE_EXCLUDE_BLOCK_END + if (ret < 0) { + setPassthrough(); + si_tcp_logdbg("socket bound only via OS"); + } + } + + if (ret < 0) { + unlock_tcp_con(); + return ret; + } + + BULLSEYE_EXCLUDE_BLOCK_START + if(orig_os_api.getsockname(m_fd, &tmp_sin, &tmp_sin_len)) { + si_tcp_logerr("get sockname failed"); + unlock_tcp_con(); + return -1; //error + } + BULLSEYE_EXCLUDE_BLOCK_END + + // TODO: mark socket as accepting both os and offloaded connections + if (tmp_sin.sa_family != AF_INET) { + si_tcp_logdbg("Illegal family %d", tmp_sin.sa_family); + errno = EAFNOSUPPORT; + unlock_tcp_con(); + return -1; //error + } + m_bound.set(tmp_sin); + in_addr_t ip = m_bound.get_in_addr(); + + if (!m_bound.is_anyaddr() && !g_p_net_device_table_mgr->get_net_device_val(ip)) { //if socket is not bound to INADDR_ANY and not offloaded socket- only bind OS + setPassthrough(); + m_sock_state = TCP_SOCK_BOUND; + si_tcp_logdbg("socket bound only via OS"); + unlock_tcp_con(); + return 0; + } + + if (tcp_bind(&m_pcb, (ip_addr_t*)(&ip), ntohs(m_bound.get_in_port())) != ERR_OK) { + errno = EINVAL; + unlock_tcp_con(); + return -1; //error + } + + m_sock_state = TCP_SOCK_BOUND; + + m_bound.set(tmp_sin); + si_tcp_logdbg("socket bound"); + + m_p_socket_stats->bound_if = m_bound.get_in_addr(); + m_p_socket_stats->bound_port = m_bound.get_in_port(); + + unlock_tcp_con(); + return 0; +} + +int sockinfo_tcp::prepareListen(){ + transport_t target_family; + struct sockaddr_in tmp_sin; + socklen_t tmp_sin_len = sizeof(sockaddr_in); + si_tcp_logfuncall(""); + + if (m_sock_offload == TCP_SOCK_PASSTHROUGH) + return 1; //passthrough + + if (is_server()) + return 0; // listen had been called before... + + if (m_sock_state != TCP_SOCK_BOUND) { + /*It is legal application behavior, listen was called without bind, + * therefore need to call for bind() to get a random port from the OS + */ + si_tcp_logdbg("listen was called without bind - calling for VMA bind" ); + + memset(&tmp_sin, 0, tmp_sin_len); + tmp_sin.sin_family = AF_INET; + tmp_sin.sin_port = 0; + tmp_sin.sin_addr.s_addr = INADDR_ANY; + if (bind((struct sockaddr *)&tmp_sin, tmp_sin_len) < 0) { + si_tcp_logdbg("bind failed"); + return 1; + } + } + + memset(&tmp_sin, 0, tmp_sin_len); + getsockname((struct sockaddr *)&tmp_sin, &tmp_sin_len); + lock_tcp_con(); + target_family = __vma_match_tcp_server(TRANS_VMA, safe_mce_sys().app_id, (struct sockaddr *) &tmp_sin, tmp_sin_len); + si_tcp_logdbg("TRANSPORT: %s, sock state = %d", __vma_get_transport_str(target_family), get_tcp_state(&m_pcb)); + + if (target_family == TRANS_OS || m_sock_offload == TCP_SOCK_PASSTHROUGH) { + setPassthrough(); + m_sock_state = TCP_SOCK_ACCEPT_READY; + } + else { + + // if (target_family == USE_VMA || target_family == USE_ULP || arget_family == USE_DEFAULT) + setPassthrough(false); + m_sock_state = TCP_SOCK_LISTEN_READY; + } + + unlock_tcp_con(); + return isPassthrough() ? 1 : 0; +} + +int sockinfo_tcp::listen(int backlog) +{ + si_tcp_logfuncall(""); + + int orig_backlog = backlog; + + if (backlog > safe_mce_sys().sysctl_reader.get_listen_maxconn()) { + si_tcp_logdbg("truncating listen backlog=%d to the maximun=%d", backlog, safe_mce_sys().sysctl_reader.get_listen_maxconn()); + backlog = safe_mce_sys().sysctl_reader.get_listen_maxconn(); + } + else if (backlog <= 0) { + si_tcp_logdbg("changing listen backlog=%d to the minimum=%d", backlog, 1); + backlog = 1; + } + if (backlog >= 5) + backlog = 10 + 2 * backlog; // allow grace, inspired by Linux + + lock_tcp_con(); + + if (is_server()) { + // if listen is called again - only update the backlog + // TODO: check if need to drop item in existing queues + m_backlog = backlog; + unlock_tcp_con(); + return 0; + } + if (m_sock_state != TCP_SOCK_LISTEN_READY) { + // print error so we can better track bugs in VMA) + si_tcp_logerr("socket is in wrong state for listen: %d", m_sock_state); + errno = EINVAL; + unlock_tcp_con(); + return -1; + } + + m_backlog = backlog; + m_ready_conn_cnt = 0; + + if (get_tcp_state(&m_pcb) != LISTEN) { + + // Now we know that it is listen socket so we have to treat m_pcb as listen pcb + // and update the relevant fields of tcp_listen_pcb. + struct tcp_pcb tmp_pcb; + memcpy(&tmp_pcb, &m_pcb, sizeof(struct tcp_pcb)); + tcp_listen((struct tcp_pcb_listen*)(&m_pcb), &tmp_pcb); + } + + m_sock_state = TCP_SOCK_ACCEPT_READY; + + tcp_accept(&m_pcb, sockinfo_tcp::accept_lwip_cb); + tcp_syn_handled((struct tcp_pcb_listen*)(&m_pcb), sockinfo_tcp::syn_received_lwip_cb); + tcp_clone_conn((struct tcp_pcb_listen*)(&m_pcb), sockinfo_tcp::clone_conn_cb); + + bool success = attach_as_uc_receiver(ROLE_TCP_SERVER); + + if (!success) { + /* we will get here if attach_as_uc_receiver failed */ + si_tcp_logdbg("Fallback the connection to os"); + setPassthrough(); + unlock_tcp_con(); + return orig_os_api.listen(m_fd, orig_backlog); + } + + // Calling to orig_listen() by default to monitor connection requests for not offloaded sockets + BULLSEYE_EXCLUDE_BLOCK_START + if (orig_os_api.listen(m_fd, orig_backlog)) { + si_tcp_logerr("orig_listen failed"); + unlock_tcp_con(); + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_END + + // Add the user's orig fd to the rx epfd handle + epoll_event ev = {0, {0}}; + ev.events = EPOLLIN; + ev.data.fd = m_fd; + int ret = orig_os_api.epoll_ctl(m_rx_epfd, EPOLL_CTL_ADD, ev.data.fd, &ev); + BULLSEYE_EXCLUDE_BLOCK_START + if (unlikely(ret)) { + if (errno == EEXIST) { + si_tcp_logdbg("failed to add user's fd to internal epfd errno=%d (%m)", errno); + } else { + si_tcp_logerr("failed to add user's fd to internal epfd errno=%d (%m)", errno); + si_tcp_logdbg("Fallback the connection to os"); + destructor_helper(); + setPassthrough(); + unlock_tcp_con(); + return 0; + } + } + BULLSEYE_EXCLUDE_BLOCK_END + + if (m_sysvar_tcp_ctl_thread > CTL_THREAD_DISABLE) + m_timer_handle = g_p_event_handler_manager->register_timer_event(safe_mce_sys().timer_resolution_msec , this, PERIODIC_TIMER, 0, NULL); + + unlock_tcp_con(); + return 0; + +} + +int sockinfo_tcp::rx_verify_available_data() +{ + int poll_count = 0; + + // Poll cq to verify the latest amount of ready bytes + int ret = rx_wait_helper(poll_count, false); + + if (ret >= 0 || errno == EAGAIN) { + errno = 0; + ret = m_p_socket_stats->n_rx_ready_byte_count; + } + + return ret; +} + +int sockinfo_tcp::accept_helper(struct sockaddr *__addr, socklen_t *__addrlen, int __flags /* = 0 */) +{ + sockinfo_tcp *ns; + //todo do one CQ poll and go to sleep even if infinite polling was set + int poll_count = m_n_sysvar_rx_poll_num; //do one poll and go to sleep (if blocking) + int ret; + + si_tcp_logfuncall(""); + + // if in os pathrough just redirect to os + if (m_sock_offload == TCP_SOCK_PASSTHROUGH) { + si_tcp_logdbg("passthrough - go to OS accept()"); + if (__flags) + return orig_os_api.accept4(m_fd, __addr, __addrlen, __flags); + else + return orig_os_api.accept(m_fd, __addr, __addrlen); + } + + si_tcp_logdbg("socket accept, __addr = %p, __addrlen = %p, *__addrlen = %d", __addr, __addrlen, __addrlen ? *__addrlen : 0); + + if (!is_server()) { + // print error so we can better track apps not following our assumptions ;) + si_tcp_logdbg("socket is in wrong state for accept: %d", m_sock_state); + errno = EINVAL; + return -1; + } + + lock_tcp_con(); + + si_tcp_logdbg("sock state = %d", get_tcp_state(&m_pcb)); + while (m_ready_conn_cnt == 0 && !g_b_exit) { + if (m_sock_state != TCP_SOCK_ACCEPT_READY) { + unlock_tcp_con(); + errno = EINVAL; + return -1; + } + + //todo instead of doing blind poll, check if waken-up by OS fd in rx_wait + // + // Always try OS accept() + + // Poll OS socket for pending connection + // smart bit to switch between the two + pollfd os_fd[1]; + os_fd[0].fd = m_fd; + os_fd[0].events = POLLIN; + ret = orig_os_api.poll(os_fd, 1, 0); // Zero timeout - just poll and return quickly + if (unlikely(ret == -1)) { + m_p_socket_stats->counters.n_rx_os_errors++; + si_tcp_logdbg("orig_os_api.poll returned with error (errno=%d %m)", errno); + unlock_tcp_con(); + return -1; + } + if (ret == 1) { + si_tcp_logdbg("orig_os_api.poll returned with packet"); + unlock_tcp_con(); + if (__flags) + return orig_os_api.accept4(m_fd, __addr, __addrlen, __flags); + else + return orig_os_api.accept(m_fd, __addr, __addrlen); + } + + if (rx_wait(poll_count, m_b_blocking) < 0) { + si_tcp_logdbg("interrupted accept"); + unlock_tcp_con(); + return -1; + } + } + if (g_b_exit) { + si_tcp_logdbg("interrupted accept"); + unlock_tcp_con(); + errno = EINTR; + return -1; + } + + si_tcp_logdbg("sock state = %d", get_tcp_state(&m_pcb)); + si_tcp_logdbg("socket accept - has some!!!"); + ns = m_accepted_conns.get_and_pop_front(); + BULLSEYE_EXCLUDE_BLOCK_START + if (!ns) { + si_tcp_logpanic("no socket in accepted queue!!! ready count = %d", m_ready_conn_cnt); + } + BULLSEYE_EXCLUDE_BLOCK_END + + m_ready_conn_cnt--; + tcp_accepted(m_sock); + + class flow_tuple key; + sockinfo_tcp::create_flow_tuple_key_from_pcb(key, &(ns->m_pcb)); + + //Since the pcb is already contained in connected sockinfo_tcp no need to keep it listen's socket SYN list + if (!m_syn_received.erase(key)) { + //Should we worry about that? + __log_dbg("Can't find the established pcb in syn received list\n"); + } + else { + m_received_syn_num--; + } + + if (m_sysvar_tcp_ctl_thread == CTL_THREAD_WITH_WAKEUP && !m_rx_peer_packets.empty()) + g_p_event_handler_manager->wakeup_timer_event(this, m_timer_handle); + + unlock_tcp_con(); + + ns->lock_tcp_con(); + + if (__addr && __addrlen) { + if ((ret = ns->getpeername(__addr, __addrlen)) < 0) { + int errno_tmp = errno; + ns->unlock_tcp_con(); + close(ns->get_fd()); + errno = errno_tmp; + + /* According accept() man description ECONNABORTED is expected + * error value in case connection was aborted. + */ + switch (errno) { + case ENOTCONN: + /* accept() expected result + * If connection was established in background and client + * closed connection forcibly (using RST) + */ + errno = ECONNABORTED; + break; + default: + break; + } + + return ret; + } + } + + ns->m_p_socket_stats->connected_ip = ns->m_connected.get_in_addr(); + ns->m_p_socket_stats->connected_port = ns->m_connected.get_in_port(); + + ns->m_p_socket_stats->bound_if = ns->m_bound.get_in_addr(); + ns->m_p_socket_stats->bound_port = ns->m_bound.get_in_port(); + + if (__flags & SOCK_NONBLOCK) + ns->fcntl(F_SETFL, O_NONBLOCK); + if (__flags & SOCK_CLOEXEC) + ns->fcntl(F_SETFD, FD_CLOEXEC); + + ns->unlock_tcp_con(); + + si_tcp_logdbg("CONN ACCEPTED: TCP PCB FLAGS: acceptor:0x%x newsock: fd=%d 0x%x new state: %d", m_pcb.flags, ns->m_fd, ns->m_pcb.flags, get_tcp_state(&ns->m_pcb)); + return ns->m_fd; +} + +int sockinfo_tcp::accept(struct sockaddr *__addr, socklen_t *__addrlen) +{ + si_tcp_logfuncall(""); + + return accept_helper(__addr, __addrlen); +} + +int sockinfo_tcp::accept4(struct sockaddr *__addr, socklen_t *__addrlen, int __flags) +{ + si_tcp_logfuncall(""); + si_tcp_logdbg("socket accept4, flags=%d", __flags); + + return accept_helper(__addr, __addrlen, __flags); +} + +sockinfo_tcp *sockinfo_tcp::accept_clone() +{ + sockinfo_tcp *si; + int fd; + + // note that this will call socket() replacement!!! + // and it will force proper socket creation + fd = socket_internal(AF_INET, SOCK_STREAM, 0); + if (fd < 0) { + return 0; + } + + si = dynamic_cast(fd_collection_get_sockfd(fd)); + + if (!si) { + si_tcp_logwarn("can not get accept socket from FD collection"); + close(fd); + return 0; + } + + si->m_parent = this; + + si->m_sock_state = TCP_SOCK_BOUND; + si->setPassthrough(false); + + if (m_sysvar_tcp_ctl_thread > CTL_THREAD_DISABLE) { + tcp_ip_output(&si->m_pcb, sockinfo_tcp::ip_output_syn_ack); + } + + return si; +} + +//Must be taken under parent's tcp connection lock +void sockinfo_tcp::auto_accept_connection(sockinfo_tcp *parent, sockinfo_tcp *child) +{ + tcp_accepted(parent->m_sock); + + class flow_tuple key; + sockinfo_tcp::create_flow_tuple_key_from_pcb(key, &(child->m_pcb)); + + //Since pcb is already contained in connected sockinfo_tcp no need to keep it listen's socket SYN list + if (!parent->m_syn_received.erase(key)) { + //Should we worry about that? + __log_dbg("Can't find the established pcb in syn received list\n"); + } + else { + parent->m_received_syn_num--; + } + + parent->unlock_tcp_con(); + child->lock_tcp_con(); + + child->m_p_socket_stats->connected_ip = child->m_connected.get_in_addr(); + child->m_p_socket_stats->connected_port = child->m_connected.get_in_port(); + child->m_p_socket_stats->bound_if = child->m_bound.get_in_addr(); + child->m_p_socket_stats->bound_port = child->m_bound.get_in_port(); + if (child->m_socketxtreme.completion) { + child->m_connected.get_sa(parent->m_socketxtreme.completion->src); + } else { + child->m_connected.get_sa(parent->m_socketxtreme.ec.completion.src); + } + + /* Update vma_completion with + * VMA_SOCKETXTREME_NEW_CONNECTION_ACCEPTED related data + */ + if (likely(child->m_parent)) { + if (child->m_socketxtreme.completion) { + child->m_socketxtreme.completion->src = parent->m_socketxtreme.completion->src; + child->m_socketxtreme.completion->listen_fd = child->m_parent->get_fd(); + } else { + child->m_socketxtreme.ec.completion.src = parent->m_socketxtreme.ec.completion.src; + child->m_socketxtreme.ec.completion.listen_fd = child->m_parent->get_fd(); + } + child->set_events(VMA_SOCKETXTREME_NEW_CONNECTION_ACCEPTED); + } + else { + vlog_printf(VLOG_ERROR, "VMA_SOCKETXTREME_NEW_CONNECTION_ACCEPTED: can't find listen socket for new connected socket with [fd=%d]", + __func__, __LINE__, child->get_fd()); + } + + child->unlock_tcp_con(); + parent->lock_tcp_con(); + + __log_dbg("CONN AUTO ACCEPTED: TCP PCB FLAGS: acceptor:0x%x newsock: fd=%d 0x%x new state: %d\n", parent->m_pcb.flags, child->m_fd, child->m_pcb.flags, get_tcp_state(&child->m_pcb)); +} + +err_t sockinfo_tcp::accept_lwip_cb(void *arg, struct tcp_pcb *child_pcb, err_t err) +{ + sockinfo_tcp *conn = (sockinfo_tcp *)(arg); + sockinfo_tcp *new_sock; + bool conn_nagle_disabled; + + if (!conn || !child_pcb) { + return ERR_VAL; + } + + __log_dbg("initial state=%x\n", get_tcp_state(&conn->m_pcb)); + __log_dbg("accept cb: arg=%p, new pcb=%p err=%d\n", arg, child_pcb, err); + if (err != ERR_OK) { + vlog_printf(VLOG_ERROR, "%s:d: accept cb failed\n", __func__, __LINE__); + return err; + } + if (conn->m_sock_state != TCP_SOCK_ACCEPT_READY) { + __log_dbg("socket is not accept ready!\n"); + return ERR_RST; + } + // make new socket + __log_dbg("new stateb4clone=%x\n", get_tcp_state(child_pcb)); + new_sock = (sockinfo_tcp*)child_pcb->my_container; + + if (!new_sock) { + vlog_printf(VLOG_ERROR, "%s:d: failed to clone socket\n", __func__, __LINE__); + return ERR_RST; + } + + tcp_ip_output(&(new_sock->m_pcb), sockinfo_tcp::ip_output); + tcp_arg(&(new_sock->m_pcb), new_sock); + tcp_recv(&(new_sock->m_pcb), sockinfo_tcp::rx_lwip_cb); + tcp_err(&(new_sock->m_pcb), sockinfo_tcp::err_lwip_cb); + + ASSERT_LOCKED(new_sock->m_tcp_con_lock); + + new_sock->m_sock_state = TCP_SOCK_CONNECTED_RDWR; + + __log_dbg("listen(fd=%d) state=%x: new sock(fd=%d) state=%x\n", conn->m_fd, get_tcp_state(&conn->m_pcb), new_sock->m_fd, get_tcp_state(&new_sock->m_pcb)); + + /* Configure Nagle algorithm settings as they were set at the parent socket. + This can happened if VMA_TCP_NAGLE flag was set, but we disabled it for the parent socket. */ + if ((conn_nagle_disabled = tcp_nagle_disabled(&conn->m_pcb)) != tcp_nagle_disabled(&new_sock->m_pcb)) { + conn_nagle_disabled ? tcp_nagle_disable(&new_sock->m_pcb) : tcp_nagle_enable(&new_sock->m_pcb); + new_sock->fit_snd_bufs_to_nagle(conn_nagle_disabled); + } + + if (new_sock->m_conn_state == TCP_CONN_INIT) { //in case m_conn_state is not in one of the error states + new_sock->m_conn_state = TCP_CONN_CONNECTED; + } + + /* if attach failed, we should continue getting traffic through the listen socket */ + // todo register as 3-tuple rule for the case the listener is gone? + new_sock->attach_as_uc_receiver(role_t (NULL), true); + + if (new_sock->m_sysvar_tcp_ctl_thread > CTL_THREAD_DISABLE) { + new_sock->m_vma_thr = true; + + // Before handling packets from flow steering the child should process everything it got from parent + while (!new_sock->m_rx_ctl_packets_list.empty()) { + vma_desc_list_t temp_list; + new_sock->m_rx_ctl_packets_list_lock.lock(); + temp_list.splice_tail(new_sock->m_rx_ctl_packets_list); + new_sock->m_rx_ctl_packets_list_lock.unlock(); + + while (!temp_list.empty()) { + mem_buf_desc_t* desc = temp_list.get_and_pop_front(); + desc->inc_ref_count(); + L3_level_tcp_input((pbuf *)desc, &new_sock->m_pcb); + if (desc->dec_ref_count() <= 1) //todo reuse needed? + new_sock->m_rx_ctl_reuse_list.push_back(desc); + } + } + new_sock->m_vma_thr = false; + } + + new_sock->unlock_tcp_con(); + + conn->lock_tcp_con(); + + //todo check that listen socket was not closed by now ? (is_server()) + conn->m_ready_pcbs.erase(&new_sock->m_pcb); + + if (conn->is_socketxtreme()) { + auto_accept_connection(conn, new_sock); + } else { + conn->m_accepted_conns.push_back(new_sock); + conn->m_ready_conn_cnt++; + + NOTIFY_ON_EVENTS(conn, EPOLLIN); + } + + //OLG: Now we should wakeup all threads that are sleeping on this socket. + conn->do_wakeup(); + //Now we should register the child socket to TCP timer + + conn->unlock_tcp_con(); + + /* Do this after auto_accept_connection() call */ + new_sock->m_parent = NULL; + + new_sock->lock_tcp_con(); + + return ERR_OK; +} + +void sockinfo_tcp::create_flow_tuple_key_from_pcb(flow_tuple &key, struct tcp_pcb *pcb) +{ + key = flow_tuple(pcb->local_ip.addr, htons(pcb->local_port), pcb->remote_ip.addr, htons(pcb->remote_port), PROTO_TCP); +} + +mem_buf_desc_t* sockinfo_tcp::get_front_m_rx_pkt_ready_list(){ + return m_rx_pkt_ready_list.front(); +} + +size_t sockinfo_tcp::get_size_m_rx_pkt_ready_list(){ + return m_rx_pkt_ready_list.size(); +} + +void sockinfo_tcp::pop_front_m_rx_pkt_ready_list(){ + m_rx_pkt_ready_list.pop_front(); +} + +void sockinfo_tcp::push_back_m_rx_pkt_ready_list(mem_buf_desc_t* buff){ + m_rx_pkt_ready_list.push_back(buff); +} + + + +struct tcp_pcb* sockinfo_tcp::get_syn_received_pcb(const flow_tuple &key) const +{ + struct tcp_pcb* ret_val = NULL; + syn_received_map_t::const_iterator itr; + + itr = m_syn_received.find(key); + if (itr != m_syn_received.end()) { + ret_val = itr->second; + } + return ret_val; +} + +struct tcp_pcb* sockinfo_tcp::get_syn_received_pcb(in_addr_t peer_ip, in_port_t peer_port, in_addr_t local_ip, in_port_t local_port) +{ + flow_tuple key(local_ip, local_port, peer_ip, peer_port, PROTO_TCP); + return get_syn_received_pcb(key); +} + +err_t sockinfo_tcp::clone_conn_cb(void *arg, struct tcp_pcb **newpcb, err_t err) +{ + sockinfo_tcp *new_sock; + err_t ret_val = ERR_OK; + + sockinfo_tcp *conn = (sockinfo_tcp *)((arg)); + NOT_IN_USE(err); + + if (!conn || !newpcb) { + return ERR_VAL; + } + + ASSERT_LOCKED(conn->m_tcp_con_lock); + conn->m_tcp_con_lock.unlock(); + + new_sock = conn->accept_clone(); + + if (new_sock) { + /* cppcheck-suppress autoVariables */ + *newpcb = (struct tcp_pcb*)(&new_sock->m_pcb); + new_sock->m_pcb.my_container = (void*)new_sock; + } + else { + ret_val = ERR_MEM; + } + + conn->m_tcp_con_lock.lock(); + + return ret_val; +} + +err_t sockinfo_tcp::syn_received_lwip_cb(void *arg, struct tcp_pcb *newpcb, err_t err) +{ + sockinfo_tcp *listen_sock = (sockinfo_tcp *)((arg)); + + if (!listen_sock || !newpcb) { + return ERR_VAL; + } + + sockinfo_tcp *new_sock = (sockinfo_tcp *)((newpcb->my_container)); + + NOT_IN_USE(err); + + ASSERT_LOCKED(listen_sock->m_tcp_con_lock); + + /* Inherite properties from the parent */ + new_sock->set_conn_properties_from_pcb(); + + new_sock->m_rcvbuff_max = MAX(listen_sock->m_rcvbuff_max, 2 * new_sock->m_pcb.mss); + new_sock->fit_rcv_wnd(true); + + // Socket socket options + listen_sock->set_sock_options(new_sock); + + listen_sock->m_tcp_con_lock.unlock(); + + new_sock->create_dst_entry(); + bool is_new_offloaded = new_sock->m_p_connected_dst_entry && new_sock->prepare_dst_to_send(true); // pass true for passive socket to skip the transport rules checking + + /* this can happen if there is no route back to the syn sender. + * so we just need to ignore it. + * we set the state to close so we won't try to send fin when we don't + * have route. */ + if (!is_new_offloaded) { + new_sock->setPassthrough(); + set_tcp_state(&new_sock->m_pcb, CLOSED); + close(new_sock->get_fd()); + listen_sock->m_tcp_con_lock.lock(); + return ERR_ABRT; + } + + new_sock->register_timer(); + + listen_sock->m_tcp_con_lock.lock(); + + flow_tuple key; + create_flow_tuple_key_from_pcb(key, newpcb); + + listen_sock->m_syn_received[key] = newpcb; + + listen_sock->m_received_syn_num++; + + return ERR_OK; +} + +err_t sockinfo_tcp::syn_received_drop_lwip_cb(void *arg, struct tcp_pcb *newpcb, err_t err) +{ + sockinfo_tcp *listen_sock = (sockinfo_tcp *)((arg)); + + if (!listen_sock || !newpcb) { + return ERR_VAL; + } + + sockinfo_tcp *new_sock = (sockinfo_tcp *)((newpcb->my_container)); + + NOT_IN_USE(err); + + ASSERT_LOCKED(listen_sock->m_tcp_con_lock); + listen_sock->m_tcp_con_lock.unlock(); + + new_sock->set_conn_properties_from_pcb(); + new_sock->create_dst_entry(); + if (new_sock->m_p_connected_dst_entry) { + new_sock->prepare_dst_to_send(true); // true for passive socket to skip the transport rules checking + tcp_arg(&(new_sock->m_pcb), new_sock); + new_sock->abort_connection(); + } + close(new_sock->get_fd()); + + listen_sock->m_tcp_con_lock.lock(); + + return ERR_ABRT; +} + +void sockinfo_tcp::set_conn_properties_from_pcb() +{ + // setup peer address and local address + + m_connected.set_in_addr(m_pcb.remote_ip.addr); + m_connected.set_in_port(htons(m_pcb.remote_port)); + m_connected.set_sa_family(AF_INET); + + m_bound.set_in_addr(m_pcb.local_ip.addr); + m_bound.set_in_port(htons(m_pcb.local_port)); + m_bound.set_sa_family(AF_INET); +} + +void sockinfo_tcp::set_sock_options(sockinfo_tcp *new_sock) +{ + si_tcp_logdbg("Applying all socket options on %p, fd %d", new_sock, new_sock->get_fd()); + + socket_options_list_t::iterator options_iter; + for (options_iter = m_socket_options_list.begin(); options_iter != m_socket_options_list.end(); options_iter++) { + socket_option_t* opt = *options_iter; + new_sock->setsockopt(opt->level, opt->optname, opt->optval, opt->optlen); + } + errno = 0; + + si_tcp_logdbg("set_sock_options completed"); +} + +err_t sockinfo_tcp::connect_lwip_cb(void *arg, struct tcp_pcb *tpcb, err_t err) +{ + sockinfo_tcp *conn = (sockinfo_tcp *)arg; + NOT_IN_USE(tpcb); + + __log_dbg("connect cb: arg=%p, pcp=%p err=%d\n", arg, tpcb, err); + + if (!conn || !tpcb) { + return ERR_VAL; + } + + conn->lock_tcp_con(); + + if (conn->m_conn_state == TCP_CONN_TIMEOUT) { + //tcp_si_logdbg("conn timeout"); + conn->m_error_status = ETIMEDOUT; + conn->unlock_tcp_con(); + return ERR_OK; + } + if (err == ERR_OK) { + conn->m_conn_state = TCP_CONN_CONNECTED; + conn->m_sock_state = TCP_SOCK_CONNECTED_RDWR; // async connect verification + conn->m_error_status = 0; + if (conn->m_rcvbuff_max < 2 * conn->m_pcb.mss) { + conn->m_rcvbuff_max = 2 * conn->m_pcb.mss; + } + conn->fit_rcv_wnd(false); + } + else { + conn->m_error_status = ECONNREFUSED; + conn->m_conn_state = TCP_CONN_FAILED; + } + + NOTIFY_ON_EVENTS(conn, EPOLLOUT); + //OLG: Now we should wakeup all threads that are sleeping on this socket. + conn->do_wakeup(); + + conn->m_p_socket_stats->connected_ip = conn->m_connected.get_in_addr(); + conn->m_p_socket_stats->connected_port = conn->m_connected.get_in_port(); + + conn->unlock_tcp_con(); + + return ERR_OK; +} + +int sockinfo_tcp::wait_for_conn_ready() +{ + int poll_count = 0; + + si_tcp_logfuncall(""); + + while(m_conn_state == TCP_CONN_CONNECTING && m_sock_state != TCP_SOCK_INITED) { + /*In case of connect error err_lwip_cb is called and not connect_lwip_cb + * therefore in this case the m_conn_state will not be changed only + * m_sock_state + */ + if (rx_wait(poll_count, m_b_blocking) < 0) { + si_tcp_logdbg("connect interrupted"); + return -1; + } + + if (unlikely(g_b_exit)) { + errno = EINTR; + return -1; + } + } + if (m_sock_state == TCP_SOCK_INITED) { + //we get here if err_lwip_cb() was called and set m_sock_state=TCP_SOCK_INITED + m_conn_state = TCP_CONN_FAILED; + errno = ECONNREFUSED; + si_tcp_logdbg("got connection error"); + //if we got here, bind succeeded earlier (in connect()), so change m_sock_state back to TCP_SOCK_BOUND to avoid binding again in case of recalling connect() + m_sock_state = TCP_SOCK_BOUND; + return -1; + + } + if (m_conn_state != TCP_CONN_CONNECTED) { + if (m_conn_state == TCP_CONN_TIMEOUT) { + m_conn_state = TCP_CONN_FAILED; + errno = ETIMEDOUT; + } else { + errno = ECONNREFUSED; + } + si_tcp_logdbg("bad connect -> timeout or none listening"); + return -1; + } + si_tcp_logdbg("+++ CONNECT OK!!!! ++++"); + m_sock_state = TCP_SOCK_CONNECTED_RDWR; + si_tcp_logdbg("TCP PCB FLAGS: 0x%x", m_pcb.flags); + return 0; +} + + + +bool sockinfo_tcp::is_readable(uint64_t *p_poll_sn, fd_array_t* p_fd_array) +{ + int ret; + + if (is_server()) { + bool state; + //tcp_si_logwarn("select on accept()"); + //m_conn_cond.lock(); + state = m_ready_conn_cnt == 0 ? false : true; + if (state) { + si_tcp_logdbg("accept ready"); + return true; + } + + if (m_sock_state == TCP_SOCK_ACCEPT_SHUT) return true; + + return false; + } + else if (m_sock_state == TCP_SOCK_ASYNC_CONNECT) { + // socket is not ready to read in this state!!! + return false; + } + + if (!is_rtr()) { + // unconnected tcp sock is always ready for read! + // return its fd as ready + si_tcp_logdbg("block check on unconnected socket"); + return true; + } + + if (m_n_rx_pkt_ready_list_count) + return true; + + if (!p_poll_sn) + return false; + + consider_rings_migration(); + + m_rx_ring_map_lock.lock(); + while(!g_b_exit && is_rtr()) { + if (likely(m_p_rx_ring)) { + // likely scenario: rx socket bound to specific cq + ret = m_p_rx_ring->poll_and_process_element_rx(p_poll_sn, p_fd_array); + if (m_n_rx_pkt_ready_list_count || ret <= 0) { + break; + } + } else if (!m_rx_ring_map.empty()) { + rx_ring_map_t::iterator rx_ring_iter; + for (rx_ring_iter = m_rx_ring_map.begin(); rx_ring_iter != m_rx_ring_map.end(); rx_ring_iter++) { + if (rx_ring_iter->second->refcnt <= 0) { + continue; + } + ring* p_ring = rx_ring_iter->first; + //g_p_lwip->do_timers(); + ret = p_ring->poll_and_process_element_rx(p_poll_sn, p_fd_array); + if (m_n_rx_pkt_ready_list_count || ret <= 0) { + break; + } + } + } else { + // No available rx rings, break loop. + break; + } + } + + m_rx_ring_map_lock.unlock(); + if (!m_n_rx_pkt_ready_list_count) { + return false; + } + + return true; +} + +bool sockinfo_tcp::is_writeable() +{ + if (m_sock_state == TCP_SOCK_ASYNC_CONNECT) { + if (m_conn_state == TCP_CONN_CONNECTED) { + si_tcp_logdbg("++++ async connect ready"); + m_sock_state = TCP_SOCK_CONNECTED_RDWR; + goto noblock; + } + else if (m_conn_state != TCP_CONN_CONNECTING) { + // async connect failed for some reason. Reset our state and return ready fd + si_tcp_logerr("async connect failed"); + if(m_sock_state != TCP_SOCK_BOUND) { //Avoid binding twice + m_sock_state = TCP_SOCK_INITED; + } + goto noblock; + } + return false; + } + if (!is_rts()) { + // unconnected tcp sock is always ready for write! - TODO: verify! + // return its fd as ready + si_tcp_logdbg("block check on unconnected socket"); + goto noblock; + } + + if (tcp_sndbuf(&m_pcb) > 0) + goto noblock; + + //g_p_lwip->do_timers(); //TODO: consider! + return false; + +noblock: +/* + if (p_fd_array) { + p_fd_array->fd_list[p_fd_array->fd_count] = m_fd; + p_fd_array->fd_count++; + } +*/ + __log_funcall("--->>> tcp_sndbuf(&m_pcb)=%d", tcp_sndbuf(&m_pcb)); + return true; +} + +bool sockinfo_tcp::is_errorable(int *errors) +{ + *errors = 0; + + if (m_conn_state == TCP_CONN_ERROR || + m_conn_state == TCP_CONN_TIMEOUT || + m_conn_state == TCP_CONN_RESETED || + m_conn_state == TCP_CONN_FAILED) { + *errors |= POLLHUP; + } + + if (m_conn_state == TCP_CONN_ERROR) { + *errors |= POLLERR; + } + + return *errors; +} + +/* + * FIXME: need to split sock connected state in two: TCP_SOCK_CON_TX/RX + */ +int sockinfo_tcp::shutdown(int __how) +{ + err_t err = ERR_OK; + + int shut_rx, shut_tx; + + // if in os pathrough just redirect to os + if (m_sock_offload == TCP_SOCK_PASSTHROUGH) { + si_tcp_logdbg("passthrough - go to OS shutdown()"); + return orig_os_api.shutdown(m_fd, __how); + } + + lock_tcp_con(); + + shut_tx = shut_rx = 0; + switch (__how) { + case SHUT_RD: + if (is_connected()) { + m_sock_state = TCP_SOCK_CONNECTED_WR; + NOTIFY_ON_EVENTS(this, EPOLLIN); + } + else if (is_rtr()) { + m_sock_state = TCP_SOCK_BOUND; + NOTIFY_ON_EVENTS(this, EPOLLIN|EPOLLHUP); + } + else if (m_sock_state == TCP_SOCK_ACCEPT_READY) { + m_sock_state = TCP_SOCK_ACCEPT_SHUT; + } + else goto bad_state; + shut_rx = 1; + break; + case SHUT_WR: + if (is_connected()) { + m_sock_state = TCP_SOCK_CONNECTED_RD; + } + else if (is_rts()) { + m_sock_state = TCP_SOCK_BOUND; + NOTIFY_ON_EVENTS(this, EPOLLHUP); + } + else if (is_server()) { + //ignore SHUT_WR on listen socket + } + else goto bad_state; + shut_tx = 1; + break; + case SHUT_RDWR: + if (is_connected() || is_rts() || is_rtr()) { + m_sock_state = TCP_SOCK_BOUND; + NOTIFY_ON_EVENTS(this, EPOLLIN|EPOLLHUP); + } + else if (m_sock_state == TCP_SOCK_ACCEPT_READY) { + m_sock_state = TCP_SOCK_ACCEPT_SHUT; + } + else goto bad_state; + shut_rx = 1; + shut_tx = 1; + break; + BULLSEYE_EXCLUDE_BLOCK_START + default: + si_tcp_logerr("unknow shutdown option %d", __how); + break; + BULLSEYE_EXCLUDE_BLOCK_END + } + + if (is_server()) { + if (shut_rx) { + tcp_accept(&m_pcb, 0); + tcp_syn_handled((struct tcp_pcb_listen*)(&m_pcb), sockinfo_tcp::syn_received_drop_lwip_cb); + } + } else { + if (get_tcp_state(&m_pcb) != LISTEN && shut_rx && m_n_rx_pkt_ready_list_count) { + abort_connection(); + } else { + err = tcp_shutdown(&m_pcb, shut_rx, shut_tx); + } + } + + do_wakeup(); + + if (err == ERR_OK) { + unlock_tcp_con(); + return 0; + } + +bad_state: + unlock_tcp_con(); + errno = ENOTCONN; + return -1; +} + +/* + * TCP options from netinet/tcp.h + * including file directly conflicts with lwipopts.h (TCP_MSS define) + */ +/* + * User-settable options (used with setsockopt). + */ +#define TCP_NODELAY 1 /* Don't delay send to coalesce packets */ +#define TCP_MAXSEG 2 /* Set maximum segment size */ +#define TCP_CORK 3 /* Control sending of partial frames */ +#define TCP_KEEPIDLE 4 /* Start keeplives after this period */ +#define TCP_KEEPINTVL 5 /* Interval between keepalives */ +#define TCP_KEEPCNT 6 /* Number of keepalives before death */ +#define TCP_SYNCNT 7 /* Number of SYN retransmits */ +#define TCP_LINGER2 8 /* Life time of orphaned FIN-WAIT-2 state */ +#define TCP_DEFER_ACCEPT 9 /* Wake up listener only when data arrive */ +#define TCP_WINDOW_CLAMP 10 /* Bound advertised window */ +#define TCP_INFO 11 /* Information about this connection. */ +#define TCP_QUICKACK 12 /* Block/reenable quick ACKs. */ + +int sockinfo_tcp::fcntl(int __cmd, unsigned long int __arg) +{ + if (!safe_mce_sys().avoid_sys_calls_on_tcp_fd || !is_connected()) + return sockinfo::fcntl(__cmd, __arg); + + switch (__cmd) { + case F_SETFL: /* Set file status flags. */ + { + si_tcp_logdbg("cmd=F_SETFL, arg=%#x", __arg); + if (__arg & O_NONBLOCK) + set_blocking(false); + else + set_blocking(true); + return 0; + } + break; + case F_GETFL: /* Get file status flags. */ + { + si_tcp_logdbg("cmd=F_GETFL"); + if (m_b_blocking) + return 0; + else + return O_NONBLOCK; + } + break; + default: + break; + } + return sockinfo::fcntl(__cmd, __arg); +} + +int sockinfo_tcp::ioctl(unsigned long int __request, unsigned long int __arg) +{ + if (!safe_mce_sys().avoid_sys_calls_on_tcp_fd || !is_connected()) + return sockinfo::ioctl(__request, __arg); + + int *p_arg = (int *)__arg; + + switch (__request) { + case FIONBIO: + { + si_tcp_logdbg("request=FIONBIO, arg=%d", *p_arg); + if (*p_arg) + set_blocking(false); + else + set_blocking(true); + return 0; + } + break; + default: + break; + } + return sockinfo::ioctl(__request, __arg); +} + +void sockinfo_tcp::fit_rcv_wnd(bool force_fit) +{ + m_pcb.rcv_wnd_max_desired = MIN(TCP_WND_SCALED(&m_pcb), m_rcvbuff_max); + + if (force_fit) { + int rcv_wnd_max_diff = m_pcb.rcv_wnd_max_desired - m_pcb.rcv_wnd_max; + + m_pcb.rcv_wnd_max = m_pcb.rcv_wnd_max_desired; + m_pcb.rcv_wnd = MAX(0, (int)m_pcb.rcv_wnd + rcv_wnd_max_diff); + m_pcb.rcv_ann_wnd = MAX(0, (int)m_pcb.rcv_ann_wnd + rcv_wnd_max_diff); + + if (!m_pcb.rcv_wnd) { + m_rcvbuff_non_tcp_recved = m_pcb.rcv_wnd_max; + } + } else if (m_pcb.rcv_wnd_max_desired > m_pcb.rcv_wnd_max) { + uint32_t rcv_wnd_max_diff = m_pcb.rcv_wnd_max_desired - m_pcb.rcv_wnd_max; + m_pcb.rcv_wnd_max = m_pcb.rcv_wnd_max_desired; + m_pcb.rcv_wnd += rcv_wnd_max_diff; + m_pcb.rcv_ann_wnd += rcv_wnd_max_diff; + } + +} + +void sockinfo_tcp::fit_snd_bufs(unsigned int new_max_snd_buff) +{ + uint32_t sent_buffs_num = 0; + + sent_buffs_num = m_pcb.max_snd_buff - m_pcb.snd_buf; + if (sent_buffs_num <= new_max_snd_buff) { + m_pcb.max_snd_buff = new_max_snd_buff; + if (m_pcb.mss) + m_pcb.max_unsent_len = (16 * (m_pcb.max_snd_buff)/m_pcb.mss); + else + m_pcb.max_unsent_len = (16 * (m_pcb.max_snd_buff)/536); /* should MSS be 0 use a const...very unlikely */ + /* make sure max_unsent_len is not 0 */ + m_pcb.max_unsent_len = MAX(m_pcb.max_unsent_len, 1); + m_pcb.snd_buf = m_pcb.max_snd_buff - sent_buffs_num; + } +} + +void sockinfo_tcp::fit_snd_bufs_to_nagle(bool disable_nagle) +{ + if (m_sndbuff_max) + return; + + if (disable_nagle) { + fit_snd_bufs(TCP_SND_BUF_NO_NAGLE); + } else { + fit_snd_bufs(TCP_SND_BUF); + } +} + +//////////////////////////////////////////////////////////////////////////////// +bool sockinfo_tcp::try_un_offloading() // un-offload the socket if possible +{ + // be conservative and avoid off-loading a socket after it started connecting + return m_conn_state == TCP_CONN_INIT ? sockinfo::try_un_offloading() : false; +} + +//////////////////////////////////////////////////////////////////////////////// +#define SOCKOPT_HANDLE_BY_OS -2 +int sockinfo_tcp::setsockopt(int __level, int __optname, + __const void *__optval, socklen_t __optlen) +{ + //todo check optlen and set proper errno on failure + si_tcp_logfunc("level=%d, optname=%d", __level, __optname); + + int val, ret = 0; + bool supported = true; + bool allow_privileged_sock_opt = false; + + if ((ret = sockinfo::setsockopt(__level, __optname, __optval, __optlen)) != SOCKOPT_PASS_TO_OS) { + if (ret == SOCKOPT_INTERNAL_VMA_SUPPORT && + m_sock_state <= TCP_SOCK_ACCEPT_READY && __optval != NULL && + is_inherited_option(__level, __optname)) + m_socket_options_list.push_back(new socket_option_t(__level, __optname,__optval, __optlen)); + return ret; + } + + ret = 0; + + if (__level == IPPROTO_IP) { + switch(__optname) { + case IP_TOS: /* might be missing ECN logic */ + ret = SOCKOPT_HANDLE_BY_OS; + if (__optlen == sizeof(int)) { + val = *(int *)__optval; + } else if (__optlen == sizeof(uint8_t)) { + val = *(uint8_t *)__optval; + } else { + break; + } + val &= ~INET_ECN_MASK; + val |= m_pcb.tos & INET_ECN_MASK; + if (m_pcb.tos != val) { + lock_tcp_con(); + m_pcb.tos = val; + header_tos_updater du(m_pcb.tos); + update_header_field(&du); + // lists.openwall.net/netdev/2009/12/21/59 + int new_prio = ip_tos2prio[IPTOS_TOS(m_pcb.tos) >> 1]; + set_sockopt_prio(&new_prio, sizeof(new_prio)); + unlock_tcp_con(); + + } + break; + default: + ret = SOCKOPT_HANDLE_BY_OS; + supported = false; + break; + } + } + if (__level == IPPROTO_TCP) { + switch(__optname) { + case TCP_NODELAY: + val = *(int *)__optval; + lock_tcp_con(); + if (val) + tcp_nagle_disable(&m_pcb); + else + tcp_nagle_enable(&m_pcb); + fit_snd_bufs_to_nagle(val); + unlock_tcp_con(); + si_tcp_logdbg("(TCP_NODELAY) nagle: %d", val); + break; + case TCP_QUICKACK: + val = *(int *)__optval; + lock_tcp_con(); + m_pcb.quickack = (uint8_t)(val > 0 ? val : 0); + unlock_tcp_con(); + si_tcp_logdbg("(TCP_QUICKACK) value: %d", val); + break; + default: + ret = SOCKOPT_HANDLE_BY_OS; + supported = false; + break; + } + } + if (__level == SOL_SOCKET) { + switch(__optname) { + case SO_REUSEADDR: + val = *(int *)__optval; + lock_tcp_con(); + if (val) + m_pcb.so_options |= SOF_REUSEADDR; + else + m_pcb.so_options &= ~SOF_REUSEADDR; + ret = SOCKOPT_HANDLE_BY_OS; //SO_REUSEADDR is also relevant on OS + unlock_tcp_con(); + si_tcp_logdbg("(SO_REUSEADDR) val: %d", val); + break; + case SO_KEEPALIVE: + val = *(int *)__optval; + lock_tcp_con(); + if (val) + m_pcb.so_options |= SOF_KEEPALIVE; + else + m_pcb.so_options &= ~SOF_KEEPALIVE; + unlock_tcp_con(); + si_tcp_logdbg("(SO_KEEPALIVE) val: %d", val); + break; + case SO_RCVBUF: + val = MIN(*(int *)__optval, safe_mce_sys().sysctl_reader.get_net_core_rmem_max()); + lock_tcp_con(); + // OS allocates double the size of memory requested by the application - not sure we need it. + m_rcvbuff_max = MAX(2 * m_pcb.mss, 2 * val); + + fit_rcv_wnd(!is_connected()); + unlock_tcp_con(); + si_tcp_logdbg("setsockopt SO_RCVBUF: %d", m_rcvbuff_max); + break; + case SO_SNDBUF: + val = MIN(*(int *)__optval, safe_mce_sys().sysctl_reader.get_net_core_wmem_max()); + lock_tcp_con(); + // OS allocates double the size of memory requested by the application - not sure we need it. + m_sndbuff_max = MAX(2 * m_pcb.mss, 2 * val); + fit_snd_bufs(m_sndbuff_max); + unlock_tcp_con(); + si_tcp_logdbg("setsockopt SO_SNDBUF: %d", m_sndbuff_max); + break; + case SO_LINGER: + if (__optlen < sizeof(struct linger)) { + errno = EINVAL; + break; + } + m_linger = *(struct linger*)__optval; + si_tcp_logdbg("setsockopt SO_LINGER: l_onoff = %d, l_linger = %d", m_linger.l_onoff, m_linger.l_linger); + break; + case SO_RCVTIMEO: + { + if (__optlen < sizeof(struct timeval)) { + errno = EINVAL; + break; + } + struct timeval* tv = (struct timeval*)__optval; + if (tv->tv_sec || tv->tv_usec) + m_loops_timer.set_timeout_msec(tv->tv_sec*1000 + (tv->tv_usec ? tv->tv_usec/1000 : 0)); + else + m_loops_timer.set_timeout_msec(-1); + si_tcp_logdbg("SOL_SOCKET: SO_RCVTIMEO=%d", m_loops_timer.get_timeout_msec()); + break; + } + case SO_BINDTODEVICE: + struct sockaddr_in sockaddr; + allow_privileged_sock_opt = safe_mce_sys().allow_privileged_sock_opt; + if (__optlen == 0 || ((char*)__optval)[0] == '\0') { + m_so_bindtodevice_ip = INADDR_ANY; + } else if (get_ipv4_from_ifname((char*)__optval, &sockaddr)) { + si_tcp_logdbg("SOL_SOCKET, SO_BINDTODEVICE - NOT HANDLED, cannot find if_name"); + errno = EINVAL; + ret = -1; + break; + } else { + m_so_bindtodevice_ip = sockaddr.sin_addr.s_addr; + + if (!is_connected()) { + /* Current implementation allows to create separate rings for tx and rx. + * tx ring is created basing on destination ip during connect() call, + * SO_BINDTODEVICE and routing table information + * whereas rx ring creation can be based on bound (local) ip + * As a result there are limitations in using this capability. + * Also we can not have bound information as + * (!m_bound.is_anyaddr() && !m_bound.is_local_loopback()) + * and can not detect offload/non-offload socket + * Note: + * This inconsistency should be resolved. + */ + ip_address local_ip(m_so_bindtodevice_ip); + + lock_tcp_con(); + /* We need to destroy this if attach/detach receiver is not called + * just reference counter for p_nd_resources is updated on attach/detach + */ + if (NULL == create_nd_resources((const ip_address)local_ip)) { + si_tcp_logdbg("Failed to get net device resources on ip %s", local_ip.to_str().c_str()); + } + unlock_tcp_con(); + } + } + // handle TX side + if (m_p_connected_dst_entry) { + if (m_p_connected_dst_entry->is_offloaded()) { + si_tcp_logdbg("SO_BINDTODEVICE will not work on already offloaded TCP socket"); + errno = EINVAL; + return -1; + } else { + m_p_connected_dst_entry->set_so_bindtodevice_addr(m_so_bindtodevice_ip); + } + } + // TODO handle RX side + si_tcp_logdbg("(SO_BINDTODEVICE) interface=%s", (char*)__optval); + break; + case SO_MAX_PACING_RATE: { + struct vma_rate_limit_t rate_limit; + + if (!__optval) { + errno = EINVAL; + break; + } + if (sizeof(struct vma_rate_limit_t) == __optlen) { + rate_limit = *(struct vma_rate_limit_t*)__optval; // value is in Kbits per second + } else if (sizeof(uint32_t) == __optlen) { + // value is in bytes per second + rate_limit.rate = BYTE_TO_KB(*(uint32_t*)__optval); // value is in bytes per second + rate_limit.max_burst_sz = 0; + rate_limit.typical_pkt_sz = 0; + } else { + errno = EINVAL; + break; + } + + lock_tcp_con(); + ret = modify_ratelimit(m_p_connected_dst_entry, rate_limit); + unlock_tcp_con(); + if (ret) { + si_tcp_logdbg("error setting setsockopt SO_MAX_PACING_RATE: %d bytes/second ", rate_limit.rate); + } else { + si_tcp_logdbg("setsockopt SO_MAX_PACING_RATE: %d bytes/second ", rate_limit.rate); + } + return ret; + } + case SO_PRIORITY: { + lock_tcp_con(); + if (set_sockopt_prio(__optval, __optlen)) { + unlock_tcp_con(); + return -1; + } + unlock_tcp_con(); + ret = SOCKOPT_HANDLE_BY_OS; + break; + } + default: + ret = SOCKOPT_HANDLE_BY_OS; + supported = false; + break; + } + } + + if (m_sock_state <= TCP_SOCK_ACCEPT_READY && __optval != NULL && is_inherited_option(__level, __optname)) + m_socket_options_list.push_back(new socket_option_t(__level, __optname,__optval, __optlen)); + + if (safe_mce_sys().avoid_sys_calls_on_tcp_fd && ret != SOCKOPT_HANDLE_BY_OS && is_connected()) + return ret; + return setsockopt_kernel(__level, __optname, __optval, __optlen, supported, allow_privileged_sock_opt); +} + +int sockinfo_tcp::getsockopt_offload(int __level, int __optname, void *__optval, + socklen_t *__optlen) +{ + int ret = -1; + + if (!__optval || !__optlen) { + errno = EFAULT; + return ret; + } + + if (0 == sockinfo::getsockopt(__level, __optname, __optval, __optlen)) { + return 0; + } + + switch (__level) { + case IPPROTO_TCP: + switch(__optname) { + case TCP_NODELAY: + if (*__optlen >= sizeof(int)) { + *(int *)__optval = tcp_nagle_disabled(&m_pcb); + si_tcp_logdbg("(TCP_NODELAY) nagle: %d", *(int *)__optval); + ret = 0; + } else { + errno = EINVAL; + } + break; + case TCP_QUICKACK: + if (*__optlen >= sizeof(int)) { + *(int *)__optval = m_pcb.quickack; + si_tcp_logdbg("(TCP_QUICKACK) value: %d", *(int *)__optval); + ret = 0; + } else { + errno = EINVAL; + } + break; + default: + ret = SOCKOPT_HANDLE_BY_OS; + break; + } + break; + case SOL_SOCKET: + switch(__optname) { + case SO_ERROR: + if (*__optlen >= sizeof(int)) { + *(int *)__optval = m_error_status; + si_tcp_logdbg("(SO_ERROR) status: %d", m_error_status); + m_error_status = 0; + ret = 0; + } else { + errno = EINVAL; + } + break; + case SO_REUSEADDR: + if (*__optlen >= sizeof(int)) { + *(int *)__optval = m_pcb.so_options & SOF_REUSEADDR; + si_tcp_logdbg("(SO_REUSEADDR) reuse: %d", *(int *)__optval); + ret = 0; + } else { + errno = EINVAL; + } + break; + case SO_KEEPALIVE: + if (*__optlen >= sizeof(int)) { + *(int *)__optval = (bool)(m_pcb.so_options & SOF_KEEPALIVE); + si_tcp_logdbg("(SO_KEEPALIVE) keepalive: %d", *(int *)__optval); + ret = 0; + } else { + errno = EINVAL; + } + break; + case SO_RCVBUF: + if (*__optlen >= sizeof(int)) { + *(int *)__optval = m_rcvbuff_max; + si_tcp_logdbg("(SO_RCVBUF) rcvbuf=%d", m_rcvbuff_max); + ret = 0; + } else { + errno = EINVAL; + } + break; + case SO_SNDBUF: + if (*__optlen >= sizeof(int)) { + *(int *)__optval = m_sndbuff_max; + si_tcp_logdbg("(SO_SNDBUF) sndbuf=%d", m_sndbuff_max); + ret = 0; + } else { + errno = EINVAL; + } + break; + case SO_LINGER: + if (*__optlen > 0) { + memcpy(__optval, &m_linger, std::min(*__optlen , sizeof(struct linger))); + si_tcp_logdbg("(SO_LINGER) l_onoff = %d, l_linger = %d", m_linger.l_onoff, m_linger.l_linger); + ret = 0; + } else { + errno = EINVAL; + } + break; + case SO_RCVTIMEO: + if (*__optlen >= sizeof(struct timeval)) { + struct timeval* tv = (struct timeval*)__optval; + tv->tv_sec = m_loops_timer.get_timeout_msec() / 1000; + tv->tv_usec = (m_loops_timer.get_timeout_msec() % 1000) * 1000; + si_tcp_logdbg("(SO_RCVTIMEO) msec=%d", m_loops_timer.get_timeout_msec()); + ret = 0; + } else { + errno = EINVAL; + } + break; + + case SO_BINDTODEVICE: + //todo add support + errno = ENOPROTOOPT; + break; + case SO_MAX_PACING_RATE: + ret = sockinfo::getsockopt(__level, __optname, __optval, __optlen); + break; + default: + ret = SOCKOPT_HANDLE_BY_OS; + break; + } + break; + case IPPROTO_IP: + switch (__optname) { + default: + ret = SOCKOPT_HANDLE_BY_OS; + break; + } + break; + default: + ret = SOCKOPT_HANDLE_BY_OS; + break; + } + + BULLSEYE_EXCLUDE_BLOCK_START + if (ret && ret != SOCKOPT_HANDLE_BY_OS) { + si_tcp_logdbg("getsockopt failed (ret=%d %m)", ret); + } + BULLSEYE_EXCLUDE_BLOCK_END + return ret; +} + +int sockinfo_tcp::getsockopt(int __level, int __optname, void *__optval, + socklen_t *__optlen) +{ + int ret = getsockopt_offload(__level, __optname, __optval, __optlen); + if (ret != SOCKOPT_HANDLE_BY_OS) + return ret; + else { + char buf[256]; + snprintf(buf, sizeof(buf), "unimplemented getsockopt __level=%#x, __optname=%#x, __optlen=%d", (unsigned)__level, (unsigned)__optname, __optlen ? *__optlen : 0); + buf[ sizeof(buf)-1 ] = '\0'; + + VLOG_PRINTF_INFO(safe_mce_sys().exception_handling.get_log_severity(), "%s", buf); + int rc = handle_exception_flow(); + switch (rc) { + case -1: + return rc; + case -2: + vma_throw_object_with_msg(vma_unsupported_api, buf); + } + } + + ret = orig_os_api.getsockopt(m_fd, __level, __optname, __optval, __optlen); + + BULLSEYE_EXCLUDE_BLOCK_START + if (ret) { + si_tcp_logdbg("getsockopt failed (ret=%d %m)", ret); + } + BULLSEYE_EXCLUDE_BLOCK_END + return ret; +} + +int sockinfo_tcp::getsockname(sockaddr *__name, socklen_t *__namelen) +{ + __log_info_func(""); + + if (m_sock_offload == TCP_SOCK_PASSTHROUGH) { + si_tcp_logdbg("passthrough - go to OS getsockname"); + return orig_os_api.getsockname(m_fd, __name, __namelen); + } + + // according to man address should be truncated if given struct is too small + if (__name && __namelen) { + if ((int)*__namelen < 0) { + si_tcp_logdbg("negative __namelen is not supported: %d", *__namelen); + errno = EINVAL; + return -1; + } + + if (*__namelen) { + m_bound.get_sa(__name, *__namelen); + } + + *__namelen = m_bound.get_socklen(); + } + + return 0; +} + +int sockinfo_tcp::getpeername(sockaddr *__name, socklen_t *__namelen) +{ + __log_info_func(""); + + if (m_sock_offload == TCP_SOCK_PASSTHROUGH) { + si_tcp_logdbg("passthrough - go to OS getpeername"); + return orig_os_api.getpeername(m_fd, __name, __namelen); + } + + if (m_conn_state != TCP_CONN_CONNECTED) { + errno = ENOTCONN; + return -1; + } + + // according to man address should be truncated if given struct is too small + if (__name && __namelen) { + if ((int)*__namelen < 0) { + si_tcp_logdbg("negative __namelen is not supported: %d", *__namelen); + errno = EINVAL; + return -1; + } + + if (*__namelen) { + m_connected.get_sa(__name, *__namelen); + } + + *__namelen = m_connected.get_socklen(); + } + + return 0; +} + +int sockinfo_tcp::rx_wait_helper(int &poll_count, bool is_blocking) +{ + int ret; + int n; + uint64_t poll_sn = 0; + rx_ring_map_t::iterator rx_ring_iter; + epoll_event rx_epfd_events[SI_RX_EPFD_EVENT_MAX]; + + // poll for completion + __log_info_func(""); + + + poll_count++; + n = 0; + // if in listen state go directly to wait part + + consider_rings_migration(); + + // There's only one CQ + m_rx_ring_map_lock.lock(); + if (likely(m_p_rx_ring)) { + n = m_p_rx_ring->poll_and_process_element_rx(&poll_sn); + } + else { //There's more than one CQ, go over each one + for (rx_ring_iter = m_rx_ring_map.begin(); rx_ring_iter != m_rx_ring_map.end(); rx_ring_iter++) { + if (unlikely(rx_ring_iter->second->refcnt <= 0)) { + __log_err("Attempt to poll illegal cq"); + continue; + } + ring* p_ring = rx_ring_iter->first; + //g_p_lwip->do_timers(); + n += p_ring->poll_and_process_element_rx(&poll_sn); + } + } + m_rx_ring_map_lock.unlock(); + if (likely(n > 0)) { // got completions from CQ + __log_entry_funcall("got %d elements sn=%llu", n, (unsigned long long)poll_sn); + + if (m_n_rx_pkt_ready_list_count) + m_p_socket_stats->counters.n_rx_poll_hit++; + return n; + } + + // if in blocking accept state skip poll phase and go to sleep directly + if (m_loops_timer.is_timeout() || !is_blocking) { + errno = EAGAIN; + return -1; + } + + if (poll_count < m_n_sysvar_rx_poll_num || m_n_sysvar_rx_poll_num == -1) { + return 0; + } + + m_p_socket_stats->counters.n_rx_poll_miss++; + // if we polling too much - go to sleep + si_tcp_logfuncall("%d: too many polls without data blocking=%d", m_fd, is_blocking); + if (g_b_exit) { + errno = EINTR; + return -1; + } + + //arming CQs + m_rx_ring_map_lock.lock(); + if (likely(m_p_rx_ring)) { + ret = m_p_rx_ring->request_notification(CQT_RX, poll_sn); + if (ret != 0) { + m_rx_ring_map_lock.unlock(); + return 0; + } + } + else { + for (rx_ring_iter = m_rx_ring_map.begin(); rx_ring_iter != m_rx_ring_map.end(); rx_ring_iter++) { + if (rx_ring_iter->second->refcnt <= 0) { + continue; + } + ring* p_ring = rx_ring_iter->first; + if (p_ring) { + ret = p_ring->request_notification(CQT_RX, poll_sn); + if (ret != 0) { + m_rx_ring_map_lock.unlock(); + return 0; + } + } + } + } + m_rx_ring_map_lock.unlock(); + + //Check if we have a packet in receive queue before we going to sleep and + //update is_sleeping flag under the same lock to synchronize between + //this code and wakeup mechanism. + + lock_tcp_con(); + if (!m_n_rx_pkt_ready_list_count && !m_ready_conn_cnt) + { + going_to_sleep(); + unlock_tcp_con(); + } + else + { + unlock_tcp_con(); + return 0; + } + + //sleep on different CQs and OS listen socket + ret = orig_os_api.epoll_wait(m_rx_epfd, rx_epfd_events, SI_RX_EPFD_EVENT_MAX, m_loops_timer.time_left_msec()); + + lock_tcp_con(); + return_from_sleep(); + unlock_tcp_con(); + + if (ret <= 0) + return ret; + + //If there is a ready packet in a queue we want to return to user as quickest as possible + if(m_n_rx_pkt_ready_list_count) + return 0; + + for (int event_idx = 0; event_idx < ret; event_idx++) + { + int fd = rx_epfd_events[event_idx].data.fd; + if (is_wakeup_fd(fd)) + { // wakeup event + lock_tcp_con(); + remove_wakeup_fd(); + unlock_tcp_con(); + continue; + } + + // Check if OS fd is ready for reading + if (fd == m_fd) { + continue; + } + + // poll cq. fd == cq channel fd. + cq_channel_info* p_cq_ch_info = g_p_fd_collection->get_cq_channel_fd(fd); + if (p_cq_ch_info) { + ring* p_ring = p_cq_ch_info->get_ring(); + if (p_ring) { + p_ring->wait_for_notification_and_process_element(fd, &poll_sn); + } + } + } + return ret; +} + +mem_buf_desc_t* sockinfo_tcp::get_next_desc(mem_buf_desc_t *p_desc) +{ + m_rx_pkt_ready_list.pop_front(); + m_p_socket_stats->n_rx_ready_pkt_count--; + + m_n_rx_pkt_ready_list_count--; + if (p_desc->p_next_desc) { + //vlog_printf(VLOG_ERROR, "detected chained pbufs! REF %u", p_desc->lwip_pbuf.pbuf.ref); + mem_buf_desc_t *prev = p_desc; + p_desc = p_desc->p_next_desc; + prev->rx.sz_payload = prev->lwip_pbuf.pbuf.len; + p_desc->rx.sz_payload = p_desc->lwip_pbuf.pbuf.tot_len = prev->lwip_pbuf.pbuf.tot_len - prev->lwip_pbuf.pbuf.len; + p_desc->rx.n_frags = --prev->rx.n_frags; + p_desc->rx.src = prev->rx.src; + p_desc->inc_ref_count(); + m_rx_pkt_ready_list.push_front(p_desc); + m_n_rx_pkt_ready_list_count++; + m_p_socket_stats->n_rx_ready_pkt_count++; + prev->lwip_pbuf.pbuf.next = NULL; + prev->p_next_desc = NULL; + prev->rx.n_frags = 1; + reuse_buffer(prev); + } + else + reuse_buffer(p_desc); + if (m_n_rx_pkt_ready_list_count) + return m_rx_pkt_ready_list.front(); + else + return NULL; +} + +mem_buf_desc_t* sockinfo_tcp::get_next_desc_peek(mem_buf_desc_t *pdesc, int& rx_pkt_ready_list_idx) +{ + + if (unlikely(pdesc->p_next_desc)) { + pdesc = pdesc->p_next_desc; + }else if (rx_pkt_ready_list_idx < m_n_rx_pkt_ready_list_count) { + pdesc = m_rx_pkt_ready_list[rx_pkt_ready_list_idx]; + rx_pkt_ready_list_idx++; + }else { + pdesc = NULL; + } + + return pdesc; +} + +timestamps_t* sockinfo_tcp::get_socket_timestamps() +{ + return &m_rx_timestamps; +} + +void sockinfo_tcp::post_deqeue(bool release_buff) +{ + NOT_IN_USE(release_buff); +} + +int sockinfo_tcp::zero_copy_rx(iovec *p_iov, mem_buf_desc_t *pdesc, int *p_flags) { + NOT_IN_USE(p_flags); + int total_rx = 0, offset = 0; + int len = p_iov[0].iov_len - sizeof(vma_packets_t) - sizeof(vma_packet_t) - sizeof(iovec); + mem_buf_desc_t* p_desc_iter; + mem_buf_desc_t* prev; + + // Make sure there is enough room for the header + if (len < 0) { + errno = ENOBUFS; + return -1; + } + + pdesc->rx.frag.iov_base = (uint8_t*)pdesc->rx.frag.iov_base + m_rx_pkt_ready_offset; + pdesc->rx.frag.iov_len -= m_rx_pkt_ready_offset; + p_desc_iter = pdesc; + + // Copy iov pointers to user buffer + vma_packets_t *p_packets = (vma_packets_t*)p_iov[0].iov_base; + p_packets->n_packet_num = 0; + + offset += sizeof(p_packets->n_packet_num); // skip n_packet_num size + + while(len >= 0 && m_n_rx_pkt_ready_list_count) { + vma_packet_t *p_pkts = (vma_packet_t *)((char *)p_packets + offset); + p_packets->n_packet_num++; + p_pkts->packet_id = (void*)p_desc_iter; + p_pkts->sz_iov = 0; + while(len >= 0 && p_desc_iter) { + + p_pkts->iov[p_pkts->sz_iov++] = p_desc_iter->rx.frag; + total_rx += p_desc_iter->rx.frag.iov_len; + + prev = p_desc_iter; + p_desc_iter = p_desc_iter->p_next_desc; + if (p_desc_iter) { + p_desc_iter->lwip_pbuf.pbuf.tot_len = prev->lwip_pbuf.pbuf.tot_len - prev->lwip_pbuf.pbuf.len; + p_desc_iter->rx.n_frags = --prev->rx.n_frags; + p_desc_iter->rx.src = prev->rx.src; + p_desc_iter->inc_ref_count(); + prev->lwip_pbuf.pbuf.next = NULL; + prev->p_next_desc = NULL; + prev->rx.n_frags = 1; + } + len -= sizeof(iovec); + offset += sizeof(iovec); + } + + if (len < 0 && p_desc_iter){ + m_rx_pkt_ready_list.pop_front(); + m_rx_pkt_ready_list.push_front(p_desc_iter); + break; + } + m_rx_pkt_ready_list.pop_front(); + m_n_rx_pkt_ready_list_count--; + m_p_socket_stats->n_rx_ready_pkt_count--; + m_p_socket_stats->n_rx_zcopy_pkt_count++; + + if (m_n_rx_pkt_ready_list_count) + p_desc_iter = m_rx_pkt_ready_list.front(); + + len -= sizeof(vma_packet_t); + offset += sizeof(vma_packet_t); + } + + return total_rx; +} + +void sockinfo_tcp::statistics_print(vlog_levels_t log_level /* = VLOG_DEBUG */) +{ + const char * const tcp_sock_state_str[] = { + "NA", + "TCP_SOCK_INITED", + "TCP_SOCK_BOUND", + "TCP_SOCK_LISTEN_READY", + "TCP_SOCK_ACCEPT_READY", + "TCP_SOCK_CONNECTED_RD", + "TCP_SOCK_CONNECTED_WR", + "TCP_SOCK_CONNECTED_RDWR", + "TCP_SOCK_ASYNC_CONNECT", + "TCP_SOCK_ACCEPT_SHUT", + }; + + const char * const tcp_conn_state_str[] = { + "TCP_CONN_INIT", + "TCP_CONN_CONNECTING", + "TCP_CONN_CONNECTED", + "TCP_CONN_FAILED", + "TCP_CONN_TIMEOUT", + "TCP_CONN_ERROR", + "TCP_CONN_RESETED", + }; + struct tcp_pcb pcb; + tcp_sock_state_e sock_state; + tcp_conn_state_e conn_state; + u32_t last_unsent_seqno = 0 , last_unacked_seqno = 0, first_unsent_seqno = 0, first_unacked_seqno = 0; + u16_t last_unsent_len = 0 , last_unacked_len = 0, first_unsent_len = 0, first_unacked_len = 0; + int rcvbuff_max, rcvbuff_current, rcvbuff_non_tcp_recved, rx_pkt_ready_list_size, rx_ctl_packets_list_size, rx_ctl_reuse_list_size; + + sockinfo::statistics_print(log_level); + + // Prepare data + lock_tcp_con(); + + pcb = m_pcb; + + if (m_pcb.unsent) { + first_unsent_seqno = m_pcb.unsent->seqno; + first_unsent_len = m_pcb.unsent->len; + + if (m_pcb.last_unsent) { + last_unsent_seqno = m_pcb.last_unsent->seqno; + last_unsent_len = m_pcb.last_unsent->len; + } + } + + if (m_pcb.unacked) { + first_unacked_seqno = m_pcb.unacked->seqno; + first_unacked_len = m_pcb.unacked->len; + + if (m_pcb.last_unacked) { + last_unacked_seqno = m_pcb.last_unacked->seqno; + last_unacked_len = m_pcb.last_unacked->len; + } + } + + sock_state = m_sock_state; + conn_state = m_conn_state; + rcvbuff_max = m_rcvbuff_max; + rcvbuff_current = m_rcvbuff_current; + rcvbuff_non_tcp_recved = m_rcvbuff_non_tcp_recved; + rx_pkt_ready_list_size = m_rx_pkt_ready_list.size(); + rx_ctl_packets_list_size = m_rx_ctl_packets_list.size(); + rx_ctl_reuse_list_size = m_rx_ctl_reuse_list.size(); + + unlock_tcp_con(); + + // Socket data + vlog_printf(log_level, "Socket state : %s\n", tcp_sock_state_str[sock_state]); + vlog_printf(log_level, "Connection state : %s\n", tcp_conn_state_str[conn_state]); + vlog_printf(log_level, "Receive buffer : m_rcvbuff_current %d, m_rcvbuff_max %d, m_rcvbuff_non_tcp_recved %d\n", rcvbuff_current, rcvbuff_max, rcvbuff_non_tcp_recved); + vlog_printf(log_level, "Rx lists size : m_rx_pkt_ready_list %d, m_rx_ctl_packets_list %d, m_rx_ctl_reuse_list %d\n", rx_pkt_ready_list_size, rx_ctl_packets_list_size, rx_ctl_reuse_list_size); + + // PCB data + vlog_printf(log_level, "PCB state : %s\n", tcp_state_str[get_tcp_state(&pcb)]); + vlog_printf(log_level, "PCB flags : 0x%x\n", pcb.flags); + vlog_printf(log_level, "Segment size : mss %hu, advtsd_mss %hu\n", pcb.mss, pcb.advtsd_mss); + + // Window scaling + if (pcb.flags & TF_WND_SCALE) { + vlog_printf(log_level, "Window scaling : ENABLED, rcv_scale %u, snd_scale %u\n", pcb.rcv_scale, pcb.snd_scale); + + // Receive and send windows + vlog_printf(log_level, "Receive window : rcv_wnd %u (%u), rcv_ann_wnd %u (%u), rcv_wnd_max %u (%u), rcv_wnd_max_desired %u (%u)\n", + pcb.rcv_wnd, RCV_WND_SCALE(&pcb, pcb.rcv_wnd), pcb.rcv_ann_wnd, RCV_WND_SCALE(&pcb, pcb.rcv_ann_wnd), + pcb.rcv_wnd_max, RCV_WND_SCALE(&pcb, pcb.rcv_wnd_max), pcb.rcv_wnd_max_desired, RCV_WND_SCALE(&pcb, pcb.rcv_wnd_max_desired)); + + vlog_printf(log_level, "Send window : snd_wnd %u (%u), snd_wnd_max %u (%u)\n", + pcb.snd_wnd, (pcb.snd_wnd >> pcb.snd_scale), pcb.snd_wnd_max, (pcb.snd_wnd_max >> pcb.snd_scale)); + } else { + vlog_printf(log_level, "Window scaling : DISABLED\n"); + + // Receive and send windows + vlog_printf(log_level, "Receive window : rcv_wnd %u, rcv_ann_wnd %u, rcv_wnd_max %u, rcv_wnd_max_desired %u\n", + pcb.rcv_wnd, pcb.rcv_ann_wnd, pcb.rcv_wnd_max, pcb.rcv_wnd_max_desired); + + vlog_printf(log_level, "Send window : snd_wnd %u, snd_wnd_max %u\n", pcb.snd_wnd, pcb.snd_wnd_max); + } + + // Congestion variable + vlog_printf(log_level, "Congestion : cwnd %u\n", pcb.cwnd); + + // Receiver variables + vlog_printf(log_level, "Receiver data : rcv_nxt %u, rcv_ann_right_edge %u\n", pcb.rcv_nxt, pcb.rcv_ann_right_edge); + + // Sender variables + vlog_printf(log_level, "Sender data : snd_nxt %u, snd_wl1 %u, snd_wl2 %u\n", pcb.snd_nxt, pcb.snd_wl1, pcb.snd_wl2); + + // Send buffer + vlog_printf(log_level, "Send buffer : snd_buf %u, max_snd_buff %u\n", pcb.snd_buf, pcb.max_snd_buff); + + // Retransmission + vlog_printf(log_level, "Retransmission : rtime %hd, rto %u, nrtx %u\n", pcb.rtime, pcb.rto, pcb.nrtx); + + // RTT + vlog_printf(log_level, "RTT variables : rttest %u, rtseq %u\n", pcb.rttest, pcb.rtseq); + + // First unsent + if (first_unsent_seqno) { + vlog_printf(log_level, "First unsent : seqno %u, len %hu, seqno + len %u\n", first_unsent_seqno, first_unsent_len, first_unsent_seqno + first_unsent_len); + + // Last unsent + if (last_unsent_seqno) { + vlog_printf(log_level, "Last unsent : seqno %u, len %hu, seqno + len %u\n", last_unsent_seqno, last_unsent_len, last_unsent_seqno + last_unsent_len); + } + } else { + vlog_printf(log_level, "First unsent : NULL\n"); + } + + // First unsent + if (first_unacked_seqno) { + vlog_printf(log_level, "First unacked : seqno %u, len %hu, seqno + len %u\n", first_unacked_seqno, first_unacked_len, first_unacked_seqno + first_unacked_len); + + // Last unacked + if (last_unacked_seqno) { + vlog_printf(log_level, "Last unacked : seqno %u, len %hu, seqno + len %u\n", last_unacked_seqno, last_unacked_len, last_unacked_seqno + last_unacked_len); + } + } else { + vlog_printf(log_level, "First unacked : NULL\n"); + } + + // Acknowledge + vlog_printf(log_level, "Acknowledge : lastack %u\n", pcb.lastack); + + // TCP timestamp +#if LWIP_TCP_TIMESTAMPS + if (pcb.flags & TF_TIMESTAMP) { + vlog_printf(log_level, "Timestamp : ts_lastacksent %u, ts_recent %u\n", pcb.ts_lastacksent, pcb.ts_recent); + } +#endif +} + +int sockinfo_tcp::free_packets(struct vma_packet_t *pkts, size_t count) +{ + int ret = 0; + unsigned int index = 0; + int bytes_to_tcp_recved; + int total_rx = 0, offset = 0; + mem_buf_desc_t *buff; + char *buf = (char *)pkts; + + lock_tcp_con(); + for(index=0; index < count; index++){ + vma_packet_t *p_pkts = (vma_packet_t *)(buf + offset); + buff = (mem_buf_desc_t*)p_pkts->packet_id; + + if (m_p_rx_ring && !m_p_rx_ring->is_member(buff->p_desc_owner)) { + errno = ENOENT; + ret = -1; + break; + } + else if (m_rx_ring_map.find(buff->p_desc_owner->get_parent()) == m_rx_ring_map.end()) { + errno = ENOENT; + ret = -1; + break; + } + + total_rx += buff->rx.sz_payload; + reuse_buffer(buff); + m_p_socket_stats->n_rx_zcopy_pkt_count--; + + offset += p_pkts->sz_iov * sizeof(iovec) + sizeof(vma_packet_t); + } + + if (total_rx > 0) { + m_rcvbuff_current -= total_rx; + // data that was not tcp_recved should do it now. + if ( m_rcvbuff_non_tcp_recved > 0 ) { + bytes_to_tcp_recved = min(m_rcvbuff_non_tcp_recved, total_rx); + tcp_recved(&m_pcb, bytes_to_tcp_recved); + m_rcvbuff_non_tcp_recved -= bytes_to_tcp_recved; + } + } + + unlock_tcp_con(); + return ret; +} + +int sockinfo_tcp::free_buffs(uint16_t len) +{ + tcp_recved(&m_pcb, len); + return 0; +} + +struct pbuf * sockinfo_tcp::tcp_tx_pbuf_alloc(void* p_conn) +{ + sockinfo_tcp *p_si_tcp = (sockinfo_tcp *)(((struct tcp_pcb*)p_conn)->my_container); + dst_entry_tcp *p_dst = (dst_entry_tcp *)(p_si_tcp->m_p_connected_dst_entry); + mem_buf_desc_t* p_desc = NULL; + if (likely(p_dst)) { + p_desc = p_dst->get_buffer(); + } + return (struct pbuf *)p_desc; +} + +//single buffer only +void sockinfo_tcp::tcp_tx_pbuf_free(void* p_conn, struct pbuf *p_buff) +{ + sockinfo_tcp *p_si_tcp = (sockinfo_tcp *)(((struct tcp_pcb*)p_conn)->my_container); + dst_entry_tcp *p_dst = (dst_entry_tcp *)(p_si_tcp->m_p_connected_dst_entry); + if (likely(p_dst)) { + p_dst->put_buffer((mem_buf_desc_t *)p_buff); + } else if (p_buff){ + mem_buf_desc_t * p_desc = (mem_buf_desc_t *)p_buff; + + //potential race, ref is protected here by tcp lock, and in ring by ring_tx lock + if (likely(p_desc->lwip_pbuf_get_ref_count())) + p_desc->lwip_pbuf_dec_ref_count(); + else + __log_err("ref count of %p is already zero, double free??", p_desc); + + if (p_desc->lwip_pbuf.pbuf.ref == 0) { + p_desc->p_next_desc = NULL; + g_buffer_pool_tx->put_buffers_thread_safe(p_desc); + } + } +} + +struct tcp_seg * sockinfo_tcp::tcp_seg_alloc(void* p_conn) +{ + sockinfo_tcp *p_si_tcp = (sockinfo_tcp *)(((struct tcp_pcb*)p_conn)->my_container); + return p_si_tcp->get_tcp_seg(); +} + +void sockinfo_tcp::tcp_seg_free(void* p_conn, struct tcp_seg * seg) +{ + sockinfo_tcp *p_si_tcp = (sockinfo_tcp *)(((struct tcp_pcb*)p_conn)->my_container); + p_si_tcp->put_tcp_seg(seg); +} + +struct tcp_seg * sockinfo_tcp::get_tcp_seg() +{ + struct tcp_seg * head = NULL; + if (!m_tcp_seg_list) { + m_tcp_seg_list = g_tcp_seg_pool->get_tcp_segs(TCP_SEG_COMPENSATION); + if (unlikely(!m_tcp_seg_list)) return NULL; + m_tcp_seg_count += TCP_SEG_COMPENSATION; + } + + head = m_tcp_seg_list; + m_tcp_seg_list = head->next; + head->next = NULL; + m_tcp_seg_in_use++; + + return head; +} + +void sockinfo_tcp::put_tcp_seg(struct tcp_seg * seg) +{ + if (unlikely(!seg)) return; + + seg->next = m_tcp_seg_list; + m_tcp_seg_list = seg; + m_tcp_seg_in_use--; + if (m_tcp_seg_count > 2 * TCP_SEG_COMPENSATION && m_tcp_seg_in_use < m_tcp_seg_count / 2) { + int count = (m_tcp_seg_count - m_tcp_seg_in_use) / 2; + struct tcp_seg * next = m_tcp_seg_list; + for (int i = 0; i < count - 1; i++) { + next = next->next; + } + struct tcp_seg * head = m_tcp_seg_list; + m_tcp_seg_list = next->next; + next->next = NULL; + g_tcp_seg_pool->put_tcp_segs(head); + m_tcp_seg_count -= count; + } + return; +} + +//tcp_seg_pool + +tcp_seg_pool::tcp_seg_pool(int size) { + m_tcp_segs_array = new struct tcp_seg[size]; + if (m_tcp_segs_array == NULL) { + __log_dbg("TCP segments allocation failed"); + free_tsp_resources(); + throw_vma_exception("TCP segments allocation failed"); + } + memset(m_tcp_segs_array, 0, sizeof(tcp_seg) * size); + for (int i = 0; i < size - 1; i++) { + m_tcp_segs_array[i].next = &m_tcp_segs_array[i + 1]; + } + m_p_head = &m_tcp_segs_array[0]; +} + +tcp_seg_pool::~tcp_seg_pool() { + free_tsp_resources(); +} + +void tcp_seg_pool::free_tsp_resources() { + delete [] m_tcp_segs_array; +} + +tcp_seg * tcp_seg_pool::get_tcp_segs(int amount) { + tcp_seg *head, *next, *prev; + if (unlikely(amount <= 0)) + return NULL; + lock(); + head = next = m_p_head; + prev = NULL; + while (amount > 0 && next) { + prev = next; + next = next->next; + amount--; + } + if (amount) { + unlock(); + return NULL; + } + prev->next = NULL; + m_p_head = next; + unlock(); + return head; +} + +void tcp_seg_pool::put_tcp_segs(tcp_seg * seg_list) { + tcp_seg * next = seg_list; + if (unlikely(!seg_list)) + return; + + while (next->next) { + next = next->next; + } + + lock(); + next->next = m_p_head; + m_p_head = seg_list; + unlock(); +} + + +tcp_timers_collection::tcp_timers_collection(int period, int resolution) +{ + m_n_period = period; + m_n_resolution = resolution; + m_n_intervals_size = period/resolution; + m_timer_handle = NULL; + m_p_intervals = new timer_node_t*[m_n_intervals_size]; + BULLSEYE_EXCLUDE_BLOCK_START + if (!m_p_intervals) { + __log_dbg("failed to allocate memory"); + free_tta_resources(); + throw_vma_exception("failed to allocate memory"); + } + + BULLSEYE_EXCLUDE_BLOCK_END + memset(m_p_intervals, 0, sizeof(timer_node_t*) * m_n_intervals_size); + m_n_location = 0; + m_n_next_insert_bucket = 0; + m_n_count = 0; +} + +tcp_timers_collection::~tcp_timers_collection() +{ + free_tta_resources(); +} + +void tcp_timers_collection::free_tta_resources(void) +{ + if (m_n_count) { + for (int i = 0; i < m_n_intervals_size; i++) { + if (m_p_intervals[i]) { + remove_timer(m_p_intervals[i]); + } + } + + if (m_n_count) { + __log_dbg("not all TCP timers have been removed, count=%d", m_n_count); + } + } + + delete[] m_p_intervals; +} + +void tcp_timers_collection::clean_obj() +{ + if (is_cleaned()) { + return ; + } + + set_cleaned(); + m_timer_handle = NULL; + if (g_p_event_handler_manager->is_running()) { + g_p_event_handler_manager->unregister_timers_event_and_delete(this); + } else { + cleanable_obj::clean_obj(); + } +} + +void tcp_timers_collection::handle_timer_expired(void* user_data) +{ + NOT_IN_USE(user_data); + timer_node_t* iter = m_p_intervals[m_n_location]; + while (iter) { + __log_funcall("timer expired on %p", iter->handler); + iter->handler->handle_timer_expired(iter->user_data); + iter = iter->next; + } + m_n_location = (m_n_location + 1) % m_n_intervals_size; + + /* Processing all messages for the daemon */ + g_p_agent->progress(); +} + +void tcp_timers_collection::add_new_timer(timer_node_t* node, timer_handler* handler, void* user_data) +{ + node->handler = handler; + node->user_data = user_data; + node->group = this; + node->next = NULL; + node->prev = NULL; + if (m_p_intervals[m_n_next_insert_bucket] != NULL) { + m_p_intervals[m_n_next_insert_bucket]->prev = node; + node->next = m_p_intervals[m_n_next_insert_bucket]; + } + m_p_intervals[m_n_next_insert_bucket] = node; + m_n_next_insert_bucket = (m_n_next_insert_bucket + 1) % m_n_intervals_size; + + if (m_n_count == 0) { + m_timer_handle = g_p_event_handler_manager->register_timer_event(m_n_resolution , this, PERIODIC_TIMER, NULL); + } + m_n_count++; + + __log_dbg("new TCP timer handler [%p] was added", handler); +} + +void tcp_timers_collection::remove_timer(timer_node_t* node) +{ + if (!node) return; + + node->group = NULL; + + if (node->prev) { + node->prev->next = node->next; + } else { + for (int i = 0; i < m_n_intervals_size; i++) { + if (m_p_intervals[i] == node) { + m_p_intervals[i] = node->next; + break; + } + } + } + + if (node->next) { + node->next->prev = node->prev; + } + + m_n_count--; + if (m_n_count == 0) { + if (m_timer_handle) { + g_p_event_handler_manager->unregister_timer_event(this, m_timer_handle); + m_timer_handle = NULL; + } + } + + __log_dbg("TCP timer handler [%p] was removed", node->handler); + + free(node); +} + +void sockinfo_tcp::update_header_field(data_updater *updater) +{ + lock_tcp_con(); + + if (m_p_connected_dst_entry) { + updater->update_field(*m_p_connected_dst_entry); + } + + unlock_tcp_con(); +} diff --git a/src/vma/sock/sockinfo_tcp.h b/src/vma/sock/sockinfo_tcp.h new file mode 100644 index 0000000..e5e2289 --- /dev/null +++ b/src/vma/sock/sockinfo_tcp.h @@ -0,0 +1,512 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef TCP_SOCKINFO_H +#define TCP_SOCKINFO_H + +#include "utils/lock_wrapper.h" +#include "vma/proto/peer_key.h" +#include "vma/proto/mem_buf_desc.h" +#include "vma/sock/socket_fd_api.h" +#include "vma/dev/buffer_pool.h" +#include "vma/dev/cq_mgr.h" +#include "vma/vma_extra.h" + +// LWIP includes +#include "vma/lwip/opt.h" +#include "vma/lwip/tcp_impl.h" + +#include "sockinfo.h" + +/** + * Tcp socket states: rdma_offload or os_passthrough. in rdma_offload: + * init --/bind()/ --> bound -- /listen()/ --> accept_ready -- /accept()may go to connected/ --> connected + * init --(optional: bind()/ -- /connect()|async_connect/--> connected --/close()/--> init + * may need to handle bind before connect in the future + */ +enum tcp_sock_offload_e { + TCP_SOCK_PASSTHROUGH = 1, // OS handling this socket connection +// TCP_SOCK_RDMA_CM, // Offloaded, uses RDMA CM - SDP like connection + TCP_SOCK_LWIP // Offloaded, uses LWIP for wire compatible TCP impl +}; + +enum tcp_sock_state_e { + TCP_SOCK_INITED = 1, + TCP_SOCK_BOUND, + TCP_SOCK_LISTEN_READY, // internal state that indicate that prepareListen was called + TCP_SOCK_ACCEPT_READY, + TCP_SOCK_CONNECTED_RD, // ready to rcv + TCP_SOCK_CONNECTED_WR, // ready to send + TCP_SOCK_CONNECTED_RDWR, // full duplex op + TCP_SOCK_ASYNC_CONNECT, // async connect in progress + TCP_SOCK_ACCEPT_SHUT // after shutdown on TCP_SOCK_ACCEPT_READY socket +}; + +/** + * state machine for the connect() side connection establishment. Taken from VMS + */ +enum tcp_conn_state_e { + TCP_CONN_INIT = 0, + TCP_CONN_CONNECTING, + TCP_CONN_CONNECTED, + TCP_CONN_FAILED, + TCP_CONN_TIMEOUT, + TCP_CONN_ERROR, + TCP_CONN_RESETED +}; + +struct socket_option_t { + const int level; + const int optname; + const socklen_t optlen; + void *optval; + + socket_option_t(const int _level, const int _optname, const void *_optval, const socklen_t _optlen) : + level(_level), optname(_optname), optlen(_optlen), optval(malloc(optlen)) { + memcpy(optval, _optval, optlen); + } + + ~socket_option_t() { if (optval) free(optval); } +}; + +typedef std::deque socket_options_list_t; +typedef std::map ready_pcb_map_t; +typedef std::map syn_received_map_t; +typedef std::map peer_map_t; + +/* taken from inet_ecn.h in kernel */ +enum inet_ecns { + INET_ECN_NOT_ECT = 0, + INET_ECN_ECT_1 = 1, + INET_ECN_ECT_0 = 2, + INET_ECN_CE = 3, + INET_ECN_MASK = 3, +}; + +class sockinfo_tcp : public sockinfo, public timer_handler +{ +public: + static inline size_t accepted_conns_node_offset(void) {return NODE_OFFSET(sockinfo_tcp, accepted_conns_node);} + typedef vma_list_t sock_list_t; + sockinfo_tcp(int fd); + virtual ~sockinfo_tcp(); + + virtual void clean_obj(); + + void setPassthrough(bool _isPassthrough = true) { + m_sock_offload = _isPassthrough ? TCP_SOCK_PASSTHROUGH : TCP_SOCK_LWIP; + m_p_socket_stats->b_is_offloaded = ! _isPassthrough; + } + bool isPassthrough() {return m_sock_offload == TCP_SOCK_PASSTHROUGH;} + + int prepareListen(); + int shutdown(int __how); + + //Not always we can close immediately TCP socket: we can do that only after the TCP connection in closed. + //In this method we just kikstarting the TCP connection termination (empty the unsent/unacked, senf FIN...) + //Return val: true is the socket is already closable and false otherwise + virtual bool prepare_to_close(bool process_shutdown = false); + void create_dst_entry(); + bool prepare_dst_to_send(bool is_accepted_socket = false); + + virtual int fcntl(int __cmd, unsigned long int __arg); + virtual int ioctl(unsigned long int __request, unsigned long int __arg); + virtual int setsockopt(int __level, int __optname, const void *__optval, socklen_t __optlen); + virtual int getsockopt(int __level, int __optname, void *__optval, socklen_t *__optlen); + int getsockopt_offload(int __level, int __optname, void *__optval, socklen_t *__optlen); + virtual int connect(const sockaddr*, socklen_t); + virtual int bind(const sockaddr *__addr, socklen_t __addrlen); + virtual int listen(int backlog); + virtual int accept(struct sockaddr *__addr, socklen_t *__addrlen); + virtual int accept4(struct sockaddr *__addr, socklen_t *__addrlen, int __flags); + virtual int getsockname(sockaddr *__name, socklen_t *__namelen); + virtual int getpeername(sockaddr *__name, socklen_t *__namelen); + + virtual int free_packets(struct vma_packet_t *pkts, size_t count); + + /* This function is used for socketxtreme mode */ + virtual int free_buffs(uint16_t len); + + virtual void statistics_print(vlog_levels_t log_level = VLOG_DEBUG); + + //Returns the connected pcb, with 5 tuple which matches the input arguments, + //in state "SYN Received" or NULL if pcb wasn't found + + struct tcp_pcb* get_syn_received_pcb(in_addr_t src_addr, in_port_t src_port, in_addr_t dest_addr, in_port_t dest_port); + + ssize_t tx(vma_tx_call_attr_t &tx_arg); + ssize_t rx(const rx_call_t call_type, iovec *p_iov, ssize_t sz_iov, int *p_flags, sockaddr *__from = NULL, socklen_t *__fromlen = NULL, struct msghdr *__msg = NULL); +#ifdef DEFINED_TSO + static err_t ip_output(struct pbuf *p, void* v_p_conn, uint16_t flags); + static err_t ip_output_syn_ack(struct pbuf *p, void* v_p_conn, uint16_t flags); +#else + static err_t ip_output(struct pbuf *p, void* v_p_conn, int is_rexmit, uint8_t is_dummy); + static err_t ip_output_syn_ack(struct pbuf *p, void* v_p_conn, int is_rexmit, uint8_t is_dummy); +#endif /* DEFINED_TSO */ + static void tcp_state_observer(void* pcb_container, enum tcp_state new_state); + static uint16_t get_route_mtu(struct tcp_pcb *pcb); + + virtual void update_header_field(data_updater *updater); + virtual bool rx_input_cb(mem_buf_desc_t* p_rx_pkt_mem_buf_desc_info, void* pv_fd_ready_array); + virtual void set_rx_packet_processor(void) { } + + static struct pbuf * tcp_tx_pbuf_alloc(void* p_conn); + static void tcp_tx_pbuf_free(void* p_conn, struct pbuf *p_buff); + static struct tcp_seg * tcp_seg_alloc(void* p_conn); + static void tcp_seg_free(void* p_conn, struct tcp_seg * seg); + + bool inline is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_array = NULL); + bool inline is_writeable(); + bool inline is_errorable(int *errors); + bool is_closable() { return get_tcp_state(&m_pcb) == CLOSED && m_syn_received.empty() && m_accepted_conns.empty(); } + bool skip_os_select() + { + // calling os select on offloaded TCP sockets makes no sense unless it's a listen socket + // to make things worse, it returns that os fd is ready... + return (m_sock_offload == TCP_SOCK_LWIP && !is_server() && m_conn_state != TCP_CONN_INIT); + } + + bool is_connected() + { + return m_sock_state == TCP_SOCK_CONNECTED_RDWR; + } + + inline bool is_rtr() + { + return (m_n_rx_pkt_ready_list_count || m_sock_state == TCP_SOCK_CONNECTED_RD || m_sock_state == TCP_SOCK_CONNECTED_RDWR); + } + + bool is_rts() + { + //ready to send + return m_sock_state == TCP_SOCK_CONNECTED_WR || m_sock_state == TCP_SOCK_CONNECTED_RDWR; + } + + bool is_server() + { + return m_sock_state == TCP_SOCK_ACCEPT_READY || m_sock_state == TCP_SOCK_ACCEPT_SHUT; + } + + virtual void update_socket_timestamps(timestamps_t * ts) + { + m_rx_timestamps = *ts; + } + + static const int CONNECT_DEFAULT_TIMEOUT_MS = 10000; + virtual inline fd_type_t get_type() + { + return FD_TYPE_SOCKET; + } + + void handle_timer_expired(void* user_data); + + list_node accepted_conns_node; + +protected: + virtual void lock_rx_q(); + virtual void unlock_rx_q(); + virtual bool try_un_offloading(); // un-offload the socket if possible + +private: + //lwip specific things + struct tcp_pcb m_pcb; + socket_options_list_t m_socket_options_list; + timestamps_t m_rx_timestamps; + tcp_sock_offload_e m_sock_offload; + tcp_sock_state_e m_sock_state; + sockinfo_tcp *m_parent; + //received packet source (true if its from internal thread) + bool m_vma_thr; + /* connection state machine */ + int m_conn_timeout; + /* SNDBUF acconting */ + int m_sndbuff_max; + /* RCVBUF acconting */ + int m_rcvbuff_max; + int m_rcvbuff_current; + int m_rcvbuff_non_tcp_recved; + tcp_conn_state_e m_conn_state; + fd_array_t* m_iomux_ready_fd_array; + struct linger m_linger; + + /* local & peer addresses */ +/* struct sockaddr *m_addr_local; + socklen_t m_local_alen; + struct sockaddr *m_addr_peer; + socklen_t m_peer_alen; +*/ + + //Relevant only for listen sockets: map connections in syn received state + //We need this map since for syn received connection no sockinfo is created yet! + syn_received_map_t m_syn_received; + uint32_t m_received_syn_num; + + /* pending connections */ + sock_list_t m_accepted_conns; + + uint32_t m_ready_conn_cnt; + int m_backlog; + + void *m_timer_handle; + lock_spin_recursive m_tcp_con_lock; + bool m_timer_pending; + + bool report_connected; //used for reporting 'connected' on second non-blocking call to connect. + + int m_error_status; + + const buffer_batching_mode_t m_sysvar_buffer_batching_mode; + const tcp_ctl_thread_t m_sysvar_tcp_ctl_thread; + const internal_thread_tcp_timer_handling_t m_sysvar_internal_thread_tcp_timer_handling; + + struct tcp_seg * m_tcp_seg_list; + int m_tcp_seg_count; + int m_tcp_seg_in_use; + + vma_desc_list_t m_rx_pkt_ready_list; + vma_desc_list_t m_rx_cb_dropped_list; + + lock_spin_recursive m_rx_ctl_packets_list_lock; + tscval_t m_last_syn_tsc; + vma_desc_list_t m_rx_ctl_packets_list; + peer_map_t m_rx_peer_packets; + vma_desc_list_t m_rx_ctl_reuse_list; + ready_pcb_map_t m_ready_pcbs; + static const unsigned TX_CONSECUTIVE_EAGAIN_THREASHOLD = 10; + unsigned m_tx_consecutive_eagain_count; + bool m_sysvar_rx_poll_on_tx_tcp; + + inline void init_pbuf_custom(mem_buf_desc_t *p_desc); + + inline void lock_tcp_con(); + inline void unlock_tcp_con(); + void tcp_timer(); + + bool prepare_listen_to_close(); + + //Builds rfs key + static void create_flow_tuple_key_from_pcb(flow_tuple &key, struct tcp_pcb *pcb); + + //auto accept function + static void auto_accept_connection(sockinfo_tcp *parent, sockinfo_tcp *child); + + // accept cb func + static err_t accept_lwip_cb(void *arg, struct tcp_pcb *child_pcb, err_t err); + + //Called when legal syn is received in order to remember the new active pcb which + //is already created by lwip, but no sockinfo instance is created yet at this stage + static err_t syn_received_lwip_cb(void *arg, struct tcp_pcb *newpcb, err_t err); + + static err_t syn_received_drop_lwip_cb(void *arg, struct tcp_pcb *newpcb, err_t err); + + static err_t clone_conn_cb(void *arg, struct tcp_pcb **newpcb, err_t err); + + int accept_helper(struct sockaddr *__addr, socklen_t *__addrlen, int __flags = 0); + + // clone socket in accept call + sockinfo_tcp *accept_clone(); + // connect() helper & callback func + int wait_for_conn_ready(); + static err_t connect_lwip_cb(void *arg, struct tcp_pcb *tpcb, err_t err); + //tx + unsigned tx_wait(int & err, bool is_blocking); + + void abort_connection(); + int handle_child_FIN(sockinfo_tcp* child_conn); + + //rx + //int rx_wait(int &poll_count, bool is_blocking = true); + static err_t ack_recvd_lwip_cb(void *arg, struct tcp_pcb *tpcb, u16_t space); + static err_t rx_lwip_cb(void *arg, struct tcp_pcb *tpcb, struct pbuf *p, err_t err); + static err_t rx_drop_lwip_cb(void *arg, struct tcp_pcb *tpcb, struct pbuf *p, err_t err); + + // Be sure that m_pcb is initialized + void set_conn_properties_from_pcb(); + void set_sock_options(sockinfo_tcp *new_sock); + + //Register to timer + void register_timer(); + + void handle_socket_linger(); + + /* + * Supported only for UDP + */ + virtual void handle_ip_pktinfo(struct cmsg_state *) {}; + + int handle_rx_error(bool is_blocking); + + /** Function prototype for tcp error callback functions. Called when the pcb + * receives a RST or is unexpectedly closed for any other reason. + * + * @note The corresponding pcb is already freed when this callback is called! + * + * @param arg Additional argument to pass to the callback function (@see tcp_arg()) + * @param err Error code to indicate why the pcb has been closed + * ERR_ABRT: aborted through tcp_abort or by a TCP timer + * ERR_RST: the connection was reset by the remote host + */ + static void err_lwip_cb(void *arg, err_t err); + + // TODO: it is misleading to declare inline in file that doesn't contain the implementation as it can't help callers + inline void return_pending_rx_buffs(); + inline void return_pending_tx_buffs(); + inline void reuse_buffer(mem_buf_desc_t *buff); + virtual mem_buf_desc_t *get_next_desc(mem_buf_desc_t *p_desc); + virtual mem_buf_desc_t* get_next_desc_peek(mem_buf_desc_t *p_desc, int& rx_pkt_ready_list_idx); + virtual timestamps_t* get_socket_timestamps(); + + inline void return_reuse_buffers_postponed() { + if (!m_rx_reuse_buf_postponed) + return; + + //for the parallel reclaim mechanism from internal thread, used for "silent" sockets + set_rx_reuse_pending(false); + + m_rx_reuse_buf_postponed = false; + + if (m_p_rx_ring) { + if (m_rx_reuse_buff.n_buff_num >= m_n_sysvar_rx_num_buffs_reuse) { + if (m_p_rx_ring->reclaim_recv_buffers(&m_rx_reuse_buff.rx_reuse)) { + m_rx_reuse_buff.n_buff_num = 0; + } else { + m_rx_reuse_buf_postponed = true; + } + } + } else { + rx_ring_map_t::iterator iter = m_rx_ring_map.begin(); + while (iter != m_rx_ring_map.end()) { + descq_t *rx_reuse = &iter->second->rx_reuse_info.rx_reuse; + int& n_buff_num = iter->second->rx_reuse_info.n_buff_num; + if (n_buff_num >= m_n_sysvar_rx_num_buffs_reuse) { + if (iter->first->reclaim_recv_buffers(rx_reuse)) { + n_buff_num = 0; + } else { + m_rx_reuse_buf_postponed = true; + } + } + ++iter; + } + } + } + + virtual void post_deqeue(bool release_buff); + virtual int zero_copy_rx(iovec *p_iov, mem_buf_desc_t *pdesc, int *p_flags); + struct tcp_pcb* get_syn_received_pcb(const flow_tuple &key) const; + struct tcp_pcb* get_syn_received_pcb(in_addr_t src_addr, in_port_t src_port, in_addr_t dest_addr, + in_port_t dest_port, int protocol, in_addr_t local_addr); + + virtual mem_buf_desc_t* get_front_m_rx_pkt_ready_list(); + virtual size_t get_size_m_rx_pkt_ready_list(); + virtual void pop_front_m_rx_pkt_ready_list(); + virtual void push_back_m_rx_pkt_ready_list(mem_buf_desc_t* buff); + + // stats + uint64_t m_n_pbufs_rcvd; + uint64_t m_n_pbufs_freed; + + //lock_spin_recursive m_rx_cq_lck; + /* pick all cqs that match given address */ + virtual int rx_verify_available_data(); + inline int rx_wait(int & poll_count, bool is_blocking); + inline int rx_wait_lockless(int & poll_count, bool is_blocking); + int rx_wait_helper(int & poll_count, bool is_blocking); + void fit_rcv_wnd(bool force_fit); + void fit_snd_bufs(unsigned int new_max); + void fit_snd_bufs_to_nagle(bool disable_nagle); + + inline struct tcp_seg * get_tcp_seg(); + inline void put_tcp_seg(struct tcp_seg * seg); + + void queue_rx_ctl_packet(struct tcp_pcb* pcb, mem_buf_desc_t *p_desc); + bool process_peer_ctl_packets(vma_desc_list_t &peer_packets); + void process_my_ctl_packets(); + void process_children_ctl_packets(); + void process_reuse_ctl_packets(); + void process_rx_ctl_packets(); + bool check_dummy_send_conditions(const int flags, const iovec* p_iov, const ssize_t sz_iov); + static void put_agent_msg(void *arg); +}; +typedef struct tcp_seg tcp_seg; + +class tcp_seg_pool : lock_spin { +public: + tcp_seg_pool(int size); + virtual ~tcp_seg_pool(); + + tcp_seg * get_tcp_segs(int amount); + void put_tcp_segs(tcp_seg * seg_list); + +private: + tcp_seg * m_tcp_segs_array; + tcp_seg * m_p_head; + void free_tsp_resources(void); +}; + +extern tcp_seg_pool* g_tcp_seg_pool; + + +class tcp_timers_collection : public timers_group , public cleanable_obj { +public: + tcp_timers_collection(int period, int resolution); + virtual ~tcp_timers_collection(); + + void clean_obj(); + + virtual void handle_timer_expired(void* user_data); + +protected: + // add a new timer + void add_new_timer(timer_node_t* node, timer_handler* handler, void* user_data); + + // remove timer from list and free it. + // called for stopping (unregistering) a timer + void remove_timer(timer_node_t* node); + +private: + void* m_timer_handle; + timer_node_t** m_p_intervals; + + int m_n_period; + int m_n_resolution; + int m_n_intervals_size; + int m_n_location; + int m_n_count; + int m_n_next_insert_bucket; + + void free_tta_resources(); +}; + +extern tcp_timers_collection* g_tcp_timers_collection; + +#endif diff --git a/src/vma/sock/sockinfo_udp.cpp b/src/vma/sock/sockinfo_udp.cpp new file mode 100644 index 0000000..c776c43 --- /dev/null +++ b/src/vma/sock/sockinfo_udp.cpp @@ -0,0 +1,2621 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "sockinfo_udp.h" + +#include +#include +#include +#include "vma/util/if.h" +#include +#include +#include + +#include "utils/bullseye.h" +#include "utils/rdtsc.h" +#include "vma/util/libvma.h" +#include "vma/sock/sock-redirect.h" +#include "vma/sock/fd_collection.h" +#include "vma/event/event_handler_manager.h" +#include "vma/dev/buffer_pool.h" +#include "vma/dev/ring.h" +#include "vma/dev/ring_slave.h" +#include "vma/dev/ring_bond.h" +#include "vma/dev/ring_simple.h" +#include "vma/dev/ring_profile.h" +#include "vma/proto/route_table_mgr.h" +#include "vma/proto/rule_table_mgr.h" +#include "vma/proto/dst_entry_tcp.h" +#include "vma/proto/dst_entry_udp.h" +#include "vma/proto/dst_entry_udp_mc.h" +#include "vma/iomux/epfd_info.h" +#include "vma/iomux/io_mux_call.h" +#include "vma/util/instrumentation.h" +#include "vma/dev/ib_ctx_handler_collection.h" + +/* useful debugging macros */ + +#define MODULE_NAME "si_udp" +#undef MODULE_HDR_INFO +#define MODULE_HDR_INFO MODULE_NAME "[fd=%d]:%d:%s() " +#undef __INFO__ +#define __INFO__ m_fd + +#define si_udp_logpanic __log_info_panic +#define si_udp_logerr __log_info_err +#define si_udp_logwarn __log_info_warn +#define si_udp_loginfo __log_info_info +#define si_udp_logdbg __log_info_dbg +#define si_udp_logfunc __log_info_func +#define si_udp_logfuncall __log_info_funcall + +/* For MCD */ +#define UDP_MAP_ADD 101 +#define UDP_MAP_REMOVE 102 + +/**/ +/** inlining functions can only help if they are implemented before their usage **/ +/**/ + +inline void sockinfo_udp::reuse_buffer(mem_buf_desc_t *buff) +{ + if(buff->dec_ref_count() <= 1) { + buff->inc_ref_count(); + sockinfo::reuse_buffer(buff); + } +} + +inline int sockinfo_udp::poll_os() +{ + int ret; + uint64_t pending_data = 0; + + m_rx_udp_poll_os_ratio_counter = 0; + ret = orig_os_api.ioctl(m_fd, FIONREAD, &pending_data); + if (unlikely(ret == -1)) { + m_p_socket_stats->counters.n_rx_os_errors++; + si_udp_logdbg("orig_os_api.ioctl returned with error in polling loop (errno=%d %m)", errno); + return -1; + } + if (pending_data > 0) { + m_p_socket_stats->counters.n_rx_poll_os_hit++; + return 1; + } + return 0; +} + +inline int sockinfo_udp::rx_wait(bool blocking) +{ + ssize_t ret = 0; + int32_t loops = 0; + int32_t loops_to_go = blocking ? m_loops_to_go : 1; + epoll_event rx_epfd_events[SI_RX_EPFD_EVENT_MAX]; + uint64_t poll_sn = 0; + + m_loops_timer.start(); + + while (loops_to_go) { + + // Multi-thread polling support - let other threads have a go on this CPU + if ((m_n_sysvar_rx_poll_yield_loops > 0) && ((loops % m_n_sysvar_rx_poll_yield_loops) == (m_n_sysvar_rx_poll_yield_loops - 1))) { + sched_yield(); + } + + // Poll socket for OS ready packets... (at a ratio of the offloaded sockets as defined in m_n_sysvar_rx_udp_poll_os_ratio) + if ((m_n_sysvar_rx_udp_poll_os_ratio > 0) && (m_rx_udp_poll_os_ratio_counter >= m_n_sysvar_rx_udp_poll_os_ratio)) { + ret = poll_os(); + if ((ret == -1) || (ret == 1)) { + return ret; + } + } + + // Poll cq for offloaded ready packets ... + m_rx_udp_poll_os_ratio_counter++; + if (is_readable(&poll_sn)) { + m_p_socket_stats->counters.n_rx_poll_hit++; + return 0; + } + + loops++; + if (!blocking || m_n_sysvar_rx_poll_num != -1) { + loops_to_go--; + } + if (m_loops_timer.is_timeout()) { + errno = EAGAIN; + return -1; + } + + if (unlikely(m_state == SOCKINFO_CLOSED)) { + errno = EBADFD; + si_udp_logdbg("returning with: EBADFD"); + return -1; + } + else if (unlikely(g_b_exit)) { + errno = EINTR; + si_udp_logdbg("returning with: EINTR"); + return -1; + } + } // End polling loop + m_p_socket_stats->counters.n_rx_poll_miss++; + + while (blocking) { + if (unlikely(m_state == SOCKINFO_CLOSED)) { + errno = EBADFD; + si_udp_logdbg("returning with: EBADFD"); + return -1; + } + else if (unlikely(g_b_exit)) { + errno = EINTR; + si_udp_logdbg("returning with: EINTR"); + return -1; + } + + if (rx_request_notification(poll_sn) > 0) { + // Check if a wce became available while arming the cq's notification channel + // A ready wce can be pending due to the drain logic + if (is_readable(&poll_sn)) { + return 0; + } + continue; // retry to arm cq notification channel in case there was no ready packet + } + else { + //Check if we have a packet in receive queue before we go to sleep + //(can happen if another thread was polling & processing the wce) + //and update is_sleeping flag under the same lock to synchronize between + //this code and wakeup mechanism. + if (is_readable(NULL)) { + return 0; + } + } + + + // Block with epoll_wait() + // on all rx_cq's notification channels and the socket's OS fd until we get an ip packet + // release lock so other threads that wait on this socket will not consume CPU + /* coverity[double_lock] TODO: RM#1049980 */ + m_lock_rcv.lock(); + if (!m_n_rx_pkt_ready_list_count) { + going_to_sleep(); + /* coverity[double_unlock] TODO: RM#1049980 */ + m_lock_rcv.unlock(); + } else { + m_lock_rcv.unlock(); + continue; + } + + ret = orig_os_api.epoll_wait(m_rx_epfd, rx_epfd_events, SI_RX_EPFD_EVENT_MAX, m_loops_timer.time_left_msec()); + + /* coverity[double_lock] TODO: RM#1049980 */ + m_lock_rcv.lock(); + return_from_sleep(); + /* coverity[double_unlock] TODO: RM#1049980 */ + m_lock_rcv.unlock(); + + if ( ret == 0 ) { //timeout + errno = EAGAIN; + return -1; + } + + if (unlikely(ret == -1)) { + if (errno == EINTR) { + si_udp_logdbg("EINTR from blocked epoll_wait() (ret=%d, errno=%d %m)", ret, errno); + } + else { + si_udp_logdbg("error from blocked epoll_wait() (ret=%d, errno=%d %m)", ret, errno); + } + + m_p_socket_stats->counters.n_rx_os_errors++; + return -1; + } + + if (ret > 0) { + + /* Quick check for a ready rx datagram on this sockinfo + * (if some other sockinfo::rx might have added a rx ready packet to our pool + * + * This is the classical case of wakeup, but we don't want to + * waist time on removing wakeup fd, it will be done next time + */ + if (is_readable(NULL)) { + return 0; + } + + // Run through all ready fd's + for (int event_idx = 0; event_idx < ret; ++event_idx) { + int fd = rx_epfd_events[event_idx].data.fd; + if (is_wakeup_fd(fd)) { + /* coverity[double_lock] TODO: RM#1049980 */ + m_lock_rcv.lock(); + remove_wakeup_fd(); + /* coverity[double_unlock] TODO: RM#1049980 */ + m_lock_rcv.unlock(); + continue; + } + + // Check if OS fd is ready for reading + if (fd == m_fd) { + m_rx_udp_poll_os_ratio_counter = 0; + return 1; + } + + // All that is left is our CQ offloading channel fd's + // poll cq. fd == cq channel fd. + // Process one wce on the relevant CQ + // The Rx CQ channel is non-blocking so this will always return quickly + cq_channel_info* p_cq_ch_info = g_p_fd_collection->get_cq_channel_fd(fd); + if (p_cq_ch_info) { + ring* p_ring = p_cq_ch_info->get_ring(); + if (p_ring) { + p_ring->wait_for_notification_and_process_element(fd, &poll_sn); + } + } + } + } + + // Check for ready datagrams on this sockinfo + // Our ring->poll_and_process_element might have got a ready rx datagram + // ..or some other sockinfo::rx might have added a ready rx datagram to our list + // In case of multiple frag we'de like to try and get all parts out of the corresponding + // ring, so we do want to poll the cq besides the select notification + if (is_readable(&poll_sn)) + return 0; + + } // while (blocking) + +/* ODEDS: No need for that as we always check if OS polling is needed in the first while loop + // If not blocking and we did not find any ready datagrams in our + // offloaded sockinfo then try the OS receive + // But try to skip this to reduce OS calls by user param + if (!blocking && unlikely(m_state != SOCKINFO_CLOSED)) { + m_n_num_skip_os_read++; + if (m_n_num_skip_os_read >= m_rx_skip_os_fd_check) { + m_n_num_skip_os_read = 0; + return 1; + } + } +*/ + errno = EAGAIN; + si_udp_logfunc("returning with: EAGAIN"); + return -1; +} + +const char * setsockopt_ip_opt_to_str(int opt) +{ + switch (opt) { + case IP_MULTICAST_IF: return "IP_MULTICAST_IF"; + case IP_MULTICAST_TTL: return "IP_MULTICAST_TTL"; + case IP_MULTICAST_LOOP: return "IP_MULTICAST_LOOP"; + case IP_ADD_MEMBERSHIP: return "IP_ADD_MEMBERSHIP"; + case IP_ADD_SOURCE_MEMBERSHIP: return "IP_ADD_SOURCE_MEMBERSHIP"; + case IP_DROP_MEMBERSHIP: return "IP_DROP_MEMBERSHIP"; + case IP_DROP_SOURCE_MEMBERSHIP: return "IP_DROP_SOURCE_MEMBERSHIP"; + default: break; + } + return "UNKNOWN IP opt"; +} + +// Throttle the amount of ring polling we do (remember last time we check for receive packets) +tscval_t g_si_tscv_last_poll = 0; + +sockinfo_udp::sockinfo_udp(int fd): + sockinfo(fd) + ,m_rx_packet_processor(&sockinfo_udp::rx_process_udp_packet_full) + ,m_mc_tx_if(INADDR_ANY) + ,m_b_mc_tx_loop(safe_mce_sys().tx_mc_loopback_default) // default value is 'true'. User can change this with config parameter SYS_VAR_TX_MC_LOOPBACK + ,m_n_mc_ttl(DEFAULT_MC_TTL) + ,m_loops_to_go(safe_mce_sys().rx_poll_num_init) // Start up with a init polling loops value + ,m_rx_udp_poll_os_ratio_counter(0) + ,m_sock_offload(true) + ,m_mc_num_grp_with_src_filter(0) + ,m_port_map_lock("sockinfo_udp::m_ports_map_lock") + ,m_port_map_index(0) + ,m_p_last_dst_entry(NULL) + ,m_tos(0) + ,m_n_sysvar_rx_poll_yield_loops(safe_mce_sys().rx_poll_yield_loops) + ,m_n_sysvar_rx_udp_poll_os_ratio(safe_mce_sys().rx_udp_poll_os_ratio) + ,m_n_sysvar_rx_ready_byte_min_limit(safe_mce_sys().rx_ready_byte_min_limit) + ,m_n_sysvar_rx_cq_drain_rate_nsec(safe_mce_sys().rx_cq_drain_rate_nsec) + ,m_n_sysvar_rx_delta_tsc_between_cq_polls(safe_mce_sys().rx_delta_tsc_between_cq_polls) + ,m_reuseaddr(false) + ,m_reuseport(false) + ,m_sockopt_mapped(false) + ,m_is_connected(false) + ,m_multicast(false) +{ + si_udp_logfunc(""); + + m_protocol = PROTO_UDP; + m_p_socket_stats->socket_type = SOCK_DGRAM; + m_p_socket_stats->b_is_offloaded = m_sock_offload; + + // Update MC related stats (default values) + m_p_socket_stats->mc_tx_if = m_mc_tx_if; + m_p_socket_stats->b_mc_loop = m_b_mc_tx_loop; + + int n_so_rcvbuf_bytes = 0; + socklen_t option_len = sizeof(n_so_rcvbuf_bytes); + BULLSEYE_EXCLUDE_BLOCK_START + if (unlikely(orig_os_api.getsockopt(m_fd, SOL_SOCKET, SO_RCVBUF, &n_so_rcvbuf_bytes, &option_len))) + si_udp_logdbg("Failure in getsockopt (errno=%d %m)", errno); + BULLSEYE_EXCLUDE_BLOCK_END + si_udp_logdbg("Sockets RCVBUF = %d bytes", n_so_rcvbuf_bytes); + rx_ready_byte_count_limit_update(n_so_rcvbuf_bytes); + + epoll_event ev = {0, {0}}; + + ev.events = EPOLLIN; + + // Add the user's orig fd to the rx epfd handle + ev.data.fd = m_fd; + + BULLSEYE_EXCLUDE_BLOCK_START + if (unlikely(orig_os_api.epoll_ctl(m_rx_epfd, EPOLL_CTL_ADD, ev.data.fd, &ev))) + si_udp_logpanic("failed to add user's fd to internal epfd errno=%d (%m)", errno); + BULLSEYE_EXCLUDE_BLOCK_END + + si_udp_logfunc("done"); +} + +sockinfo_udp::~sockinfo_udp() +{ + si_udp_logfunc(""); + + // Remove all RX ready queue buffers (Push into reuse queue per ring) + si_udp_logdbg("Releasing %d ready rx packets (total of %d bytes)", m_n_rx_pkt_ready_list_count, m_p_socket_stats->n_rx_ready_byte_count); + rx_ready_byte_count_limit_update(0); + + + // Clear the dst_entry map + dst_entry_map_t::iterator dst_entry_iter = m_dst_entry_map.begin(); + while (dst_entry_iter != m_dst_entry_map.end()) { + delete dst_entry_iter->second; // TODO ALEXR - should we check and delete the udp_mc in MC cases? + m_dst_entry_map.erase(dst_entry_iter); + dst_entry_iter = m_dst_entry_map.begin(); + } + +/* AlexR: + We don't have to be nice and delete the fd. close() will do that any way. + This save us the problem when closing in the clean-up case - if we get closed be the nameserver socket 53. + if (unlikely( orig_os_api.epoll_ctl(m_rx_epfd, EPOLL_CTL_DEL, m_fd, NULL))) { + if (errno == ENOENT) + si_logfunc("failed to del users fd from internal epfd - probably clean up case (errno=%d %m)", errno); + else + si_logerr("failed to del users fd from internal epfd (errno=%d %m)", errno); + } +*/ + m_lock_rcv.lock(); + do_wakeup(); + + destructor_helper(); + + m_lock_rcv.unlock(); + + statistics_print(); + + if (m_n_rx_pkt_ready_list_count || m_rx_ready_byte_count || m_rx_pkt_ready_list.size() || m_rx_ring_map.size() || m_rx_reuse_buff.n_buff_num) + si_udp_logerr("not all buffers were freed. protocol=UDP. m_n_rx_pkt_ready_list_count=%d, m_rx_ready_byte_count=%d, m_rx_pkt_ready_list.size()=%d, m_rx_ring_map.size()=%d, m_rx_reuse_buff.n_buff_num=%d", + m_n_rx_pkt_ready_list_count, m_rx_ready_byte_count, (int)m_rx_pkt_ready_list.size() ,(int)m_rx_ring_map.size(), m_rx_reuse_buff.n_buff_num); + + si_udp_logfunc("done"); +} + +int sockinfo_udp::bind(const struct sockaddr *__addr, socklen_t __addrlen) +{ + si_udp_logfunc(""); + + + // We always call the orig_bind which will check sanity of the user socket api + // and the OS will also allocate a specific port that we can also use + int ret = orig_os_api.bind(m_fd, __addr, __addrlen); + if (ret) { + si_udp_logdbg("orig bind failed (ret=%d %m)", ret); + // TODO: Should we set errno again (maybe log write modified the orig.bind() errno)? + return ret; + } + if (unlikely(m_state == SOCKINFO_CLOSED) || unlikely(g_b_exit)) { + errno = EBUSY; + return -1; // zero returned from orig_bind() + } + + struct sockaddr_in bound_addr; + socklen_t boundlen = sizeof(struct sockaddr_in); + struct sockaddr *name = (struct sockaddr *)&bound_addr; + socklen_t *namelen = &boundlen; + + ret = getsockname(name, namelen); + BULLSEYE_EXCLUDE_BLOCK_START + if (ret) { + si_udp_logdbg("getsockname failed (ret=%d %m)", ret); + return -1; + } + + BULLSEYE_EXCLUDE_BLOCK_END + // save the bound info and then attach to offload flows + on_sockname_change(name, *namelen); + si_udp_logdbg("bound to %s", m_bound.to_str()); + dst_entry_map_t::iterator dst_entry_iter = m_dst_entry_map.begin(); + while (dst_entry_iter != m_dst_entry_map.end()) { + if (!m_bound.is_anyaddr() && !m_bound.is_mc()) { + dst_entry_iter->second->set_bound_addr(m_bound.get_in_addr()); + } + dst_entry_iter++; + } + + return 0; +} + +int sockinfo_udp::connect(const struct sockaddr *__to, socklen_t __tolen) +{ + sock_addr connect_to((struct sockaddr*)__to); + si_udp_logdbg("to %s", connect_to.to_str()); + + // We always call the orig_connect which will check sanity of the user socket api + // and the OS will also allocate a specific bound port that we can also use + int ret = orig_os_api.connect(m_fd, __to, __tolen); + if (ret) { + si_udp_logdbg("orig connect failed (ret=%d, errno=%d %m)", ret, errno); + return ret; + } + if (unlikely(m_state == SOCKINFO_CLOSED) || unlikely(g_b_exit)) { + errno = EBUSY; + return -1; // zero returned from orig_connect() + } + + auto_unlocker _lock(m_lock_snd); + + // Dissolve the current connection setting if it's not AF_INET + // (this also support the default dissolve by AF_UNSPEC) + if (connect_to.get_sa_family() == AF_INET) { + m_connected.set_sa_family(AF_INET); + m_connected.set_in_addr(INADDR_ANY); + m_p_socket_stats->connected_ip = m_connected.get_in_addr(); + + m_connected.set_in_port(INPORT_ANY); + m_p_socket_stats->connected_port = m_connected.get_in_port(); + +/* TODO ALEXR REMOVE ME - DONE IN DST_ENTRY + + if (ZERONET_N(connect_to.get_in_addr())) { + si_udp_logdbg("VMA does not offload zero net IP address"); + si_udp_logdbg("'connect()' to zero net address [%s] will be handled by the OS", connect_to.to_str()); + return 0; // zero returned from orig_connect() + } + + if (LOOPBACK_N(connect_to.get_in_addr())) { + si_udp_logdbg("VMA does not offload local loopback IP address"); + si_udp_logdbg("'connect()' to local loopback address [%s] will be handled by the OS", connect_to.to_str()); + return 0; // zero returned from orig_connect() + } +*/ + + in_addr_t dst_ip = connect_to.get_in_addr(); + in_port_t dst_port = connect_to.get_in_port(); + + // Check & Save connect ip info + if (dst_ip != INADDR_ANY && m_connected.get_in_addr() != dst_ip) { + si_udp_logdbg("connected ip changed (%s -> %s)", m_connected.to_str_in_addr(), connect_to.to_str_in_addr()); + } + m_connected.set_in_addr(dst_ip); + m_p_socket_stats->connected_ip = dst_ip; + + // Check & Save connect port info + if (dst_port != INPORT_ANY && m_connected.get_in_port() != dst_port) { + si_udp_logdbg("connected port changed (%s -> %s)", m_connected.to_str_in_port(), connect_to.to_str_in_port()); + } + m_connected.set_in_port(dst_port); + m_p_socket_stats->connected_port = dst_port; + + + // Connect can change the OS bound address, + // lets check it and update our bound ip & port + // Call on_sockname_change (this will save the bind information and attach to unicast flow) + struct sockaddr_in bound_addr; + socklen_t boundlen = sizeof(struct sockaddr_in); + struct sockaddr *name = (struct sockaddr *)&bound_addr; + socklen_t *namelen = &boundlen; + + ret = getsockname(name, namelen); + BULLSEYE_EXCLUDE_BLOCK_START + if (ret) { + si_udp_logerr("getsockname failed (ret=%d %m)", ret); + return 0; // zero returned from orig_connect() + } + BULLSEYE_EXCLUDE_BLOCK_END + + m_is_connected = true; // will inspect for SRC + + on_sockname_change(name, *namelen); + + si_udp_logdbg("bound to %s", m_bound.to_str()); + in_port_t src_port = m_bound.get_in_port(); + + if (TRANS_VMA != find_target_family(ROLE_UDP_CONNECT, m_connected.get_p_sa(), m_bound.get_p_sa())) { + setPassthrough(); + return 0; + } + // Create the new dst_entry + if (IN_MULTICAST_N(dst_ip)) { + socket_data data = { m_fd, m_n_mc_ttl, m_tos, m_pcp }; + m_p_connected_dst_entry = new dst_entry_udp_mc(dst_ip, dst_port, src_port, + m_mc_tx_if ? m_mc_tx_if : m_bound.get_in_addr(), + m_b_mc_tx_loop, data, m_ring_alloc_log_tx); + } + else { + socket_data data = { m_fd, m_n_uc_ttl, m_tos, m_pcp }; + m_p_connected_dst_entry = new dst_entry_udp(dst_ip, dst_port, + src_port, data, m_ring_alloc_log_tx); + } + + BULLSEYE_EXCLUDE_BLOCK_START + if (!m_p_connected_dst_entry) { + si_udp_logerr("Failed to create dst_entry(dst_ip:%s, dst_port:%d, src_port:%d)", NIPQUAD(dst_ip), ntohs(dst_port), ntohs(src_port)); + m_connected.set_in_addr(INADDR_ANY); + m_p_socket_stats->connected_ip = INADDR_ANY; + m_connected.set_in_port(INPORT_ANY); + m_p_socket_stats->connected_port = INPORT_ANY; + m_is_connected = false; // will skip inspection for SRC + return 0; + } + BULLSEYE_EXCLUDE_BLOCK_END + if (!m_bound.is_anyaddr() && !m_bound.is_mc()) { + m_p_connected_dst_entry->set_bound_addr(m_bound.get_in_addr()); + } + if (m_so_bindtodevice_ip) { + m_p_connected_dst_entry->set_so_bindtodevice_addr(m_so_bindtodevice_ip); + } + m_p_connected_dst_entry->prepare_to_send(m_so_ratelimit, false, true); + return 0; + } + return 0; +} + +int sockinfo_udp::getsockname(struct sockaddr *__name, socklen_t *__namelen) +{ + si_udp_logdbg(""); + + if (unlikely(m_state == SOCKINFO_CLOSED) || unlikely(g_b_exit)) { + errno = EINTR; + return -1; + } + + return orig_os_api.getsockname(m_fd, __name, __namelen); +} + +int sockinfo_udp::on_sockname_change(struct sockaddr *__name, socklen_t __namelen) +{ + NOT_IN_USE(__namelen); /* TODO use __namelen for IPV6 */ + + BULLSEYE_EXCLUDE_BLOCK_START + if (__name == NULL) { + si_udp_logerr("invalid NULL __name"); + errno = EFAULT; + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_END + + sock_addr bindname(__name); + + sa_family_t sin_family = bindname.get_sa_family(); + if (sin_family != AF_INET) { + si_udp_logfunc("not AF_INET family (%d)", sin_family); + return 0; + } + + bool is_bound_modified = false; + in_addr_t bound_if = bindname.get_in_addr(); + in_port_t bound_port = bindname.get_in_port(); + + auto_unlocker _lock(m_lock_rcv); + + // Check & Save bind port info + if (m_bound.get_in_port() != bound_port) { + si_udp_logdbg("bound port defined (%s -> %d)", m_bound.to_str_in_port(), ntohs(bound_port)); + m_bound.set_in_port(bound_port); + m_p_socket_stats->bound_port = bound_port; + is_bound_modified = true; + } + + // Check & Save bind if info + if (m_bound.get_in_addr() != bound_if) { + si_udp_logdbg("bound if changed (%s -> %d.%d.%d.%d)", m_bound.to_str_in_addr(), NIPQUAD(bound_if)); + m_bound.set_in_addr(bound_if); + m_p_socket_stats->bound_if = bound_if; + } + + // Check if this is the new 'name' (local port) of the socket + if (is_bound_modified && bound_port != INPORT_ANY) { + + // Attach UDP unicast port to offloaded interface + // 1. Check if local_if is offloadable OR is on INADDR_ANY which means attach to ALL + // 2. Verify not binding to MC address in the UC case + // 3. if not offloaded then set a PassThrough + if ((m_bound.is_anyaddr() || g_p_net_device_table_mgr->get_net_device_val(m_bound.get_in_addr()))) { + attach_as_uc_receiver(ROLE_UDP_RECEIVER); // if failed, we will get RX from OS + } + else if (m_bound.is_mc()) { + // MC address binding will happen later as part of the ADD_MEMBERSHIP in handle_pending_mreq() + si_udp_logdbg("bound to MC address, no need to attach to UC address as offloaded"); + } + else { + si_udp_logdbg("will be passed to OS for handling - not offload interface (%s)", m_bound.to_str()); + setPassthrough(); + } + + // Attach UDP port pending MC groups to offloaded interface (set by ADD_MEMBERSHIP before bind() was called) + handle_pending_mreq(); + } + + return 0; +} + +//////////////////////////////////////////////////////////////////////////////// +int sockinfo_udp::setsockopt(int __level, int __optname, __const void *__optval, socklen_t __optlen) +{ + si_udp_logfunc("level=%d, optname=%d", __level, __optname); + + int ret = 0; + + if (unlikely(m_state == SOCKINFO_CLOSED) || unlikely(g_b_exit)) + return orig_os_api.setsockopt(m_fd, __level, __optname, __optval, __optlen); + + auto_unlocker lock_tx(m_lock_snd); + auto_unlocker lock_rx(m_lock_rcv); + + if ((ret = sockinfo::setsockopt(__level, __optname, __optval, __optlen)) != SOCKOPT_PASS_TO_OS) { + return ret; + } + + bool supported = true; + switch (__level) { + + case SOL_SOCKET: + { + switch (__optname) { + + case SO_REUSEADDR: + set_reuseaddr(*(bool*)__optval); + si_udp_logdbg("SOL_SOCKET, %s=%s", setsockopt_so_opt_to_str(__optname), (*(bool*)__optval ? "true" : "false")); + break; + + case SO_REUSEPORT: + set_reuseport(*(bool*)__optval); + si_udp_logdbg("SOL_SOCKET, %s=%s", setsockopt_so_opt_to_str(__optname), (*(bool*)__optval ? "true" : "false")); + break; + + case SO_BROADCAST: + si_udp_logdbg("SOL_SOCKET, %s=%s", setsockopt_so_opt_to_str(__optname), (*(bool*)__optval ? "true" : "false")); + break; + + case SO_RCVBUF: + { + int n_so_rcvbuf_bytes = *(int*)__optval; + // OS allocates double the size of memory requested by the application + n_so_rcvbuf_bytes = n_so_rcvbuf_bytes * 2; + + si_udp_logdbg("SOL_SOCKET, %s=%d (x2)", setsockopt_so_opt_to_str(__optname), n_so_rcvbuf_bytes); + rx_ready_byte_count_limit_update(n_so_rcvbuf_bytes); + } + break; + + case SO_SNDBUF: + si_udp_logdbg("SOL_SOCKET, %s=%d", setsockopt_so_opt_to_str(__optname), *(int*)__optval); + // this is supported without doing something special because VMA send immediately without buffering + break; + + case SO_RCVTIMEO: + if (__optval) { + struct timeval* tv = (struct timeval*)__optval; + if (tv->tv_sec || tv->tv_usec) + m_loops_timer.set_timeout_msec(tv->tv_sec*1000 + (tv->tv_usec ? tv->tv_usec/1000 : 0)); + else + m_loops_timer.set_timeout_msec(-1); + si_udp_logdbg("SOL_SOCKET: SO_RCVTIMEO=%d", m_loops_timer.get_timeout_msec()); + } + else { + si_udp_logdbg("SOL_SOCKET, %s=\"???\" - NOT HANDLED, optval == NULL", setsockopt_so_opt_to_str(__optname)); + } + break; + + case SO_BINDTODEVICE: + if (__optval) { + struct sockaddr_in sockaddr; + if (__optlen == 0 || ((char*)__optval)[0] == '\0') { + m_so_bindtodevice_ip = INADDR_ANY; + } else if (get_ipv4_from_ifname((char*)__optval, &sockaddr)) { + si_udp_logdbg("SOL_SOCKET, %s=\"???\" - NOT HANDLED, cannot find if_name", setsockopt_so_opt_to_str(__optname)); + break; + } else { + m_so_bindtodevice_ip = sockaddr.sin_addr.s_addr; + } + si_udp_logdbg("SOL_SOCKET, %s='%s' (%d.%d.%d.%d)", setsockopt_so_opt_to_str(__optname), (char*)__optval, NIPQUAD(m_so_bindtodevice_ip)); + + // handle TX side + if (m_p_connected_dst_entry) { + m_p_connected_dst_entry->set_so_bindtodevice_addr(m_so_bindtodevice_ip); + } else { + dst_entry_map_t::iterator dst_entry_iter = m_dst_entry_map.begin(); + while (dst_entry_iter != m_dst_entry_map.end()) { + dst_entry_iter->second->set_so_bindtodevice_addr(m_so_bindtodevice_ip); + dst_entry_iter++; + } + } + + // handle RX side - TODO + } + else { + si_udp_logdbg("SOL_SOCKET, %s=\"???\" - NOT HANDLED, optval == NULL", setsockopt_so_opt_to_str(__optname)); + } + break; + case SO_MAX_PACING_RATE: + if (__optval) { + struct vma_rate_limit_t val; + + if (sizeof(struct vma_rate_limit_t) == __optlen) { + val = *(struct vma_rate_limit_t*)__optval; // value is in Kbits per second + } else if (sizeof(uint32_t) == __optlen) { + // value is in bytes per second + val.rate = BYTE_TO_KB(*(uint32_t*)__optval); // value is in bytes per second + val.max_burst_sz = 0; + val.typical_pkt_sz = 0; + } else { + si_udp_logdbg("SOL_SOCKET, %s=\"???\" - bad length got %d", + setsockopt_so_opt_to_str(__optname), __optlen); + return -1; + } + + if (modify_ratelimit(m_p_connected_dst_entry, val) < 0) { + si_udp_logdbg("error setting setsockopt SO_MAX_PACING_RATE for connected dst_entry %p: %d bytes/second ", m_p_connected_dst_entry, val.rate); + + // Do not fall back to kernel in this case. + // The kernel's support for packet pacing is of no consequence + // to the VMA user and may only confuse the calling application. + return -1; + } + + size_t dst_entries_not_modified = 0; + dst_entry_map_t::iterator dst_entry_iter ; + for (dst_entry_iter = m_dst_entry_map.begin(); + dst_entry_iter != m_dst_entry_map.end(); + ++dst_entry_iter) { + dst_entry* p_dst_entry = dst_entry_iter->second; + if (modify_ratelimit(p_dst_entry, val) < 0) { + si_udp_logdbg("error setting setsockopt SO_MAX_PACING_RATE " + "for dst_entry %p: %d bytes/second ", + p_dst_entry, val.rate); + dst_entries_not_modified++; + } + } + // It is possible that the user has a setup with some NICs that support + // packet pacing and some that don't. + // Setting packet pacing fails only if all NICs do not support it. + if (m_dst_entry_map.size() && + (dst_entries_not_modified == m_dst_entry_map.size())) { + return -1; + } + return 0; + } + else { + si_udp_logdbg("SOL_SOCKET, %s=\"???\" - NOT HANDLED, optval == NULL", setsockopt_so_opt_to_str(__optname)); + } + break; + case SO_PRIORITY: + if (set_sockopt_prio(__optval, __optlen)) { + return -1; + } + break; + default: + si_udp_logdbg("SOL_SOCKET, optname=%s (%d)", setsockopt_so_opt_to_str(__optname), __optname); + supported = false; + break; + } + } // case SOL_SOCKET + break; + + case IPPROTO_IP: + { + switch (__optname) { + + case IP_MULTICAST_IF: + { + struct ip_mreqn mreqn; + memset(&mreqn, 0, sizeof(mreqn)); + + if (!__optval || __optlen < sizeof(struct in_addr)) { + si_udp_loginfo("IPPROTO_IP, %s=\"???\", optlen:%d", setsockopt_ip_opt_to_str(__optname), (int)__optlen); + break; + } + + if (__optlen >= sizeof(struct ip_mreqn)) { + memcpy(&mreqn, __optval, sizeof(struct ip_mreqn)); + } else if (__optlen >= sizeof(struct ip_mreq)) { + memcpy(&mreqn, __optval, sizeof(struct ip_mreq)); + } else { + memcpy(&mreqn.imr_address, __optval, sizeof(struct in_addr)); + } + + if (mreqn.imr_ifindex) { + local_ip_list_t lip_offloaded_list = g_p_net_device_table_mgr->get_ip_list(mreqn.imr_ifindex); + if (!lip_offloaded_list.empty()) { + mreqn.imr_address.s_addr = lip_offloaded_list.front().local_addr; + } else { + struct sockaddr_in src_addr; + if (get_ipv4_from_ifindex(mreqn.imr_ifindex, &src_addr) == 0) { + mreqn.imr_address.s_addr = src_addr.sin_addr.s_addr; + } else { + si_udp_logdbg("setsockopt(%s) will be passed to OS for handling, can't get address of interface index %d ", setsockopt_ip_opt_to_str(__optname), mreqn.imr_ifindex); + break; + } + } + } + + m_mc_tx_if = mreqn.imr_address.s_addr; + + si_udp_logdbg("IPPROTO_IP, %s=%d.%d.%d.%d", setsockopt_ip_opt_to_str(__optname), NIPQUAD(m_mc_tx_if)); + m_p_socket_stats->mc_tx_if = m_mc_tx_if; + } + break; + + case IP_MULTICAST_TTL: + { + int n_mc_ttl = -1; + if (__optlen == sizeof(m_n_mc_ttl)) + n_mc_ttl = *(char*)__optval; + else if (__optlen == sizeof(int)) + n_mc_ttl = *(int*)__optval; + else { + break; + } + if (n_mc_ttl == -1) { + n_mc_ttl = 1; + } + if (n_mc_ttl >= 0 && n_mc_ttl <= 255) { + m_n_mc_ttl = n_mc_ttl; + header_ttl_updater du(m_n_mc_ttl, true); + update_header_field(&du); + si_udp_logdbg("IPPROTO_IP, %s=%d", setsockopt_ip_opt_to_str(__optname), m_n_mc_ttl); + } + else { + si_udp_loginfo("IPPROTO_IP, %s=\"???\"", setsockopt_ip_opt_to_str(__optname)); + } + } + break; + + case IP_MULTICAST_LOOP: + { + if (__optval) { + bool b_mc_loop = *(bool*)__optval; + m_b_mc_tx_loop = b_mc_loop ? true : false; + m_p_socket_stats->b_mc_loop = m_b_mc_tx_loop; + si_udp_logdbg("IPPROTO_IP, %s=%s", setsockopt_ip_opt_to_str(__optname), (m_b_mc_tx_loop ? "true" : "false")); + } + else { + si_udp_loginfo("IPPROTO_IP, %s=\"???\"", setsockopt_ip_opt_to_str(__optname)); + } + } + break; + + case IP_ADD_MEMBERSHIP: + case IP_DROP_MEMBERSHIP: + case IP_ADD_SOURCE_MEMBERSHIP: + case IP_DROP_SOURCE_MEMBERSHIP: + { + if (!m_sock_offload) { + si_udp_logdbg("VMA Rx Offload is Disabled! calling OS setsockopt() for IPPROTO_IP, %s", setsockopt_ip_opt_to_str(__optname)); + break; + } + + if (NULL == __optval) { + si_udp_logdbg("IPPROTO_IP, %s; Bad optval! calling OS setsockopt()", setsockopt_ip_opt_to_str(__optname)); + break; + } + + // There are 3 types of structs that we can receive, ip_mreq(2 members), ip_mreqn(3 members), ip_mreq_source(3 members) + // ip_mreq struct type and size depend on command type, let verify all possibilities and continue + // below with safe logic. + + // NOTE: The ip_mreqn structure is available only since Linux 2.2. For compatibility, the old ip_mreq + // structure (present since Linux 1.2) is still supported; it differs from ip_mreqn only by not + // including the imr_ifindex field. + if (__optlen < sizeof(struct ip_mreq)) { + si_udp_logdbg("IPPROTO_IP, %s; Bad optlen! calling OS setsockopt() with optlen=%d (required optlen=%d)", + setsockopt_ip_opt_to_str(__optname), __optlen, sizeof(struct ip_mreq)); + break; + } + // IP_ADD_SOURCE_MEMBERSHIP (and DROP) used ip_mreq_source which is same size struct as ip_mreqn, + // but fields have different meaning + if (((IP_ADD_SOURCE_MEMBERSHIP == __optname) || (IP_DROP_SOURCE_MEMBERSHIP == __optname)) && + (__optlen < sizeof(struct ip_mreq_source))) { + si_udp_logdbg("IPPROTO_IP, %s; Bad optlen! calling OS setsockopt() with optlen=%d (required optlen=%d)", + setsockopt_ip_opt_to_str(__optname), __optlen, sizeof(struct ip_mreq_source)); + break; + } + + // Use local variable for easy access + in_addr_t mc_grp = ((struct ip_mreq*)__optval)->imr_multiaddr.s_addr; + in_addr_t mc_if = ((struct ip_mreq*)__optval)->imr_interface.s_addr; + + // In case interface address is undefined[INADDR_ANY] we need to find the ip address to use + struct ip_mreq_source mreqprm = {{mc_grp}, {mc_if}, {0}}; + if ((IP_ADD_MEMBERSHIP == __optname) || (IP_DROP_MEMBERSHIP == __optname)) { + if (__optlen >= sizeof(struct ip_mreqn)) { + struct ip_mreqn* p_mreqn = (struct ip_mreqn*)__optval; + if (p_mreqn->imr_ifindex) { + local_ip_list_t lip_offloaded_list = g_p_net_device_table_mgr->get_ip_list(p_mreqn->imr_ifindex); + if (!lip_offloaded_list.empty()) { + mreqprm.imr_interface.s_addr = lip_offloaded_list.front().local_addr; + } else { + struct sockaddr_in src_addr; + if (get_ipv4_from_ifindex(p_mreqn->imr_ifindex, &src_addr) == 0) { + mreqprm.imr_interface.s_addr = src_addr.sin_addr.s_addr; + } else { + si_udp_logdbg("setsockopt(%s) will be passed to OS for handling, can't get address of interface index %d ", + setsockopt_ip_opt_to_str(__optname), p_mreqn->imr_ifindex); + break; + } + } + } + } + } + else { + // Save and use the user provided source address filter in case of IP_ADD_SOURCE_MEMBERSHIP or IP_DROP_SOURCE_MEMBERSHIP + mreqprm.imr_sourceaddr.s_addr = ((struct ip_mreq_source*)__optval)->imr_sourceaddr.s_addr; + } + + // Update interface IP in case it was changed above + mc_if = mreqprm.imr_interface.s_addr; + + if (!IN_MULTICAST_N(mc_grp)) { + si_udp_logdbg("setsockopt(%s) will be passed to OS for handling, IP %d.%d.%d.%d is not MC ", + setsockopt_ip_opt_to_str(__optname), NIPQUAD(mc_grp)); + break; + } + + // Find local interface IP address + if (INADDR_ANY == mc_if) { + in_addr_t dst_ip = mc_grp; + in_addr_t src_ip = 0; + + if ((!m_bound.is_anyaddr()) && (!m_bound.is_mc())) { + src_ip = m_bound.get_in_addr(); + } else if (m_so_bindtodevice_ip) { + src_ip = m_so_bindtodevice_ip; + } + // Find local if for this MC ADD/DROP + route_result res; + g_p_route_table_mgr->route_resolve(route_rule_table_key(dst_ip, src_ip, m_tos), res); + mc_if = res.p_src; + si_udp_logdbg("IPPROTO_IP, %s=%d.%d.%d.%d, mc_if:INADDR_ANY (resolved to: %d.%d.%d.%d)", setsockopt_ip_opt_to_str(__optname), NIPQUAD(mc_grp), NIPQUAD(mc_if)); + } + else { + si_udp_logdbg("IPPROTO_IP, %s=%d.%d.%d.%d, mc_if:%d.%d.%d.%d mc_src:%d.%d.%d.%d", setsockopt_ip_opt_to_str(__optname), NIPQUAD(mc_grp), NIPQUAD(mc_if), NIPQUAD(mreqprm.imr_sourceaddr.s_addr)); + } + + // Add multicast group membership + if (mc_change_membership_start_helper(mc_grp, __optname)) { + return -1; + } + + bool goto_os = false; + // Check MC rules for not offloading + sock_addr tmp_grp_addr(AF_INET, mc_grp, m_bound.get_in_port()); + mc_pending_pram mcpram = {mreqprm.imr_multiaddr, mreqprm.imr_interface, mreqprm.imr_sourceaddr, __optname}; + + if (TRANS_OS == __vma_match_udp_receiver(TRANS_VMA, safe_mce_sys().app_id, tmp_grp_addr.get_p_sa(), tmp_grp_addr.get_socklen())) { + // call orig setsockopt() and don't try to offlaod + si_udp_logdbg("setsockopt(%s) will be passed to OS for handling due to rule matching", setsockopt_ip_opt_to_str(__optname)); + goto_os = true; + } + // Check if local_if is not offloadable + else if (!g_p_net_device_table_mgr->get_net_device_val(mc_if)) { + // call orig setsockopt() and don't try to offlaod + si_udp_logdbg("setsockopt(%s) will be passed to OS for handling - not offload interface (%d.%d.%d.%d)", setsockopt_ip_opt_to_str(__optname), NIPQUAD(mc_if)); + goto_os = true; + } + // offloaded, check if need to pend + else if (INPORT_ANY == m_bound.get_in_port()) { + // Delay attaching to this MC group until we have bound UDP port + ret = orig_os_api.setsockopt(m_fd, __level, __optname, __optval, __optlen); + if (ret) return ret; + mc_change_pending_mreq(&mcpram); + } + // Handle attach to this MC group now + else if (mc_change_membership( &mcpram )) { + // Opps, failed in attaching??? call orig setsockopt() + goto_os = true; + } + + if (goto_os) { + ret = orig_os_api.setsockopt(m_fd, __level, __optname, __optval, __optlen); + if (ret) return ret; + } + + mc_change_membership_end_helper(mc_grp, __optname, mreqprm.imr_sourceaddr.s_addr); + return 0; + } + break; + case IP_PKTINFO: + if (__optval) { + if(*(int*)__optval) + m_b_pktinfo = true; + else + m_b_pktinfo = false; + } + break; + case IP_TOS: + { + int val; + if (__optlen == sizeof(int)) { + val = *(int *)__optval; + } else if (__optlen == sizeof(uint8_t)) { + val = *(uint8_t *)__optval; + } else { + break; + } + m_tos =(uint8_t)val; + header_tos_updater du(m_tos); + update_header_field(&du); + // lists.openwall.net/netdev/2009/12/21/59 + int new_prio = ip_tos2prio[IPTOS_TOS(m_tos) >> 1]; + set_sockopt_prio(&new_prio, sizeof(new_prio)); + } + break; + default: + { + si_udp_logdbg("IPPROTO_IP, optname=%s (%d)", setsockopt_ip_opt_to_str(__optname), __optname); + supported = false; + } + break; + } + } // case IPPROTO_IP + break; + + case IPPROTO_UDP: + switch (__optname) { + case UDP_MAP_ADD: + { + if (! __optval) { + si_udp_loginfo("UDP_MAP_ADD __optval = NULL"); + break; + } + struct port_socket_t port_socket; + port_socket.port = *(in_port_t *)__optval; + m_port_map_lock.lock(); + if (std::find(m_port_map.begin(), m_port_map.end(), port_socket.port) == m_port_map.end()) { + port_socket.fd = get_sock_by_L3_L4(PROTO_UDP, m_bound.get_in_addr(), port_socket.port); + if (port_socket.fd == -1) { + si_udp_logdbg("could not find UDP_MAP_ADD socket for port %d", ntohs(port_socket.port)); + m_port_map_lock.unlock(); + return -1; + } + if (m_port_map.empty()) { + m_sockopt_mapped = true; + // set full versus partial RX UDP handling due to updates in m_socket_mapped + set_rx_packet_processor(); + } + si_udp_logdbg("found UDP_MAP_ADD socket fd for port %d. fd is %d", ntohs(port_socket.port), port_socket.fd); + m_port_map.push_back(port_socket); + } + m_port_map_lock.unlock(); + return 0; + } + case UDP_MAP_REMOVE: + { + if (! __optval) { + si_udp_loginfo("UDP_MAP_REMOVE __optval = NULL"); + break; + } + in_port_t port = *(in_port_t *)__optval; + si_udp_logdbg("stopping de-muxing packets to port %d", ntohs(port)); + m_port_map_lock.lock(); + std::vector::iterator iter = std::find(m_port_map.begin(), m_port_map.end(), port); + if (iter != m_port_map.end()) { + m_port_map.erase(iter); + if (m_port_map.empty()) { + m_sockopt_mapped = false; + // set full versus partial RX UDP handling due to updates in m_socket_mapped + set_rx_packet_processor(); + } + } + m_port_map_lock.unlock(); + return 0; + } + default: + si_udp_logdbg("IPPROTO_UDP, optname=%s (%d)", setsockopt_ip_opt_to_str(__optname), __optname); + supported = false; + break; + } // case IPPROTO_UDP + break; + + default: + { + si_udp_logdbg("level = %d, optname = %d", __level, __optname); + supported = false; + } + break; + } + return setsockopt_kernel(__level, __optname, __optval, __optlen, supported, false); +} + +int sockinfo_udp::getsockopt(int __level, int __optname, void *__optval, socklen_t *__optlen) +{ + si_udp_logfunc("level=%d, optname=%d", __level, __optname); + + int ret = orig_os_api.getsockopt(m_fd, __level, __optname, __optval, __optlen); + + if (unlikely(m_state == SOCKINFO_CLOSED) || unlikely(g_b_exit)) + return ret; + + if (0 == sockinfo::getsockopt(__level, __optname, __optval, __optlen)) { + return 0; + } + + auto_unlocker lock_tx(m_lock_snd); + auto_unlocker lock_rx(m_lock_rcv); + + bool supported = true; + switch (__level) { + case SOL_SOCKET: + { + switch (__optname) { + + case SO_RCVBUF: + { + uint32_t n_so_rcvbuf_bytes = *(int*)__optval; + si_udp_logdbg("SOL_SOCKET, SO_RCVBUF=%d", n_so_rcvbuf_bytes); + + if (m_p_socket_stats->n_rx_ready_byte_count > n_so_rcvbuf_bytes) + si_udp_logdbg("Releasing at least %d bytes from ready rx packets queue", m_p_socket_stats->n_rx_ready_byte_count - n_so_rcvbuf_bytes); + + rx_ready_byte_count_limit_update(n_so_rcvbuf_bytes); + } + break; + + case SO_SNDBUF: + si_udp_logdbg("SOL_SOCKET, SO_SNDBUF=%d", *(int*)__optval); + break; + + case SO_MAX_PACING_RATE: + ret = sockinfo::getsockopt(__level, __optname, __optval, __optlen); + break; + + default: + si_udp_logdbg("SOL_SOCKET, optname=%d", __optname); + supported = false; + break; + } + + } // case SOL_SOCKET + break; + + default: + { + si_udp_logdbg("level = %d, optname = %d", __level, __optname); + supported = false; + } + break; + } + + if (! supported) { + char buf[256]; + snprintf(buf, sizeof(buf), "unimplemented getsockopt __level=%#x, __optname=%#x, __optlen=%d", (unsigned)__level, (unsigned)__optname, __optlen ? *__optlen : 0); + buf[ sizeof(buf)-1 ] = '\0'; + + VLOG_PRINTF_INFO(safe_mce_sys().exception_handling.get_log_severity(), "%s", buf); + int rc = handle_exception_flow(); + switch (rc) { + case -1: + return rc; + case -2: + vma_throw_object_with_msg(vma_unsupported_api, buf); + } + } + + return ret; +} + +// Drop rx ready packets from head of queue +void sockinfo_udp::rx_ready_byte_count_limit_update(size_t n_rx_ready_bytes_limit_new) +{ + si_udp_logfunc("new limit: %d Bytes (old: %d Bytes, min value %d Bytes)", n_rx_ready_bytes_limit_new, m_p_socket_stats->n_rx_ready_byte_limit, m_n_sysvar_rx_ready_byte_min_limit); + if (n_rx_ready_bytes_limit_new > 0 && n_rx_ready_bytes_limit_new < m_n_sysvar_rx_ready_byte_min_limit) + n_rx_ready_bytes_limit_new = m_n_sysvar_rx_ready_byte_min_limit; + m_p_socket_stats->n_rx_ready_byte_limit = n_rx_ready_bytes_limit_new; + + m_lock_rcv.lock(); + while (m_p_socket_stats->n_rx_ready_byte_count > m_p_socket_stats->n_rx_ready_byte_limit) { + if (m_n_rx_pkt_ready_list_count) { + mem_buf_desc_t* p_rx_pkt_desc = m_rx_pkt_ready_list.get_and_pop_front(); + m_n_rx_pkt_ready_list_count--; + m_rx_ready_byte_count -= p_rx_pkt_desc->rx.sz_payload; + m_p_socket_stats->n_rx_ready_pkt_count--; + m_p_socket_stats->n_rx_ready_byte_count -= p_rx_pkt_desc->rx.sz_payload; + + reuse_buffer(p_rx_pkt_desc); + return_reuse_buffers_postponed(); + } + else + break; + } + m_lock_rcv.unlock(); + + return; +} + +ssize_t sockinfo_udp::rx(const rx_call_t call_type, iovec* p_iov,ssize_t sz_iov, + int* p_flags, sockaddr *__from ,socklen_t *__fromlen, struct msghdr *__msg) +{ + int errno_tmp = errno; + int ret; + uint64_t poll_sn = 0; + int out_flags = 0; + int in_flags = *p_flags; + + si_udp_logfunc(""); + + m_lock_rcv.lock(); + + if (unlikely(m_state == SOCKINFO_CLOSED)) { + errno = EBADFD; + ret = -1; + goto out; + } + else if (unlikely(g_b_exit)) { + errno = EINTR; + ret = -1; + goto out; + } + +#ifdef VMA_TIME_MEASURE + TAKE_T_RX_START; +#endif + save_stats_threadid_rx(); + + int rx_wait_ret; + + return_reuse_buffers_postponed(); + + // Drop lock to not starve other threads + m_lock_rcv.unlock(); + + // Poll socket for OS ready packets... (at a ratio of the offloaded sockets as defined in m_n_sysvar_rx_udp_poll_os_ratio) + if ((m_n_sysvar_rx_udp_poll_os_ratio > 0) && (m_rx_udp_poll_os_ratio_counter >= m_n_sysvar_rx_udp_poll_os_ratio)) { + ret = poll_os(); + if (ret == -1) { + /* coverity[double_lock] TODO: RM#1049980 */ + m_lock_rcv.lock(); + goto out; + } + if (ret == 1) { + /* coverity[double_lock] TODO: RM#1049980 */ + m_lock_rcv.lock(); + goto os; + } + } + + // First check if we have a packet in the ready list + if ((m_n_rx_pkt_ready_list_count > 0 && m_n_sysvar_rx_cq_drain_rate_nsec == MCE_RX_CQ_DRAIN_RATE_DISABLED) + || is_readable(&poll_sn)) { + /* coverity[double_lock] TODO: RM#1049980 */ + m_lock_rcv.lock(); + m_rx_udp_poll_os_ratio_counter++; + if (m_n_rx_pkt_ready_list_count > 0) { + // Found a ready packet in the list + if (__msg) handle_cmsg(__msg); + ret = dequeue_packet(p_iov, sz_iov, (sockaddr_in *)__from, __fromlen, in_flags, &out_flags); + goto out; + } + /* coverity[double_unlock] TODO: RM#1049980 */ + m_lock_rcv.unlock(); + } + +wait: + /* + * We (probably) do not have a ready packet. + * Wait for RX to become ready. + */ + rx_wait_ret = rx_wait(m_b_blocking && !(in_flags & MSG_DONTWAIT)); + + m_lock_rcv.lock(); + + if (likely(rx_wait_ret == 0)) { + // Got 0, means we might have a ready packet + if (m_n_rx_pkt_ready_list_count > 0) { + if (__msg) handle_cmsg(__msg); + ret = dequeue_packet(p_iov, sz_iov, (sockaddr_in *)__from, __fromlen, in_flags, &out_flags); + goto out; + } else { + m_lock_rcv.unlock(); + goto wait; + } + } + else if (unlikely(rx_wait_ret < 0)) { + // Got < 0, means an error occurred + ret = rx_wait_ret; + goto out; + } // else - packet in OS + + /* + * If we got here, either the socket is not offloaded or rx_wait() returned 1. + */ +os: + if (in_flags & MSG_VMA_ZCOPY_FORCE) { + // Enable the next non-blocked read to check the OS + m_rx_udp_poll_os_ratio_counter = m_n_sysvar_rx_udp_poll_os_ratio; + errno = EIO; + ret = -1; + goto out; + } + +#ifdef VMA_TIME_MEASURE + INC_GO_TO_OS_RX_COUNT; +#endif + + in_flags &= ~MSG_VMA_ZCOPY; + ret = socket_fd_api::rx_os(call_type, p_iov, sz_iov, in_flags, __from, __fromlen, __msg); + *p_flags = in_flags; + save_stats_rx_os(ret); + if (ret > 0) { + // This will cause the next non-blocked read to check the OS again. + // We do this only after a successful read. + m_rx_udp_poll_os_ratio_counter = m_n_sysvar_rx_udp_poll_os_ratio; + } + +out: + /* coverity[double_unlock] TODO: RM#1049980 */ + m_lock_rcv.unlock(); + + if (__msg) + __msg->msg_flags |= out_flags & MSG_TRUNC; + + if (ret < 0) { +#ifdef VMA_TIME_MEASURE + INC_ERR_RX_COUNT; +#endif + si_udp_logfunc("returning with: %d (errno=%d %m)", ret, errno); + } + else { +#ifdef VMA_TIME_MEASURE + TAKE_T_RX_END; +#endif + /* Restore errno on function entry in case success */ + errno = errno_tmp; + + si_udp_logfunc("returning with: %d", ret); + } + return ret; +} + +void sockinfo_udp::handle_ip_pktinfo(struct cmsg_state * cm_state) +{ + struct in_pktinfo in_pktinfo; + mem_buf_desc_t* p_desc = m_rx_pkt_ready_list.front(); + + rx_net_device_map_t::iterator iter = m_rx_nd_map.find(p_desc->rx.udp.local_if); + if (iter == m_rx_nd_map.end()) { + si_udp_logerr("could not find net device for ip %d.%d.%d.%d", NIPQUAD(p_desc->rx.udp.local_if)); + return; + } + in_pktinfo.ipi_ifindex = iter->second.p_ndv->get_if_idx(); + in_pktinfo.ipi_addr = p_desc->rx.dst.sin_addr; + in_pktinfo.ipi_spec_dst.s_addr = p_desc->rx.udp.local_if; + insert_cmsg(cm_state, IPPROTO_IP, IP_PKTINFO, &in_pktinfo, sizeof(struct in_pktinfo)); +} + +// This function is relevant only for non-blocking socket +void sockinfo_udp::set_immediate_os_sample() +{ + m_rx_udp_poll_os_ratio_counter = m_n_sysvar_rx_udp_poll_os_ratio; +} + +// This function is relevant only for non-blocking socket +void sockinfo_udp::unset_immediate_os_sample() +{ + m_rx_udp_poll_os_ratio_counter = 0; +} + +bool sockinfo_udp::is_readable(uint64_t *p_poll_sn, fd_array_t* p_fd_ready_array) +{ + si_udp_logfuncall(""); + + // Check local list of ready rx packets + // This is the quickest way back to the user with a ready packet (which will happen if we don't force draining of the CQ) + if (m_n_rx_pkt_ready_list_count > 0) { + + if (m_n_sysvar_rx_cq_drain_rate_nsec == MCE_RX_CQ_DRAIN_RATE_DISABLED) { + si_udp_logfunc("=> true (ready count = %d packets / %d bytes)", m_n_rx_pkt_ready_list_count, m_p_socket_stats->n_rx_ready_byte_count); + return true; + } + else { + tscval_t tsc_now = TSCVAL_INITIALIZER; + gettimeoftsc(&tsc_now); + if (tsc_now - g_si_tscv_last_poll < m_n_sysvar_rx_delta_tsc_between_cq_polls) { + si_udp_logfunc("=> true (ready count = %d packets / %d bytes)", m_n_rx_pkt_ready_list_count, m_p_socket_stats->n_rx_ready_byte_count); + return true; + } + + // Getting here means that although socket has rx + // ready packets we still want to poll the CQ + g_si_tscv_last_poll = tsc_now; + } + } + + + // Loop on rx cq_list and process waiting wce (non blocking! polling only from this context) + // AlexR todo: would be nice to start after the last cq_pos for better cq coverage + if (p_poll_sn) { + consider_rings_migration(); + si_udp_logfuncall("try poll rx cq's"); + rx_ring_map_t::iterator rx_ring_iter; + m_rx_ring_map_lock.lock(); + for (rx_ring_iter = m_rx_ring_map.begin(); rx_ring_iter != m_rx_ring_map.end(); rx_ring_iter++) { + if (rx_ring_iter->second->refcnt <= 0) + continue; + + ring* p_ring = rx_ring_iter->first; + while(1) { + int ret = p_ring->poll_and_process_element_rx(p_poll_sn, p_fd_ready_array); + + if (ret <= 0) { + break; // Get out of the CQ polling while loop (no wce or error case) + } + + /* else (ret > 0) - at least one processed wce */ + if (m_n_rx_pkt_ready_list_count) { + // Get out of the CQ polling loop + si_udp_logfunc("=> polled true (ready count = %d packets / %d bytes)", m_n_rx_pkt_ready_list_count, m_p_socket_stats->n_rx_ready_byte_count); + m_rx_ring_map_lock.unlock(); + return true; + } + } + } + m_rx_ring_map_lock.unlock(); + } + + // Check local list of ready rx packets + // This check is added in case we processed all wce and drained the cq + //TODO: handle the scenario of 2 thread accessing the same socket - might need to lock m_n_rx_pkt_ready_list_count + if (m_n_rx_pkt_ready_list_count) { + si_udp_logfunc("=> true (ready count = %d packets / %d bytes)", m_n_rx_pkt_ready_list_count, m_p_socket_stats->n_rx_ready_byte_count); + return true; + } + + // Not ready packets in ready queue, return false + si_udp_logfuncall("=> false (ready count = %d packets / %d bytes)", m_n_rx_pkt_ready_list_count, m_p_socket_stats->n_rx_ready_byte_count); + return false; +} + +int sockinfo_udp::rx_request_notification(uint64_t poll_sn) +{ + si_udp_logfuncall(""); + int ring_ready_count = 0, ring_armed_count = 0; + rx_ring_map_t::iterator rx_ring_iter; + m_rx_ring_map_lock.lock(); + for (rx_ring_iter = m_rx_ring_map.begin(); rx_ring_iter != m_rx_ring_map.end(); rx_ring_iter++) { + ring* p_ring = rx_ring_iter->first; + int ret = p_ring->request_notification(CQT_RX, poll_sn); + if (ret > 0) { + // cq not armed and might have ready completions for processing + ring_ready_count++; + } + else if (ret == 0) { + // cq armed + ring_armed_count++; + } + else { //if (ret < 0) + si_udp_logerr("failure from ring[%p]->request_notification() (errno=%d %m)", p_ring, errno); + } + } + m_rx_ring_map_lock.unlock(); + + si_udp_logfunc("armed or busy %d ring(s) and %d ring are pending processing", ring_armed_count, ring_ready_count); + return ring_ready_count; +} + +ssize_t sockinfo_udp::tx(vma_tx_call_attr_t &tx_arg) +{ + const iovec* p_iov = tx_arg.attr.msg.iov; + const ssize_t sz_iov = tx_arg.attr.msg.sz_iov; + const int __flags = tx_arg.attr.msg.flags; + const struct sockaddr *__dst = tx_arg.attr.msg.addr; + const socklen_t __dstlen = tx_arg.attr.msg.len; + int errno_tmp = errno; + int ret = 0; + bool is_dummy = IS_DUMMY_PACKET(__flags); + dst_entry* p_dst_entry = m_p_connected_dst_entry; // Default for connected() socket but we'll update it on a specific sendTO(__to) call + + si_udp_logfunc(""); + + m_lock_snd.lock(); + + save_stats_threadid_tx(); + + /* Let allow OS to process all invalid scenarios to avoid any + * inconsistencies in setting errno values. + * Note: The field size sets a theoretical limit of 65,535 bytes + * (8 byte header + 65,527 bytes of data) for a UDP datagram. + * However the actual limit for the data length, which is imposed by + * the underlying IPv4 protocol, is 65,507 bytes + * (65,535 - 8 byte UDP header - 20 byte IP header). + */ + if (unlikely((m_state == SOCKINFO_CLOSED) || (g_b_exit) || + (NULL == p_iov) || + (0 >= sz_iov) || + (NULL == p_iov[0].iov_base) || + (65507 < p_iov[0].iov_len))) { + goto tx_packet_to_os; + } + + if (unlikely(__flags & MSG_OOB)) { + si_udp_logdbg("MSG_OOB not supported in UDP (tx-ing to os)"); + goto tx_packet_to_os; + } +#ifdef VMA_TIME_MEASURE + TAKE_T_TX_START; +#endif + if (__dst != NULL) { + if (unlikely(__dstlen < sizeof(struct sockaddr_in))) { + si_udp_logdbg("going to os, dstlen < sizeof(struct sockaddr_in), dstlen = %d", __dstlen); + goto tx_packet_to_os; + } + if (unlikely(get_sa_family(__dst) != AF_INET)) { + si_udp_logdbg("to->sin_family != AF_INET (tx-ing to os)"); + goto tx_packet_to_os; + } + + sock_addr dst((struct sockaddr*)__dst); + + if (dst == m_last_sock_addr && m_p_last_dst_entry) { + p_dst_entry = m_p_last_dst_entry; + } else { + + // Find dst_entry in map (create one if needed) + dst_entry_map_t::iterator dst_entry_iter = m_dst_entry_map.find(dst); + + if (likely(dst_entry_iter != m_dst_entry_map.end())) { + + // Fast path + // We found our target dst_entry object + m_p_last_dst_entry = p_dst_entry = dst_entry_iter->second; + m_last_sock_addr = dst; + } + else { + // Slow path + // We do not have the correct dst_entry in the map and need to create a one + + // Verify we are bounded (got a local port) + // can happen in UDP sendto() directly after socket(DATAGRAM) + if (m_bound.get_in_port() == INPORT_ANY) { + struct sockaddr addr = {AF_INET, {0}}; + if (bind(&addr, sizeof(struct sockaddr))) { +#ifdef VMA_TIME_MEASURE + INC_ERR_TX_COUNT; +#endif + errno = EAGAIN; + m_lock_snd.unlock(); + return -1; + } + } + in_port_t src_port = m_bound.get_in_port(); + // Create the new dst_entry + if (dst.is_mc()) { + socket_data data = { m_fd, m_n_mc_ttl, m_tos, m_pcp }; + p_dst_entry = new dst_entry_udp_mc( + dst.get_in_addr(), + dst.get_in_port(), + src_port, + m_mc_tx_if ? m_mc_tx_if : m_bound.get_in_addr(), + m_b_mc_tx_loop, + data, + m_ring_alloc_log_tx); + } + else { + socket_data data = { m_fd, m_n_uc_ttl, m_tos, m_pcp }; + p_dst_entry = new dst_entry_udp( + dst.get_in_addr(), + dst.get_in_port(), + src_port, + data, + m_ring_alloc_log_tx); + } + BULLSEYE_EXCLUDE_BLOCK_START + if (!p_dst_entry) { + si_udp_logerr("Failed to create dst_entry(dst_ip:%s, dst_port:%d, src_port:%d)", dst.to_str_in_addr(), dst.to_str_in_port(), ntohs(src_port)); + goto tx_packet_to_os; + } + BULLSEYE_EXCLUDE_BLOCK_END + if (!m_bound.is_anyaddr() && !m_bound.is_mc()) { + p_dst_entry->set_bound_addr(m_bound.get_in_addr()); + } + if (m_so_bindtodevice_ip) { + p_dst_entry->set_so_bindtodevice_addr(m_so_bindtodevice_ip); + } + // Save new dst_entry in map + m_dst_entry_map[dst] = p_dst_entry; + /* ADD logging + si_udp_logfunc("Address %d.%d.%d.%d failed resolving as Tx on supported devices for interfaces %d.%d.%d.%d (tx-ing to os)", NIPQUAD(to_ip), NIPQUAD(local_if)); + */ + } + } + } else if (unlikely(!p_dst_entry)) { + si_udp_logdbg("going to os, __dst = %p, m_p_connected_dst_entry = %p", __dst, m_p_connected_dst_entry); + goto tx_packet_to_os; + } + + { +#ifdef DEFINED_TSO + vma_send_attr attr = {(vma_wr_tx_packet_attr)0, 0}; + bool b_blocking = m_b_blocking; + if (unlikely(__flags & MSG_DONTWAIT)) + b_blocking = false; + + attr.flags = (vma_wr_tx_packet_attr)((b_blocking * VMA_TX_PACKET_BLOCK) | (is_dummy * VMA_TX_PACKET_DUMMY)); + if (likely(p_dst_entry->is_valid())) { + // All set for fast path packet sending - this is our best performance flow + ret = p_dst_entry->fast_send((iovec*)p_iov, sz_iov, attr); + } + else { + // updates the dst_entry internal information and packet headers + ret = p_dst_entry->slow_send(p_iov, sz_iov, attr, m_so_ratelimit, __flags, this, tx_arg.opcode); + } +#else + bool b_blocking = m_b_blocking; + if (unlikely(__flags & MSG_DONTWAIT)) + b_blocking = false; + + if (likely(p_dst_entry->is_valid())) { + // All set for fast path packet sending - this is our best performance flow + ret = p_dst_entry->fast_send((iovec*)p_iov, sz_iov, is_dummy, b_blocking); + } + else { + // updates the dst_entry internal information and packet headers + ret = p_dst_entry->slow_send(p_iov, sz_iov, is_dummy, m_so_ratelimit, b_blocking, false, __flags, this, tx_arg.opcode); + } +#endif /* DEFINED_TSO */ + + if (unlikely(p_dst_entry->try_migrate_ring(m_lock_snd))) { + m_p_socket_stats->counters.n_tx_migrations++; + } + + // TODO ALEXR - still need to handle "is_dropped" in send path + // For now we removed the support of this feature (AlexV & AlexR) + } + + if (likely(p_dst_entry->is_offloaded())) { + + // MNY: Problematic in cases where packet was dropped because no tx buffers were available.. + // Yet we need to add this code to avoid deadlocks in case of EPOLLOUT ET. + NOTIFY_ON_EVENTS(this, EPOLLOUT); + + save_stats_tx_offload(ret, is_dummy); + +#ifdef VMA_TIME_MEASURE + TAKE_T_TX_END; +#endif + m_lock_snd.unlock(); + + /* Restore errno on function entry in case success */ + if (ret >= 0) { + errno = errno_tmp; + } + + return ret; + } + else { + goto tx_packet_to_os_stats; + } + +tx_packet_to_os: +#ifdef VMA_TIME_MEASURE + INC_GO_TO_OS_TX_COUNT; +#endif + // Calling OS transmit + ret = socket_fd_api::tx_os(tx_arg.opcode, p_iov, sz_iov, __flags, __dst, __dstlen); + +tx_packet_to_os_stats: + save_stats_tx_os(ret); + m_lock_snd.unlock(); + return ret; +} + +int sockinfo_udp::rx_verify_available_data() +{ + int ret; + + // Don't poll cq if offloaded data is ready + if (!m_rx_pkt_ready_list.empty()) { + auto_unlocker locker(m_lock_rcv); + if (!m_rx_pkt_ready_list.empty()) { + return m_rx_pkt_ready_list.front()->rx.sz_payload; + } + } + + ret = rx_wait(false); + + if (ret == 0) { + // Got 0, means we might have a ready packet + auto_unlocker locker(m_lock_rcv); + if (!m_rx_pkt_ready_list.empty()) { + ret = m_rx_pkt_ready_list.front()->rx.sz_payload; + } + } + else if (ret == 1) { + // Got 1, means we have a ready packet in OS + uint64_t pending_data = 0; + ret = orig_os_api.ioctl(m_fd, FIONREAD, &pending_data); + if (ret >= 0) { + // This will cause the next non-blocked read to check the OS again. + // We do this only after a successful read. + m_rx_udp_poll_os_ratio_counter = m_n_sysvar_rx_udp_poll_os_ratio; + ret = pending_data; + } + } else if (errno == EAGAIN) { + errno = 0; + ret = 0; + } + + return ret; +} + +/** + * sockinfo_udp::inspect_uc_packet inspects the input packet for basic rules, + * common for all cases. Its applicable for UC case as well. + */ +inline bool sockinfo_udp::inspect_uc_packet(mem_buf_desc_t* p_desc) +{ + // Check that sockinfo is bound to the packets dest port + // This protects the case where a socket is closed and a new one is rapidly opened + // receiving the same socket id. + // In this case packets arriving for the old sockets should be dropped. + // This distinction assumes that the OS guarantees the old and new sockets to receive different + // port numbers from bind(). + // If the user requests to bind the new socket to the same port number as the old one it will be + // impossible to identify packets designated for the old socket in this way. + if (unlikely(p_desc->rx.dst.sin_port != m_bound.get_in_port())) { + si_udp_logfunc("rx packet discarded - not socket's bound port (pkt: %d, sock:%s)", + ntohs(p_desc->rx.dst.sin_port), m_bound.to_str_in_port()); + return false; + } + + // Check if sockinfo rx byte quato reached - then disregard this packet + if (unlikely(m_p_socket_stats->n_rx_ready_byte_count >= m_p_socket_stats->n_rx_ready_byte_limit)) { + si_udp_logfunc("rx packet discarded - socket limit reached (%d bytes)", m_p_socket_stats->n_rx_ready_byte_limit); + m_p_socket_stats->counters.n_rx_ready_byte_drop += p_desc->rx.sz_payload; + m_p_socket_stats->counters.n_rx_ready_pkt_drop++; + return false; + } + + if (unlikely(m_state == SOCKINFO_CLOSED) || unlikely(g_b_exit)) { + si_udp_logfunc("rx packet discarded - fd closed"); + return false; + } + return true; +} + +/** + * Inspects UDP packets in case socket was connected + * + */ +inline bool sockinfo_udp::inspect_connected(mem_buf_desc_t* p_desc) +{ + if ((m_connected.get_in_port() != INPORT_ANY) && (m_connected.get_in_addr() != INADDR_ANY)) { + if (unlikely(m_connected.get_in_port() != p_desc->rx.src.sin_port)) { + si_udp_logfunc("rx packet discarded - not socket's connected port (pkt: %d, sock:%s)", + ntohs(p_desc->rx.src.sin_port), m_connected.to_str_in_port()); + return false; + } + + if (unlikely(m_connected.get_in_addr() != p_desc->rx.src.sin_addr.s_addr)) { + si_udp_logfunc("rx packet discarded - not socket's connected port (pkt: [%d:%d:%d:%d], sock:[%s])", + NIPQUAD(p_desc->rx.src.sin_addr.s_addr), m_connected.to_str_in_addr()); + return false; + } + } + return true; +} + +/** + * Inspects multicast packets + * + */ +inline bool sockinfo_udp::inspect_mc_packet(mem_buf_desc_t* p_desc) +{ + // if loopback is disabled, discard loopback packets. + // in linux, loopback control (set by setsockopt) is done in TX flow. + // since we currently can't control it in TX, we behave like windows, which filter on RX + if (unlikely(!m_b_mc_tx_loop && p_desc->rx.udp.local_if == p_desc->rx.src.sin_addr.s_addr)) { + si_udp_logfunc("rx packet discarded - loopback is disabled (pkt: [%d:%d:%d:%d], sock:%s)", + NIPQUAD(p_desc->rx.src.sin_addr.s_addr), m_bound.to_str_in_addr()); + return false; + } + if (m_mc_num_grp_with_src_filter) { + in_addr_t mc_grp = p_desc->rx.dst.sin_addr.s_addr; + if (IN_MULTICAST_N(mc_grp)) { + in_addr_t mc_src = p_desc->rx.src.sin_addr.s_addr; + + if ((m_mc_memberships_map.find(mc_grp) == m_mc_memberships_map.end()) || + ((0 < m_mc_memberships_map[mc_grp].size()) && + (m_mc_memberships_map[mc_grp].find(mc_src) == m_mc_memberships_map[mc_grp].end()))) { + si_udp_logfunc("rx packet discarded - multicast source mismatch"); + return false; + } + } + } + return true; +} + +/** + * Performs inspection by registered user callback + * + */ +inline vma_recv_callback_retval_t sockinfo_udp::inspect_by_user_cb(mem_buf_desc_t* p_desc) +{ + vma_info_t pkt_info; + + pkt_info.struct_sz = sizeof(pkt_info); + pkt_info.packet_id = (void*)p_desc; + pkt_info.src = &p_desc->rx.src; + pkt_info.dst = &p_desc->rx.dst; + pkt_info.socket_ready_queue_pkt_count = m_p_socket_stats->n_rx_ready_pkt_count; + pkt_info.socket_ready_queue_byte_count = m_p_socket_stats->n_rx_ready_byte_count; + + if (m_n_tsing_flags & SOF_TIMESTAMPING_RAW_HARDWARE) { + pkt_info.hw_timestamp = p_desc->rx.timestamps.hw; + } + if (p_desc->rx.timestamps.sw.tv_sec) { + pkt_info.sw_timestamp = p_desc->rx.timestamps.sw; + } + + // fill io vector array with data buffer pointers + iovec iov[p_desc->rx.n_frags]; + int nr_frags = 0; + + for (mem_buf_desc_t *tmp = p_desc; tmp; tmp = tmp->p_next_desc) { + iov[nr_frags++] = tmp->rx.frag; + } + + // call user callback + return m_rx_callback(m_fd, nr_frags, iov, &pkt_info, m_rx_callback_context); +} + +/* Update vma_completion with + * VMA_SOCKETXTREME_PACKET related data + */ +inline void sockinfo_udp::fill_completion(mem_buf_desc_t* p_desc) +{ + struct vma_completion_t *completion; + + /* Try to process socketxtreme_poll() completion directly */ + m_socketxtreme.completion = m_p_rx_ring->get_comp(); + + if (m_socketxtreme.completion) { + completion = m_socketxtreme.completion; + } else { + completion = &m_socketxtreme.ec.completion; + } + + completion->packet.num_bufs = p_desc->rx.n_frags; + completion->packet.total_len = 0; + completion->src = p_desc->rx.src; + + if (m_n_tsing_flags & SOF_TIMESTAMPING_RAW_HARDWARE) { + completion->packet.hw_timestamp = p_desc->rx.timestamps.hw; + } + + for(mem_buf_desc_t *tmp_p = p_desc; tmp_p; tmp_p = tmp_p->p_next_desc) { + completion->packet.total_len += tmp_p->rx.sz_payload; + completion->packet.buff_lst = (struct vma_buff_t*)tmp_p; + completion->packet.buff_lst->next = (struct vma_buff_t*)tmp_p->p_next_desc; + completion->packet.buff_lst->payload = p_desc->rx.frag.iov_base; + completion->packet.buff_lst->len = p_desc->rx.frag.iov_len; + } + + NOTIFY_ON_EVENTS(this, VMA_SOCKETXTREME_PACKET); + + save_stats_rx_offload(completion->packet.total_len); + m_socketxtreme.completion = NULL; + m_socketxtreme.last_buff_lst = NULL; +} + +/** + * Performs packet processing for NON-SOCKETXTREME cases and store packet + * in ready queue. + */ +inline void sockinfo_udp::update_ready(mem_buf_desc_t* p_desc, void* pv_fd_ready_array, vma_recv_callback_retval_t cb_ret) +{ + // In ZERO COPY case we let the user's application manage the ready queue + if (cb_ret != VMA_PACKET_HOLD) { + m_lock_rcv.lock(); + // Save rx packet info in our ready list + m_rx_pkt_ready_list.push_back(p_desc); + m_n_rx_pkt_ready_list_count++; + m_rx_ready_byte_count += p_desc->rx.sz_payload; + m_p_socket_stats->n_rx_ready_pkt_count++; + m_p_socket_stats->n_rx_ready_byte_count += p_desc->rx.sz_payload; + m_p_socket_stats->counters.n_rx_ready_pkt_max = max((uint32_t)m_p_socket_stats->n_rx_ready_pkt_count, + m_p_socket_stats->counters.n_rx_ready_pkt_max); + m_p_socket_stats->counters.n_rx_ready_byte_max = max((uint32_t)m_p_socket_stats->n_rx_ready_byte_count, + m_p_socket_stats->counters.n_rx_ready_byte_max); + do_wakeup(); + m_lock_rcv.unlock(); + } else { + m_p_socket_stats->n_rx_zcopy_pkt_count++; + } + + NOTIFY_ON_EVENTS(this, EPOLLIN); + + // Add this fd to the ready fd list + /* + * Note: No issue is expected in case socketxtreme_poll() usage because 'pv_fd_ready_array' is null + * in such case and as a result update_fd_array() call means nothing + */ + io_mux_call::update_fd_array((fd_array_t*)pv_fd_ready_array, m_fd); + + si_udp_logfunc("rx ready count = %d packets / %d bytes", m_n_rx_pkt_ready_list_count, m_p_socket_stats->n_rx_ready_byte_count); +} + +/** + * Performs full inspection and processing for generic UDP + * It will be bypassing some inspections if appropriate flags were + * not set. + */ +inline bool sockinfo_udp::rx_process_udp_packet_full(mem_buf_desc_t* p_desc, void* pv_fd_ready_array) +{ + if (!inspect_uc_packet(p_desc)) + return false; + + if (m_is_connected && !inspect_connected(p_desc)) + return false; + + if (m_multicast && !inspect_mc_packet(p_desc)) + return false; + + if (m_sockopt_mapped) { + // Check port mapping - redirecting packets to another socket + while (!m_port_map.empty()) { + m_port_map_lock.lock(); + if (m_port_map.empty()) { + m_port_map_lock.unlock(); + break; + } + m_port_map_index = ((m_port_map_index + 1) >= m_port_map.size() ? 0 : (m_port_map_index + 1)); + int new_port = m_port_map[m_port_map_index].port; + socket_fd_api* sock_api = g_p_fd_collection->get_sockfd(m_port_map[m_port_map_index].fd); + if (!sock_api || sock_api->get_type()!=FD_TYPE_SOCKET) { + m_port_map.erase(std::remove(m_port_map.begin(), m_port_map.end(), m_port_map[m_port_map_index].port)); + if (m_port_map_index) + m_port_map_index--; + m_port_map_lock.unlock(); + continue; + } + m_port_map_lock.unlock(); + p_desc->rx.dst.sin_port = new_port; + return ((sockinfo_udp*)sock_api)->rx_process_udp_packet_full(p_desc, pv_fd_ready_array); + } + } + + process_timestamps(p_desc); + + vma_recv_callback_retval_t cb_ret = VMA_PACKET_RECV; + if (m_rx_callback && ((cb_ret = inspect_by_user_cb(p_desc)) == VMA_PACKET_DROP)) { + si_udp_logfunc("rx packet discarded - by user callback"); + return false; + } + // Yes, we want to keep this packet! + // And we must increment ref_counter before pushing this packet into the ready queue + // to prevent race condition with the 'if( (--ref_count) <= 0)' in ib_comm_mgr + p_desc->inc_ref_count(); + + if (p_desc->rx.socketxtreme_polled) { + fill_completion(p_desc); + p_desc->rx.socketxtreme_polled = false; + } else { + update_ready(p_desc, pv_fd_ready_array, cb_ret); + } + return true; +} + +/** + * Performs inspection and processing for simple UC UDP case + * bypassing all other inspections + */ +inline bool sockinfo_udp::rx_process_udp_packet_partial(mem_buf_desc_t* p_desc, void* pv_fd_ready_array) +{ + if (!inspect_uc_packet(p_desc)) + return false; + + process_timestamps(p_desc); + + vma_recv_callback_retval_t cb_ret = VMA_PACKET_RECV; + if (m_rx_callback && ((cb_ret = inspect_by_user_cb(p_desc)) == VMA_PACKET_DROP)) { + si_udp_logfunc("rx packet discarded - by user callback"); + return false; + } + // Yes, we want to keep this packet! + // And we must increment ref_counter before pushing this packet into the ready queue + // to prevent race condition with the 'if( (--ref_count) <= 0)' in ib_comm_mgr + p_desc->inc_ref_count(); + + if (p_desc->rx.socketxtreme_polled) { + fill_completion(p_desc); + p_desc->rx.socketxtreme_polled = false; + } else { + update_ready(p_desc, pv_fd_ready_array, cb_ret); + } + return true; +} + +/** + * set packet inspector and processor + */ +inline void sockinfo_udp::set_rx_packet_processor(void) +{ + si_udp_logdbg("is_connected: %d mapped: %d multicast: %d", + m_is_connected, m_sockopt_mapped, m_multicast); + // Select partial or full packet processing. + // Full packet processing is selected in case of: + // - connect() was done on the UDP socket + // In this case the UDP 3-tuple is not sufficient for the packet matching. + // - In the case that socket mapping is enabled extra processing is required + // - Multicast packets + // For simple UC traffic reduced packet processing is selected. + if (m_is_connected || m_sockopt_mapped || m_multicast) { + m_rx_packet_processor = &sockinfo_udp::rx_process_udp_packet_full; + } else { + m_rx_packet_processor = &sockinfo_udp::rx_process_udp_packet_partial; + } +} + +void sockinfo_udp::rx_add_ring_cb(flow_tuple_with_local_if &flow_key, ring* p_ring, bool is_migration /* = false */) +{ + si_udp_logdbg(""); + sockinfo::rx_add_ring_cb(flow_key, p_ring, is_migration); + + //Now that we got at least 1 CQ attached enable the skip os mechanism. + m_rx_udp_poll_os_ratio_counter = m_n_sysvar_rx_udp_poll_os_ratio; + + // Now that we got at least 1 CQ attached start polling the CQs + if (m_b_blocking) { + m_loops_to_go = m_n_sysvar_rx_poll_num; + } + else { + m_loops_to_go = 1; // Force single CQ poll in case of non-blocking socket + } +} + +void sockinfo_udp::rx_del_ring_cb(flow_tuple_with_local_if &flow_key, ring* p_ring, bool is_migration /* = false */) +{ + si_udp_logdbg(""); + + sockinfo::rx_del_ring_cb(flow_key, p_ring, is_migration); + + // If no more CQ's are attached on this socket, return CQ polling loops ot init state + if (m_rx_ring_map.size() <= 0) { + if (m_b_blocking) { + m_loops_to_go = safe_mce_sys().rx_poll_num_init; + } + else { + m_loops_to_go = 1; + } + } +} + +void sockinfo_udp::set_blocking(bool is_blocked) +{ + sockinfo::set_blocking(is_blocked); + + if (m_b_blocking) { + // Set the high CQ polling RX_POLL value + // depending on where we have mapped offloaded MC gorups + if (m_rx_ring_map.size() > 0) + m_loops_to_go = m_n_sysvar_rx_poll_num; + else + m_loops_to_go = safe_mce_sys().rx_poll_num_init; + } + else { + // Force single CQ poll in case of non-blocking socket + m_loops_to_go = 1; + } +} + +void sockinfo_udp::handle_pending_mreq() +{ + si_udp_logdbg("Attaching to pending multicast groups"); + mc_pram_list_t::iterator mreq_iter, mreq_iter_temp; + for (mreq_iter = m_pending_mreqs.begin(); mreq_iter != m_pending_mreqs.end();) { + if (m_sock_offload) { + mc_change_membership(&(*mreq_iter)); + } + mreq_iter_temp = mreq_iter; + ++mreq_iter; + m_pending_mreqs.erase(mreq_iter_temp); + } +} + +int sockinfo_udp::mc_change_pending_mreq(const mc_pending_pram *p_mc_pram) +{ + si_udp_logdbg("setsockopt(%s) will be pending until bound to UDP port", setsockopt_ip_opt_to_str(p_mc_pram->optname)); + + mc_pram_list_t::iterator mc_pram_iter, mreq_iter_temp; + switch (p_mc_pram->optname) { + case IP_ADD_MEMBERSHIP: + case IP_ADD_SOURCE_MEMBERSHIP: + m_pending_mreqs.push_back(*p_mc_pram); + break; + case IP_DROP_MEMBERSHIP: + case IP_DROP_SOURCE_MEMBERSHIP: + for (mc_pram_iter = m_pending_mreqs.begin(); mc_pram_iter != m_pending_mreqs.end();) { + if ((mc_pram_iter->imr_multiaddr.s_addr == p_mc_pram->imr_multiaddr.s_addr) && + ((IP_DROP_MEMBERSHIP == p_mc_pram->optname) || // In case of a IP_DROP_SOURCE_MEMBERSHIP we should check source address too + (mc_pram_iter->imr_sourceaddr.s_addr == p_mc_pram->imr_sourceaddr.s_addr))) { + // We found the group, erase it + mreq_iter_temp = mc_pram_iter; + ++mc_pram_iter; + m_pending_mreqs.erase(mreq_iter_temp); + } else { + ++mc_pram_iter; + } + } + break; + BULLSEYE_EXCLUDE_BLOCK_START + default: + si_udp_logerr("setsockopt(%s) illegal", setsockopt_ip_opt_to_str(p_mc_pram->optname)); + return -1; + BULLSEYE_EXCLUDE_BLOCK_END + } + return 0; +} + +int sockinfo_udp::mc_change_membership_start_helper(in_addr_t mc_grp, int optname) +{ + switch (optname) { + case IP_ADD_MEMBERSHIP: + if (m_mc_memberships_map.find(mc_grp) == m_mc_memberships_map.end() + && m_mc_memberships_map.size() >= (size_t)safe_mce_sys().sysctl_reader.get_igmp_max_membership()) { + errno = ENOBUFS; + return -1; + } + break; + case IP_ADD_SOURCE_MEMBERSHIP: + if (m_mc_memberships_map.find(mc_grp) != m_mc_memberships_map.end()) {//This group is exist + if (m_mc_memberships_map[mc_grp].size() >= (size_t)safe_mce_sys().sysctl_reader.get_igmp_max_source_membership()) { + errno = ENOBUFS; + return -1; + } + } + else {//This group is not exist + if (m_mc_memberships_map.size() >= (size_t)safe_mce_sys().sysctl_reader.get_igmp_max_membership()) { + errno = ENOBUFS; + return -1; + } + } + break; + case IP_DROP_MEMBERSHIP: + case IP_DROP_SOURCE_MEMBERSHIP: + break; + BULLSEYE_EXCLUDE_BLOCK_START + default: + si_udp_logerr("setsockopt(%s) will be passed to OS for handling", setsockopt_ip_opt_to_str(optname)); + return -1; + BULLSEYE_EXCLUDE_BLOCK_END + } + return 0; +} + +int sockinfo_udp::mc_change_membership_end_helper(in_addr_t mc_grp, int optname, in_addr_t mc_src /*=0*/) +{ + switch (optname) { + case IP_ADD_MEMBERSHIP: + m_mc_memberships_map[mc_grp]; + break; + case IP_ADD_SOURCE_MEMBERSHIP: + m_mc_memberships_map[mc_grp][mc_src] = 1; + if (1 == m_mc_memberships_map[mc_grp].size()) { + ++m_mc_num_grp_with_src_filter; + } + break; + case IP_DROP_MEMBERSHIP: + m_mc_memberships_map.erase(mc_grp); + break; + case IP_DROP_SOURCE_MEMBERSHIP: + if ((m_mc_memberships_map.find(mc_grp) != m_mc_memberships_map.end())) { + m_mc_memberships_map[mc_grp].erase(mc_src); + if (0 == m_mc_memberships_map[mc_grp].size()) { + m_mc_memberships_map.erase(mc_grp); + --m_mc_num_grp_with_src_filter; + } + } + break; + BULLSEYE_EXCLUDE_BLOCK_START + default: + si_udp_logerr("setsockopt(%s) will be passed to OS for handling", setsockopt_ip_opt_to_str(optname)); + return -1; + BULLSEYE_EXCLUDE_BLOCK_END + } + + return 0; +} + +int sockinfo_udp::mc_change_membership(const mc_pending_pram *p_mc_pram) +{ + in_addr_t mc_grp = p_mc_pram->imr_multiaddr.s_addr; + in_addr_t mc_if = p_mc_pram->imr_interface.s_addr; + + BULLSEYE_EXCLUDE_BLOCK_START + if (IN_MULTICAST_N(mc_grp) == false) { + si_udp_logerr("%s for non multicast (%d.%d.%d.%d) %#x", setsockopt_ip_opt_to_str(p_mc_pram->optname), NIPQUAD(mc_grp), mc_grp); + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_END + + sock_addr tmp_grp_addr(AF_INET, mc_grp, m_bound.get_in_port()); + if (__vma_match_udp_receiver(TRANS_VMA, safe_mce_sys().app_id, tmp_grp_addr.get_p_sa(), tmp_grp_addr.get_socklen()) == TRANS_OS) { + // Break so we call orig setsockopt() and don't try to offload + si_udp_logdbg("setsockopt(%s) will be passed to OS for handling due to rule matching", setsockopt_ip_opt_to_str(p_mc_pram->optname)); + return -1; + } + + if (mc_if == INADDR_ANY) { + in_addr_t dst_ip = mc_grp; + in_addr_t src_ip = 0; + + if (!m_bound.is_anyaddr() && !m_bound.is_mc()) { + src_ip = m_bound.get_in_addr(); + }else if (m_so_bindtodevice_ip) { + src_ip = m_so_bindtodevice_ip; + } + // Find local if for this MC ADD/DROP + route_result res; + g_p_route_table_mgr->route_resolve(route_rule_table_key(dst_ip, src_ip, m_tos), res); + mc_if = res.p_src; + } + + // MNY: TODO: Check rules for local_if (blacklist interface feature) + /*sock_addr tmp_if_addr(AF_INET, mc_if, m_bound.get_in_port()); + if (__vma_match_udp_receiver(TRANS_VMA, tmp_if_addr.get_p_sa(), tmp_if_addr.get_socklen(), safe_mce_sys().app_id) == TRANS_OS) { + // Break so we call orig setsockopt() and don't try to offlaod + si_udp_logdbg("setsockopt(%s) will be passed to OS for handling due to rule matching", setsockopt_ip_opt_to_str(optname)); + return -1; + }*/ + + // Check if local_if is offloadable + if (!g_p_net_device_table_mgr->get_net_device_val(mc_if)) { + // Break so we call orig setsockopt() and try to offlaod + si_udp_logdbg("setsockopt(%s) will be passed to OS for handling - not offload interface (%d.%d.%d.%d)", setsockopt_ip_opt_to_str(p_mc_pram->optname), NIPQUAD(mc_if)); + return -1; + } + + int pram_size = sizeof(ip_mreq); + struct ip_mreq_source mreq_src; + mreq_src.imr_multiaddr.s_addr = p_mc_pram->imr_multiaddr.s_addr; + mreq_src.imr_interface.s_addr = p_mc_pram->imr_interface.s_addr; + mreq_src.imr_sourceaddr.s_addr = p_mc_pram->imr_sourceaddr.s_addr; + + switch (p_mc_pram->optname) { + case IP_ADD_MEMBERSHIP: + { + if ((m_mc_memberships_map.find(mc_grp) != m_mc_memberships_map.end()) && (0 < m_mc_memberships_map[mc_grp].size())) { + return -1; // Same group with source filtering is already exist + } + + // The address specified in bind() has a filtering role. + // i.e. sockets should discard datagrams which sent to an unbound ip address. + if (!m_bound.is_anyaddr() && mc_grp != m_bound.get_in_addr()) { + // Ignore for socketXtreme because m_bound is used as part of the legacy implementation + if (!safe_mce_sys().enable_socketxtreme) { + return -1; // Socket was bound to a different ip address + } + } + + flow_tuple_with_local_if flow_key(mc_grp, m_bound.get_in_port(), m_connected.get_in_addr(), m_connected.get_in_port(), PROTO_UDP, mc_if); + if (!attach_receiver(flow_key)) { + // we will get RX from OS + return -1; + } + vma_stats_mc_group_add(mc_grp, m_p_socket_stats); + original_os_setsockopt_helper( &mreq_src, pram_size, p_mc_pram->optname); + m_multicast = true; + break; + } + case IP_ADD_SOURCE_MEMBERSHIP: + { + flow_tuple_with_local_if flow_key(mc_grp, m_bound.get_in_port(), 0, 0, PROTO_UDP, mc_if); + if (!attach_receiver(flow_key)) { + // we will get RX from OS + return -1; + } + vma_stats_mc_group_add(mc_grp, m_p_socket_stats); + pram_size = sizeof(ip_mreq_source); + original_os_setsockopt_helper( &mreq_src, pram_size, p_mc_pram->optname); + m_multicast = true; + break; + } + case IP_DROP_MEMBERSHIP: + { + flow_tuple_with_local_if flow_key(mc_grp, m_bound.get_in_port(), m_connected.get_in_addr(), m_connected.get_in_port(), PROTO_UDP, mc_if); + original_os_setsockopt_helper( &mreq_src, pram_size, p_mc_pram->optname); + if (!detach_receiver(flow_key)) { + return -1; + } + vma_stats_mc_group_remove(mc_grp, m_p_socket_stats); + m_multicast = false; + break; + } + case IP_DROP_SOURCE_MEMBERSHIP: + { + flow_tuple_with_local_if flow_key(mc_grp, m_bound.get_in_port(), 0, 0, PROTO_UDP, mc_if); + pram_size = sizeof(ip_mreq_source); + original_os_setsockopt_helper( &mreq_src, pram_size, p_mc_pram->optname); + if (1 == m_mc_memberships_map[mc_grp].size()) { //Last source in the group + if (!detach_receiver(flow_key)) { + return -1; + } + vma_stats_mc_group_remove(mc_grp, m_p_socket_stats); + m_multicast = false; // get out from MC group + } + break; + } + BULLSEYE_EXCLUDE_BLOCK_START + default: + si_udp_logerr("setsockopt(%s) will be passed to OS for handling", setsockopt_ip_opt_to_str(p_mc_pram->optname)); + return -1; + BULLSEYE_EXCLUDE_BLOCK_END + } + + // set full versus partial RX UDP handling due to potential updates in m_multicast + set_rx_packet_processor(); + return 0; +} + +void sockinfo_udp::original_os_setsockopt_helper( void* pram, int pram_size, int optname) +{ + si_udp_logdbg("calling orig_setsockopt(%s) for igmp support by OS", setsockopt_ip_opt_to_str(optname)); + if (orig_os_api.setsockopt(m_fd, IPPROTO_IP, optname, pram, pram_size)) { + si_udp_logdbg("orig setsockopt(%s) failed (errno=%d %m)",setsockopt_ip_opt_to_str(optname), errno); + } +} + +void sockinfo_udp::statistics_print(vlog_levels_t log_level /* = VLOG_DEBUG */) +{ + sockinfo::statistics_print(log_level); + + // Socket data + vlog_printf(log_level, "Rx ready list size : %u\n", m_rx_pkt_ready_list.size()); + + vlog_printf(log_level, "Socket timestamp : m_b_rcvtstamp %s, m_b_rcvtstampns %s, m_n_tsing_flags %u\n", + m_b_rcvtstamp ? "true" : "false" , m_b_rcvtstampns ? "true" : "false", m_n_tsing_flags); +} + +void sockinfo_udp::save_stats_threadid_rx() +{ + // Save Thread Id for statistics module + if (g_vlogger_level >= VLOG_DEBUG) + m_p_socket_stats->threadid_last_rx = gettid(); +} + +void sockinfo_udp::save_stats_threadid_tx() +{ + // Save Thread Id for statistics module + if (g_vlogger_level >= VLOG_DEBUG) + m_p_socket_stats->threadid_last_tx = gettid(); +} + +void sockinfo_udp::save_stats_tx_offload(int bytes, bool is_dummy) +{ + if (unlikely(is_dummy)) { + m_p_socket_stats->counters.n_tx_dummy++; + } else { + if (bytes >= 0) { + m_p_socket_stats->counters.n_tx_sent_byte_count += bytes; + m_p_socket_stats->counters.n_tx_sent_pkt_count++; + } + else if (errno == EAGAIN) { + m_p_socket_stats->counters.n_rx_os_eagain++; + } + else { + m_p_socket_stats->counters.n_tx_errors++; + } + } +} + +int sockinfo_udp::free_packets(struct vma_packet_t *pkts, size_t count) +{ + int ret = 0; + unsigned int index = 0; + mem_buf_desc_t *buff; + + m_lock_rcv.lock(); + for(index=0; index < count; index++){ + buff = (mem_buf_desc_t*)pkts[index].packet_id; + if (m_rx_ring_map.find(buff->p_desc_owner->get_parent()) == m_rx_ring_map.end()) { + errno = ENOENT; + ret = -1; + break; + } + reuse_buffer(buff); + m_p_socket_stats->n_rx_zcopy_pkt_count--; + } + m_lock_rcv.unlock(); + return ret; +} + +mem_buf_desc_t* sockinfo_udp::get_next_desc(mem_buf_desc_t *p_desc) +{ + return p_desc->p_next_desc; +} + +mem_buf_desc_t* sockinfo_udp::get_next_desc_peek(mem_buf_desc_t *p_desc, int& rx_pkt_ready_list_idx) +{ + NOT_IN_USE(rx_pkt_ready_list_idx); + return p_desc->p_next_desc; +} + +timestamps_t* sockinfo_udp::get_socket_timestamps() +{ + if (unlikely(m_rx_pkt_ready_list.empty())) { + si_udp_logdbg("m_rx_pkt_ready_list empty"); + return NULL; + } + return &m_rx_pkt_ready_list.front()->rx.timestamps; +} + +void sockinfo_udp::post_deqeue(bool release_buff) +{ + mem_buf_desc_t *to_resue = m_rx_pkt_ready_list.get_and_pop_front(); + m_p_socket_stats->n_rx_ready_pkt_count--; + m_n_rx_pkt_ready_list_count--; + if (release_buff) + reuse_buffer(to_resue); + m_rx_pkt_ready_offset = 0; +} + +int sockinfo_udp::zero_copy_rx(iovec *p_iov, mem_buf_desc_t *p_desc, int *p_flags) +{ + mem_buf_desc_t* p_desc_iter; + int total_rx = 0; + int len = p_iov[0].iov_len - sizeof(vma_packets_t) - sizeof(vma_packet_t); + + // Make sure there is enough room for the header + if (len < 0) { + errno = ENOBUFS; + return -1; + } + + // Copy iov pointers to user buffer + vma_packets_t *p_packets = (vma_packets_t*)p_iov[0].iov_base; + p_packets->n_packet_num = 1; + p_packets->pkts[0].packet_id = (void*)p_desc; + p_packets->pkts[0].sz_iov = 0; + for (p_desc_iter = p_desc; p_desc_iter; p_desc_iter = p_desc_iter->p_next_desc) { + len -= sizeof(p_packets->pkts[0].iov[0]); + if (len < 0) { + *p_flags = MSG_TRUNC; + break; + } + p_packets->pkts[0].iov[p_packets->pkts[0].sz_iov++] = p_desc_iter->rx.frag; + total_rx += p_desc_iter->rx.frag.iov_len; + } + + m_p_socket_stats->n_rx_zcopy_pkt_count++; + + si_udp_logfunc("copied pointers to %d bytes to user buffer", total_rx); + return total_rx; +} + +size_t sockinfo_udp::handle_msg_trunc(size_t total_rx, size_t payload_size, int in_flags, int* p_out_flags) +{ + if (payload_size > total_rx) { + m_rx_ready_byte_count -= (payload_size-total_rx); + m_p_socket_stats->n_rx_ready_byte_count -= (payload_size-total_rx); + *p_out_flags |= MSG_TRUNC; + if (in_flags & MSG_TRUNC) + return payload_size; + } + + return total_rx; +} + +int sockinfo_udp::get_socket_tx_ring_fd(struct sockaddr *to, socklen_t tolen) +{ + NOT_IN_USE(tolen); + si_udp_logfunc("get_socket_tx_ring_fd fd %d to %p tolen %d", m_fd, to ,tolen); + + if (!to) { + si_udp_logdbg("got invalid to addr null for fd %d", m_fd); + errno = EINVAL; + return -1; + } + sock_addr dst(to); + ring *ring = NULL; + + if (m_p_connected_dst_entry && m_connected == dst) { + ring = m_p_connected_dst_entry->get_ring(); + } else { + dst_entry_map_t::iterator it = m_dst_entry_map.find(dst); + if (it != m_dst_entry_map.end()) { + ring = it->second->get_ring(); + } + } + if (!ring) { + si_udp_logdbg("could not find TX ring for fd %d addr %s", + m_fd, dst.to_str()); + errno = ENODATA; + return -1; + } + int res = ring->get_tx_channel_fd(); + si_udp_logdbg("Returning TX ring fd %d for sock fd %d adrr %s", + res, m_fd, dst.to_str()); + return res; +} + +mem_buf_desc_t* sockinfo_udp::get_front_m_rx_pkt_ready_list(){ + return m_rx_pkt_ready_list.front(); +} + +size_t sockinfo_udp::get_size_m_rx_pkt_ready_list(){ + return m_rx_pkt_ready_list.size(); +} + +void sockinfo_udp::pop_front_m_rx_pkt_ready_list(){ + m_rx_pkt_ready_list.pop_front(); +} + +void sockinfo_udp::push_back_m_rx_pkt_ready_list(mem_buf_desc_t* buff){ + m_rx_pkt_ready_list.push_back(buff); +} + +bool sockinfo_udp::prepare_to_close(bool process_shutdown) { + m_lock_rcv.lock(); + do_wakeup(); + + if (m_econtext) { + m_econtext->fd_closed(m_fd); + } + + m_lock_rcv.unlock(); + + NOT_IN_USE(process_shutdown); + m_state = SOCKINFO_CLOSING; + return is_closable(); +} + +void sockinfo_udp::update_header_field(data_updater *updater) +{ + dst_entry_map_t::iterator dst_entry_iter = m_dst_entry_map.begin(); + for (; dst_entry_iter != m_dst_entry_map.end(); dst_entry_iter++) { + updater->update_field(*dst_entry_iter->second); + + } + if (m_p_connected_dst_entry) { + updater->update_field(*m_p_connected_dst_entry); + } +} diff --git a/src/vma/sock/sockinfo_udp.h b/src/vma/sock/sockinfo_udp.h new file mode 100644 index 0000000..4ffc0bf --- /dev/null +++ b/src/vma/sock/sockinfo_udp.h @@ -0,0 +1,311 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef SOCKINFO_H +#define SOCKINFO_H + +#include +#include +#include +#include + +#include "config.h" +#include "vlogger/vlogger.h" +#include "utils/lock_wrapper.h" + +#include "vma/vma_extra.h" +#include "vma/util/chunk_list.h" +#include "vma/util/vma_stats.h" +#include "vma/util/sys_vars.h" +#include "vma/proto/mem_buf_desc.h" +#include "vma/proto/dst_entry_udp.h" + +#include "pkt_rcvr_sink.h" +#include "pkt_sndr_source.h" +#include "sock-redirect.h" +#include "sockinfo.h" + +// Send flow dst_entry map +namespace std { + namespace tr1 { + template<> + class hash + { + public: + size_t operator()(const sock_addr &key) const + { + sock_addr* tmp_key = (sock_addr*)&key; + return tmp_key->hash(); + } + }; + } +} +typedef std::tr1::unordered_map dst_entry_map_t; + + +struct mc_pending_pram +{ + struct in_addr imr_multiaddr; + struct in_addr imr_interface; + struct in_addr imr_sourceaddr; + int optname; +}; + +// Multicast pending list +typedef std::list mc_pram_list_t; +typedef std::tr1::unordered_map > mc_memberships_map_t; + +/** + * @class udp sockinfo + * Represents an udp socket. + */ +class sockinfo_udp : public sockinfo +{ +public: + sockinfo_udp(int fd); + virtual ~sockinfo_udp(); + + void setPassthrough() { m_p_socket_stats->b_is_offloaded = m_sock_offload = false;} + bool isPassthrough() { return ! m_sock_offload;} + + int prepare_to_connect(const sockaddr *__to, socklen_t __tolen); + + int bind(const struct sockaddr *__addr, socklen_t __addrlen); + int connect(const struct sockaddr *__to, socklen_t __tolen); + int getsockname(struct sockaddr *__name, socklen_t *__namelen); + int setsockopt(int __level, int __optname, const void *__optval, socklen_t __optlen); + int getsockopt(int __level, int __optname, void *__optval, socklen_t *__optlen); + + inline void set_reuseaddr(bool reuseaddr) { m_reuseaddr = reuseaddr; } + inline void set_reuseport(bool reuseport) { m_reuseport = reuseport; } + virtual bool flow_in_reuse(void) { return m_reuseaddr | m_reuseport; } + + /** + * Sampling the OS immediately by matching the rx_skip_os counter (m_rx_udp_poll_os_ratio_counter) to the limit (safe_mce_sys().rx_udp_poll_os_ratio) + */ + void set_immediate_os_sample(); + /** + * Reseting rx_skip_os counter to prevent sampling OS immediately + */ + void unset_immediate_os_sample(); + /** + * Process a Rx request, we might have a ready packet, or we might block until + * we have one (if sockinfo::m_b_blocking == true) + */ + ssize_t rx(const rx_call_t call_type, iovec *p_iov, ssize_t sz_iov, int *p_flags, sockaddr *__from = NULL, socklen_t *__fromlen = NULL, struct msghdr *__msg = NULL); + /** + * Check that a call to this sockinfo rx() will not block + * -> meaning, we got an offloaded ready rx datagram + * Return 'true' if would not block, 'false' if might block. + * + * While polling CQ, the fd_array is filled with a list of newly queued packets FD's + */ + bool is_readable(uint64_t *p_poll_sn, fd_array_t *p_fd_array = NULL); + /** + * Arm the event channel(s) assosiated with this sockinfo + * Fill the fd_set (p_rxfds) with the correct fd channel values and the p_nfds with the (max_fd + 1) + * Fill the p_cq_mgr_fd_map with the pointer to the cq_mgr asosiated with the fd + * Return count of channels (fds) that where mapped + */ + int rx_request_notification(uint64_t poll_sn); + /** + * Process a Tx request, handle all that is needed to send the packet, we might block + * until the connection info is ready or a tx buffer is releast (if sockinfo::m_b_blocking == true) + */ + ssize_t tx(vma_tx_call_attr_t &tx_arg); + /** + * Check that a call to this sockinof rx() will not block + * -> meaning, we got a ready rx packet + */ + void rx_add_ring_cb(flow_tuple_with_local_if& flow_key, ring* p_ring, bool is_migration = false); + void rx_del_ring_cb(flow_tuple_with_local_if& flow_key, ring* p_ring, bool is_migration = false); + virtual int rx_verify_available_data(); + + // This callback will handle ready rx packet notification from any ib_conn_mgr + /** + * Method sockinfo_udp::rx_process_packet run packet processor + * with inspection, in case packet is OK, completion for SOCKETXTREME mode + * will be filled or in other cases packet go to ready queue. + * If packet to be discarded, packet ref. counter will not be + * incremented and method returns false. + * Normally it is single point from sockinfo to be called from ring level. + */ + inline bool rx_input_cb(mem_buf_desc_t* p_rx_wc_buf_desc, void* pv_fd_ready_array) + { + return (this->*m_rx_packet_processor)(p_rx_wc_buf_desc, pv_fd_ready_array); + } + inline void set_rx_packet_processor(void); + + // This call will handle all rdma related events (bind->listen->connect_req->accept) + virtual void statistics_print(vlog_levels_t log_level = VLOG_DEBUG); + virtual int free_packets(struct vma_packet_t *pkts, size_t count); + virtual inline fd_type_t get_type() + { + return FD_TYPE_SOCKET; + } + + virtual bool prepare_to_close(bool process_shutdown = false); + virtual int get_socket_tx_ring_fd(struct sockaddr *to, socklen_t tolen); + virtual void update_header_field(data_updater *updater); + +private: + + struct port_socket_t { + + int port; + int fd; + + bool operator== (const int& r_port) + { + return port == r_port; + } + }; + +/* in_addr_t m_bound_if; + in_port_t m_bound_port; + in_addr_t m_connected_ip; + in_port_t m_connected_port; +*/ + typedef bool (sockinfo_udp::* udp_rx_packet_processor_t)(mem_buf_desc_t* p_desc, void* pv_fd_ready_array); + + udp_rx_packet_processor_t m_rx_packet_processor; // to inspect and process incoming packet + + in_addr_t m_mc_tx_if; + bool m_b_mc_tx_loop; + uint8_t m_n_mc_ttl; + + int32_t m_loops_to_go; // local param for polling loop on this socket + uint32_t m_rx_udp_poll_os_ratio_counter; // Data member which sets how many offloaded polls on the cq + // we want to do before doing an OS poll, on this socket + bool m_sock_offload; + + mc_pram_list_t m_pending_mreqs; + mc_memberships_map_t m_mc_memberships_map; + uint32_t m_mc_num_grp_with_src_filter; + + lock_spin m_port_map_lock; + std::vector m_port_map; + unsigned m_port_map_index; + + dst_entry_map_t m_dst_entry_map; + dst_entry* m_p_last_dst_entry; + sock_addr m_last_sock_addr; + + chunk_list_t m_rx_pkt_ready_list; + + uint8_t m_tos; + + const uint32_t m_n_sysvar_rx_poll_yield_loops; + const uint32_t m_n_sysvar_rx_udp_poll_os_ratio; + const uint32_t m_n_sysvar_rx_ready_byte_min_limit; + const uint32_t m_n_sysvar_rx_cq_drain_rate_nsec; + const uint32_t m_n_sysvar_rx_delta_tsc_between_cq_polls; + + bool m_reuseaddr; // to track setsockopt with SO_REUSEADDR + bool m_reuseport; // to track setsockopt with SO_REUSEPORT + bool m_sockopt_mapped; // setsockopt IPPROTO_UDP UDP_MAP_ADD + bool m_is_connected; // to inspect for in_addr.src + bool m_multicast; // true when socket set MC rule + + int mc_change_membership(const mc_pending_pram *p_mc_pram); + int mc_change_membership_start_helper(in_addr_t mc_grp, int optname); + int mc_change_membership_end_helper(in_addr_t mc_grp, int optname, in_addr_t mc_src = 0); + int mc_change_pending_mreq(const mc_pending_pram *p_mc_pram); + int on_sockname_change(struct sockaddr *__name, socklen_t __namelen); + void handle_pending_mreq(); + void original_os_setsockopt_helper( void* pram, int pram_size, int optname); + /* helper functions */ + void set_blocking(bool is_blocked); + + void rx_ready_byte_count_limit_update(size_t n_rx_ready_bytes_limit); // Drop rx ready packets from head of queue + + void save_stats_threadid_rx(); // ThreadId will only saved if logger is at least in DEBUG(4) level + void save_stats_threadid_tx(); // ThreadId will only saved if logger is at least in DEBUG(4) level + + void save_stats_tx_offload(int bytes, bool is_dummy); + + int rx_wait_helper(int &poll_count, bool is_blocking); + + inline int rx_wait(bool blocking); + inline int poll_os(); + + virtual inline void reuse_buffer(mem_buf_desc_t *buff); + virtual mem_buf_desc_t* get_next_desc (mem_buf_desc_t *p_desc); + virtual mem_buf_desc_t* get_next_desc_peek(mem_buf_desc_t *p_desc, int& rx_pkt_ready_list_idx); + virtual timestamps_t* get_socket_timestamps(); + virtual void update_socket_timestamps(timestamps_t *) {}; + + inline void return_reuse_buffers_postponed() { + if (!m_rx_reuse_buf_postponed) + return; + + //for the parallel reclaim mechanism from internal thread, used for "silent" sockets + set_rx_reuse_pending(false); + + m_rx_reuse_buf_postponed = false; + + rx_ring_map_t::iterator iter = m_rx_ring_map.begin(); + while (iter != m_rx_ring_map.end()) { + descq_t *rx_reuse = &iter->second->rx_reuse_info.rx_reuse; + int& n_buff_num = iter->second->rx_reuse_info.n_buff_num; + if (n_buff_num >= m_n_sysvar_rx_num_buffs_reuse) { + if (iter->first->reclaim_recv_buffers(rx_reuse)) { + n_buff_num = 0; + } else { + m_rx_reuse_buf_postponed = true; + } + } + ++iter; + } + } + + inline bool rx_process_udp_packet_full(mem_buf_desc_t* p_desc, void* pv_fd_ready_array); + inline bool rx_process_udp_packet_partial(mem_buf_desc_t* p_desc, void* pv_fd_ready_array); + inline bool inspect_uc_packet(mem_buf_desc_t* p_desc); + inline bool inspect_connected(mem_buf_desc_t* p_desc); + inline bool inspect_mc_packet(mem_buf_desc_t* p_desc); + inline vma_recv_callback_retval_t inspect_by_user_cb(mem_buf_desc_t* p_desc); + inline void fill_completion(mem_buf_desc_t* p_desc); + inline void update_ready(mem_buf_desc_t* p_rx_wc_buf_desc, void* pv_fd_ready_array, vma_recv_callback_retval_t cb_ret); + + virtual void post_deqeue (bool release_buff); + virtual int zero_copy_rx (iovec *p_iov, mem_buf_desc_t *pdesc, int *p_flags); + virtual size_t handle_msg_trunc(size_t total_rx, size_t payload_size, int in_flags, int* p_out_flags); + virtual void handle_ip_pktinfo(struct cmsg_state *cm_state); + + virtual mem_buf_desc_t* get_front_m_rx_pkt_ready_list(); + virtual size_t get_size_m_rx_pkt_ready_list(); + virtual void pop_front_m_rx_pkt_ready_list(); + virtual void push_back_m_rx_pkt_ready_list(mem_buf_desc_t* buff); +}; +#endif diff --git a/src/vma/util/30-libvma-limits.conf b/src/vma/util/30-libvma-limits.conf new file mode 100644 index 0000000..9c0d8bc --- /dev/null +++ b/src/vma/util/30-libvma-limits.conf @@ -0,0 +1,6 @@ +# Default limits that are needed for proper work of libvma +# Read more about this topic in the VMA's User Manual + +* - memlock unlimited +* soft memlock unlimited +* hard memlock unlimited diff --git a/src/vma/util/agent.cpp b/src/vma/util/agent.cpp new file mode 100644 index 0000000..4f2ada1 --- /dev/null +++ b/src/vma/util/agent.cpp @@ -0,0 +1,711 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include +#include +#include +#include + +#include "vlogger/vlogger.h" +#include "utils/lock_wrapper.h" +#include "vma/sock/sock-redirect.h" +#include "vma/util/list.h" +#include "vma/util/agent.h" + +#undef MODULE_NAME +#define MODULE_NAME "agent:" +#undef MODULE_HDR +#define MODULE_HDR MODULE_NAME "%d:%s() " + +#define AGENT_DEFAULT_MSG_NUM (512) +#define AGENT_DEFAULT_MSG_GROW (16) /* number of messages to grow */ +#define AGENT_DEFAULT_INACTIVE (10) /* periodic time for establishment connection attempts (in sec) */ +#define AGENT_DEFAULT_ALIVE (1) /* periodic time for alive check (in sec) */ + + +/* Force system call */ +#define sys_call(_result, _func, ...) \ + do { \ + if (orig_os_api._func) \ + _result = orig_os_api._func(__VA_ARGS__); \ + else \ + _result = ::_func(__VA_ARGS__); \ + } while (0) + +/* Print user notification */ +#define output_fatal() \ + do { \ + vlog_levels_t _level = (mce_sys_var::HYPER_MSHV == safe_mce_sys().hypervisor ? \ + VLOG_WARNING : VLOG_DEBUG); \ + vlog_printf(_level, "*************************************************************\n"); \ + if (rc == -EPROTONOSUPPORT) \ + vlog_printf(_level, "* Protocol version mismatch was found between vma and vmad. *\n"); \ + else \ + vlog_printf(_level, "* Can not establish connection with the daemon (vmad). *\n"); \ + vlog_printf(_level, "* UDP/TCP connections are likely to be limited. *\n"); \ + vlog_printf(_level, "*************************************************************\n"); \ + } while (0) + +agent* g_p_agent = NULL; + + +agent::agent() : + m_state(AGENT_CLOSED), m_sock_fd(-1), m_pid_fd(-1), + m_msg_num(AGENT_DEFAULT_MSG_NUM) +{ + int rc = 0; + agent_msg_t *msg = NULL; + int i = 0; + + INIT_LIST_HEAD(&m_cb_queue); + INIT_LIST_HEAD(&m_free_queue); + INIT_LIST_HEAD(&m_wait_queue); + + /* Fill free queue with empty messages */ + i = m_msg_num; + m_msg_num = 0; + const char *path = safe_mce_sys().vmad_notify_dir; + while (i--) { + /* coverity[overwrite_var] */ + msg = (agent_msg_t *)calloc(1, sizeof(*msg)); + if (NULL == msg) { + rc = -ENOMEM; + __log_dbg("failed queue creation (rc = %d)", rc); + goto err; + } + msg->length = 0; + msg->tag = AGENT_MSG_TAG_INVALID; + list_add_tail(&msg->item, &m_free_queue); + m_msg_num++; + } + + if ((mkdir(path, 0777) != 0) && (errno != EEXIST)) { + rc = -errno; + __log_dbg("failed create folder %s (rc = %d)", path, rc); + goto err; + } + + rc = snprintf(m_sock_file, sizeof(m_sock_file) - 1, + "%s/%s.%d.sock", path, VMA_AGENT_BASE_NAME, getpid()); + if ((rc < 0 ) || (rc == (sizeof(m_sock_file) - 1) )) { + rc = -ENOMEM; + __log_dbg("failed allocate sock file (rc = %d)", rc); + goto err; + } + + rc = snprintf(m_pid_file, sizeof(m_pid_file) - 1, + "%s/%s.%d.pid", path, VMA_AGENT_BASE_NAME, getpid()); + if ((rc < 0 ) || (rc == (sizeof(m_pid_file) - 1) )) { + rc = -ENOMEM; + __log_dbg("failed allocate pid file (rc = %d)", rc); + goto err; + } + + sys_call(m_pid_fd, open, m_pid_file, + O_RDWR | O_CREAT, S_IRUSR | S_IWUSR | S_IRGRP); + if (m_pid_fd < 0) { + rc = -errno; + __log_dbg("failed open pid file (rc = %d)", rc); + goto err; + } + + rc = create_agent_socket(); + if (rc < 0) { + __log_dbg("failed open sock file (rc = %d)", rc); + goto err; + } + + /* Initialization is mostly completed + * At the moment it does not matter if connection with + * daemon can be done here or later + */ + m_state = AGENT_INACTIVE; + + rc = send_msg_init(); + if (rc < 0) { + __log_dbg("failed establish connection with daemon (rc = %d)", rc); + goto err; + } + + /* coverity[leaked_storage] */ + return ; + +err: + /* There is no chance to establish connection with daemon + * because of internal problems or communication protocol + * variance + * So do not try anymore + */ + m_state = AGENT_CLOSED; + + output_fatal(); + + while (!list_empty(&m_free_queue)) { + /* coverity[overwrite_var] */ + msg = list_first_entry(&m_free_queue, agent_msg_t, item); + list_del_init(&msg->item); + free(msg); + } + + if (m_pid_fd > 0) { + int ret = 0; + NOT_IN_USE(ret); + sys_call(ret, close, m_pid_fd); + m_pid_fd = -1; + unlink(m_pid_file); + } + + if (m_sock_fd > 0) { + int ret = 0; + NOT_IN_USE(ret); + sys_call(ret, close, m_sock_fd); + m_sock_fd = -1; + unlink(m_sock_file); + } + + /* coverity[leaked_storage] */ + return ; +} + +agent::~agent() +{ + agent_msg_t *msg = NULL; + agent_callback_t *cb = NULL; + + if (AGENT_CLOSED == m_state) { + return ; + } + + progress(); + send_msg_exit(); + + m_state = AGENT_CLOSED; + + /* This delay is needed to allow process EXIT message + * before event from system file monitor is raised + */ + usleep(1000); + + while (!list_empty(&m_cb_queue)) { + cb = list_first_entry(&m_cb_queue, agent_callback_t, item); + list_del_init(&cb->item); + free(cb); + } + + while (!list_empty(&m_free_queue)) { + msg = list_first_entry(&m_free_queue, agent_msg_t, item); + list_del_init(&msg->item); + free(msg); + } + + if (m_sock_fd > 0) { + int ret = 0; + NOT_IN_USE(ret); + sys_call(ret, close, m_sock_fd); + unlink(m_sock_file); + } + + if (m_pid_fd > 0) { + int ret = 0; + NOT_IN_USE(ret); + sys_call(ret, close, m_pid_fd); + unlink(m_pid_file); + } +} + +void agent::register_cb(agent_cb_t fn, void *arg) +{ + agent_callback_t *cb = NULL; + struct list_head *entry = NULL; + + if (AGENT_CLOSED == m_state) { + return ; + } + + if (NULL == fn) { + return ; + } + + m_cb_lock.lock(); + /* check if it exists in the queue */ + list_for_each(entry, &m_cb_queue) { + cb = list_entry(entry, agent_callback_t, item); + if ((cb->cb == fn) && (cb->arg == arg)) { + m_cb_lock.unlock(); + return ; + } + } + /* allocate new callback element and add to the queue */ + cb = (agent_callback_t *)calloc(1, sizeof(*cb)); + if (cb) { + cb->cb = fn; + cb->arg = arg; + list_add_tail(&cb->item, &m_cb_queue); + } + m_cb_lock.unlock(); + /* coverity[leaked_storage] */ +} + +void agent::unregister_cb(agent_cb_t fn, void *arg) +{ + agent_callback_t *cb = NULL; + struct list_head *entry = NULL; + + if (AGENT_CLOSED == m_state) { + return ; + } + + m_cb_lock.lock(); + /* find element in the queue and remove one */ + list_for_each(entry, &m_cb_queue) { + cb = list_entry(entry, agent_callback_t, item); + if ((cb->cb == fn) && (cb->arg == arg)) { + list_del_init(&cb->item); + free(cb); + m_cb_lock.unlock(); + return ; + } + } + m_cb_lock.unlock(); +} + +int agent::put(const void *data, size_t length, intptr_t tag) +{ + agent_msg_t *msg = NULL; + int i = 0; + + if (AGENT_CLOSED == m_state) { + return 0; + } + + if (m_sock_fd < 0) { + return -EBADF; + } + + if (length > sizeof(msg->data)) { + return -EINVAL; + } + + m_msg_lock.lock(); + + /* put any message in case agent is active to avoid queue uncontrolled grow + * progress() function is able to call registered callbacks in case + * it detects that link with daemon is up + */ + if (AGENT_ACTIVE == m_state) { + /* allocate new message in case free queue is empty */ + if (list_empty(&m_free_queue)) { + for (i = 0; i < AGENT_DEFAULT_MSG_GROW; i++) { + /* coverity[overwrite_var] */ + msg = (agent_msg_t *)malloc(sizeof(*msg)); + if (NULL == msg) { + break; + } + msg->length = 0; + msg->tag = AGENT_MSG_TAG_INVALID; + list_add_tail(&msg->item, &m_free_queue); + m_msg_num++; + } + } + /* get message from free queue */ + /* coverity[overwrite_var] */ + msg = list_first_entry(&m_free_queue, agent_msg_t, item); + list_del_init(&msg->item); + + /* put message into wait queue */ + list_add_tail(&msg->item, &m_wait_queue); + } + + /* update message */ + if (msg) { + memcpy(&msg->data, data, length); + msg->length = length; + msg->tag = tag; + } + + m_msg_lock.unlock(); + + return 0; +} + +void agent::progress(void) +{ + agent_msg_t* msg = NULL; + struct timeval tv_now = TIMEVAL_INITIALIZER; + static struct timeval tv_inactive_elapsed = TIMEVAL_INITIALIZER; + static struct timeval tv_alive_elapsed = TIMEVAL_INITIALIZER; + + if (AGENT_CLOSED == m_state) { + return ; + } + + gettime(&tv_now); + + /* Attempt to establish connection with daemon */ + if (AGENT_INACTIVE == m_state) { + /* Attempt can be done less often than progress in active state */ + if (tv_cmp(&tv_inactive_elapsed, &tv_now, <)) { + tv_inactive_elapsed = tv_now; + tv_inactive_elapsed.tv_sec += AGENT_DEFAULT_INACTIVE; + if (0 <= send_msg_init()) { + progress_cb(); + goto go; + } + } + return ; + } + +go: + /* Check connection with daemon during active state */ + if (list_empty(&m_wait_queue)) { + if (tv_cmp(&tv_alive_elapsed, &tv_now, <)) { + check_link(); + } + } else { + tv_alive_elapsed = tv_now; + tv_alive_elapsed.tv_sec += AGENT_DEFAULT_ALIVE; + + /* Process all messages that are in wait queue */ + m_msg_lock.lock(); + while (!list_empty(&m_wait_queue)) { + msg = list_first_entry(&m_wait_queue, agent_msg_t, item); + if (0 > send(msg)) { + break; + } + list_del_init(&msg->item); + msg->length = 0; + msg->tag = AGENT_MSG_TAG_INVALID; + list_add_tail(&msg->item, &m_free_queue); + } + m_msg_lock.unlock(); + } +} + +void agent::progress_cb(void) +{ + agent_callback_t *cb = NULL; + struct list_head *entry = NULL; + + m_cb_lock.lock(); + list_for_each(entry, &m_cb_queue) { + cb = list_entry(entry, agent_callback_t, item); + cb->cb(cb->arg); + } + m_cb_lock.unlock(); +} + +int agent::send(agent_msg_t *msg) +{ + int rc = 0; + + if (AGENT_ACTIVE != m_state) { + return -ENODEV; + } + + if (m_sock_fd < 0) { + return -EBADF; + } + + if (NULL == msg) { + return -EINVAL; + } + + /* send() in blocking manner */ + sys_call(rc, send, m_sock_fd, (void *)&msg->data, msg->length, 0); + if (rc < 0) { + __log_dbg("Failed to send() errno %d (%s)", + errno, strerror(errno)); + rc = -errno; + m_state = AGENT_INACTIVE; + __log_dbg("Agent is inactivated. state = %d", m_state); + goto err; + } + +err: + return rc; +} + +int agent::send_msg_init(void) +{ + int rc = 0; + struct sockaddr_un server_addr; + struct vma_msg_init data; + uint8_t *version; + + if (AGENT_ACTIVE == m_state) { + return 0; + } + + if (m_sock_fd < 0) { + return -EBADF; + } + + /* Set server address */ + memset(&server_addr, 0, sizeof(server_addr)); + server_addr.sun_family = AF_UNIX; + strncpy(server_addr.sun_path, VMA_AGENT_ADDR, sizeof(server_addr.sun_path) - 1); + + sys_call(rc, connect, m_sock_fd, (struct sockaddr *)&server_addr, + sizeof(struct sockaddr_un)); + if (rc < 0) { + __log_dbg("Failed to connect() errno %d (%s)", + errno, strerror(errno)); + rc = -ECONNREFUSED; + goto err; + } + + memset(&data, 0, sizeof(data)); + data.hdr.code = VMA_MSG_INIT; + data.hdr.ver = VMA_AGENT_VER; + data.hdr.pid = getpid(); + version = (uint8_t *)&data.ver; + version[0] = VMA_LIBRARY_MAJOR; + version[1] = VMA_LIBRARY_MINOR; + version[2] = VMA_LIBRARY_RELEASE; + version[3] = VMA_LIBRARY_REVISION; + + /* send(VMA_MSG_INIT) in blocking manner */ + sys_call(rc, send, m_sock_fd, &data, sizeof(data), 0); + if (rc < 0) { + __log_dbg("Failed to send(VMA_MSG_INIT) errno %d (%s)", + errno, strerror(errno)); + rc = -ECONNREFUSED; + goto err; + } + + /* recv(VMA_MSG_INIT|ACK) in blocking manner */ + memset(&data, 0, sizeof(data)); + sys_call(rc, recv, m_sock_fd, &data, sizeof(data), 0); + if (rc < (int)sizeof(data)) { + __log_dbg("Failed to recv(VMA_MSG_INIT) errno %d (%s)", + errno, strerror(errno)); + rc = -ECONNREFUSED; + goto err; + } + + if (data.hdr.code != (VMA_MSG_INIT | VMA_MSG_ACK) || + data.hdr.pid != getpid()) { + __log_dbg("Protocol is not supported: code = 0x%X pid = %d", + data.hdr.code, data.hdr.pid); + rc = -EPROTO; + goto err; + } + + if (data.hdr.ver < VMA_AGENT_VER) { + __log_dbg("Protocol version mismatch: agent ver = 0x%X vmad ver = 0x%X", + VMA_AGENT_VER, data.hdr.ver); + rc = -EPROTONOSUPPORT; + goto err; + } + + m_state = AGENT_ACTIVE; + __log_dbg("Agent is activated. state = %d", m_state); + +err: + return rc; +} + +int agent::send_msg_exit(void) +{ + int rc = 0; + struct vma_msg_exit data; + + if (AGENT_ACTIVE != m_state) { + return -ENODEV; + } + + if (m_sock_fd < 0) { + return -EBADF; + } + + m_state = AGENT_INACTIVE; + __log_dbg("Agent is inactivated. state = %d", m_state); + + memset(&data, 0, sizeof(data)); + data.hdr.code = VMA_MSG_EXIT; + data.hdr.ver = VMA_AGENT_VER; + data.hdr.pid = getpid(); + + /* send(VMA_MSG_EXIT) in blocking manner */ + sys_call(rc, send, m_sock_fd, &data, sizeof(data), 0); + if (rc < 0) { + __log_dbg("Failed to send(VMA_MSG_EXIT) errno %d (%s)", + errno, strerror(errno)); + rc = -errno; + goto err; + } + + return 0; +err: + return rc; +} + +int agent::send_msg_flow(struct vma_msg_flow *data) +{ + int rc = 0; + struct vma_msg_flow answer; + + if (AGENT_ACTIVE != m_state) { + return -ENODEV; + } + + if (m_sock_fd < 0) { + return -EBADF; + } + + /* wait answer */ + data->hdr.status = 1; + + /* send(VMA_MSG_TC) in blocking manner */ + sys_call(rc, send, m_sock_fd, data, sizeof(*data), 0); + if (rc < 0) { + __log_dbg("Failed to send(VMA_MSG_TC) errno %d (%s)", + errno, strerror(errno)); + rc = -errno; + goto err; + } + + /* recv(VMA_MSG_TC|ACK) in blocking manner */ + memset(&answer, 0, sizeof(answer)); + sys_call(rc, recv, m_sock_fd, &answer.hdr, sizeof(answer.hdr), 0); + if (rc < (int)sizeof(answer.hdr)) { + __log_dbg("Failed to recv(VMA_MSG_TC) errno %d (%s)", + errno, strerror(errno)); + rc = -ECONNREFUSED; + goto err; + } + + /* reply sanity check */ + if (!(answer.hdr.code == (data->hdr.code | VMA_MSG_ACK) && + answer.hdr.ver == data->hdr.ver && + answer.hdr.pid == data->hdr.pid)) { + __log_dbg("Protocol version mismatch: code = 0x%X ver = 0x%X pid = %d", + answer.hdr.code, answer.hdr.ver, answer.hdr.pid); + rc = -EPROTO; + goto err; + } + + rc = answer.hdr.status; +err: + return rc; +} + +int agent::create_agent_socket(void) +{ + int rc = 0; + int optval = 1; + struct timeval opttv; + struct sockaddr_un sock_addr; + + /* Create UNIX UDP socket to receive data from VMA processes */ + memset(&sock_addr, 0, sizeof(sock_addr)); + sock_addr.sun_family = AF_UNIX; + strncpy(sock_addr.sun_path, m_sock_file, sizeof(sock_addr.sun_path) - 1); + /* remove possible old socket */ + unlink(m_sock_file); + + sys_call(m_sock_fd, socket, AF_UNIX, SOCK_DGRAM, 0); + if (m_sock_fd < 0) { + __log_dbg("Failed to call socket() errno %d (%s)", + errno, strerror(errno)); + rc = -errno; + goto err; + } + + optval = 1; + sys_call(rc, setsockopt, m_sock_fd, SOL_SOCKET, SO_REUSEADDR, + (const void *)&optval, sizeof(optval)); + if (rc < 0) { + __log_dbg("Failed to call setsockopt(SO_REUSEADDR) errno %d (%s)", + errno, strerror(errno)); + rc = -errno; + goto err; + } + + /* Sets the timeout value as 3 sec that specifies the maximum amount of time + * an input function waits until it completes. + */ + opttv.tv_sec = 3; + opttv.tv_usec = 0; + sys_call(rc, setsockopt, m_sock_fd, SOL_SOCKET, SO_RCVTIMEO, + (const void *)&opttv, sizeof(opttv)); + if (rc < 0) { + __log_dbg("Failed to call setsockopt(SO_RCVTIMEO) errno %d (%s)", + errno, strerror(errno)); + rc = -errno; + goto err; + } + + /* bind created socket */ + sys_call(rc, bind, m_sock_fd, (struct sockaddr *)&sock_addr, + sizeof(sock_addr)); + if (rc < 0) { + __log_dbg("Failed to call bind() errno %d (%s)", + errno, strerror(errno)); + rc = -errno; + goto err; + } + +err: + return rc; +} + +void agent::check_link(void) +{ + int rc = 0; + static struct sockaddr_un server_addr; + static int flag = 0; + + /* Set server address */ + if (!flag) { + flag = 1; + memset(&server_addr, 0, sizeof(server_addr)); + server_addr.sun_family = AF_UNIX; + strncpy(server_addr.sun_path, VMA_AGENT_ADDR, sizeof(server_addr.sun_path) - 1); + } + + sys_call(rc, connect, m_sock_fd, (struct sockaddr *)&server_addr, + sizeof(struct sockaddr_un)); + if (rc < 0) { + __log_dbg("Failed to connect() errno %d (%s)", + errno, strerror(errno)); + m_state = AGENT_INACTIVE; + __log_dbg("Agent is inactivated. state = %d", m_state); + } +} diff --git a/src/vma/util/agent.h b/src/vma/util/agent.h new file mode 100644 index 0000000..f72e719 --- /dev/null +++ b/src/vma/util/agent.h @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef SRC_VMA_UTIL_AGENT_H_ +#define SRC_VMA_UTIL_AGENT_H_ + +#include "vma/util/agent_def.h" + +/** + * @struct agent_msg_t + * @brief Agent message resource descriptor. + * + * This structure describes a internal message object. + */ +typedef struct agent_msg { + struct list_head item; /**< link element */ + int length; /**< actual length of valuable data */ + intptr_t tag; /**< unique identifier of the message */ + union { + struct vma_msg_state state; + char raw[1]; + } data; /**< data to be sent to daemon */ +} agent_msg_t; + +#define AGENT_MSG_TAG_INVALID (-1) + +/** + * @enum agent_state_t + * @brief List of possible Agent states. + */ +typedef enum { + AGENT_INACTIVE, + AGENT_ACTIVE, + AGENT_CLOSED +} agent_state_t; + +typedef void (*agent_cb_t)(void *arg); + +/** + * @struct agent_msg_t + * @brief Callback queue element. + * + * This structure describes function call that is + * done in case Agent change the state + */ +typedef struct agent_callback { + struct list_head item; /**< link element */ + agent_cb_t cb; /**< Callback function */ + void *arg; /**< Function argument */ +} agent_callback_t; + + +class agent { +public: + agent(); + virtual ~agent(); + + inline agent_state_t state(void) const + { + return m_state; + } + + void register_cb(agent_cb_t fn, void *arg); + void unregister_cb(agent_cb_t fn, void *arg); + int put(const void *data, size_t length, intptr_t tag); + void progress(void); + int send_msg_flow(struct vma_msg_flow *data); + +private: + /* state of this object */ + agent_state_t m_state; + + /* socket used for communication with daemon */ + int m_sock_fd; + + /* file descriptor that is tracked by daemon */ + int m_pid_fd; + + /* unix socket name + * size should be less than sockaddr_un.sun_path + */ + char m_sock_file[100]; + + /* name of pid file */ + char m_pid_file[100]; + + /* queue of callback elements + * this queue stores function calls activated during + * state change + */ + struct list_head m_cb_queue; + + /* thread-safe lock to protect operations + * under the callback queue + */ + lock_spin m_cb_lock; + + /* queue of message elements + * this queue stores unused messages + */ + struct list_head m_free_queue; + + /* queue of message elements + * this queue stores messages from different sockets + */ + struct list_head m_wait_queue; + + /* thread-safe lock to protect operations + * under the message wait and free queues + */ + lock_spin m_msg_lock; + + /* total number of allocated messages + * some amount of messages are allocated during initialization + * but total number can grow during run-time + */ + int m_msg_num; + + int create_agent_socket(void); + int send(agent_msg_t *msg); + int send_msg_init(void); + int send_msg_exit(void); + void progress_cb(void); + void check_link(void); +}; + +extern agent* g_p_agent; + +#endif /* SRC_VMA_UTIL_AGENT_H_ */ diff --git a/src/vma/util/agent_def.h b/src/vma/util/agent_def.h new file mode 100644 index 0000000..3e11098 --- /dev/null +++ b/src/vma/util/agent_def.h @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef SRC_VMA_UTIL_AGENT_DEF_H_ +#define SRC_VMA_UTIL_AGENT_DEF_H_ + +#ifndef offsetof +#define offsetof(type, member) ((uintptr_t) &((type *)0)->member) +#endif + +#ifndef container_of +/** + * container_of - cast a member of a structure out to the containing structure + * @ptr: the pointer to the member. + * @type: the type of the container struct this is embedded in. + * @member: the name of the member within the struct. + * + */ +#define container_of(ptr, type, member) (type *)((char *)(ptr) - offsetof(type,member)) +#endif + +/* List of supported messages in range 0..63 + * Two bits as 6-7 are reserved. + * 6-bit is reserved + * 7-bit in message code is for ACK flag in case specific + * message requires the confirmation + */ +#define VMA_MSG_INIT 0x01 +#define VMA_MSG_STATE 0x02 +#define VMA_MSG_EXIT 0x03 +#define VMA_MSG_FLOW 0x04 + +#define VMA_MSG_ACK 0x80 + +#define VMA_AGENT_VER 0x03 + +#define VMA_AGENT_BASE_NAME "vma_agent" +#define VMA_AGENT_ADDR "/var/run/" VMA_AGENT_BASE_NAME ".sock" +#define VMA_AGENT_PATH "/tmp/vma" + + +#pragma pack(push, 1) +struct vma_hdr { + uint8_t code; /* code of message */ + uint8_t ver; /* format version */ + uint8_t status; /* status (require answer or return code for reply message) */ + uint8_t reserve[1]; /* unused */ + int32_t pid; /* process id */ + +}; + +struct vma_msg_init { + struct vma_hdr hdr; + uint32_t ver; +}; + +struct vma_msg_exit { + struct vma_hdr hdr; +}; + +struct vma_msg_state { + struct vma_hdr hdr; + uint32_t fid; + uint32_t src_ip; + uint32_t dst_ip; + uint16_t src_port; + uint16_t dst_port; + uint8_t type; + uint8_t state; +}; + +enum { + VMA_MSG_FLOW_EGRESS = 0, + VMA_MSG_FLOW_UDP_5T = 1, + VMA_MSG_FLOW_UDP_3T = 2, + VMA_MSG_FLOW_TCP_5T = 3, + VMA_MSG_FLOW_TCP_3T = 4 +}; + +typedef enum { + VMA_MSG_FLOW_ADD = 1, + VMA_MSG_FLOW_DEL = 2 +} msg_flow_t; + +struct vma_msg_flow { + struct vma_hdr hdr; + uint8_t type; /* format of tc rule command */ + uint8_t action; /* add, del */ + uint32_t if_id; /* interface index */ + uint32_t tap_id; /* tap device index */ + struct { + uint32_t dst_ip; + uint16_t dst_port; + struct { + uint32_t src_ip; + uint16_t src_port; + } t5; + } flow; +}; + +#pragma pack( pop ) + +#endif /* SRC_VMA_UTIL_AGENT_DEF_H_ */ diff --git a/src/vma/util/chunk_list.h b/src/vma/util/chunk_list.h new file mode 100644 index 0000000..0a93b3b --- /dev/null +++ b/src/vma/util/chunk_list.h @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef CHUNK_LIST_H_ +#define CHUNK_LIST_H_ + +#include +#include "vma/util/vma_list.h" + +#define CHUNK_LIST_CONTAINER_SIZE 64 // Amount of T elements of each container. +#define CHUNK_LIST_CONTAINER_INIT 4 // Initial number of containers. +#define CHUNK_LIST_CONTIANER_THRESHOLD 15 // Maximum number of containers before free. + +#define clist_logfunc(log_fmt, log_args...) vlog_printf(VLOG_FUNC, "clist[%p]:%d:%s() " log_fmt "\n", this, __LINE__, __FUNCTION__, ##log_args) +#define clist_logwarn(log_fmt, log_args...) vlog_printf(VLOG_WARNING, "clist[%p]:%d:%s() " log_fmt "\n", this, __LINE__, __FUNCTION__, ##log_args) +#define clist_logerr(log_fmt, log_args...) vlog_printf(VLOG_ERROR, "clist[%p]:%d:%s() " log_fmt "\n", this, __LINE__, __FUNCTION__, ##log_args) + +template +class chunk_list_t { + + struct container { + static inline size_t node_offset(void) {return NODE_OFFSET(container, m_node);} + list_node m_node; + T* m_p_buffer; + + container(T* buffer) : m_p_buffer(buffer) {} + + ~container() { + free(m_p_buffer); + m_p_buffer = NULL; + } + }; + + typedef vma_list_t container_list; + +private: + + container_list m_free_containers; // Contains available containers. + container_list m_used_containers; // Contains used containers. + size_t m_size; // The amount of T element in the list. + int m_front; // Index of the first element. + int m_back; // Index of the last element. + + size_t allocate(int containers = 1) { + clist_logfunc("Allocating %d containers of %d bytes each", containers, CHUNK_LIST_CONTAINER_SIZE * sizeof(T)); + + container* cont; + T* data; + for (int i = 0 ; i < containers ; i++) { + data = (T*)calloc(CHUNK_LIST_CONTAINER_SIZE, sizeof(T)); + if (!data || !(cont = new container(data))) { + // Memory allocation error + if (data) free(data); + clist_logerr("Failed to allocate memory"); + goto out; + } + m_free_containers.push_back(cont); + } + + out: + return m_free_containers.size(); + } + + void initialize() { + m_free_containers.set_id("chunk_list_t (%p), m_free_containers", this); + m_used_containers.set_id("chunk_list_t (%p), m_used_containers", this); + + m_front = 0; + m_back = -1; + m_size = 0; + + if (allocate(CHUNK_LIST_CONTAINER_INIT)) { + m_used_containers.push_back(m_free_containers.get_and_pop_front()); + } + } + +public: + + chunk_list_t() { + clist_logfunc("Constructor has been called"); + initialize(); + } + + chunk_list_t(const chunk_list_t &other) { + clist_logwarn("Copy constructor is not supported! other=%p", &other); + initialize(); + } + + ~chunk_list_t() { + clist_logfunc("Destructor has been called! m_size=%zu, m_free_containers=%zu, m_used_containers=%zu", m_size, m_free_containers.size(), m_used_containers.size()); + + if (empty()) { + while (!m_used_containers.empty()) { + delete(m_used_containers.get_and_pop_back()); + } + } else { + clist_logwarn("Not all buffers were freed. size=%zu\n", m_size); + } + + while (!m_free_containers.empty()) { + delete(m_free_containers.get_and_pop_back()); + } + } + + inline bool empty() const { + return m_size == 0; + } + + inline size_t size() const { + return m_size; + } + + inline T front() const { + // Check if the list is empty. + if (unlikely(empty())) + return NULL; + return m_used_containers.front()->m_p_buffer[m_front]; + } + + inline void pop_front() { + // Check if the list is empty. + if (unlikely(empty())) { + return; + } + + // Container is empty, move it to the free list or delete it if necessary. + if (unlikely(++m_front == CHUNK_LIST_CONTAINER_SIZE)) { + m_front = 0; + container* cont = m_used_containers.get_and_pop_front(); + unlikely(m_free_containers.size() > CHUNK_LIST_CONTIANER_THRESHOLD) ? delete(cont) : m_free_containers.push_back(cont); + } + + m_size--; + } + + inline T get_and_pop_front() { + T list_front = front(); + pop_front(); + return list_front; + } + + inline void push_back(T obj) { + // Container is full, request a free one or allocate if necessary. + if (unlikely(++m_back == CHUNK_LIST_CONTAINER_SIZE)) { + if (unlikely(m_free_containers.empty()) && !allocate()) { + clist_logerr("Failed to push back obj %p", obj); + return; + } + m_back = 0; + m_used_containers.push_back(m_free_containers.get_and_pop_back()); + } + + m_used_containers.back()->m_p_buffer[m_back] = obj; + m_size++; + } +}; + +#endif /* CHUNK_LIST_H_ */ diff --git a/src/vma/util/config_parser.y b/src/vma/util/config_parser.y new file mode 100644 index 0000000..1294cdd --- /dev/null +++ b/src/vma/util/config_parser.y @@ -0,0 +1,544 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id: config_parser.y 1.5 2005/06/29 11:39:27 eitan Exp $ + */ + + +/* + +*/ +%{ + +/* header section */ +#include +#include +#include +#include +#include +#include +#include +#include + +typedef enum +{ + CONF_RULE +} configuration_t; + +#define YYERROR_VERBOSE 1 + +extern int yyerror(const char *msg); +extern int yylex(void); +static int parse_err = 0; + +struct dbl_lst __instance_list; + +/* some globals to store intermidiate parser state */ +static struct use_family_rule __vma_rule; +static struct address_port_rule *__vma_address_port_rule = NULL; +static int __vma_rule_push_head = 0; +static int current_role = 0; +static configuration_t current_conf_type = CONF_RULE; +static struct instance *curr_instance = NULL; + +int __vma_config_empty(void) +{ + return ((__instance_list.head == NULL) && (__instance_list.tail == NULL)); +} + +/* define the address by 4 integers */ +static void __vma_set_ipv4_addr(short a0, short a1, short a2, short a3) +{ + char buf[16]; + struct in_addr *p_ipv4 = NULL; + + p_ipv4 = &(__vma_address_port_rule->ipv4); + + sprintf(buf,"%d.%d.%d.%d", a0, a1, a2, a3); + if (1 != inet_pton(AF_INET, (const char*)buf, p_ipv4)) { + parse_err = 1; + yyerror("provided address is not legal"); + } +} + +static void __vma_set_inet_addr_prefix_len(unsigned char prefixlen) +{ + if (prefixlen > 32) + prefixlen = 32; + + __vma_address_port_rule->prefixlen = prefixlen; +} + +// SM: log part is not used... +int __vma_min_level = 9; + +void __vma_dump_address_port_rule_config_state(char *buf) { + if (__vma_address_port_rule->match_by_addr) { + char str_addr[INET_ADDRSTRLEN]; + + inet_ntop(AF_INET, &(__vma_address_port_rule->ipv4), str_addr, sizeof(str_addr)); + if ( __vma_address_port_rule->prefixlen != 32 ) { + sprintf(buf+strlen(buf), " %s/%d", str_addr, + __vma_address_port_rule->prefixlen); + } else { + sprintf(buf+strlen(buf), " %s", str_addr); + } + } else { + sprintf(buf+strlen(buf), " *"); + } + + if (__vma_address_port_rule->match_by_port) { + sprintf(buf+strlen(buf), ":%d",__vma_address_port_rule->sport); + if (__vma_address_port_rule->eport > __vma_address_port_rule->sport) + sprintf(buf+strlen(buf), "-%d",__vma_address_port_rule->eport); + } + else + sprintf(buf+strlen(buf), ":*"); +} + +/* dump the current state in readable format */ +static void __vma_dump_rule_config_state() { + char buf[1024]; + sprintf(buf, "\tACCESS CONFIG: use %s %s %s ", + __vma_get_transport_str(__vma_rule.target_transport), + __vma_get_role_str(current_role), + __vma_get_protocol_str(__vma_rule.protocol)); + __vma_address_port_rule = &(__vma_rule.first); + __vma_dump_address_port_rule_config_state(buf); + if (__vma_rule.use_second) { + __vma_address_port_rule = &(__vma_rule.second); + __vma_dump_address_port_rule_config_state(buf); + } + sprintf(buf+strlen(buf), "\n"); + __vma_log(1, "%s", buf); +} + +/* dump configuration properites of new instance */ +static void __vma_dump_instance() { + char buf[1024]; + + if (curr_instance) { + sprintf(buf, "CONFIGURATION OF INSTANCE "); + if (curr_instance->id.prog_name_expr) + sprintf(buf+strlen(buf), "%s ", curr_instance->id.prog_name_expr); + if (curr_instance->id.user_defined_id) + sprintf(buf+strlen(buf), "%s", curr_instance->id.user_defined_id); + sprintf(buf+strlen(buf), ":\n"); + __vma_log(1, "%s", buf); + } +} + +static void __vma_add_dbl_lst_node_head(struct dbl_lst *lst, struct dbl_lst_node *node) +{ + if (node && lst) { + + node->prev = NULL; + node->next = lst->head; + + if (!lst->head) + lst->tail = node; + else + lst->head->prev = node; + + lst->head = node; + } +} + +static void __vma_add_dbl_lst_node(struct dbl_lst *lst, struct dbl_lst_node *node) +{ + if (node && lst) { + node->prev = lst->tail; + + if (!lst->head) + lst->head = node; + else + lst->tail->next = node; + lst->tail = node; + } +} + +static struct dbl_lst_node* __vma_allocate_dbl_lst_node() +{ + struct dbl_lst_node *ret_val = NULL; + + ret_val = (struct dbl_lst_node*) malloc(sizeof(struct dbl_lst_node)); + if (!ret_val) { + yyerror("fail to allocate new node"); + parse_err = 1; + } + else + memset((void*) ret_val, 0, sizeof(struct dbl_lst_node)); + return ret_val; +} + +/* use the above state for adding a new instance */ +static void __vma_add_instance(char *prog_name_expr, char *user_defined_id) { + struct dbl_lst_node *curr, *new_node; + struct instance *new_instance; + + curr = __instance_list.head; + while (curr) { + struct instance *instance = (struct instance*)curr->data; + if (!strcmp(prog_name_expr, instance->id.prog_name_expr) && !strcmp(user_defined_id, instance->id.user_defined_id)) { + curr_instance = (struct instance*)curr->data; + if (__vma_min_level <= 1) __vma_dump_instance(); + return; + } + curr = curr->next; + } + + if (!(new_node = __vma_allocate_dbl_lst_node())) + return; + + new_instance = (struct instance*) malloc(sizeof(struct instance)); + if (!new_instance) { + free(new_node); + yyerror("fail to allocate new instance"); + parse_err = 1; + return; + } + + memset((void*) new_instance, 0, sizeof(struct instance)); + new_instance->id.prog_name_expr = strdup(prog_name_expr); + new_instance->id.user_defined_id = strdup(user_defined_id); + + if (!new_instance->id.prog_name_expr || !new_instance->id.user_defined_id) { + yyerror("failed to allocate memory"); + parse_err = 1; + if (new_instance->id.prog_name_expr) + free(new_instance->id.prog_name_expr); + if (new_instance->id.user_defined_id) + free(new_instance->id.user_defined_id); + free(new_instance); + return; + } + new_node->data = (void*)new_instance; + __vma_add_dbl_lst_node(&__instance_list, new_node); + curr_instance = new_instance; + if (__vma_min_level <= 1) __vma_dump_instance(); +} + +static void __vma_add_inst_with_int_uid(char *prog_name_expr, int user_defined_id) { + char str_id[50]; + sprintf(str_id, "%d", user_defined_id); + __vma_add_instance(prog_name_expr, str_id); +} + +/* use the above state for making a new rule */ +static void __vma_add_rule() { + struct dbl_lst *p_lst; + struct use_family_rule *rule; + struct dbl_lst_node *new_node; + + if (!curr_instance) + __vma_add_instance("*", "*"); + if (!curr_instance) + return; + + if (__vma_min_level <= 1) __vma_dump_rule_config_state(); + switch (current_role) { + case ROLE_TCP_SERVER: + p_lst = &curr_instance->tcp_srv_rules_lst; + break; + case ROLE_TCP_CLIENT: + p_lst = &curr_instance->tcp_clt_rules_lst; + break; + case ROLE_UDP_SENDER: + p_lst = &curr_instance->udp_snd_rules_lst; + break; + case ROLE_UDP_RECEIVER: + p_lst = &curr_instance->udp_rcv_rules_lst; + break; + case ROLE_UDP_CONNECT: + p_lst = &curr_instance->udp_con_rules_lst; + break; + default: + yyerror("ignoring unknown role"); + parse_err = 1; + return; + break; + } + + if (!(new_node = __vma_allocate_dbl_lst_node())) + return; + + rule = (struct use_family_rule *)malloc(sizeof(*rule)); + if (!rule) { + free(new_node); + yyerror("fail to allocate new rule"); + parse_err = 1; + return; + } + memset(rule, 0, sizeof(*rule)); + new_node->data = (void*)rule; + *((struct use_family_rule *)new_node->data) = __vma_rule; + if (__vma_rule_push_head) + __vma_add_dbl_lst_node_head(p_lst, new_node); + else + __vma_add_dbl_lst_node(p_lst, new_node); +} + +%} + + +%union { + int ival; + char *sval; +} + +%token USE "use" +%token TCP_CLIENT "tcp client" +%token TCP_SERVER "tcp server" +%token UDP_SENDER "udp sender" +%token UDP_RECEIVER "udp receiver" +%token UDP_CONNECT "udp connect" +%token TCP "tcp" +%token UDP "udp" +%token OS "os" +%token VMA "vma" +%token SDP "sdp" +%token SA "sa" +%token INT "integer value" +%token APP_ID "application id" +%token PROGRAM "program name" +%token USER_DEFINED_ID_STR "userdefined id str" +%token LOG "log statement" +%token DEST "destination" +%token STDERR "ystderr" +%token SYSLOG "syslog" +%token FILENAME "yfile" +%token NAME "a name" +%token LEVEL "min-level" +%token LINE "new line" +%type NAME PROGRAM USER_DEFINED_ID_STR +%type INT LOG DEST STDERR SYSLOG FILENAME APP_ID USE OS VMA SDP TCP UDP TCP_CLIENT TCP_SERVER UDP_SENDER UDP_RECEIVER UDP_CONNECT LEVEL LINE +%start config + +%{ + long __vma_config_line_num; +%} +%% + +NL: + LINE + | NL LINE + |; + +ONL: + | NL; + +config: + ONL statements + ; + +statements: + | statements statement + ; + +statement: + log_statement + | app_id_statement + | socket_statement + ; + +log_statement: + LOG log_opts NL + ; + +log_opts: + | log_opts log_dest + | log_opts verbosity + ; + +log_dest: + DEST STDERR { __vma_log_set_log_stderr(); } + | DEST SYSLOG { __vma_log_set_log_syslog(); } + | DEST FILENAME NAME { __vma_log_set_log_file($3); } + ; + +verbosity: + LEVEL INT { __vma_log_set_min_level($2); } + ; + +app_id_statement: + app_id NL + ; + +app_id: + APP_ID PROGRAM USER_DEFINED_ID_STR {__vma_add_instance($2, $3); if ($2) free($2); if ($3) free($3); } + | APP_ID PROGRAM INT {__vma_add_inst_with_int_uid($2, $3); if ($2) free($2); } + ; + + +socket_statement: + use transport role tuple NL { __vma_add_rule(); } + ; + +use: + USE { current_conf_type = CONF_RULE; } + ; + +transport: + OS { __vma_rule.target_transport = TRANS_OS; } + | VMA { __vma_rule.target_transport = TRANS_VMA; } + | SDP { __vma_rule.target_transport = TRANS_SDP; } + | SA { __vma_rule.target_transport = TRANS_SA; } + | '*' { __vma_rule.target_transport = TRANS_ULP; } + ; + + +role: + TCP_SERVER { current_role = ROLE_TCP_SERVER; __vma_rule.protocol = PROTO_TCP; } + | TCP_CLIENT { current_role = ROLE_TCP_CLIENT; __vma_rule.protocol = PROTO_TCP; } + | UDP_RECEIVER { current_role = ROLE_UDP_RECEIVER; __vma_rule.protocol = PROTO_UDP; } + | UDP_SENDER { current_role = ROLE_UDP_SENDER; __vma_rule.protocol = PROTO_UDP; } + | UDP_CONNECT { current_role = ROLE_UDP_CONNECT; __vma_rule.protocol = PROTO_UDP; } + ; + +tuple: + three_tuple + | five_tuple + ; + +three_tuple: + address_first ':' ports + ; + +five_tuple: + address_first ':' ports ':' address_second ':' ports + ; + +address_first: + { __vma_address_port_rule = &(__vma_rule.first); __vma_rule.use_second = 0; } address + ; + +address_second: + { __vma_address_port_rule = &(__vma_rule.second); __vma_rule.use_second = 1; } address + ; + +address: + ipv4 { if (current_conf_type == CONF_RULE) __vma_address_port_rule->match_by_addr = 1; __vma_set_inet_addr_prefix_len(32); } + | ipv4 '/' INT { if (current_conf_type == CONF_RULE) __vma_address_port_rule->match_by_addr = 1; __vma_set_inet_addr_prefix_len($3); } + | '*' { if (current_conf_type == CONF_RULE) __vma_address_port_rule->match_by_addr = 0; __vma_set_inet_addr_prefix_len(32); } + ; + +ipv4: + INT '.' INT '.' INT '.' INT { __vma_set_ipv4_addr($1,$3,$5,$7); } + ; + +ports: + INT { __vma_address_port_rule->match_by_port = 1; __vma_address_port_rule->sport= $1; __vma_address_port_rule->eport= $1; } + | INT '-' INT { __vma_address_port_rule->match_by_port = 1; __vma_address_port_rule->sport= $1; __vma_address_port_rule->eport= $3; } + | '*' { __vma_address_port_rule->match_by_port = 0; __vma_address_port_rule->sport= 0; __vma_address_port_rule->eport= 0; } + ; + +%% + +int yyerror(const char *msg) +{ + /* replace the $undefined and $end if exists */ + char *orig_msg = (char*)malloc(strlen(msg)+25); + char *final_msg = (char*)malloc(strlen(msg)+25); + + strcpy(orig_msg, msg); + + char *word = strtok(orig_msg, " "); + final_msg[0] = '\0'; + while (word != NULL) { + if (!strncmp(word, "$undefined", 10)) { + strcat(final_msg, "unrecognized-token "); + } else if (!strncmp(word, "$end",4)) { + strcat(final_msg, "end-of-file "); + } else { + strcat(final_msg, word); + strcat(final_msg, " "); + } + word = strtok(NULL, " "); + } + + __vma_log(9, "Error (line:%ld) : %s\n", __vma_config_line_num, final_msg); + parse_err = 1; + + free(orig_msg); + free(final_msg); + return 1; +} + +#include +#include + +/* parse apollo route dump file */ +int __vma_parse_config_file (const char *fileName) { + extern FILE * libvma_yyin; + + /* open the file */ + if (access(fileName, R_OK)) { + printf("libvma Error: No access to open File:%s %s\n", + fileName, strerror(errno)); + return(1); + } + + libvma_yyin = fopen(fileName,"r"); + if (!libvma_yyin) { + printf("libvma Error: Fail to open File:%s\n", fileName); + return(1); + } + __instance_list.head = NULL; + __instance_list.tail = NULL; + parse_err = 0; + __vma_config_line_num = 1; + + /* parse it */ + yyparse(); + + fclose(libvma_yyin); + return(parse_err); +} + +int __vma_parse_config_line (char *line) { + extern FILE * libvma_yyin; + + __vma_rule_push_head = 1; + + libvma_yyin = fmemopen(line, strlen(line), "r"); + + if (!libvma_yyin) { + printf("libvma Error: Fail to parse line:%s\n", line); + return(1); + } + + parse_err = 0; + yyparse(); + + fclose(libvma_yyin); + + return(parse_err); +} diff --git a/src/vma/util/config_scanner.l b/src/vma/util/config_scanner.l new file mode 100644 index 0000000..9d02366 --- /dev/null +++ b/src/vma/util/config_scanner.l @@ -0,0 +1,283 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * OpenIB.org BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + * + * $Id: ibnl_scanner.ll,v 1.4 2005/02/23 21:08:37 eitan Exp $ + */ + +%{ + +//#define DEBUG 1 + +#define yyparse libvma_yyparse +#define yylex libvma_yylex +#define yyerror libvma_yyerror +#define yylval libvma_yylval +#define yychar libvma_yychar +#define yydebug libvma_yydebug +#define yynerrs libvma_yynerrs + +#define yywrap libvma_yywrap + +#include +#include +#include "config_parser.h" +extern long __vma_config_line_num; +%} +%option nounput +%option noinput +%s CANNAME APP_ID_S1 APP_ID_S2 +%% + +^[ \t]*#.* {} + +application-id { + yylval.ival = APP_ID; +#ifdef DEBUG + printf("APP_ID\n"); +#endif + BEGIN(APP_ID_S1); + return APP_ID; +} + +[^ \t\n]+ { + yylval.sval = (char *)malloc(strlen(yytext) + 1); + strcpy(yylval.sval, yytext); +#ifdef DEBUG + printf("PROGRAM:%s\n",yylval.sval); +#endif + BEGIN(APP_ID_S2); + return (PROGRAM); +} + +[^ \t\n]+ { + yylval.sval = (char *)malloc(strlen(yytext) + 1); + strcpy(yylval.sval, yytext); +#ifdef DEBUG + printf("USER_DEFINED_ID_STR:%s\n",yylval.sval); +#endif + BEGIN(0); + return (USER_DEFINED_ID_STR); +} + +([1-9][0-9]*|0) { + yylval.ival = atoi(yytext); +#ifdef DEBUG + printf("INT:%d\n",yylval.ival); +#endif + return INT; +} + +log { + yylval.ival = LOG; +#ifdef DEBUG + printf("LOG\n"); +#endif + return LOG; +} + +destination { + yylval.ival = DEST; +#ifdef DEBUG + printf("DEST\n"); +#endif + return DEST; +} + +min-level { + yylval.ival = LEVEL; +#ifdef DEBUG + printf("LEVEL\n"); +#endif + return LEVEL; +} + +ystderr { + yylval.ival = STDERR; +#ifdef DEBUG + printf("STDERR\n"); +#endif + return STDERR; +} + +syslog { + yylval.ival = SYSLOG; +#ifdef DEBUG + printf("SYSLOG\n"); +#endif + return SYSLOG; +} + +yfile { + yylval.ival = FILENAME; +#ifdef DEBUG + printf("FILENAME\n"); +#endif + BEGIN(CANNAME); + return FILENAME; +} + + + +use { + yylval.ival = USE; +#ifdef DEBUG + printf("USE\n"); +#endif + return USE; +} + +tcp { + yylval.ival = TCP; +#ifdef DEBUG + printf("TCP\n"); +#endif + return TCP; +} + +udp { + yylval.ival = UDP; +#ifdef DEBUG + printf("UDP\n"); +#endif + return UDP; +} + +os { + yylval.ival = OS; +#ifdef DEBUG + printf("OS\n"); +#endif + return OS; +} + +vma { + yylval.ival = VMA; +#ifdef DEBUG + printf("VMA\n"); +#endif + return VMA; +} + +sdp { + yylval.ival = SDP; +#ifdef DEBUG + printf("SDP\n"); +#endif + return SDP; +} + +sa { + yylval.ival = SA; +#ifdef DEBUG + printf("SA\n"); +#endif + return SA; +} + +tcp_client { + yylval.ival = TCP_CLIENT; +#ifdef DEBUG + printf("TCP CLIENT\n"); +#endif + return TCP_CLIENT; +} + +tcp_server { + yylval.ival = TCP_SERVER; +#ifdef DEBUG + printf("TCP SERVER\n"); +#endif + return TCP_SERVER; +} + +udp_sender { + yylval.ival = UDP_SENDER; +#ifdef DEBUG + printf("UDP SENDER\n"); +#endif + return UDP_SENDER; +} + +udp_receiver { + yylval.ival = UDP_RECEIVER; +#ifdef DEBUG + printf("UDP RECEIVER\n"); +#endif + return UDP_RECEIVER; +} + +udp_connect { + yylval.ival = UDP_CONNECT; +#ifdef DEBUG + printf("UDP CONNECT\n"); +#endif + return UDP_CONNECT; +} + +[^ \t\n]+ { + yylval.sval = (char *)malloc(strlen(yytext) + 1); + strcpy(yylval.sval, yytext); +#ifdef DEBUG + printf("NAME:%s\n",yylval.sval); +#endif + BEGIN(0); + return (NAME); +} + +\n { + __vma_config_line_num++; +#ifdef DEBUG + printf("LINE\n"); +#endif + yylval.ival = LINE; + return(LINE); +} + +[#][^\n]* { + __vma_config_line_num++; +} + +[ \t]+ {} + +. { +#ifdef DEBUG + printf("CHAR:%c\n",yytext[0]); +#endif + return(yytext[0]); +} + +%% + +int yywrap () +{ + return (1); +} + diff --git a/src/vma/util/data_updater.cpp b/src/vma/util/data_updater.cpp new file mode 100644 index 0000000..479c5ec --- /dev/null +++ b/src/vma/util/data_updater.cpp @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "data_updater.h" + +data_updater::~data_updater() +{ + +} + +header_ttl_updater::header_ttl_updater(uint8_t ttl, bool is_multicast) + : data_updater() + , m_ttl(ttl) + , m_is_multicast(is_multicast) +{ + +} + +bool header_ttl_updater::update_field(dst_entry &dst) +{ + if ((IN_MULTICAST_N(dst.get_dst_addr()) && m_is_multicast) || + (!IN_MULTICAST_N(dst.get_dst_addr()) && !m_is_multicast)) { + dst.set_ip_ttl(m_ttl); + } + return true; +} + +header_pcp_updater::header_pcp_updater(uint8_t pcp) + : data_updater() + , m_pcp(pcp) +{ + +} + +bool header_pcp_updater::update_field(dst_entry &dst) +{ + return dst.set_pcp(m_pcp); +} + +header_tos_updater::header_tos_updater(uint8_t tos) + : data_updater() + , m_tos(tos) +{ + +} + +bool header_tos_updater::update_field(dst_entry &dst) +{ + dst.set_ip_tos(m_tos); + return true; +} + +ring_alloc_logic_updater::ring_alloc_logic_updater(int fd, lock_base & socket_lock, + resource_allocation_key & ring_alloc_logic, + socket_stats_t* socket_stats) + : data_updater() + , m_fd(fd) + , m_socket_lock(socket_lock) + , m_key(ring_alloc_logic) + , m_sock_stats(socket_stats) +{ + +} + +bool ring_alloc_logic_updater::update_field(dst_entry &dst) +{ + if (dst.update_ring_alloc_logic(m_fd, m_socket_lock, m_key)) + m_sock_stats->counters.n_tx_migrations++; + + return true; +} diff --git a/src/vma/util/data_updater.h b/src/vma/util/data_updater.h new file mode 100644 index 0000000..61a1829 --- /dev/null +++ b/src/vma/util/data_updater.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef SRC_VMA_UTIL_DATA_UPDATER_H_ +#define SRC_VMA_UTIL_DATA_UPDATER_H_ + +#include "vma/proto/dst_entry.h" + +class data_updater { +public: + data_updater() {}; + virtual ~data_updater() = 0; + virtual bool update_field(dst_entry &dst) = 0; +}; + +class header_ttl_updater: public data_updater { +public: + header_ttl_updater(uint8_t ttl, bool is_unicast); + virtual ~header_ttl_updater() {}; + virtual bool update_field(dst_entry &hdr); +private: + uint8_t m_ttl; + bool m_is_multicast; +}; + +class header_pcp_updater: public data_updater { +public: + header_pcp_updater(uint8_t pcp); + virtual ~header_pcp_updater() {}; + virtual bool update_field(dst_entry &hdr); +private: + uint32_t m_pcp; +}; + +class header_tos_updater: public data_updater { +public: + header_tos_updater(uint8_t pcp); + virtual ~header_tos_updater() {}; + virtual bool update_field(dst_entry &hdr); +private: + uint8_t m_tos; +}; + +class ring_alloc_logic_updater: public data_updater { +public: + ring_alloc_logic_updater(int fd, lock_base & socket_lock, + resource_allocation_key & ring_alloc_logic, + socket_stats_t* socket_stats); + virtual ~ring_alloc_logic_updater() {}; + virtual bool update_field(dst_entry &hdr); +private: + int m_fd; + lock_base & m_socket_lock; + resource_allocation_key & m_key; + socket_stats_t* m_sock_stats; +}; +#endif /* SRC_VMA_UTIL_DATA_UPDATER_H_ */ diff --git a/src/vma/util/hash_map.h b/src/vma/util/hash_map.h new file mode 100644 index 0000000..21502f0 --- /dev/null +++ b/src/vma/util/hash_map.h @@ -0,0 +1,198 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef HASH_MAP_H +#define HASH_MAP_H + +#include "utils/lock_wrapper.h" + + +/** + * Map keys to values (K -> V). + * The map supports SET and GET operations. + * The map does not do any kind of locking, however it is + * guaranteed that SET does not interfere with GET. + * + * In order to perform find/add new operation, do: + * if GET: + * return elem + * lock() + * if not GET: + * SET(new_elem) + * unlock() + * return GET + * + * This is correct because there are no DELETE operations. + * hash_map + * @param K key type + * @param V value type + * @param MAP_SIZE hash table size (better be a power of 2) + * @param NULL_VALUE invalid (sentinel) value for type V, i.e NULL for pointers. + */ + +#define HASH_MAP_SIZE 4096 + +template +class hash_map { +public: + hash_map(); + virtual ~hash_map(); + +public: + struct map_node { + K key; + V value; + map_node *next; + }; + + struct pair { + // coverity[member_decl] + K first; + // coverity[member_decl] + V second; + }; + + class iterator { + public: + iterator() : + m_index(HASH_MAP_SIZE), m_node(NULL), m_hash_table(NULL) { + } + + pair *operator->() { + if (m_node) { + m_pair.first = m_node->key; + m_pair.second = m_node->value; + } + return &m_pair; + } + + iterator& operator++() { + if (m_node) { + m_node = m_node->next; + } + advance(); + return *this; + } + + bool operator!=(const iterator &other) const { + return m_node != other.m_node; + } + + private: + iterator(int index, map_node *node, map_node* const *hash_table) : + m_index(index), m_node(node), m_hash_table(hash_table) { + advance(); + } + + // Skip empty nodes + void advance() { + while (!m_node && m_index < HASH_MAP_SIZE) { + m_node = m_hash_table[++m_index]; + } + if (m_index >= HASH_MAP_SIZE) { + m_node = NULL; + } + } + + int m_index; + map_node *m_node; + map_node * const *m_hash_table; + pair m_pair; + + friend class hash_map; + }; + + /** + * Adds a (key,value) pair to the map. + * If the key already there, the value is updated. + */ + void set(const K &key, V value); + + /** + * Adds a (key,value) pair to the map. + * If the key already there, the value is updated. + * + * If a mapping with null_value is found, it is replaced + * with the new mapping (instead of allocating more room). + * This way mappings can be deleted in a GET-safe manner, + * and not wasting too much memory (There will be at most + * one empty item for each bucket). + */ + void set_replace(const K &key, V value, V null_value); + + /** + * Retrieves a value for a given key. + * + * @param key Key to find. + * @param default_value Return this if not found. + * @return Value for key, of defaultValue if not found. + */ + inline V get(const K &key, V default_value); + + /** + * Removes a mapping from the map. + * NOTE: This is not synchronized with GET. In order to be safe, delete + * items by replacing the mapping with some NULL value, and set items + * with set_replace to replace the empty mappings. + * + * @param key Key to delete. + * @return true if deleted, false if not found. + */ + inline bool del(const K &key); + + iterator begin() const { + return iterator(0, m_hash_table[0], m_hash_table); + } + + iterator end() const { + return iterator(HASH_MAP_SIZE, NULL, m_hash_table); + } + +private: + /** + * Calculate key bucket number by it's hash (XOR of all bytes) + * @param key Key to hash. + * @return Bucket number. + */ + inline int calc_hash(const K &key); + + /// holds the hash table + map_node *m_hash_table[HASH_MAP_SIZE]; + + /// last used element optimization + map_node *m_last; +}; + +#include "hash_map.inl" + +#endif diff --git a/src/vma/util/hash_map.inl b/src/vma/util/hash_map.inl new file mode 100644 index 0000000..4387afa --- /dev/null +++ b/src/vma/util/hash_map.inl @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +template +hash_map::hash_map() : m_last(NULL) { + int i; + + for (i = 0; i < HASH_MAP_SIZE; ++i) { + m_hash_table[i] = NULL; + } +} + +template +hash_map::~hash_map() { + map_node *head, *tmp; + int i; + + // Release all map nodes + for (i = 0; i < HASH_MAP_SIZE; ++i) { + head = m_hash_table[i]; + while (head) { + tmp = head->next; + delete head; + head = tmp; + } + } +} + +template +inline V hash_map::get(const K &key, V default_value) { + map_node *node, *last; + + last = m_last; // Copy last before it changes + if (last && last->key == key) { + return last->value; + } + + node = m_hash_table[calc_hash(key)]; + while (node) { + if (node->key == key) { + m_last = node; + return node->value; + } + node = node->next; + } + return default_value; +} + +template +void hash_map::set(const K &key, V value) { + map_node **pptail, *new_node; + + // find last pointer + pptail = &( m_hash_table[calc_hash(key)] ); + while (*pptail) { + if ((*pptail)->key == key) { + (*pptail)->value = value; + return; + } + pptail = &( (*pptail)->next ); + } + + // create new node + new_node = new map_node(); + new_node->key = key; + new_node->value = value; + new_node->next = NULL; + + // add + *pptail = new_node; +} + +template +void hash_map::set_replace(const K &key, V value, V null_value) { + map_node **pptail, *new_node = NULL; + + // find last pointer + pptail = &( m_hash_table[calc_hash(key)] ); + while (*pptail) { + if ((*pptail)->key == key) { + (*pptail)->value = value; + return; + } else if ((*pptail)->key == null_value) { + new_node = *pptail; + break; + } + pptail = &( (*pptail)->next ); + } + + if (!new_node) { + // create new node + new_node = new map_node(); + new_node->next = NULL; + } + + new_node->key = key; + new_node->value = value; + + // add + *pptail = new_node; +} + +template +inline int hash_map::calc_hash(const K &key) { + uint8_t *pval, *csum8; + uint16_t csum; + size_t i, j; + + // uint32_t-size checksum on key + csum = 0; + csum8 = (uint8_t*)&csum; + pval = (uint8_t*)&key; + // start toggle from 1, as the keys are usually succeders, and gone through htons + for (i = 0, j = 1; i < sizeof(K); ++i, j ^= 1) { + csum8[j] ^= *pval; + ++pval; + } + // to 12 bit + csum = (csum8[0] ^ csum8[1]) | ((((csum >> 4) ^ (csum >> 8)) & 0xf) << 8); + // or modolu prime close to 4096 + //csum %= 4093; + return csum; +} + +template +bool hash_map::del(const K &key) { + map_node **pprev, *tmp; + + // find last pointer + pprev = &( m_hash_table[calc_hash(key)] ); + while (*pprev) { + if ((*pprev)->key == key) { + tmp = *pprev; + *pprev = (*pprev)->next; + if (m_last == tmp) + m_last = NULL; + delete tmp; + return true; + } + pprev = &( (*pprev)->next ); + } + return false; +} diff --git a/src/vma/util/if.h b/src/vma/util/if.h new file mode 100644 index 0000000..0d71b94 --- /dev/null +++ b/src/vma/util/if.h @@ -0,0 +1,44 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef VMA_IF_H_ +#define VMA_IF_H_ + +#include +#include + +/* defined in net/if.h but that conflicts with linux/if.h... */ +extern "C" unsigned int if_nametoindex (__const char *__ifname) __THROW; +extern "C" char *if_indextoname (unsigned int __ifindex, char *__ifname) __THROW; + +#endif diff --git a/src/vma/util/instrumentation.cpp b/src/vma/util/instrumentation.cpp new file mode 100644 index 0000000..8b272d1 --- /dev/null +++ b/src/vma/util/instrumentation.cpp @@ -0,0 +1,383 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "config.h" +#include "instrumentation.h" +#include + +#ifdef RDTSC_MEASURE +uint16_t g_rdtsc_cost = 0; +instr_info g_rdtsc_instr_info_arr[RDTSC_FLOW_MAX]; +char g_rdtsc_flow_names[RDTSC_FLOW_MAX][256] = { + {"RDTSC_FLOW_TX_SENDTO_TO_AFTER_POST_SEND"}, + {"RDTSC_FLOW_RX_CQE_RECEIVEFROM"}, + {"RDTSC_FLOW_TX_VERBS_POST_SEND"}, + {"RDTSC_FLOW_RX_VERBS_IDLE_POLL"}, + {"RDTSC_FLOW_MEASURE_RECEIVEFROM_TO_SENDTO"}, + {"RDTSC_FLOW_RX_LWIP"}, + {"RDTSC_FLOW_MEASURE_RX_DISPATCH_PACKET"}, + {"RDTSC_FLOW_PROCCESS_AFTER_BUFFER_TO_RECIVEFROM "}, + {"RDTSC_FLOW_RX_VMA_TCP_IDLE_POLL"}, + {"RDTSC_FLOW_RX_READY_POLL_TO_LWIP"}, + {"RDTSC_FLOW_RX_LWIP_TO_RECEVEFROM"}, + {"RDTSC_FLOW_RX_VERBS_READY_POLL"} + +}; + +void init_rdtsc() +{ + tscval_t start, end, curr; + + gettimeoftsc(&start); + for(int i = 0; i < 1000000; i++) { + gettimeoftsc(&curr); + gettimeoftsc(&curr); + } + gettimeoftsc(&end); + g_rdtsc_cost = (end - start)/1000000; + vlog_printf(VLOG_ERROR,"RDTSC cost is: %u\n", g_rdtsc_cost); + + for(int i = 0; i < RDTSC_FLOW_MAX; i++) { + memset((void*)(&g_rdtsc_instr_info_arr[i]), 0, sizeof(instr_info)); + g_rdtsc_instr_info_arr[i].print_ratio = RDTSC_PRINT_RATIO; + g_rdtsc_instr_info_arr[i].trace_log_idx = i; + } + +} + +void print_rdtsc_summary() +{ + uint64_t avg; + + vlog_printf(VLOG_ERROR,"*********** RDTSC Summary ************ \n"); + for(int i = 0; i < RDTSC_FLOW_MAX; i++) { + if (g_rdtsc_instr_info_arr[i].counter) { + avg = g_rdtsc_instr_info_arr[i].cycles/g_rdtsc_instr_info_arr[i].counter; + vlog_printf(VLOG_ERROR,"%s: %" PRIu64 " \n", g_rdtsc_flow_names[g_rdtsc_instr_info_arr[i].trace_log_idx], avg); + } + + } +} + + +#endif //RDTSC_MEASURE + +#ifdef VMA_TIME_MEASURE + +#include +#include +#include +#include +#include +#include "sys_vars.h" +#include "utils/clock.h" +#include "utils/rdtsc.h" + +struct timespec g_inst[INST_SIZE][INST_SAMPLS]; +uint32_t g_inst_nsec[INST_SIZE][INST_SAMPLS+INST_SUMS]; +uint32_t g_poll_cnt[INST_SIZE]; +uint32_t g_inst_cnt; +uint32_t g_tx_err_counter; +uint32_t g_rx_err_counter; +uint32_t g_poll_err_counter; +uint32_t g_tx_go_to_os; +uint32_t g_rx_go_to_os; +uint32_t g_dump_cnt = 1; + +void init_instrumentation() +{ + memset(g_inst, 0, sizeof(struct timespec)*INST_SIZE*INST_SAMPLS); + memset(g_inst_nsec, 0, sizeof(uint32_t)*INST_SIZE*(INST_SAMPLS+INST_SUMS)); + memset(g_poll_cnt, 0, sizeof(uint32_t)*INST_SIZE); + g_inst_cnt = 0; + g_tx_err_counter = 0; + g_rx_err_counter = 0; + g_poll_err_counter = 0; + g_tx_go_to_os = 0; + g_rx_go_to_os = 0; +} +void finit_instrumentation(char* dump_file_name) +{ + if(dump_file_name == NULL) + return; + + if (0 >= g_inst_cnt) + return; + + if(g_inst_cnt > INST_SIZE){ + g_inst_cnt = INST_SIZE; + } + + std::ofstream dump_file; + + uint32_t poll_start_to_poll_cq_min=VMA_TIME_DEFAULT_MIN_VAL, poll_start_to_poll_cq_max=0; + double poll_start_to_poll_cq_avg = 0; + uint32_t poll_cq_to_end_poll_min=VMA_TIME_DEFAULT_MIN_VAL, poll_cq_to_end_poll_max=0; + double poll_cq_to_end_poll_avg = 0; + uint32_t poll_delta_max=0, poll_delta_min=VMA_TIME_DEFAULT_MIN_VAL; + double poll_delta_avg = 0; + uint32_t rx_delta_max=0, rx_delta_min=VMA_TIME_DEFAULT_MIN_VAL; + double rx_delta_avg = 0; + + uint32_t rx_start_to_poll_cq_min=VMA_TIME_DEFAULT_MIN_VAL, rx_start_to_poll_cq_max=0; + double rx_start_to_poll_cq_avg = 0; + uint32_t poll_cq_to_end_rx_min=VMA_TIME_DEFAULT_MIN_VAL, poll_cq_to_end_rx_max=0; + double poll_cq_to_end_rx_avg = 0; + + uint32_t tx_start_to_post_snd_s_min=VMA_TIME_DEFAULT_MIN_VAL, tx_start_to_post_snd_s_max=0; + double tx_start_to_post_snd_s_avg = 0; + uint32_t tx_post_snd_s_to_e_min=VMA_TIME_DEFAULT_MIN_VAL, tx_post_snd_s_to_e_max=0; + double tx_post_snd_s_to_e_avg = 0; + uint32_t tx_post_snd_e_to_tx_end_min=VMA_TIME_DEFAULT_MIN_VAL, tx_post_snd_e_to_tx_end_max=0; + double tx_post_snd_e_to_tx_end_avg = 0; + uint32_t max_poll_count = 0; + uint32_t poll_start_to_poll_cq = 0; + uint32_t poll_cq_to_end_poll = 0; + uint32_t poll_delta = 0; + uint32_t rx_start_to_poll_cq = 0; + uint32_t poll_cq_to_end_rx = 0; + uint32_t rx_delta = 0; + uint32_t tx_start_to_post_snd_s = 0; + uint32_t tx_post_snd_e_to_tx_end = 0; + uint32_t tx_post_snd_s_to_e = 0; + + char* dumpFileName = (char*)malloc(sizeof(char)*(FILENAME_MAX+10)); + sprintf(dumpFileName, "%s.%d.%d", dump_file_name, getpid(), g_dump_cnt); + + dump_file.open (dumpFileName); + + dump_file << "INVALID:" << VMA_TIME_INVALID <<"\n"; + dump_file << "TOTAL SAMPLES: " << g_inst_cnt << "\n"; + dump_file << "TX ERRORS:" << g_tx_err_counter << "\n"; + dump_file << "RX ERRORS:" << g_rx_err_counter << "\n"; + dump_file << "TX GO TO OS:" << g_tx_go_to_os << "\n"; + dump_file << "RX GO TO OS:" << g_rx_go_to_os << "\n"; + dump_file << "POLL ERRORS:" << g_poll_err_counter << "\n"; + + for (uint32_t i=0; i poll_start_to_poll_cq_max ) + poll_start_to_poll_cq_max = poll_start_to_poll_cq; + + poll_cq_to_end_poll = g_inst_nsec[i][POLL_END] - g_inst_nsec[i][CQ_IN_START]; + poll_cq_to_end_poll_avg += poll_cq_to_end_poll; + if ( poll_cq_to_end_poll < poll_cq_to_end_poll_min ) + poll_cq_to_end_poll_min = poll_cq_to_end_poll; + if ( poll_cq_to_end_poll > poll_cq_to_end_poll_max ) + poll_cq_to_end_poll_max = poll_cq_to_end_poll; + } + else { + poll_start_to_poll_cq = VMA_TIME_INVALID; + poll_cq_to_end_poll = VMA_TIME_INVALID; + } + + poll_delta = g_inst_nsec[i][POLL_END] - g_inst_nsec[i][POLL_START]; + poll_delta_avg += poll_delta; + if ( poll_delta < poll_delta_min ) + poll_delta_min = poll_delta; + if ( poll_delta > poll_delta_max ) + poll_delta_max = poll_delta; + + } + else { + poll_start_to_poll_cq = VMA_TIME_INVALID; + poll_cq_to_end_poll = VMA_TIME_INVALID; + poll_delta = VMA_TIME_INVALID; + } + + if (VMA_TIME_IS_LEGAL(g_inst_nsec[i][RX_START], g_inst_nsec[i][RX_END])) { + rx_delta = g_inst_nsec[i][RX_END] - g_inst_nsec[i][RX_START]; + rx_delta_avg += rx_delta; + if ( rx_delta < rx_delta_min ) + rx_delta_min = rx_delta; + if ( rx_delta > rx_delta_max ) + rx_delta_max = rx_delta; + + if (VMA_TIME_INVALID == poll_delta) { + if ((VMA_TIME_IS_LEGAL(g_inst_nsec[i][RX_START], g_inst_nsec[i][CQ_IN_START])) && + (VMA_TIME_IS_LEGAL(g_inst_nsec[i][CQ_IN_START], g_inst_nsec[i][RX_END]))) + { + rx_start_to_poll_cq = g_inst_nsec[i][CQ_IN_START] - g_inst_nsec[i][RX_START]; + rx_start_to_poll_cq_avg += rx_start_to_poll_cq; + if ( rx_start_to_poll_cq < rx_start_to_poll_cq_min ) + rx_start_to_poll_cq_min = rx_start_to_poll_cq; + if ( rx_start_to_poll_cq > rx_start_to_poll_cq_max ) + rx_start_to_poll_cq_max = rx_start_to_poll_cq; + + poll_cq_to_end_rx = g_inst_nsec[i][RX_END] - g_inst_nsec[i][CQ_IN_START]; + poll_cq_to_end_rx_avg += poll_cq_to_end_rx; + if ( poll_cq_to_end_rx < poll_cq_to_end_rx_min ) + poll_cq_to_end_rx_min = poll_cq_to_end_rx; + if ( poll_cq_to_end_rx > poll_cq_to_end_rx_max ) + poll_cq_to_end_rx_max = poll_cq_to_end_rx; + } + else { + rx_start_to_poll_cq = VMA_TIME_INVALID; + poll_cq_to_end_rx = VMA_TIME_INVALID; + } + } + } + else { + rx_delta = VMA_TIME_INVALID; + } + + if (VMA_TIME_IS_LEGAL(g_inst_nsec[i][TX_START], g_inst_nsec[i][TX_POST_SEND_START])) + { + tx_start_to_post_snd_s = g_inst_nsec[i][TX_POST_SEND_START] - g_inst_nsec[i][TX_START]; + tx_start_to_post_snd_s_avg += tx_start_to_post_snd_s; + if ( tx_start_to_post_snd_s < tx_start_to_post_snd_s_min ) + tx_start_to_post_snd_s_min = tx_start_to_post_snd_s; + if ( tx_start_to_post_snd_s > tx_start_to_post_snd_s_max ) + tx_start_to_post_snd_s_max = tx_start_to_post_snd_s; + } + else { + tx_start_to_post_snd_s = VMA_TIME_INVALID; + } + + if (VMA_TIME_IS_LEGAL(g_inst_nsec[i][TX_POST_SEND_START], g_inst_nsec[i][TX_POST_SEND_END])) + { + tx_post_snd_s_to_e = g_inst_nsec[i][TX_POST_SEND_END] - g_inst_nsec[i][TX_POST_SEND_START]; + tx_post_snd_s_to_e_avg += tx_post_snd_s_to_e; + if ( tx_post_snd_s_to_e < tx_post_snd_s_to_e_min ) + tx_post_snd_s_to_e_min = tx_post_snd_s_to_e; + if ( tx_post_snd_s_to_e > tx_post_snd_s_to_e_max ) + tx_post_snd_s_to_e_max = tx_post_snd_s_to_e; + } + else { + tx_post_snd_s_to_e = VMA_TIME_INVALID; + } + + if (VMA_TIME_IS_LEGAL( g_inst_nsec[i][TX_POST_SEND_END], g_inst_nsec[i][TX_END])) { + tx_post_snd_e_to_tx_end = g_inst_nsec[i][TX_END] - g_inst_nsec[i][TX_POST_SEND_END]; + tx_post_snd_e_to_tx_end_avg += tx_post_snd_e_to_tx_end; + if ( tx_post_snd_e_to_tx_end < tx_post_snd_e_to_tx_end_min ) + tx_post_snd_e_to_tx_end_min = tx_post_snd_e_to_tx_end; + if ( tx_post_snd_e_to_tx_end > tx_post_snd_e_to_tx_end_max ) + tx_post_snd_e_to_tx_end_max = tx_post_snd_e_to_tx_end; + } + else { + tx_post_snd_e_to_tx_end = VMA_TIME_INVALID; + } + + g_inst_nsec[i][POLL_START_TO_CQ_IN] = poll_start_to_poll_cq; + g_inst_nsec[i][POLL_CQ_IN_TO_POLL_END] = poll_cq_to_end_poll; + g_inst_nsec[i][POLL_DELTA] = poll_delta; + g_inst_nsec[i][RX_START_TO_CQ_IN] = rx_start_to_poll_cq; + g_inst_nsec[i][RX_CQ_IN_TO_POLL_END] = poll_cq_to_end_rx; + g_inst_nsec[i][RX_DELTA] = rx_delta; + g_inst_nsec[i][TX_START_TO_POST_SND_S] = tx_start_to_post_snd_s; + g_inst_nsec[i][TX_POST_SND_S_TO_E] = tx_post_snd_s_to_e; + g_inst_nsec[i][TX_POST_SND_E_TO_TX_END] = tx_post_snd_e_to_tx_end; + + if (g_poll_cnt[i] > max_poll_count) + max_poll_count = g_poll_cnt[i]; + } + + poll_start_to_poll_cq_avg = poll_start_to_poll_cq_avg/g_inst_cnt; + poll_cq_to_end_poll_avg = poll_cq_to_end_poll_avg/g_inst_cnt; + poll_delta_avg = poll_delta_avg/g_inst_cnt; + rx_delta_avg = rx_delta_avg/g_inst_cnt; + rx_start_to_poll_cq_avg = rx_start_to_poll_cq_avg/g_inst_cnt; + poll_cq_to_end_rx_avg = poll_cq_to_end_rx_avg/g_inst_cnt; + tx_start_to_post_snd_s_avg = tx_start_to_post_snd_s_avg/g_inst_cnt; + tx_post_snd_s_to_e_avg = tx_post_snd_s_to_e_avg/g_inst_cnt; + tx_post_snd_e_to_tx_end_avg = tx_post_snd_e_to_tx_end_avg/g_inst_cnt; + + if (VMA_TIME_DEFAULT_MIN_VAL == poll_start_to_poll_cq_min) + poll_start_to_poll_cq_min = 0; + if (VMA_TIME_DEFAULT_MIN_VAL == poll_cq_to_end_poll_min) + poll_cq_to_end_poll_min = 0; + if (VMA_TIME_DEFAULT_MIN_VAL == poll_delta_min) + poll_delta_min = 0; + if (VMA_TIME_DEFAULT_MIN_VAL == rx_start_to_poll_cq_min) + rx_start_to_poll_cq_min = 0; + if (VMA_TIME_DEFAULT_MIN_VAL == poll_cq_to_end_rx_min) + poll_cq_to_end_rx_min = 0; + if (VMA_TIME_DEFAULT_MIN_VAL == rx_delta_min) + rx_delta_min = 0; + if (VMA_TIME_DEFAULT_MIN_VAL == rx_delta_max) + rx_delta_max = 0; + if (VMA_TIME_DEFAULT_MIN_VAL == tx_start_to_post_snd_s_min) + tx_start_to_post_snd_s_min = 0; + if (VMA_TIME_DEFAULT_MIN_VAL == tx_post_snd_s_to_e_min) + tx_post_snd_s_to_e_min = 0; + if (VMA_TIME_DEFAULT_MIN_VAL == tx_post_snd_e_to_tx_end_min) + tx_post_snd_e_to_tx_end_min = 0; + + dump_file << "poll_start_to_poll_cq: min=" << poll_start_to_poll_cq_min << " max=" << poll_start_to_poll_cq_max << " avg=" << poll_start_to_poll_cq_avg << "\n"; + dump_file << "poll_cq_to_end_poll: min=" << poll_cq_to_end_poll_min << " max=" << poll_cq_to_end_poll_max << " avg=" << poll_cq_to_end_poll_avg << "\n"; + dump_file << "poll_delta: min=" << poll_delta_min << " max=" << poll_delta_max << " avg=" << poll_delta_avg << "\n"; + dump_file << "rx_start_to_poll_cq: min=" << rx_start_to_poll_cq_min << " max=" << rx_start_to_poll_cq_max << " avg=" << rx_start_to_poll_cq_avg << "\n"; + dump_file << "rx_cq_to_end_poll: min=" << poll_cq_to_end_rx_min << " max=" << poll_cq_to_end_rx_max << " avg=" << poll_cq_to_end_rx_avg << "\n"; + dump_file << "rx_delta: min=" << rx_delta_min << " max=" << rx_delta_max << " avg=" << rx_delta_avg << "\n"; + dump_file << "tx_start_to_post_snd: min=" << tx_start_to_post_snd_s_min << " max=" << tx_start_to_post_snd_s_max << " avg=" << tx_start_to_post_snd_s_avg << "\n"; + dump_file << "tx_post_snd_s_to_e: min=" << tx_post_snd_s_to_e_min << " max=" << tx_post_snd_s_to_e_max << " avg=" << tx_post_snd_s_to_e_avg << "\n"; + dump_file << "tx_post_snd_e_to_tx_end: min=" << tx_post_snd_e_to_tx_end_min << " max=" << tx_post_snd_e_to_tx_end_max << " avg=" << tx_post_snd_e_to_tx_end_avg << "\n"; + + dump_file << "MAX_POLL_COUNT: " << max_poll_count << "\n"; + + + dump_file << " poll_in cq_poll poll_out rx_in rx_out tx_in post_snd_s pos_snd_e tx_out poll_start_to_poll_cq poll_cq_to_end_poll poll_delta rx_start_to_poll_cq poll_cq_to_end_rx rx_delta tx_start_to_post_snd_s tx_pos_snd_s_to_e tx_post_snd_e_to_tx_end g_poll_cnt\n"; + + for (uint32_t i=0; i +#include +#include "utils/rdtsc.h" +#define __STDC_FORMAT_MACROS +#include +#include "vlogger/vlogger.h" + +#ifdef RDTSC_MEASURE +void init_rdtsc(); +void print_rdtsc_summary(); + +#define RDTSC_PRINT_RATIO 100000 +#define RDTSC_TAKE_START(instr) gettimeoftsc(&instr.start) +#define RDTSC_TAKE_END(instr) gettimeoftsc(&instr.end); \ + instr.cycles += (instr.end < instr.start - g_rdtsc_cost)?0:(instr.end - instr.start - g_rdtsc_cost); \ + instr.counter++; \ + if (instr.print_ratio && instr.counter%instr.print_ratio == 0) { \ + uint64_t avg = instr.cycles/instr.counter; \ + vlog_printf(VLOG_ERROR,"%s: %" PRIu64 " \n", g_rdtsc_flow_names[instr.trace_log_idx], avg); \ + } \ + +enum rdtsc_flow_type { + RDTSC_FLOW_SENDTO_TO_AFTER_POST_SEND = 0, + RDTSC_FLOW_RX_CQE_TO_RECEIVEFROM = 1, + RDTSC_FLOW_TX_VERBS_POST_SEND = 2, + RDTSC_FLOW_RX_VERBS_IDLE_POLL = 3, + RDTSC_FLOW_RECEIVEFROM_TO_SENDTO = 4, + RDTSC_FLOW_MEASURE_RX_LWIP = 5, + RDTSC_FLOW_RX_DISPATCH_PACKET = 6, + RDTSC_FLOW_PROCCESS_RX_BUFFER_TO_RECIVEFROM = 7, + RDTSC_FLOW_RX_VMA_TCP_IDLE_POLL = 8, + RDTSC_FLOW_RX_READY_POLL_TO_LWIP = 9, + RDTSC_FLOW_RX_LWIP_TO_RECEVEFROM = 10, + RDTSC_FLOW_RX_VERBS_READY_POLL = 11, + RDTSC_FLOW_MAX = 12 +}; + +typedef struct instr_info { + tscval_t start; + tscval_t end; + uint64_t cycles; + uint64_t counter; + uint64_t print_ratio; + uint16_t trace_log_idx; +} instr_info; + +extern uint16_t g_rdtsc_cost; +extern char g_rdtsc_flow_names[RDTSC_FLOW_MAX][256]; +extern instr_info g_rdtsc_instr_info_arr[RDTSC_FLOW_MAX]; + +#endif //RDTS_MEASURE + +//#define VMA_TIME_MEASURE 1 +#ifdef VMA_TIME_MEASURE + +#define POLL_START 0 +#define CQ_IN_START 1 +#define POLL_END 2 +#define RX_START 3 +#define RX_END 4 +#define TX_START 5 +#define TX_POST_SEND_START 6 +#define TX_POST_SEND_END 7 +#define TX_END 8 + +#define POLL_START_TO_CQ_IN 9 +#define POLL_CQ_IN_TO_POLL_END 10 +#define POLL_DELTA 11 +#define RX_START_TO_CQ_IN 12 +#define RX_CQ_IN_TO_POLL_END 13 +#define RX_DELTA 14 +#define TX_START_TO_POST_SND_S 15 +#define TX_POST_SND_S_TO_E 16 +#define TX_POST_SND_E_TO_TX_END 17 + +#define INST_SIZE 2000000 + +#define INST_SAMPLS (TX_END - POLL_START + 1) +#define INST_SUMS (TX_POST_SND_E_TO_TX_END - POLL_START_TO_CQ_IN + 1) + + +#define TAKE_TIME_2M(__i__) do {if (g_inst_cnt end) || (0== start)) ? false : true) + +#define INC_POLL_COUNT do {if (g_inst_cnt +# +# field: +# Defines the program name (not including the path) the statments, which +# appears below that directive, apply to. +# Wildcards with same semantics as "ls" are supported (* and ?). +# So db2* would match on any program with a name starting with db2. +# t?cp would match on ttcp, etc. +# If not provided (default) the statement matches all programs. +# +# field: +# Specify the proccess ID the statments which appears below that directive, +# apply to. +# In order to configure user-defined-id besides specifying it in +# configuration files, also needed to set the +# VMA_APPLICATION_ID environment variable to the same value which +# appears in user-defined-id filed of the application-id statment. +# +############################################################################## +# +# LOG CONFIGURATION: NOT FULLY SUPPORTED YET ! +# ------------------ +# The log directive allows the user to specify which and where debug and error +# messages get sent. The log statement format is: +# log [destination stderr|syslog|file ] [min-level <1-9>] +# +# destination - defines the destination of the log messages: +# stderr - messages will be forwarded to the stderr +# syslog - messages sent to the syslog service +# file - messages will be written to the file /var/log/ for root. +# for regular user, if full path is requsted . +# or /tmp/. if no path is requested +# +# min-level - defines the verbosity of the log: +# 9 - only errors are printed +# 8 - warnings +# 7 - connect and listen summary (useful for tracking SDP usage) +# 4 - positive match summary (useful for config file debug) +# 3 - negative match summary (useful for config file debug) +# 2 - function calls and return values +# 1 - debug messages +# +############################################################################## +# +# SOCKET TRANSPORT CONTROL: +# ------------------------------ +# The socket control statements allows the user to specify when libvma will +# offload AF_INET/SOCK_STREAM or AF_INET/SOCK_DATAGRAM sockets. +# Each control statement specifies a matching rule that all its +# subexpressions must evaluate as true (logical and) to apply. +# +# The statements that control which type transport to use are made +# of the following: +# use :[::] +# +# can be one of: +# "vma" - for specifying when VMA should be used. +# "os" - for specifying when socket should handled by os network stack. +# +# can be one of: +# 1. "tcp_server". +# 2. "tcp_client" +# 3. "udp_sender" +# 4. "udp_receiver" +# 5. "udp_connect" +# +# means: +# Either the local address the server is bind to or the remote server +# address the client connects to. Syntax for address matching is: +# [/]|* +# IPv4 address = [0-9]+\.[0-9]+\.[0-9]+\.[0-9]+ each sub number < 255 +# prefix_length = [0-9]+ and with value <= 32. A prefix_length of 24 +# matches the subnet mask 255.255.255.0 . A prefix_length of 32 +# requires matching of the exact IP. +# +# is: +# start-port[-end-port] where port numbers are >0 and < 65536 +# +# Rules are evaluated in order of definition. So the first match wins. +# If no match is made vma will default to "vma". +# +# Detailed explanation: +# +# udp_receiver - local_ip:local_port +# udp_sender - remote_ip:remote_port +# udp_connect - remote_ip:remote_port:local_ip:local_port +# tcp_server - local_ip:local_port +# tcp_client - remote_ip:remote_port:local_ip:local_port +# +# For TCP, the socket must be either all (recv & send / listen) offloaded or all not offloaded. +# For UDP, the recv and send from/to each address are independent, and can be separately defined as offloaded or not. +# +# For TCP connected socket - use tcp_client with 5 tuple specification starting from remote and ending with local address. +# For TCP listen socket - use tcp_server with the local address to listen on. +# For TCP accepted sockets - offloading is determined by the listen socket it was accepted through. +# If the listen socket is offloaded, all accepted sockets are offloaded and if it is not offloaded, all accepted sockets are not offloaded. +# Therefore, no special rule is available for accepted sockets. +# +# For UDP receive - use udp_receiver with local address. For MC, use MC ip and port. +# For UDP send - use udp_sender with remote address. For MC, use MC ip and port. +# For UDP connect - use 5 tuple specification starting from remote and ending with local address. +# UDP connect rule will only affect UDP sockets which connect() have been called on. +# A "use os udp_connect" rule will pass the socket to be handled exclusively by OS (no more offloaded traffic). +# A "use vma udp_connect" rule still leave the socket to be impacted by udp_sender and udp_receiver rules (which makes it practically useless). +# +############################################################################# +# +# Examples: +#------------------------------- +# Apply the rules to program tcp_lat with ID B1, also don't forget to set +# the VMA_APPLICATION_ID: export VMA_APPLICATION_ID=B1 +# application user defined id +# application-id tcp_lat B1 +# +# Use VMA by TCP clients on any local interface connecting to machines that belongs to subnet 192.168.1.* +# transport role address:port[-range] +# use vma tcp_client 192.168.1.0/24:*:*:* +# +# Use OS by when TCP server listens to port 5001 of any machine +# family role address:port[-range] +# use os tcp_server *:5001 +# +############################################################################### diff --git a/src/vma/util/libvma.h b/src/vma/util/libvma.h new file mode 100644 index 0000000..bcd2c9c --- /dev/null +++ b/src/vma/util/libvma.h @@ -0,0 +1,303 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef _LIBVMA_H +#define _LIBVMA_H + +#include + +#include "vtypes.h" +#include "vma/lwip/opt.h" + +/* --------------------------------------------------------------------- */ +/* library static and global variables */ +/* --------------------------------------------------------------------- */ + +/* max string length to store any IPv4/IPv6 address */ +#define MAX_ADDR_STR_LEN 49 +#define MAX_IF_NAME_LEN 10 +#define MAX_CONF_FILE_ENTRY_STR_LEN 512 +// from lwip +#define NETIF_MAX_HWADDR_LEN 6U + +#ifdef __cplusplus +extern "C" { +#endif + +typedef enum +{ + ROLE_TCP_SERVER, + ROLE_TCP_CLIENT, + ROLE_UDP_RECEIVER, + ROLE_UDP_SENDER, + ROLE_UDP_CONNECT +} role_t; + +typedef enum { + TRANS_OS = 1, + TRANS_VMA, + TRANS_SDP, + TRANS_SA, + TRANS_ULP, + TRANS_DEFAULT +} transport_t; + +typedef enum { + PROTO_UNDEFINED, + PROTO_UDP, + PROTO_TCP, + PROTO_ALL +} in_protocol_t; + +typedef enum { + DEV_CLONE, + DEV_REPLACE +} dev_conf_mode_t; + +typedef enum { + IN_ADDR_DHCP, + IN_ADDR_STATIC +} in_addr_alloc_mode_t; + +typedef enum { + MAC_AUTO_GEN, + MAC_MANUAL +} mac_alloc_mode_t; + + +/* some state to string functions */ +static inline const char *__vma_get_transport_str(transport_t transport ) +{ + switch (transport) { + case TRANS_OS: + return "OS"; + break; + case TRANS_VMA: + return "VMA"; + break; + case TRANS_SDP: + return "SDP"; + break; + case TRANS_SA: + return "SA"; + break; + case TRANS_ULP: + return "ULP"; + break; + case TRANS_DEFAULT: + return "DEFAULT"; + break; + } + return ( "UNKNOWN-TRANSPORT" ); +} + +/* some state to string functions */ +static inline const char *__vma_get_protocol_str(in_protocol_t protocol) +{ + switch (protocol) { + case PROTO_UNDEFINED: return "UNDEFINED"; + case PROTO_UDP: return "UDP"; + case PROTO_TCP: return "TCP"; + case PROTO_ALL: return "*"; + default: + break; + } + return ("unknown-protocol"); +} + +static inline const char *__vma_get_role_str(int role) +{ + switch (role) { + case ROLE_TCP_CLIENT: + return("tcp_client"); + break; + case ROLE_TCP_SERVER: + return("tcp_server"); + break; + case ROLE_UDP_RECEIVER: + return("udp_receiver"); + break; + case ROLE_UDP_SENDER: + return("udp_sender"); + break; + case ROLE_UDP_CONNECT: + return("udp_connect"); + break; + default: + break; + } + return("unknown role"); +} + +struct dbl_lst_node +{ + struct dbl_lst_node *prev, *next; + void *data; +}; + +struct dbl_lst +{ + struct dbl_lst_node *head; + struct dbl_lst_node *tail; +}; + +struct address_port_rule +{ + int match_by_addr; /* if 0 ignore address match */ + struct in_addr ipv4; /* IPv4 address for mapping */ + unsigned char prefixlen; /* length of CIDR prefix (ie /24) */ + int match_by_port; /* if 0 ignore port match */ + unsigned short sport, eport; /* start port - end port, inclusive */ +}; + +/* data structure for holding address family mapping rules */ +/* note we filter non relevant programs during parsing ... */ +struct use_family_rule +{ + struct address_port_rule first; + struct address_port_rule second; + unsigned char use_second; + transport_t target_transport; /* if match - use this transport */ + in_protocol_t protocol; /* protocol family for mapping */ +}; + +/* data structure for holding the devices vma will handle */ +struct vma_device +{ + dev_conf_mode_t conf_mode; /* clone or replace insterface */ + u8_t hw_addr[NETIF_MAX_HWADDR_LEN]; /* interface physical address */ + u8_t hw_addr_len; /* interface physical address length */ + in_addr_alloc_mode_t in_addr_alloc_mode;/* static or dhcp */ + mac_alloc_mode_t mac_alloc_mode; /* manual or autogen */ + struct in_addr ipv4; /* interface IPv4 address */ + unsigned char prefixlen; /* prefix len of interface IPv4 address */ + char if_name[MAX_IF_NAME_LEN + 1]; /*base interface name */ +}; + +struct instance_id +{ + char *prog_name_expr; + char *user_defined_id; +}; + +/* data structure for holding the instances descriptors */ +struct instance +{ + struct instance_id id; /* instance id */ + struct dbl_lst tcp_clt_rules_lst; /* tcp client's rules list */ + struct dbl_lst tcp_srv_rules_lst; /* tcp server's rules list */ + struct dbl_lst udp_snd_rules_lst; /* udp sender rules list */ + struct dbl_lst udp_rcv_rules_lst; /* udp receiver rules list */ + struct dbl_lst udp_con_rules_lst; /* udp connect rules list */ +}; + +extern struct dbl_lst __instance_list; +extern int __vma_min_level; + +#define VMA_NETMASK(n) ((n == 0) ? 0 : ~((1UL<<(32 - n)) - 1)) +#define IF_NAME_LEN 10 + +/* match.cpp */ +transport_t __vma_match_tcp_client(transport_t my_transport, const char *app_id, const struct sockaddr *sin_first, const socklen_t sin_addrlen_first, const struct sockaddr *sin_second, const socklen_t sin_addrlen_second); + +transport_t __vma_match_tcp_server(transport_t my_transport, const char *app_id, const struct sockaddr *sin, const socklen_t addrlen); + +transport_t __vma_match_udp_sender(transport_t my_transport, const char *app_id, const struct sockaddr * sin, const socklen_t addrlen); + +transport_t __vma_match_udp_receiver(transport_t my_transport, const char *app_id, const struct sockaddr * sin, const socklen_t addrlen); + +transport_t __vma_match_udp_connect(transport_t my_transport, const char *app_id, const struct sockaddr *sin_first, const socklen_t sin_addrlen_first, const struct sockaddr *sin_second, const socklen_t sin_addrlen_second); + +/* config.c */ +int __vma_config_empty(void); + +int __vma_parse_config_file(const char *config_file); + +int __vma_parse_config_line(const char *config_line); + +void __vma_print_conf_file(struct dbl_lst conf_lst); + +void __vma_free_resources(void); + +int __vma_match_program_name(struct instance *instance); + +int __vma_match_user_defined_id(struct instance *instance, const char *app_id); + +transport_t __vma_match_by_program(in_protocol_t my_protocol, const char *app_id); + +/* log.c */ +#if 0 +static inline +void __vma_log( + int level, + char *format, + ... ) +{ + NOT_IN_USE(level) + vlog_ +}; +#endif + +#define __vma_log(level, format, args...) \ + printf(format, ##args) + +static inline int __vma_log_get_level(void) +{ + return __vma_min_level; +} + +static inline void __vma_log_set_min_level(int level ) +{ + __vma_min_level= level; +}; + +//TODO AlexV: implement this function +static inline int __vma_log_set_log_stderr(void) {return 0;}; + +//TODO AlexV: implement this function +static inline int __vma_log_set_log_syslog(void) {return 0;}; + +//TODO AlexV: implement this function +static inline int __vma_log_set_log_file(char *filename ) +{ + NOT_IN_USE(filename); + return 0; +}; + +int __vma_sockaddr_to_vma(const struct sockaddr *addr_in, socklen_t addrlen, struct sockaddr_in *addr_out, int *was_ipv6 ); + +#ifdef __cplusplus +}; +#endif + +#endif diff --git a/src/vma/util/list.h b/src/vma/util/list.h new file mode 100644 index 0000000..1c7380c --- /dev/null +++ b/src/vma/util/list.h @@ -0,0 +1,567 @@ +#ifndef _LINUX_LIST_H +#define _LINUX_LIST_H + +/* + * Simple doubly linked list implementation. + * + * Some of the internal functions ("__xxx") are useful when + * manipulating whole lists rather than single entries, as + * sometimes we already know the next/prev entries and we can + * generate better code by using them directly rather than + * using the generic single-entry routines. + */ + +/* This is a modified version of Linux kernel list.h */ + +struct list_head { + struct list_head *next, *prev; +}; + +#define LIST_HEAD_INIT(name) { &(name), &(name) } + +#define LIST_HEAD(name) \ + struct list_head name = LIST_HEAD_INIT(name) + +static inline void INIT_LIST_HEAD(struct list_head *list) +{ + list->next = list; + list->prev = list; +} + +/* + * Insert a new entry between two known consecutive entries. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +#ifndef CONFIG_DEBUG_LIST +static inline void __list_add(struct list_head *new_item, + struct list_head *prev, + struct list_head *next) +{ + next->prev = new_item; + new_item->next = next; + new_item->prev = prev; + prev->next = new_item; +} +#else +extern void __list_add(struct list_head *new_item, + struct list_head *prev, + struct list_head *next); +#endif + +/** + * list_add - add a new entry + * @new_item: new entry to be added + * @head: list head to add it after + * + * Insert a new entry after the specified head. + * This is good for implementing stacks. + */ +static inline void list_add(struct list_head *new_item, struct list_head *head) +{ + __list_add(new_item, head, head->next); +} + + +/** + * list_add_tail - add a new entry + * @new_item: new entry to be added + * @head: list head to add it before + * + * Insert a new entry before the specified head. + * This is useful for implementing queues. + */ +static inline void list_add_tail(struct list_head *new_item, struct list_head *head) +{ + __list_add(new_item, head->prev, head); +} + +/* + * Delete a list entry by making the prev/next entries + * point to each other. + * + * This is only for internal list manipulation where we know + * the prev/next entries already! + */ +static inline void __list_del(struct list_head * prev, struct list_head * next) +{ + next->prev = prev; + prev->next = next; +} + +/** + * list_del - deletes entry from list. + * @entry: the element to delete from the list. + * Note: list_empty() on entry does not return true after this, the entry is + * in an undefined state. + */ +#ifndef CONFIG_DEBUG_LIST +static inline void list_del(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + entry->next = 0; + entry->prev = 0; +} +#else +extern void list_del(struct list_head *entry); +#endif + +/** + * list_replace - replace old entry by new one + * @old : the element to be replaced + * @new_item : the new element to insert + * + * If @old was empty, it will be overwritten. + */ +static inline void list_replace(struct list_head *old, + struct list_head *new_item) +{ + new_item->next = old->next; + new_item->next->prev = new_item; + new_item->prev = old->prev; + new_item->prev->next = new_item; +} + +static inline void list_replace_init(struct list_head *old, + struct list_head *new_item) +{ + list_replace(old, new_item); + INIT_LIST_HEAD(old); +} + +/** + * list_del_init - deletes entry from list and reinitialize it. + * @entry: the element to delete from the list. + */ +static inline void list_del_init(struct list_head *entry) +{ + __list_del(entry->prev, entry->next); + INIT_LIST_HEAD(entry); +} + +/** + * list_move - delete from one list and add as another's head + * @list: the entry to move + * @head: the head that will precede our entry + */ +static inline void list_move(struct list_head *list, struct list_head *head) +{ + __list_del(list->prev, list->next); + list_add(list, head); +} + +/** + * list_move_tail - delete from one list and add as another's tail + * @list: the entry to move + * @head: the head that will follow our entry + */ +static inline void list_move_tail(struct list_head *list, + struct list_head *head) +{ + __list_del(list->prev, list->next); + list_add_tail(list, head); +} + +/** + * list_is_last - tests whether @list is the last entry in list @head + * @list: the entry to test + * @head: the head of the list + */ +static inline int list_is_last(const struct list_head *list, + const struct list_head *head) +{ + return list->next == head; +} + +/** + * list_empty - tests whether a list is empty + * @head: the list to test. + */ +static inline int list_empty(const struct list_head *head) +{ + return head->next == head; +} + +/** + * list_empty_careful - tests whether a list is empty and not being modified + * @head: the list to test + * + * Description: + * tests whether a list is empty _and_ checks that no other CPU might be + * in the process of modifying either member (next or prev) + * + * NOTE: using list_empty_careful() without synchronization + * can only be safe if the only activity that can happen + * to the list entry is list_del_init(). Eg. it cannot be used + * if another CPU could re-list_add() it. + */ +static inline int list_empty_careful(const struct list_head *head) +{ + struct list_head *next = head->next; + return (next == head) && (next == head->prev); +} + +/** + * list_rotate_left - rotate the list to the left + * @head: the head of the list + */ +static inline void list_rotate_left(struct list_head *head) +{ + struct list_head *first; + + if (!list_empty(head)) { + first = head->next; + list_move_tail(first, head); + } +} + +/** + * list_is_singular - tests whether a list has just one entry. + * @head: the list to test. + */ +static inline int list_is_singular(const struct list_head *head) +{ + return !list_empty(head) && (head->next == head->prev); +} + +static inline void __list_cut_position(struct list_head *list, + struct list_head *head, struct list_head *entry) +{ + struct list_head *new_first = entry->next; + list->next = head->next; + list->next->prev = list; + list->prev = entry; + entry->next = list; + head->next = new_first; + new_first->prev = head; +} + +/** + * list_cut_position - cut a list into two + * @list: a new list to add all removed entries + * @head: a list with entries + * @entry: an entry within head, could be the head itself + * and if so we won't cut the list + * + * This helper moves the initial part of @head, up to and + * including @entry, from @head to @list. You should + * pass on @entry an element you know is on @head. @list + * should be an empty list or a list you do not care about + * losing its data. + * + */ +static inline void list_cut_position(struct list_head *list, + struct list_head *head, struct list_head *entry) +{ + if (list_empty(head)) + return; + if (list_is_singular(head) && + (head->next != entry && head != entry)) + return; + if (entry == head) + INIT_LIST_HEAD(list); + else + __list_cut_position(list, head, entry); +} + +static inline void __list_splice(const struct list_head *list, + struct list_head *prev, + struct list_head *next) +{ + struct list_head *first = list->next; + struct list_head *last = list->prev; + + first->prev = prev; + prev->next = first; + + last->next = next; + next->prev = last; +} + +/** + * list_splice - join two lists, this is designed for stacks + * @list: the new list to add. + * @head: the place to add it in the first list. + */ +static inline void list_splice(const struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) + __list_splice(list, head, head->next); +} + +/** + * list_splice_tail - join two lists, each list being a queue + * @list: the new list to add. + * @head: the place to add it in the first list. + */ +static inline void list_splice_tail(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) + __list_splice(list, head->prev, head); +} + +/** + * list_splice_init - join two lists and reinitialise the emptied list. + * @list: the new list to add. + * @head: the place to add it in the first list. + * + * The list at @list is reinitialised + */ +static inline void list_splice_init(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) { + __list_splice(list, head, head->next); + INIT_LIST_HEAD(list); + } +} + +/** + * list_splice_tail_init - join two lists and reinitialise the emptied list + * @list: the new list to add. + * @head: the place to add it in the first list. + * + * Each of the lists is a queue. + * The list at @list is reinitialised + */ +static inline void list_splice_tail_init(struct list_head *list, + struct list_head *head) +{ + if (!list_empty(list)) { + __list_splice(list, head->prev, head); + INIT_LIST_HEAD(list); + } +} + +/** + * list_entry - get the struct for this entry + * @ptr: the &struct list_head pointer. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + */ +#ifdef __cplusplus +#define list_entry(ptr, type, member) \ + (reinterpret_cast((char *)(ptr)-(char *)(&(reinterpret_cast(1)->member))+1)) +#else +#define list_entry(ptr, type, member) \ + container_of(ptr, type, member) +#endif + +/** + * list_first_entry - get the first element from a list + * @ptr: the list head to take the element from. + * @type: the type of the struct this is embedded in. + * @member: the name of the list_struct within the struct. + * + * Note, that list is expected to be not empty. + */ +#define list_first_entry(ptr, type, member) \ + list_entry((ptr)->next, type, member) + +/** + * list_for_each - iterate over a list + * @pos: the &struct list_head to use as a loop cursor. + * @head: the head for your list. + */ +#define list_for_each(pos, head) \ + for (pos = (head)->next; pos != (head); pos = pos->next) + +/** + * __list_for_each - iterate over a list + * @pos: the &struct list_head to use as a loop cursor. + * @head: the head for your list. + * + * This variant doesn't differ from list_for_each() any more. + * We don't do prefetching in either case. + */ +#define __list_for_each(pos, head) \ + for (pos = (head)->next; pos != (head); pos = pos->next) + +/** + * list_for_each_prev - iterate over a list backwards + * @pos: the &struct list_head to use as a loop cursor. + * @head: the head for your list. + */ +#define list_for_each_prev(pos, head) \ + for (pos = (head)->prev; pos != (head); pos = pos->prev) + +/** + * list_for_each_safe - iterate over a list safe against removal of list entry + * @pos: the &struct list_head to use as a loop cursor. + * @n: another &struct list_head to use as temporary storage + * @head: the head for your list. + */ +#define list_for_each_safe(pos, n, head) \ + for (pos = (head)->next, n = pos->next; pos != (head); \ + pos = n, n = pos->next) + +/** + * list_for_each_prev_safe - iterate over a list backwards safe against removal of list entry + * @pos: the &struct list_head to use as a loop cursor. + * @n: another &struct list_head to use as temporary storage + * @head: the head for your list. + */ +#define list_for_each_prev_safe(pos, n, head) \ + for (pos = (head)->prev, n = pos->prev; \ + pos != (head); \ + pos = n, n = pos->prev) + +/** + * list_for_each_entry - iterate over list of given type + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry(pos, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + +/** + * list_for_each_entry_reverse - iterate backwards over list of given type. + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry_reverse(pos, head, member) \ + for (pos = list_entry((head)->prev, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry(pos->member.prev, typeof(*pos), member)) + +/** + * list_prepare_entry - prepare a pos entry for use in list_for_each_entry_continue() + * @pos: the type * to use as a start point + * @head: the head of the list + * @member: the name of the list_struct within the struct. + * + * Prepares a pos entry for use as a start point in list_for_each_entry_continue(). + */ +#define list_prepare_entry(pos, head, member) \ + ((pos) ? : list_entry(head, typeof(*pos), member)) + +/** + * list_for_each_entry_continue - continue iteration over list of given type + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + * + * Continue to iterate over list of given type, continuing after + * the current position. + */ +#define list_for_each_entry_continue(pos, head, member) \ + for (pos = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + +/** + * list_for_each_entry_continue_reverse - iterate backwards from the given point + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + * + * Start to iterate over list of given type backwards, continuing after + * the current position. + */ +#define list_for_each_entry_continue_reverse(pos, head, member) \ + for (pos = list_entry(pos->member.prev, typeof(*pos), member); \ + &pos->member != (head); \ + pos = list_entry(pos->member.prev, typeof(*pos), member)) + +/** + * list_for_each_entry_from - iterate over list of given type from the current point + * @pos: the type * to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + * + * Iterate over list of given type, continuing from current position. + */ +#define list_for_each_entry_from(pos, head, member) \ + for (; &pos->member != (head); \ + pos = list_entry(pos->member.next, typeof(*pos), member)) + +/** + * list_for_each_entry_safe - iterate over list of given type safe against removal of list entry + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + */ +#define list_for_each_entry_safe(pos, n, head, member) \ + for (pos = list_entry((head)->next, typeof(*pos), member), \ + n = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(*n), member)) + +/** + * list_for_each_entry_safe_continue + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + * + * Iterate over list of given type, continuing after current point, + * safe against removal of list entry. + */ +#define list_for_each_entry_safe_continue(pos, n, head, member) \ + for (pos = list_entry(pos->member.next, typeof(*pos), member), \ + n = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(*n), member)) + +/** + * list_for_each_entry_safe_from + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + * + * Iterate over list of given type from current point, safe against + * removal of list entry. + */ +#define list_for_each_entry_safe_from(pos, n, head, member) \ + for (n = list_entry(pos->member.next, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.next, typeof(*n), member)) + +/** + * list_for_each_entry_safe_reverse + * @pos: the type * to use as a loop cursor. + * @n: another type * to use as temporary storage + * @head: the head for your list. + * @member: the name of the list_struct within the struct. + * + * Iterate backwards over list of given type, safe against removal + * of list entry. + */ +#define list_for_each_entry_safe_reverse(pos, n, head, member) \ + for (pos = list_entry((head)->prev, typeof(*pos), member), \ + n = list_entry(pos->member.prev, typeof(*pos), member); \ + &pos->member != (head); \ + pos = n, n = list_entry(n->member.prev, typeof(*n), member)) + +/** + * list_safe_reset_next - reset a stale list_for_each_entry_safe loop + * @pos: the loop cursor used in the list_for_each_entry_safe loop + * @n: temporary storage used in list_for_each_entry_safe + * @member: the name of the list_struct within the struct. + * + * list_safe_reset_next is not safe to use in general if the list may be + * modified concurrently (eg. the lock is dropped in the loop body). An + * exception to this is if the cursor element (pos) is pinned in the list, + * and list_safe_reset_next is called after re-taking the lock and before + * completing the current iteration of the loop body. + */ +#define list_safe_reset_next(pos, n, member) \ + n = list_entry(pos->member.next, typeof(*pos), member) + +/* + * Double linked lists with a single pointer list head. + * Mostly useful for hash tables where the two pointer list head is + * too wasteful. + * You lose the ability to access the tail in O(1). + */ + +#endif /* _LINUX_LIST_H */ diff --git a/src/vma/util/match.cpp b/src/vma/util/match.cpp new file mode 100644 index 0000000..73bc385 --- /dev/null +++ b/src/vma/util/match.cpp @@ -0,0 +1,713 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +/* + * system includes + */ +#if HAVE_CONFIG_H +# include +#endif /* HAVE_CONFIG_H */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * VMA specific includes + */ +#include "libvma.h" + +// debugging macros +#define MODULE_NAME "match:" + +#define match_logpanic __log_panic +#define match_logerr __log_err +#define match_logwarn __log_warn +#define match_loginfo __log_info +#define match_logdbg __log_dbg +#define match_logfunc __log_func +#define match_logfuncall __log_funcall + +/* --------------------------------------------------------------------- */ +/* library static and global variables */ +/* --------------------------------------------------------------------- */ +extern char *program_invocation_name, *program_invocation_short_name; + +static void free_dbl_lst(struct dbl_lst *dbl_lst) +{ + struct dbl_lst_node *node, *tmp; + + node = dbl_lst->head; + while (node) { + tmp = node->next; + if (node->data) + free(node->data); + + free(node); + node = tmp; + } + dbl_lst->head = NULL; + dbl_lst->tail = NULL; +} + +static void free_instance_content(struct instance *instance) +{ + if (!instance) + return; + + /* free srever's rules */ + free_dbl_lst(&instance->tcp_srv_rules_lst); + + /*free client's rules */ + free_dbl_lst(&instance->tcp_clt_rules_lst); + + /* free the instance id content*/ + if (instance->id.prog_name_expr) + free(instance->id.prog_name_expr); + + if (instance->id.user_defined_id) + free(instance->id.user_defined_id); + free(instance); +} + +void __vma_free_resources(void) +{ + struct dbl_lst_node *node, *tmp; + + /* free the instances */ + node = __instance_list.head; + while (node) { + tmp = node->next; + free_instance_content((struct instance *)node->data); + free(node); + node = tmp; + } + __instance_list.head = NULL; + __instance_list.tail = NULL; +} + +void get_address_port_rule_str(char *addr_buf, char *ports_buf, struct address_port_rule *rule) +{ + char str_addr[INET_ADDRSTRLEN]; + + /* TODO: handle IPv6 in rule */ + if (rule->match_by_addr) { + inet_ntop(AF_INET, &(rule->ipv4), str_addr, sizeof(str_addr)); + if (rule->prefixlen != 32) { + sprintf(addr_buf, "%s/%d", str_addr, rule->prefixlen ); + } else { + sprintf(addr_buf, "%s", str_addr); + } + } else { + sprintf(addr_buf, "%s" ,"*"); + } + + if (rule->match_by_port) + if (rule->eport > rule->sport) + sprintf(ports_buf, "%d-%d", rule->sport, rule->eport); + else + sprintf(ports_buf, "%d", rule->sport); + else + sprintf(ports_buf, "*"); +} + +static void get_rule_str(struct use_family_rule *rule, char *buf, size_t len) +{ + if (!rule) { + snprintf(buf, len, " "); + return; + } + + char addr_buf_first[MAX_ADDR_STR_LEN]; + char ports_buf_first[16]; + char addr_buf_second[MAX_ADDR_STR_LEN]; + char ports_buf_second[16]; + const char *target = __vma_get_transport_str(rule->target_transport); + const char *protocol = __vma_get_protocol_str(rule->protocol); + + get_address_port_rule_str(addr_buf_first, ports_buf_first, &(rule->first)); + if (rule->use_second) { + get_address_port_rule_str(addr_buf_second, ports_buf_second, &(rule->second)); + snprintf(buf, len, "use %s %s %s:%s:%s:%s", target, protocol, addr_buf_first, ports_buf_first, addr_buf_second, ports_buf_second); + } else { + snprintf(buf, len, "use %s %s %s:%s", target, protocol, addr_buf_first, ports_buf_first); + } +} + +static void get_instance_id_str(struct instance *instance, char *buf, size_t len) +{ + if (instance) + snprintf(buf, len, "application-id %s %s", instance->id.prog_name_expr, instance->id.user_defined_id); + else + snprintf(buf, len, " "); +} + +static void print_rule(struct use_family_rule *rule) +{ + char rule_str[MAX_CONF_FILE_ENTRY_STR_LEN] = " "; + + if(rule) { + get_rule_str(rule, rule_str, MAX_CONF_FILE_ENTRY_STR_LEN); + } + match_logdbg("\t\t\t%s", rule_str); +} + +static void print_instance_id_str(struct instance *instance) +{ + char instance_str[MAX_CONF_FILE_ENTRY_STR_LEN] = " "; + + if(instance) { + get_instance_id_str(instance, instance_str, MAX_CONF_FILE_ENTRY_STR_LEN); + } + match_logdbg("%s:", instance_str); +} + +static void print_rules_lst(struct dbl_lst_node *curr_node) +{ + while (curr_node) { + struct use_family_rule *rule = (struct use_family_rule *)curr_node->data; + print_rule(rule); + curr_node = curr_node->next; + } +} + +static void print_instance_conf(struct instance *instance) +{ + if (!instance) { + match_logdbg("\tinstance is empty"); + } else { + print_instance_id_str(instance); + + struct dbl_lst_node *node = instance->tcp_srv_rules_lst.head; + match_logdbg("\ttcp_server's rules:"); + print_rules_lst(node); + + node = instance->tcp_clt_rules_lst.head; + match_logdbg("\ttcp_clinet's rules:"); + print_rules_lst(node); + + node = instance->udp_rcv_rules_lst.head; + match_logdbg("\tudp receiver rules:"); + print_rules_lst(node); + + node = instance->udp_snd_rules_lst.head; + match_logdbg("\tudp sender rules:"); + print_rules_lst(node); + + node = instance->udp_con_rules_lst.head; + match_logdbg("\tudp connect rules:"); + print_rules_lst(node); + + match_logdbg(" "); + } +} + +void __vma_print_conf_file(struct dbl_lst conf_lst) +{ + struct dbl_lst_node *node = conf_lst.head; + + match_logdbg("Configuration File:"); + while (node) { + struct instance *instance = (struct instance *)node->data; + print_instance_conf(instance); + node = node->next; + } +} + +/* return 0 if the addresses match */ +static inline int match_ipv4_addr(struct address_port_rule *rule, const struct sockaddr_in *sin) +{ + // Added netmask on rule side to avoid user mistake when configuring ip rule: 1.1.1.x/24 instead of 1.1.1.0/24 + match_logdbg("rule ip address:%d.%d.%d.%d, socket ip address:%d.%d.%d.%d ", NIPQUAD(rule->ipv4.s_addr & htonl(VMA_NETMASK(rule->prefixlen))), NIPQUAD(sin->sin_addr.s_addr & htonl(VMA_NETMASK(rule->prefixlen)))); + return ( (rule->ipv4.s_addr & htonl(VMA_NETMASK(rule->prefixlen))) != (sin->sin_addr.s_addr & htonl(VMA_NETMASK(rule->prefixlen)))); +} + +static int match_ip_addr_and_port(transport_t my_transport, struct use_family_rule *rule, const struct sockaddr *addr_in_first, const socklen_t addrlen_first, const struct sockaddr *addr_in_second = NULL, const socklen_t addrlen_second = 0) +{ + const struct sockaddr_in *sin_first = ( const struct sockaddr_in * )addr_in_first; + const struct sockaddr_in *sin_second = ( const struct sockaddr_in * )addr_in_second; + const struct sockaddr_in6 *sin6_first = ( const struct sockaddr_in6 * )addr_in_first; + const struct sockaddr_in6 *sin6_second = ( const struct sockaddr_in6 * )addr_in_second; + struct sockaddr_in tmp_sin_first; + struct sockaddr_in tmp_sin_second; + unsigned short port_first; + unsigned short port_second; + int match = 1; + char addr_buf_first[MAX_ADDR_STR_LEN]; + const char *addr_str_first; + char addr_buf_second[MAX_ADDR_STR_LEN]; + const char *addr_str_second; + char rule_str[512]; + + if ( g_vlogger_level >= VLOG_DEBUG ){ + + get_rule_str(rule, rule_str, sizeof(rule_str)); + + if ( sin6_first->sin6_family == AF_INET6 ) { + addr_str_first = inet_ntop( AF_INET6, (void *)&(sin6_first->sin6_addr), addr_buf_first, MAX_ADDR_STR_LEN); + port_first = ntohs(sin6_first->sin6_port); + } else { + addr_str_first = inet_ntop( AF_INET, (void *)&(sin_first->sin_addr), addr_buf_first, MAX_ADDR_STR_LEN); + port_first = ntohs(sin_first->sin_port); + } + if (addr_str_first == NULL) + addr_str_first = "INVALID_ADDR"; + + if (addr_in_second) { + if ( sin6_second->sin6_family == AF_INET6 ) { + addr_str_second = inet_ntop( AF_INET6, (void *)&(sin6_second->sin6_addr), addr_buf_second, MAX_ADDR_STR_LEN); + port_second = ntohs(sin6_second->sin6_port); + } else { + addr_str_second = inet_ntop( AF_INET, (void *)&(sin_second->sin_addr), addr_buf_second, MAX_ADDR_STR_LEN); + port_second = ntohs(sin_second->sin_port); + } + if (addr_str_second == NULL) + addr_str_second = "INVALID_ADDR"; + + match_logdbg("MATCH: matching %s:%d:%s:%d to %s => ", addr_str_first, port_first, addr_str_second, port_second, rule_str); + + } else { + match_logdbg("MATCH: matching %s:%d to %s => ", addr_str_first, port_first, rule_str); + } + + } + + /* We currently only support IPv4 and IPv4 embedded in IPv6 */ + if ( rule->first.match_by_port ) { + if ( sin6_first->sin6_family == AF_INET6 ) + port_first = ntohs( sin6_first->sin6_port ); + else + port_first = ntohs( sin_first->sin_port ); + + if ((port_first < rule->first.sport) || (port_first > rule->first.eport)) { + match_logdbg("NEGATIVE MATCH by port range" ); + match = 0; + } + } + + if ( match && rule->first.match_by_addr ) { + if ( __vma_sockaddr_to_vma( addr_in_first, addrlen_first, &tmp_sin_first, NULL ) || + match_ipv4_addr(&(rule->first), &tmp_sin_first)) { + match_logdbg("NEGATIVE MATCH by address" ); + match = 0; + } + } + + if (match && rule->use_second && addr_in_second) { + if ( rule->second.match_by_port ) { + if ( sin6_second->sin6_family == AF_INET6 ) + port_second = ntohs( sin6_second->sin6_port ); + else + port_second = ntohs( sin_second->sin_port ); + + if ((port_second < rule->second.sport) || (port_second > rule->second.eport)) { + match_logdbg("NEGATIVE MATCH by port range" ); + match = 0; + } + } + + if ( match && rule->second.match_by_addr ) { + if ( __vma_sockaddr_to_vma( addr_in_second, addrlen_second, &tmp_sin_second, NULL ) || + match_ipv4_addr(&(rule->second), &tmp_sin_second)) { + match_logdbg("NEGATIVE MATCH by address" ); + match = 0; + } + } + } + + if (match) { + if (!(rule->target_transport == TRANS_OS || rule->target_transport == TRANS_ULP || rule->target_transport == my_transport)) { + match_logdbg("NEGATIVE MATCH by transport" ); + match = 0; + } + else { + match_logdbg("POSITIVE MATCH"); + } + } + + return match; +} + +/* return 1 on match */ +int __vma_match_program_name(struct instance *instance) +{ + if (!instance) + return 1; + + return !fnmatch( instance->id.prog_name_expr, program_invocation_short_name, 0); +} + +/* return 1 on match */ +int __vma_match_user_defined_id(struct instance *instance, const char *app_id) +{ + int ret_val = 0; + + if (!instance || !instance->id.user_defined_id || !app_id ) + ret_val = 1; + else if (!strcmp(app_id, "*")) + ret_val = 1; + else if (!strcmp(instance->id.user_defined_id, "*")) + ret_val = 1; + else + ret_val = !strcmp(app_id, instance->id.user_defined_id); + + return ret_val; +} + +static transport_t get_family_by_first_matching_rule(transport_t my_transport, struct dbl_lst rules_lst, const struct sockaddr *sin_first, const socklen_t addrlen_first, const struct sockaddr *sin_second = NULL, const socklen_t addrlen_second = 0) +{ + struct dbl_lst_node *node; + + for (node = rules_lst.head; node != NULL; node = node->next) { + /* first rule wins */ + struct use_family_rule *rule = (struct use_family_rule *)node->data; + if (rule) + if (match_ip_addr_and_port(my_transport, rule, sin_first, addrlen_first, sin_second, addrlen_second)) + return rule->target_transport; + } + + match_logdbg("No matching rule. Using VMA (default)" ); + return TRANS_VMA; //No matching rule or no rule at all. Don't continue to next application-id +} + +static transport_t get_family_by_instance_first_matching_rule(transport_t my_transport, role_t role, const char *app_id, const struct sockaddr *sin_first, const socklen_t addrlen_first, const struct sockaddr *sin_second = NULL, const socklen_t addrlen_second = 0) +{ + transport_t target_family = TRANS_DEFAULT; + + /* if we do not have any rules we use vma */ + if ( __vma_config_empty()){ + target_family = TRANS_VMA; + } + else{ + struct dbl_lst_node *curr = __instance_list.head; + + while (curr && target_family == TRANS_DEFAULT) { + struct instance *curr_instance = (struct instance *)curr->data; + if (curr_instance) { + /* skip if not our program */ + if (__vma_match_program_name(curr_instance) && __vma_match_user_defined_id(curr_instance, app_id)) { + match_logdbg("MATCHING program name: %s, application-id: %s",curr_instance->id.prog_name_expr, curr_instance->id.user_defined_id); + switch (role) { + case ROLE_TCP_SERVER: + target_family = get_family_by_first_matching_rule(my_transport, curr_instance->tcp_srv_rules_lst, sin_first, addrlen_first); + break; + case ROLE_TCP_CLIENT: + target_family = get_family_by_first_matching_rule(my_transport, curr_instance->tcp_clt_rules_lst, sin_first, addrlen_first, sin_second, addrlen_second); + break; + case ROLE_UDP_SENDER: + target_family = get_family_by_first_matching_rule(my_transport, curr_instance->udp_snd_rules_lst, sin_first, addrlen_first); + break; + case ROLE_UDP_RECEIVER: + target_family = get_family_by_first_matching_rule(my_transport, curr_instance->udp_rcv_rules_lst, sin_first, addrlen_first); + break; + case ROLE_UDP_CONNECT: + target_family = get_family_by_first_matching_rule(my_transport, curr_instance->udp_con_rules_lst, sin_first, addrlen_first, sin_second, addrlen_second); + break; + BULLSEYE_EXCLUDE_BLOCK_START + default: + break; + BULLSEYE_EXCLUDE_BLOCK_END + } + } + } + curr = curr->next; + } + if(!curr && target_family == TRANS_DEFAULT) { + target_family = TRANS_VMA; + } + } + return target_family; +} + +/* return the result of the first matching rule found */ +transport_t __vma_match_tcp_server(transport_t my_transport, const char *app_id, const struct sockaddr * sin, const socklen_t addrlen) +{ + transport_t target_family; + + target_family = get_family_by_instance_first_matching_rule(my_transport, ROLE_TCP_SERVER, app_id, sin, addrlen); + + match_logdbg("MATCH TCP SERVER (LISTEN): => %s", __vma_get_transport_str(target_family)); + + return target_family; +} + +transport_t __vma_match_tcp_client(transport_t my_transport, const char *app_id, const struct sockaddr * sin_first, const socklen_t addrlen_first, const struct sockaddr * sin_second, const socklen_t addrlen_second) +{ + transport_t target_family; + + target_family = get_family_by_instance_first_matching_rule(my_transport, ROLE_TCP_CLIENT, app_id, sin_first, addrlen_first, sin_second, addrlen_second); + + match_logdbg("MATCH TCP CLIENT (CONNECT): => %s", __vma_get_transport_str(target_family)); + + return target_family; +} + +/* return the result of the first matching rule found */ +transport_t __vma_match_udp_sender(transport_t my_transport, const char *app_id, const struct sockaddr * sin, const socklen_t addrlen) +{ + transport_t target_family; + + target_family = get_family_by_instance_first_matching_rule(my_transport, ROLE_UDP_SENDER, app_id, sin, addrlen); + + match_logdbg("MATCH UDP SENDER: => %s", __vma_get_transport_str(target_family)); + + return target_family; +} + +transport_t __vma_match_udp_receiver(transport_t my_transport, const char *app_id, const struct sockaddr * sin, const socklen_t addrlen) +{ + transport_t target_family; + + target_family = get_family_by_instance_first_matching_rule(my_transport, ROLE_UDP_RECEIVER, app_id, sin, addrlen); + + match_logdbg("MATCH UDP RECEIVER: => %s", __vma_get_transport_str(target_family)); + + return target_family; +} + +transport_t __vma_match_udp_connect(transport_t my_transport, const char *app_id, const struct sockaddr * sin_first, const socklen_t addrlen_first, const struct sockaddr * sin_second, const socklen_t addrlen_second) +{ + transport_t target_family; + + target_family = get_family_by_instance_first_matching_rule(my_transport, ROLE_UDP_CONNECT, app_id, sin_first, addrlen_first, sin_second, addrlen_second); + + match_logdbg("MATCH UDP CONNECT: => %s", __vma_get_transport_str(target_family)); + + return target_family; +} + +/* given a set of rules see if there is a global match for current program */ +static transport_t match_by_all_rules_program(in_protocol_t my_protocol, struct dbl_lst rules_lst) +{ + int any_vma = 0; + int any_os = 0; + int any_sdp = 0; + transport_t target_family = TRANS_DEFAULT; + struct dbl_lst_node *node; + struct use_family_rule *rule; + + for (node = rules_lst.head; (node != NULL) && (target_family == TRANS_DEFAULT) ; node = node->next ) { + /* + * to declare a dont care we either have a dont care address and port + * or the previous non global rules use the same target family as the + * global rule + */ + rule = (struct use_family_rule *)node->data; + + if (!rule) + continue; + if ((rule->protocol == my_protocol || my_protocol == PROTO_ALL) && + (rule->first.match_by_addr || rule->first.match_by_port || (rule->use_second && (rule->second.match_by_addr || rule->second.match_by_port )))) { + /* not a global match rule - just track the target family */ + if (rule->target_transport == TRANS_VMA || rule->target_transport == TRANS_ULP) + any_vma++; + else if (rule->target_transport == TRANS_OS) + any_os++; + } else if (rule->protocol == my_protocol && !(rule->first.match_by_addr || rule->first.match_by_port || (rule->use_second && (rule->second.match_by_addr || rule->second.match_by_port )))){ + /* a global match so we can declare a match by program */ + if ((rule->target_transport == TRANS_VMA || rule->target_transport == TRANS_ULP) && (any_os == 0)) + target_family = TRANS_VMA; + else if ((rule->target_transport == TRANS_OS) && (any_vma == 0) && (any_sdp == 0)) + target_family = TRANS_OS; + } + } + if (target_family == TRANS_DEFAULT) {// no matching rules under application-id. use VMA. Don't continue to next application-id + target_family = TRANS_VMA; + } + return target_family; +} + +/* return tcp or vma if the port and role are don't cares */ +transport_t __vma_match_by_program(in_protocol_t my_protocol, const char *app_id) +{ + transport_t server_target_family = TRANS_DEFAULT; + transport_t client_target_family = TRANS_DEFAULT; + transport_t target_family = TRANS_DEFAULT; + bool b_found_app_id_match = false; + + if ( __vma_config_empty() ){ + match_logdbg("Configuration file is empty. Using VMA (default)" ); + target_family = TRANS_VMA; + } + else{ + struct dbl_lst_node *node = __instance_list.head; + + while (node && target_family == TRANS_DEFAULT) { + /* need to try both server and client rules */ + struct instance* instance; + instance = (struct instance *)node->data; + if (instance && __vma_match_program_name(instance) && __vma_match_user_defined_id(instance, app_id)) { + b_found_app_id_match = true; + if (my_protocol == PROTO_TCP) + { + /* TCP */ + server_target_family = + match_by_all_rules_program(my_protocol, instance->tcp_srv_rules_lst); + client_target_family = + match_by_all_rules_program(my_protocol, instance->tcp_clt_rules_lst); + } + else if(my_protocol == PROTO_UDP){ + /* UDP */ + server_target_family = + match_by_all_rules_program(my_protocol, instance->udp_rcv_rules_lst); + client_target_family = + match_by_all_rules_program(my_protocol, instance->udp_snd_rules_lst); + } + + /* only if both agree */ + if (server_target_family == client_target_family) + target_family = server_target_family; + } + node = node->next; + } + } + + if (strcmp("VMA_DEFAULT_APPLICATION_ID", app_id) && !b_found_app_id_match) + match_logwarn("requested VMA_APPLICATION_ID does not exist in the configuration file"); + + return target_family; +} + +/* is_ipv4_embedded_in_ipv6 -- return 1 if the given ipv6 address is ipv4 */ +static int is_ipv4_embedded_in_ipv6(const struct sockaddr_in6 *sin6) +{ + static struct in6_addr ipv4_embedded_addr = {{{0}}}; + + /* 10 first bytes must be 0 */ + if (memcmp(&ipv4_embedded_addr.s6_addr[0], &sin6->sin6_addr.s6_addr[0], 10)) + return 0; + + /* next two must be all zeros or all ones */ + if (((sin6->sin6_addr.s6_addr[10] == 0) && + (sin6->sin6_addr.s6_addr[11] == 0)) || + ((sin6->sin6_addr.s6_addr[10] == 0xff) && + (sin6->sin6_addr.s6_addr[11] == 0xff))) + return 1; + + return 0; +} + +#define IPV6_ADDR_IN_MIN_LEN 24 +int __vma_sockaddr_to_vma(const struct sockaddr *addr_in, socklen_t addrlen, struct sockaddr_in *addr_out, int *was_ipv6) +{ + const struct sockaddr_in *sin = (const struct sockaddr_in *) addr_in; + const struct sockaddr_in6 *sin6 = (const struct sockaddr_in6 *) addr_in; + char buf[MAX_ADDR_STR_LEN]; + + /* currently VMA supports only IPv4 ... */ + if (!addr_in) { + match_logdbg("Error __vma_sockaddr_to_vma: " + "provided NULL input pointer"); + errno = EINVAL; + return -1; + } + if (!addr_out) { + match_logdbg("Error __vma_sockaddr_to_vma: " + "provided NULL output pointer"); + errno = EINVAL; + return -1; + } + + if (sin->sin_family == AF_INET) { + match_logdbg("__vma_sockaddr_to_vma: Given IPv4"); + if (addrlen < sizeof(struct sockaddr_in)) { + match_logdbg("Error __vma_sockaddr_to_vma: " + "provided address length:%u < IPv4 length %d", + (unsigned)addrlen, (int)sizeof(struct sockaddr_in)); + errno = EINVAL; + return -1; + } + + memcpy(addr_out, sin, sizeof(*addr_out)); + if (was_ipv6) + *was_ipv6 = 0; + } else if (sin6->sin6_family == AF_INET6) { + if (addrlen < IPV6_ADDR_IN_MIN_LEN) { + match_logdbg("Error __vma_sockaddr_to_vma: " + "provided address length:%d < IPv6 length %d", + addrlen, IPV6_ADDR_IN_MIN_LEN); + errno = EINVAL; + return -1; + } + + /* cannot convert IPv6 that is not IPv4 embedding */ + if (!is_ipv4_embedded_in_ipv6(sin6)) { + match_logdbg("Error __vma_sockaddr_to_vma: " + "Given IPv6 address not an embedded IPv4"); + errno = EINVAL; + return -1; + } + memset(addr_out, 0, sizeof(*addr_out)); + memcpy(&addr_out->sin_addr, &(sin6->sin6_addr.s6_addr[12]), 4); + + if (addr_out->sin_addr.s_addr == ntohl(1)) { + addr_out->sin_addr.s_addr = htonl(INADDR_LOOPBACK); + match_logdbg("__vma_sockaddr_to_vma: Given IPv6 loopback address"); + } else + match_logdbg("__vma_sockaddr_to_vma: Given IPv4 embedded in IPv6"); + + addr_out->sin_family = AF_INET; + addr_out->sin_port = sin6->sin6_port; + + + if (inet_ntop (addr_out->sin_family, (void *) &(addr_out->sin_addr), buf, + MAX_ADDR_STR_LEN) == NULL) { + match_logdbg("__vma_sockaddr_to_vma: Converted IPv4 address is illegal"); + } else { + match_logdbg("__vma_sockaddr_to_vma: Converted IPv4 is:%s", buf); + } + if (was_ipv6) + *was_ipv6 = 1; + + } else if (sin->sin_family == 0) { + + match_logdbg("__vma_sockaddr_to_vma: Converted NULL address"); + memcpy(addr_out, addr_in, addrlen); + } else { + match_logdbg("Error __vma_sockaddr_to_vma: " + "address family <%d> is unknown", sin->sin_family); + errno = EAFNOSUPPORT; + return -1; + } + + return 0; +} diff --git a/src/vma/util/sg_array.h b/src/vma/util/sg_array.h new file mode 100644 index 0000000..4ef97fb --- /dev/null +++ b/src/vma/util/sg_array.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef SG_ARRAY_H +#define SG_ARRAY_H + +#include + +#include "vma/ib/base/verbs_extra.h" + +//! sg_array - helper class on top of scatter/gather elements array. +//Represent it like a virtual one dimension vector/array. + +class sg_array { +public: + sg_array(ibv_sge *sg_, int num_sge_): + m_sg(sg_) + ,m_current(sg_) + ,m_num_sge(num_sge_) + ,m_length(0) + ,m_index(0) + ,m_pos(0) + { + } +//! Index operator +#if 0 //TODO: testing + inline uint8_t* operator[](int ind_) + { + int index = -1; + int pos = 0; + if (unlikely(m_sg == NULL)) + return NULL; + while (index++ <= m_num_sge) { + + if (pos+(int)m_sg[index].length > ind_) { + return (uint8_t*)m_sg[index].addr+(ind_-pos); + } else { + pos += m_sg[index].length; + } + } + return NULL; + } +#endif //0 +//! Get pointer to data for get_len size from current position. +//In case there is no full requested range in current SGE returns +//the rest in current sge. Next call will start from the beginning +//of next SGE + inline uint8_t* get_data(int* get_len) + { + if (likely(m_index < m_num_sge)) { + + m_current = m_sg + m_index; + + if (likely((m_pos+*get_len) < (int)m_current->length)) { + uint8_t* old_p = (uint8_t*)m_sg[m_index].addr+m_pos; + m_pos += *get_len; + if (unlikely(m_pos < 0)) + return NULL; + return old_p; + } else { + *get_len = m_current->length - m_pos; + + if (unlikely(m_pos < 0)) + return NULL; + uint8_t* old_p = (uint8_t*)m_sg[m_index++].addr+m_pos; + // moving to next sge + m_pos = 0; + return old_p; + } + } + return NULL; + } + + inline int get_num_sge(void) { return m_sg ? m_num_sge : -1; } + inline int length(void) + { + if (unlikely(m_sg==NULL || m_num_sge==0) ) + return 0; + for (int i=0; ilkey; } + +private: + struct ibv_sge* m_sg; + struct ibv_sge* m_current; + int m_num_sge; + int m_length; + int m_index; + int m_pos; + +}; + +#endif // SG_ARRAY_H diff --git a/src/vma/util/sock_addr.h b/src/vma/util/sock_addr.h new file mode 100644 index 0000000..4484367 --- /dev/null +++ b/src/vma/util/sock_addr.h @@ -0,0 +1,205 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef SOCK_ADDR_H +#define SOCK_ADDR_H + +#include +#include +#include +#include "vma/util/vtypes.h" + +class sock_addr +{ +public: + sock_addr() : m_p_sa_in((struct sockaddr_in*)&m_sa) { memset(m_p_sa_in, 0, get_socklen()); m_str[0]='\0'; m_str_in_addr[0]='\0'; m_str_in_port[0]='\0'; }; +#if _BullseyeCoverage + #pragma BullseyeCoverage off +#endif + sock_addr(struct sockaddr& other) : m_sa(other), m_p_sa_in((struct sockaddr_in*)&m_sa) { m_str[0]='\0'; m_str_in_addr[0]='\0'; m_str_in_port[0]='\0'; }; +#if _BullseyeCoverage + #pragma BullseyeCoverage on +#endif + // coverity[uninit_member] + sock_addr(const struct sockaddr* other) : m_sa(*other), m_p_sa_in((struct sockaddr_in*)&m_sa) { m_str[0]='\0'; }; +#if _BullseyeCoverage + #pragma BullseyeCoverage off +#endif + sock_addr(struct sockaddr_in other) : m_sa_in(other), m_p_sa_in((struct sockaddr_in*)&m_sa) { m_str[0]='\0'; m_str_in_addr[0]='\0'; m_str_in_port[0]='\0'; }; +#if _BullseyeCoverage + #pragma BullseyeCoverage on +#endif + sock_addr(sa_family_t f, in_addr_t a, in_port_t p) : m_p_sa_in((struct sockaddr_in*)&m_sa) + { memset(m_p_sa_in, 0, get_socklen()); set_sa_family(f); set_in_addr(a); set_in_port(p); m_str[0]='\0'; m_str_in_addr[0]='\0'; m_str_in_port[0]='\0'; }; + ~sock_addr() {}; + + struct sockaddr* get_p_sa() { return &m_sa; } + void get_sa(struct sockaddr* p_sa, size_t size) { memcpy(p_sa, &m_sa, std::min(get_socklen(), size)); } + void get_sa(struct sockaddr_in& r_sa_in) { memcpy(&r_sa_in, &m_sa, get_socklen()); } + + sa_family_t get_sa_family() { struct sockaddr_in* p_sa_in = (struct sockaddr_in*)&m_sa; return p_sa_in->sin_family; } + in_addr_t get_in_addr() { struct sockaddr_in* p_sa_in = (struct sockaddr_in*)&m_sa; return p_sa_in->sin_addr.s_addr; } + in_port_t get_in_port() { struct sockaddr_in* p_sa_in = (struct sockaddr_in*)&m_sa; return p_sa_in->sin_port; } + socklen_t get_socklen() {return sizeof(struct sockaddr); }; + + bool is_anyaddr() { return (INADDR_ANY == m_p_sa_in->sin_addr.s_addr); }; + bool is_mc() { return (IN_MULTICAST_N(m_p_sa_in->sin_addr.s_addr)); }; + +#if _BullseyeCoverage + #pragma BullseyeCoverage off +#endif + bool is_bc() { return (IS_BROADCAST_N(m_p_sa_in->sin_addr.s_addr)); }; + bool is_local_loopback() { return (LOOPBACK_N(m_p_sa_in->sin_addr.s_addr)); }; + bool is_anyport() { return (INPORT_ANY == m_p_sa_in->sin_port); }; +#if _BullseyeCoverage + #pragma BullseyeCoverage on +#endif + + + void set(struct sockaddr& sa) { m_sa = sa; } + void set_sa_family(sa_family_t family) { m_sa_in.sin_family = family; } + void set_in_addr(in_addr_t in_addr) { m_sa_in.sin_addr.s_addr = in_addr;} + void set_in_port(in_port_t in_port) { m_sa_in.sin_port = in_port;} + + sock_addr& operator=(const sock_addr& other) { + m_sa = other.m_sa; + m_p_sa_in = (struct sockaddr_in*)&m_sa; + m_str[0]='\0'; + m_str_in_addr[0]='\0'; + m_str_in_port[0]='\0'; + return *this; + } + + bool operator==(sock_addr const& other) const + { + struct sockaddr_in* p_sa_in = (struct sockaddr_in*)&m_sa; + struct sockaddr_in* p_sa_in_other = (struct sockaddr_in*)&other.m_sa; + + return (p_sa_in->sin_port == p_sa_in_other->sin_port) && + (p_sa_in->sin_addr.s_addr == p_sa_in_other->sin_addr.s_addr) && + (p_sa_in->sin_family == p_sa_in_other->sin_family); + } + +#if _BullseyeCoverage + #pragma BullseyeCoverage off +#endif + bool operator <(sock_addr const& other) const + { + struct sockaddr_in* p_sa_in = (struct sockaddr_in*)&m_sa; + struct sockaddr_in* p_sa_in_other = (struct sockaddr_in*)&other.m_sa; + + if (p_sa_in->sin_port < p_sa_in_other->sin_port) return true; + if (p_sa_in->sin_port > p_sa_in_other->sin_port) return false; + if (p_sa_in->sin_addr.s_addr < p_sa_in_other->sin_addr.s_addr) return true; + if (p_sa_in->sin_addr.s_addr > p_sa_in_other->sin_addr.s_addr) return false; + if (p_sa_in->sin_family < p_sa_in_other->sin_family) return true; + return false; + } +#if _BullseyeCoverage + #pragma BullseyeCoverage on +#endif + + size_t hash(void) + { + uint8_t csum = 0; + uint8_t* pval = (uint8_t*)this; + for (size_t i = 0; i < (sizeof(struct sockaddr)); ++i, ++pval) { csum ^= *pval; } + return csum; + } + + char* to_str_in_addr() { set_str_in_addr(); return m_str_in_addr; }; + char* to_str_in_port() { set_str_in_port(); return m_str_in_port; }; + char* to_str() { set_str_in_addr(); set_str_in_port(); set_str(); return m_str; }; + +private: + union { + struct sockaddr m_sa; + struct sockaddr_in m_sa_in; + }; + + struct sockaddr_in* m_p_sa_in; + + char m_str_in_addr[16]; + char m_str_in_port[6]; + char m_str[22]; + + /* cppcheck-suppress wrongPrintfScanfArgNum */ + void set_str_in_addr() { sprintf(m_str_in_addr, "%d.%d.%d.%d", NIPQUAD(get_in_addr())); set_str(); } + void set_str_in_port() { sprintf(m_str_in_port, "%d", ntohs(get_in_port())); set_str(); } + /* cppcheck-suppress wrongPrintfScanfArgNum */ + void set_str() { sprintf(m_str, "%d.%d.%d.%d:%d", NIPQUAD(get_in_addr()), ntohs(get_in_port())); }; +}; + +static inline sa_family_t get_sa_family(const struct sockaddr* addr) +{ + return ((struct sockaddr_in*)addr)->sin_family; +} + +#if _BullseyeCoverage + #pragma BullseyeCoverage off +#endif +static inline sa_family_t get_sa_family(const struct sockaddr& addr) +{ + return ((struct sockaddr_in*)&addr)->sin_family; +} +#if _BullseyeCoverage + #pragma BullseyeCoverage on +#endif + +static inline in_addr_t get_sa_ipv4_addr(const struct sockaddr* addr) +{ + return ((struct sockaddr_in*)addr)->sin_addr.s_addr; +} + +static inline in_addr_t get_sa_ipv4_addr(const struct sockaddr& addr) +{ + return ((struct sockaddr_in*)&addr)->sin_addr.s_addr; +} + +static inline in_port_t get_sa_port(const struct sockaddr* addr) +{ + return ((struct sockaddr_in*)addr)->sin_port; +} + +#if _BullseyeCoverage + #pragma BullseyeCoverage off +#endif +static inline in_port_t get_sa_port(const struct sockaddr& addr) +{ + return ((struct sockaddr_in*)&addr)->sin_port; +} +#if _BullseyeCoverage + #pragma BullseyeCoverage on +#endif + +#endif /*SOCK_ADDR_H*/ diff --git a/src/vma/util/sys_vars.cpp b/src/vma/util/sys_vars.cpp new file mode 100644 index 0000000..01c9f55 --- /dev/null +++ b/src/vma/util/sys_vars.cpp @@ -0,0 +1,1334 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include "main.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vlogger/vlogger.h" +#include "utils/rdtsc.h" +#include "vma/util/vma_stats.h" +#include "vma/util/utils.h" +#include "vma/event/event_handler_manager.h" +#include "vma/event/vlogger_timer_handler.h" +#include "vma/dev/buffer_pool.h" +#include "vma/dev/ib_ctx_handler_collection.h" +#include "vma/dev/net_device_table_mgr.h" +#include "vma/proto/ip_frag.h" +#include "vma/proto/vma_lwip.h" +#include "vma/proto/route_table_mgr.h" +#include "vma/proto/rule_table_mgr.h" +#include "vma/proto/igmp_mgr.h" + +#include "vma/proto/neighbour_table_mgr.h" +#include "vma/netlink/netlink_wrapper.h" +#include "vma/event/command.h" + +#include "vma/sock/sock-redirect.h" +#include "vma/sock/fd_collection.h" +#include "vma/sock/sockinfo_tcp.h" +#include "vma/sock/sockinfo_udp.h" +#include "vma/iomux/io_mux_call.h" + +#include "vma/util/instrumentation.h" + +void check_netperf_flags(); + +// Do not rely on global variable initialization in code that might be called from library constructor (main_init) +mce_sys_var & safe_mce_sys() {return mce_sys_var::instance();} + +#define MAX_BACKTRACE 25 +#define MAX_VERSION_STR_LEN 128 +#define MAX_CMD_LINE 2048 + +void mce_sys_var::print_vma_load_failure_msg() +{ + vlog_printf(VLOG_ERROR,"***************************************************************************\n"); + vlog_printf(VLOG_ERROR,"* Failed loading VMA library! Try executing the application without VMA. *\n"); + vlog_printf(VLOG_ERROR,"* 'unset LD_PRELOAD' environment variable and rerun the application. *\n"); + vlog_printf(VLOG_ERROR,"***************************************************************************\n"); +} + + + +namespace vma_spec { + typedef struct { + vma_spec_t level; + const char * output_name; + const char ** input_names; + } vma_spec_names; + + static const char *names_none[] = {"none", "0",NULL}; + static const char *spec_names_ulatency[] = {"ultra-latency", "10", NULL}; + static const char *spec_names_latency[] = {"latency", "15", NULL}; + static const char *spec_names_29west[] = {"29west", "29", NULL}; + static const char *spec_names_wombat_fh[] = {"wombat_fh", "554", NULL}; + static const char *spec_names_mcd[] = {"mcd", "623", NULL}; + static const char *spec_names_mcd_irq[] = {"mcd-irq", "624", NULL}; + static const char *spec_names_rti[] = {"rti", "784", NULL}; + static const char *spec_names_7750[] = {"7750", NULL}; + static const char *spec_names_multi_ring[] = {"multi_ring_latency", NULL}; + + // must be by order because "to_str" relies on that! + static const vma_spec_names specs[] = { + {MCE_SPEC_NONE, "NONE", (const char ** )names_none}, + {MCE_SPEC_SOCKPERF_ULTRA_LATENCY_10, "Ultra Latency", (const char ** )spec_names_ulatency}, + {MCE_SPEC_SOCKPERF_LATENCY_15, "Latency", (const char ** )spec_names_latency}, + {MCE_SPEC_29WEST_LBM_29, "29West LBM Logic", (const char ** )spec_names_29west}, + {MCE_SPEC_WOMBAT_FH_LBM_554, "Wombat FH LBM Logic", (const char ** )spec_names_wombat_fh}, + {MCE_SPEC_MCD_623, "Memcached Logic", (const char ** )spec_names_mcd}, + {MCE_SPEC_MCD_IRQ_624, "Memcached Interrupt Mode", (const char ** )spec_names_mcd_irq}, + {MCE_SPEC_RTI_784, "RTI Logic", (const char ** )spec_names_rti}, + {MCE_SPEC_LL_7750, "7750 Low Latency Profile", (const char ** )spec_names_7750}, + {MCE_SPEC_LL_MULTI_RING, "Multi Ring Latency Profile", (const char ** )spec_names_multi_ring}, + }; + + // convert str to vVMA_spec_t; upon error - returns the given 'def_value' + vma_spec_t from_str(const char* str, vma_spec_t def_value) + { + size_t num_levels = sizeof(specs) / sizeof(specs[0]); + for (size_t i = 0; i < num_levels; ++i) { + const char ** input_name = specs[i].input_names; + while (*input_name) { + if (strcasecmp(str, *input_name) == 0) + return specs[i].level; + input_name++; + } + } + + return def_value; // not found. use given def_value + } + + // convert int to vVMA_spec_t; upon error - returns the given 'def_value' + vma_spec_t from_int(const int int_spec, vma_spec_t def_value) + { + if (int_spec >= MCE_SPEC_NONE && int_spec <= MCE_VMA__ALL) { + return static_cast(int_spec); + } + return def_value; // not found. use given def_value + } + + const char * to_str(vma_spec_t level) + { + static int base = MCE_SPEC_NONE; + return specs[level - base].output_name; + } + +} + +int mce_sys_var::list_to_cpuset(char *cpulist, cpu_set_t *cpu_set) +{ + char comma[] = ","; + char dash[] = "-"; + char *comma_saveptr, *dash_saveptr; + + char *token, *subtoken, *endptr; + int range_start, range_end; + int i; + + CPU_ZERO(cpu_set); + + /* + * When passed a CPU list, we expect comma(',') delimited values. + */ + token = strtok_r(cpulist, comma, &comma_saveptr); + if (!token) { + return -1; + } + + /* + * For each comma delimited value we need to parse the token based + * on a dash('-') to see if we are dealing with a single cpu digit + * or a range. + */ + while (token) { + + subtoken = strtok_r(token, dash, &dash_saveptr); + if (!subtoken) { + return -1; + } + + while (subtoken) { + + errno = 0; + range_start = strtol(subtoken, &endptr, 10); + if ( (!range_start && *endptr) || errno) { + return -1; + } + + /* + * Here we assume that if we get a second subtoken + * then we must be processing a range. + */ + subtoken = strtok_r(NULL, dash, &dash_saveptr); + if (subtoken) { + errno = 0; + range_end = strtol(subtoken, &endptr, 10); + if ( (!range_end && *endptr) || errno) { + return -1; + } + subtoken = NULL; + } else { + range_end = range_start; + } + + for (i = range_start; i <= range_end; i++) { + if (i > (CPU_SETSIZE-1)) { + return -1; + } else { + CPU_SET(i,cpu_set); + } + } + } + + token = strtok_r(NULL, comma, &comma_saveptr); + } + + return 0; + +} + +int mce_sys_var::hex_to_cpuset(char *start, cpu_set_t *cpu_set) +{ + const char *end; + char hexc[2]; + int i, length, digit; + int bit = 0, set_one = 0; + + /* + * The least significant bits are at the end of the + * string, so we need to start our processing at the + * last character and work our way back to the start. + */ + length = strlen(start); + end = start + (length - 1); + + CPU_ZERO(cpu_set); + while (length) { + + *hexc = *end; + *(hexc+1) = 0; // NULL terminate the string or strtol can be buggy. + if (!isxdigit(*hexc)) { + return -1; + } + + digit = strtol(hexc, NULL, 16); + + /* + * Each hex digit is 4 bits. For each bit set per + * in the hex value set the corresponding CPU number + * in the cpu_set. + * + * Since we are working on a single hex digit in a string + * of unknown length we need to keep a running bit counter + * so we don't lose track of our progress. + */ + for (i = 0; i < 4; i++) + { + if (digit & (1 << i)) { + if (bit > (CPU_SETSIZE-1)) { + return -1; + } else { + CPU_SET(bit,cpu_set); + set_one++; + } + } + + bit++; + } + + /* move the end pointer back a character */ + end--; + + /* one less character to process */ + length--; + } + + /* + * passing all 0's is not legal. if no bits were set + * and we make it to the end of the function then return + * failure. + */ + if (!set_one) { + return -1; + } else { + return 0; + } + +} + +int mce_sys_var::env_to_cpuset(char *orig_start, cpu_set_t *cpu_set) +{ + int ret; + char* start = strdup(orig_start); // save the caller string from strtok destruction. + + /* + * We expect a hex number or comma delimited cpulist. Check for + * starting characters of "0x" or "0X" and if present then parse + * the string as a hexidecimal value, otherwise treat it as a + * cpulist. + */ + if ((strlen(start) > 2) && + (start[0] == '0') && + ((start[1] == 'x') || (start[1] == 'X'))) { + ret = hex_to_cpuset(start + 2, cpu_set); + } else { + ret = list_to_cpuset(start, cpu_set); + } + + free(start); + return ret; +} + +void mce_sys_var::read_env_variable_with_pid(char* mce_sys_name, size_t mce_sys_max_size, char* env_ptr) +{ + int n = -1; + char* d_pos = NULL; + + if (NULL == env_ptr || NULL == mce_sys_name || mce_sys_max_size < 2) { + return ; + } + + d_pos = strstr(env_ptr, "%d"); + if (!d_pos) { // no %d in the string + n = snprintf(mce_sys_name, mce_sys_max_size - 1, "%s", env_ptr); + if (unlikely((((int)mce_sys_max_size - 1) < n) || (n < 0))) { + mce_sys_name[0] = '\0'; + } + } else { // has at least one occurrence of %d - replace the first one with the process PID + size_t bytes_num = MIN((size_t)(d_pos - env_ptr), mce_sys_max_size - 1); + strncpy(mce_sys_name, env_ptr, bytes_num); + mce_sys_name[bytes_num] = '\0'; + n = snprintf(mce_sys_name + bytes_num, mce_sys_max_size - bytes_num - 1, "%d", getpid()); + if (likely((0 < n) && (n < ((int)mce_sys_max_size - (int)bytes_num - 1)))) { + bytes_num += n; + snprintf(mce_sys_name + bytes_num, mce_sys_max_size - bytes_num, "%s", d_pos + 2); + } + } +} + +bool mce_sys_var::check_cpuinfo_flag(const char* flag) +{ + FILE *fp; + char *line; + bool ret = false; + + fp = fopen("/proc/cpuinfo", "r"); + if (!fp) { + vlog_printf(VLOG_ERROR, "error while fopen\n"); + print_vma_load_failure_msg(); + return false; + } + line = (char*)malloc(MAX_CMD_LINE); + BULLSEYE_EXCLUDE_BLOCK_START + if (!line) { + vlog_printf(VLOG_ERROR, "error while malloc\n"); + print_vma_load_failure_msg(); + goto exit; + } + BULLSEYE_EXCLUDE_BLOCK_END + while (fgets(line, MAX_CMD_LINE, fp)) { + if (strncmp(line, "flags\t", 5) == 0) { + if (strstr(line, flag)) { + ret = true; + goto exit; + } + } + } + +exit: + fclose(fp); + free(line); + return ret; +} + +/* + * Intel and AMD CPUs have reserved bit 31 of ECX of CPUID leaf 0x1 as the hypervisor present bit. + * This bit allows hypervisors to indicate their presence to the guest operating system. + * Hypervisors set this bit and physical CPUs (all existing and future CPUs) set this bit to zero. + * Guest operating systems can test bit 31 to detect if they are running inside a virtual machine. + */ +bool mce_sys_var::cpuid_hv() +{ +#if defined(__x86_64__) + uint32_t _ecx; + __asm__ __volatile__("cpuid" \ + : "=c"(_ecx) \ + : "a"(0x01)); + usleep(0); + return (bool)((_ecx >> 31) & 0x1); +#else + return check_cpuinfo_flag(VIRTUALIZATION_FLAG); +#endif +} + +/* + * Intel and AMD have also reserved CPUID leaves 0x40000000 - 0x400000FF for software use. + * Hypervisors can use these leaves to provide an interface to pass information from the + * hypervisor to the guest operating system running inside a virtual machine. + * The hypervisor bit indicates the presence of a hypervisor and that it is safe to test + * these additional software leaves. VMware defines the 0x40000000 leaf as the hypervisor CPUID + * information leaf. Code running on a VMware hypervisor can test the CPUID information leaf + * for the hypervisor signature. VMware stores the string "VMwareVMware" in + * EBX, ECX, EDX of CPUID leaf 0x40000000. + */ +const char* mce_sys_var::cpuid_hv_vendor() +{ + static __thread char vendor[13] = {0}; + + if (!cpuid_hv()) { + return NULL; + } +#if defined(__x86_64__) + uint32_t _ebx = 0, _ecx = 0, _edx = 0; + __asm__ __volatile__("cpuid" \ + : "=b"(_ebx), \ + "=c"(_ecx), \ + "=d"(_edx) \ + : "a"(0x40000000)); + sprintf(vendor, "%c%c%c%c", _ebx, (_ebx >> 8), (_ebx >> 16), (_ebx >> 24)); + sprintf(vendor + 4, "%c%c%c%c", _ecx, (_ecx >> 8), (_ecx >> 16), (_ecx >> 24)); + sprintf(vendor + 8, "%c%c%c%c", _edx, (_edx >> 8), (_edx >> 16), (_edx >> 24)); + vendor[12] = 0x00; +#endif + return vendor; +} + +void mce_sys_var::read_hv() +{ + const char *hyper_vendor_id = NULL; + + hypervisor = mce_sys_var::HYPER_NONE; + hyper_vendor_id = cpuid_hv_vendor(); + if (hyper_vendor_id) { + if (!strncmp("XenVMMXenVMM", hyper_vendor_id, 12)) { + hypervisor = HYPER_XEN; + } else if (!strncmp("KVMKVMKVM", hyper_vendor_id, 9)) { + hypervisor = HYPER_KVM; + } else if (!strncmp("Microsoft Hv", hyper_vendor_id, 12)) { + hypervisor = HYPER_MSHV; + } else if (!strncmp("VMwareVMware", hyper_vendor_id, 12)) { + hypervisor = HYPER_VMWARE; + } else { + hypervisor = HYPER_NONE; + } + } +} + +void mce_sys_var::get_env_params() +{ + int c = 0, len =0; + char *env_ptr; + FILE *fp = NULL; + int app_name_size = MAX_CMD_LINE; + // Large buffer size to avoid need for realloc + + fp = fopen("/proc/self/cmdline", "r"); + if (!fp) { + vlog_printf(VLOG_ERROR, "error while fopen\n"); + print_vma_load_failure_msg(); + exit(1); + } + + app_name = (char *)malloc(app_name_size); + BULLSEYE_EXCLUDE_BLOCK_START + if (!app_name) { + vlog_printf(VLOG_ERROR, "error while malloc\n"); + print_vma_load_failure_msg(); + exit(1); + } + BULLSEYE_EXCLUDE_BLOCK_END + while ((c = fgetc(fp)) != EOF){ + app_name[len++] = (c==0?' ':c); + if (len>=app_name_size) { + app_name_size=app_name_size*2; + app_name = (char*)realloc(app_name, app_name_size); + BULLSEYE_EXCLUDE_BLOCK_START + if (!app_name) { + vlog_printf(VLOG_ERROR, "error while malloc\n"); + print_vma_load_failure_msg(); + exit(1); + } + BULLSEYE_EXCLUDE_BLOCK_END + } + } + + app_name[len-1] = '\0'; + fclose(fp); + + memset(vma_time_measure_filename, 0, sizeof(vma_time_measure_filename)); + strcpy(vma_time_measure_filename, MCE_DEFAULT_TIME_MEASURE_DUMP_FILE); + memset(log_filename, 0, sizeof(log_filename)); + memset(stats_filename, 0, sizeof(stats_filename)); + memset(stats_shmem_dirname, 0, sizeof(stats_shmem_dirname)); + memset(vmad_notify_dir, 0, sizeof(vmad_notify_dir)); + strcpy(stats_filename, MCE_DEFAULT_STATS_FILE); + strcpy(vmad_notify_dir, MCE_DEFAULT_VMAD_FOLDER); + strcpy(stats_shmem_dirname, MCE_DEFAULT_STATS_SHMEM_DIR); + strcpy(conf_filename, MCE_DEFAULT_CONF_FILE); + strcpy(app_id, MCE_DEFAULT_APP_ID); + strcpy(internal_thread_cpuset, MCE_DEFAULT_INTERNAL_THREAD_CPUSET); + strcpy(internal_thread_affinity_str, MCE_DEFAULT_INTERNAL_THREAD_AFFINITY_STR); + + log_level = VLOG_DEFAULT; + log_details = MCE_DEFAULT_LOG_DETAILS; + log_colors = MCE_DEFAULT_LOG_COLORS; + handle_sigintr = MCE_DEFAULT_HANDLE_SIGINTR; + handle_segfault = MCE_DEFAULT_HANDLE_SIGFAULT; + stats_fd_num_max = MCE_DEFAULT_STATS_FD_NUM; + + ring_allocation_logic_tx= MCE_DEFAULT_RING_ALLOCATION_LOGIC_TX; + ring_allocation_logic_rx= MCE_DEFAULT_RING_ALLOCATION_LOGIC_RX; + ring_migration_ratio_tx = MCE_DEFAULT_RING_MIGRATION_RATIO_TX; + ring_migration_ratio_rx = MCE_DEFAULT_RING_MIGRATION_RATIO_RX; + ring_limit_per_interface= MCE_DEFAULT_RING_LIMIT_PER_INTERFACE; + ring_dev_mem_tx = MCE_DEFAULT_RING_DEV_MEM_TX; + + tcp_max_syn_rate = MCE_DEFAULT_TCP_MAX_SYN_RATE; + + tx_num_segs_tcp = MCE_DEFAULT_TX_NUM_SEGS_TCP; + tx_num_bufs = MCE_DEFAULT_TX_NUM_BUFS; +#ifdef DEFINED_TSO + tx_buf_size = MCE_DEFAULT_TX_BUF_SIZE; +#endif /* DEFINED_TSO */ + tx_num_wr = MCE_DEFAULT_TX_NUM_WRE; + tx_num_wr_to_signal = MCE_DEFAULT_TX_NUM_WRE_TO_SIGNAL; + tx_max_inline = MCE_DEFAULT_TX_MAX_INLINE; + tx_mc_loopback_default = MCE_DEFAULT_TX_MC_LOOPBACK; + tx_nonblocked_eagains = MCE_DEFAULT_TX_NONBLOCKED_EAGAINS; + tx_prefetch_bytes = MCE_DEFAULT_TX_PREFETCH_BYTES; + tx_bufs_batch_udp = MCE_DEFAULT_TX_BUFS_BATCH_UDP; + tx_bufs_batch_tcp = MCE_DEFAULT_TX_BUFS_BATCH_TCP; + + rx_num_bufs = MCE_DEFAULT_RX_NUM_BUFS; + rx_bufs_batch = MCE_DEFAULT_RX_BUFS_BATCH; + rx_num_wr = MCE_DEFAULT_RX_NUM_WRE; + rx_num_wr_to_post_recv = MCE_DEFAULT_RX_NUM_WRE_TO_POST_RECV; + rx_poll_num = MCE_DEFAULT_RX_NUM_POLLS; + rx_poll_num_init = MCE_DEFAULT_RX_NUM_POLLS_INIT; + rx_udp_poll_os_ratio = MCE_DEFAULT_RX_UDP_POLL_OS_RATIO; + hw_ts_conversion_mode = MCE_DEFAULT_HW_TS_CONVERSION_MODE; + rx_poll_yield_loops = MCE_DEFAULT_RX_POLL_YIELD; + select_handle_cpu_usage_stats = MCE_DEFAULT_SELECT_CPU_USAGE_STATS; + rx_ready_byte_min_limit = MCE_DEFAULT_RX_BYTE_MIN_LIMIT; + rx_prefetch_bytes = MCE_DEFAULT_RX_PREFETCH_BYTES; + rx_prefetch_bytes_before_poll = MCE_DEFAULT_RX_PREFETCH_BYTES_BEFORE_POLL; + rx_cq_drain_rate_nsec = MCE_DEFAULT_RX_CQ_DRAIN_RATE; + rx_delta_tsc_between_cq_polls = 0; + + gro_streams_max = MCE_DEFAULT_GRO_STREAMS_MAX; + + tcp_3t_rules = MCE_DEFAULT_TCP_3T_RULES; + eth_mc_l2_only_rules = MCE_DEFAULT_ETH_MC_L2_ONLY_RULES; + mc_force_flowtag = MCE_DEFAULT_MC_FORCE_FLOWTAG; + + select_poll_num = MCE_DEFAULT_SELECT_NUM_POLLS; + select_poll_os_force = MCE_DEFAULT_SELECT_POLL_OS_FORCE; + select_poll_os_ratio = MCE_DEFAULT_SELECT_POLL_OS_RATIO; + select_skip_os_fd_check = MCE_DEFAULT_SELECT_SKIP_OS; + + cq_moderation_enable = MCE_DEFAULT_CQ_MODERATION_ENABLE; + cq_moderation_count = MCE_DEFAULT_CQ_MODERATION_COUNT; + cq_moderation_period_usec = MCE_DEFAULT_CQ_MODERATION_PERIOD_USEC; + cq_aim_max_count = MCE_DEFAULT_CQ_AIM_MAX_COUNT; + cq_aim_max_period_usec = MCE_DEFAULT_CQ_AIM_MAX_PERIOD_USEC; + cq_aim_interval_msec = MCE_DEFAULT_CQ_AIM_INTERVAL_MSEC; + cq_aim_interrupts_rate_per_sec = MCE_DEFAULT_CQ_AIM_INTERRUPTS_RATE_PER_SEC; + + cq_poll_batch_max = MCE_DEFAULT_CQ_POLL_BATCH; + progress_engine_interval_msec = MCE_DEFAULT_PROGRESS_ENGINE_INTERVAL_MSEC; + progress_engine_wce_max = MCE_DEFAULT_PROGRESS_ENGINE_WCE_MAX; + cq_keep_qp_full = MCE_DEFAULT_CQ_KEEP_QP_FULL; + qp_compensation_level = MCE_DEFAULT_QP_COMPENSATION_LEVEL; + internal_thread_arm_cq_enabled = MCE_DEFAULT_INTERNAL_THREAD_ARM_CQ_ENABLED; + + offloaded_sockets = MCE_DEFAULT_OFFLOADED_SOCKETS; + timer_resolution_msec = MCE_DEFAULT_TIMER_RESOLUTION_MSEC; + tcp_timer_resolution_msec= MCE_DEFAULT_TCP_TIMER_RESOLUTION_MSEC; + internal_thread_tcp_timer_handling = MCE_DEFAULT_INTERNAL_THREAD_TCP_TIMER_HANDLING; + tcp_ctl_thread = MCE_DEFAULT_TCP_CTL_THREAD; + tcp_ts_opt = MCE_DEFAULT_TCP_TIMESTAMP_OPTION; + tcp_nodelay = MCE_DEFAULT_TCP_NODELAY; + tcp_quickack = MCE_DEFAULT_TCP_QUICKACK; +// exception_handling is handled by its CTOR + avoid_sys_calls_on_tcp_fd = MCE_DEFAULT_AVOID_SYS_CALLS_ON_TCP_FD; + allow_privileged_sock_opt = MCE_DEFAULT_ALLOW_PRIVILEGED_SOCK_OPT; + wait_after_join_msec = MCE_DEFAULT_WAIT_AFTER_JOIN_MSEC; + thread_mode = MCE_DEFAULT_THREAD_MODE; + buffer_batching_mode = MCE_DEFAULT_BUFFER_BATCHING_MODE; + mem_alloc_type = MCE_DEFAULT_MEM_ALLOC_TYPE; + enable_ipoib = MCE_DEFAULT_IPOIB_FLAG; + enable_socketxtreme = MCE_DEFAULT_SOCKETXTREME; +#ifdef DEFINED_TSO + enable_tso = MCE_DEFAULT_TSO; +#endif /* DEFINED_TSO */ + handle_fork = MCE_DEFAULT_FORK_SUPPORT; + handle_bf = MCE_DEFAULT_BF_FLAG; + close_on_dup2 = MCE_DEFAULT_CLOSE_ON_DUP2; + mtu = MCE_DEFAULT_MTU; + lwip_mss = MCE_DEFAULT_MSS; + lwip_cc_algo_mod = MCE_DEFAULT_LWIP_CC_ALGO_MOD; + mce_spec = MCE_SPEC_NONE; + mce_spec_param1 = 1; + mce_spec_param2 = 1; + + neigh_num_err_retries = MCE_DEFAULT_NEIGH_NUM_ERR_RETRIES; + neigh_uc_arp_quata = MCE_DEFAULT_NEIGH_UC_ARP_QUATA; + neigh_wait_till_send_arp_msec = MCE_DEFAULT_NEIGH_UC_ARP_DELAY_MSEC; + timer_netlink_update_msec = MCE_DEFAULT_NETLINK_TIMER_MSEC; + + rx_poll_on_tx_tcp = MCE_DEFAULT_RX_POLL_ON_TX_TCP; + trigger_dummy_send_getsockname = MCE_DEFAULT_TRIGGER_DUMMY_SEND_GETSOCKNAME; + +#ifdef VMA_TIME_MEASURE + vma_time_measure_num_samples = MCE_DEFAULT_TIME_MEASURE_NUM_SAMPLES; +#endif + + read_hv(); + + /* Configure enable_socketxtreme as first because + * this mode has some special predefined parameter limitations + */ + if((env_ptr = getenv(SYS_VAR_SOCKETXTREME )) != NULL) { + enable_socketxtreme = atoi(env_ptr) ? true : false; + } + if (enable_socketxtreme) { + /* Set following parameters as default for SocketXtreme mode */ + rx_num_wr = 1024; + gro_streams_max = 0; + progress_engine_interval_msec = MCE_CQ_DRAIN_INTERVAL_DISABLED; + } + + if ((env_ptr = getenv(SYS_VAR_SPEC)) != NULL){ + mce_spec = (uint32_t)vma_spec::from_str(env_ptr, MCE_SPEC_NONE); + } + + switch (mce_spec) { + case MCE_SPEC_SOCKPERF_ULTRA_LATENCY_10: + tx_num_segs_tcp = 512; //MCE_DEFAULT_TX_NUM_SEGS_TCP (1000000) + tx_num_bufs = 512; //MCE_DEFAULT_TX_NUM_BUFS (200000) + tx_num_wr = 256; //MCE_DEFAULT_TX_NUM_WRE (3000) + tx_num_wr_to_signal = 4; //MCE_DEFAULT_TX_NUM_WRE_TO_SIGNAL (64) + tx_prefetch_bytes = MCE_DEFAULT_TX_PREFETCH_BYTES; //(256) + tx_bufs_batch_udp = 1; //MCE_DEFAULT_TX_BUFS_BATCH_UDP (8) + tx_bufs_batch_tcp = 1; //MCE_DEFAULT_TX_BUFS_BATCH_TCP; + rx_num_bufs = 1024; //MCE_DEFAULT_RX_NUM_BUFS (200000) + rx_bufs_batch = 4; //MCE_DEFAULT_RX_BUFS_BATCH (64) + rx_num_wr = 256; //MCE_DEFAULT_RX_NUM_WRE (16000) + rx_num_wr_to_post_recv = 4; //MCE_DEFAULT_RX_NUM_WRE_TO_POST_RECV (64) + rx_poll_num = -1; //MCE_DEFAULT_RX_NUM_POLLS +#ifdef DEFINED_TSO + enable_tso = false; //MCE_DEFAULT_TSO (true) +#endif /* DEFINED_TSO */ + rx_udp_poll_os_ratio = 0; //MCE_DEFAULT_RX_UDP_POLL_OS_RATIO + rx_prefetch_bytes = MCE_DEFAULT_RX_PREFETCH_BYTES; //(256) + rx_prefetch_bytes_before_poll = 256; //MCE_DEFAULT_RX_PREFETCH_BYTES_BEFORE_POLL 0 + select_poll_num = -1; + select_poll_os_ratio = 0; + select_skip_os_fd_check = 0; + avoid_sys_calls_on_tcp_fd = true; //MCE_DEFAULT_AVOID_SYS_CALLS_ON_TCP_FD (false) + gro_streams_max = 0; //MCE_DEFAULT_GRO_STREAMS_MAX (32) + progress_engine_interval_msec = 0; + cq_keep_qp_full = false; //MCE_DEFAULT_CQ_KEEP_QP_FULL(true) + thread_mode = THREAD_MODE_SINGLE; + mem_alloc_type = ALLOC_TYPE_HUGEPAGES; + tcp_nodelay = true; // MCE_DEFAULT_TCP_NODELAY (false) + ring_dev_mem_tx = 16384; // MCE_DEFAULT_RING_DEV_MEM_TX (0) + strcpy(internal_thread_affinity_str, "0"); //MCE_DEFAULT_INTERNAL_THREAD_AFFINITY_STR; + break; + + case MCE_SPEC_SOCKPERF_LATENCY_15: + tx_num_wr = 256; //MCE_DEFAULT_TX_NUM_WRE (3000) + tx_num_wr_to_signal = 4; //MCE_DEFAULT_TX_NUM_WRE_TO_SIGNAL(64) + tx_bufs_batch_udp = 1; //MCE_DEFAULT_TX_BUFS_BATCH_UDP (8) + tx_bufs_batch_tcp = 1; //MCE_DEFAULT_TX_BUFS_BATCH_TCP (16) + rx_bufs_batch = 4; //MCE_DEFAULT_RX_BUFS_BATCH (64) + rx_num_wr = 256; //MCE_DEFAULT_RX_NUM_WRE (16000) + rx_num_wr_to_post_recv = 4; //MCE_DEFAULT_RX_NUM_WRE_TO_POST_RECV (64) + rx_poll_num = -1; //MCE_DEFAULT_RX_NUM_POLLS (100000) +#ifdef DEFINED_TSO + enable_tso = false; //MCE_DEFAULT_TSO (true) +#endif /* DEFINED_TSO */ + rx_prefetch_bytes_before_poll = 256; //MCE_DEFAULT_RX_PREFETCH_BYTES_BEFORE_POLL (0) + select_poll_num = -1; //MCE_DEFAULT_SELECT_NUM_POLLS (100000) + avoid_sys_calls_on_tcp_fd = true; //MCE_DEFAULT_AVOID_SYS_CALLS_ON_TCP_FD (false) + gro_streams_max = 0; //MCE_DEFAULT_GRO_STREAMS_MAX (32) + cq_keep_qp_full = false; //MCE_DEFAULT_CQ_KEEP_QP_FULL (true) + thread_mode = THREAD_MODE_SINGLE; //MCE_DEFAULT_THREAD_MODE (THREAD_MODE_MULTI) + mem_alloc_type = ALLOC_TYPE_HUGEPAGES; //MCE_DEFAULT_MEM_ALLOC_TYPE (ALLOC_TYPE_CONTIG) + strcpy(internal_thread_affinity_str, "0"); //MCE_DEFAULT_INTERNAL_THREAD_AFFINITY_STR ("-1") + progress_engine_interval_msec = 100; //MCE_DEFAULT_PROGRESS_ENGINE_INTERVAL_MSEC (10) + select_poll_os_ratio = 100; //MCE_DEFAULT_SELECT_POLL_OS_RATIO (10) + select_poll_os_force = 1; //MCE_DEFAULT_SELECT_POLL_OS_FORCE (0) + tcp_nodelay = true; // MCE_DEFAULT_TCP_NODELAY (falst) + ring_dev_mem_tx = 16384; // MCE_DEFAULT_RING_DEV_MEM_TX (0) + break; + + case MCE_SPEC_29WEST_LBM_29: + mce_spec_param1 = 5000; // [u-sec] Time out to send next pipe_write + mce_spec_param2 = 50; // Num of max sequential pipe_write to drop + rx_poll_num = 0; + rx_udp_poll_os_ratio = 100; + select_poll_num = 100000; + select_poll_os_ratio = 100; + select_skip_os_fd_check = 50; + break; + + case MCE_SPEC_WOMBAT_FH_LBM_554: + mce_spec_param1 = 5000; // [u-sec] Time out to send next pipe_write + mce_spec_param2 = 50; // Num of max sequential pipe_write to drop + rx_poll_num = 0; + rx_udp_poll_os_ratio = 100; + select_poll_num = 0; + select_skip_os_fd_check = 20; + break; + + case MCE_SPEC_RTI_784: + rx_poll_num = -1; +// TODO - Need to replace old QP/CQ allocation logic here +// qp_allocation_logic = QP_ALLOC_LOGIC__QP_PER_PEER_IP_PER_LOCAL_IP; +// cq_allocation_logic = CQ_ALLOC_LOGIC__CQ_PER_QP; + break; + + case MCE_SPEC_MCD_623: + ring_allocation_logic_rx = RING_LOGIC_PER_CORE_ATTACH_THREADS; + ring_allocation_logic_tx = RING_LOGIC_PER_CORE_ATTACH_THREADS; + break; + + case MCE_SPEC_MCD_IRQ_624: + ring_allocation_logic_rx = RING_LOGIC_PER_CORE_ATTACH_THREADS; + ring_allocation_logic_tx = RING_LOGIC_PER_CORE_ATTACH_THREADS; + select_poll_num = 0; + rx_poll_num = 0; + cq_moderation_enable = false; + break; + + case MCE_SPEC_LL_7750: + tx_num_bufs = 8192; // MCE_DEFAULT_TX_NUM_BUFS (200000), Global TX data buffers allocated + rx_num_bufs = 204800; // MCE_DEFAULT_RX_NUM_BUFS (200000), RX data buffers used on all QPs on all HCAs + log_level = VLOG_WARNING; //VLOG_DEFAULT(VLOG_INFO) VMA_TRACELEVEL + stats_fd_num_max = 1024; //MCE_DEFAULT_STATS_FD_NUM(100), max. number of sockets monitored by VMA stats + strcpy(internal_thread_affinity_str, "0x3"); // MCE_DEFAULT_INTERNAL_THREAD_AFFINITY_STR(-1), first 2 cores + rx_poll_num = -1; //MCE_DEFAULT_RX_NUM_POLLS(100000), Infinite RX poll for ready packets (during read/recv) + select_poll_num = -1; //MCE_DEFAULT_SELECT_NUM_POLLS(100000), Infinite poll the hardware on RX (before sleeping in epoll/select, etc) + select_poll_os_ratio = 0; //MCE_DEFAULT_SELECT_POLL_OS_RATIO(10), Disable polling OS fd's (re-enabled if bound on OS fd) + tcp_3t_rules = true; //MCE_DEFAULT_TCP_3T_RULES(false), Use only 3 tuple rules for TCP + avoid_sys_calls_on_tcp_fd = 1; //MCE_DEFAULT_AVOID_SYS_CALLS_ON_TCP_FD (false), Disable handling control packets on a separate thread + buffer_batching_mode = BUFFER_BATCHING_NONE; //MCE_DEFAULT_BUFFER_BATCHING_MODE(BUFFER_BATCHING_WITH_RECLAIM), Disable handling control packets on a separate thread + tcp_ctl_thread = CTL_THREAD_NO_WAKEUP; //MCE_DEFAULT_TCP_CTL_THREAD (CTL_THREAD_DISABLE), wait for thread timer to expire + break; + + case MCE_SPEC_LL_MULTI_RING: + mem_alloc_type = ALLOC_TYPE_HUGEPAGES; //MCE_DEFAULT_MEM_ALLOC_TYPE (ALLOC_TYPE_CONTIG) VMA_MEM_ALLOC_TYPE + select_poll_num = -1; //MCE_DEFAULT_SELECT_NUM_POLLS (100000) VMA_SELECT_POLL + rx_poll_num = -1; //MCE_DEFAULT_RX_NUM_POLLS(100000) VMA_RX_POLL + ring_allocation_logic_tx = RING_LOGIC_PER_THREAD; //MCE_DEFAULT_RING_ALLOCATION_LOGIC_TX(RING_LOGIC_PER_INTERFACE) VMA_RING_ALLOCATION_LOGIC_TX + ring_allocation_logic_rx = RING_LOGIC_PER_THREAD; //MCE_DEFAULT_RING_ALLOCATION_LOGIC_RX(RING_LOGIC_PER_INTERFACE) VMA_RING_ALLOCATION_LOGIC_RX + select_poll_os_ratio = 0; //MCE_DEFAULT_SELECT_POLL_OS_RATIO(10) VMA_SELECT_POLL_OS_RATIO + select_skip_os_fd_check = 0; //MCE_DEFAULT_SELECT_SKIP_OS(4) VMA_SELECT_SKIP_OS + rx_poll_on_tx_tcp = true; //MCE_DEFAULT_RX_POLL_ON_TX_TCP (false) + trigger_dummy_send_getsockname = true; //MCE_DEFAULT_TRIGGER_DUMMY_SEND_GETSOCKNAME (false) + break; + + case MCE_SPEC_NONE: + default: + break; + } + + if ((env_ptr = getenv(SYS_VAR_SPEC_PARAM1)) != NULL) + mce_spec_param1 = (uint32_t)atoi(env_ptr); + + if ((env_ptr = getenv(SYS_VAR_SPEC_PARAM2)) != NULL) + mce_spec_param2 = (uint32_t)atoi(env_ptr); + + if ((env_ptr = getenv(SYS_VAR_LOG_FILENAME)) != NULL){ + read_env_variable_with_pid(log_filename, sizeof(log_filename), env_ptr); + } + + if ((env_ptr = getenv(SYS_VAR_STATS_FILENAME)) != NULL){ + read_env_variable_with_pid(stats_filename, sizeof(stats_filename), env_ptr); + } + + if ((env_ptr = getenv(SYS_VAR_STATS_SHMEM_DIRNAME)) != NULL){ + read_env_variable_with_pid(stats_shmem_dirname, sizeof(stats_shmem_dirname), env_ptr); + } + + if ((env_ptr = getenv(SYS_VAR_CONF_FILENAME)) != NULL){ + read_env_variable_with_pid(conf_filename, sizeof(conf_filename), env_ptr); + } + + if ((env_ptr = getenv(SYS_VAR_VMAD_DIR)) != NULL){ + read_env_variable_with_pid(vmad_notify_dir, sizeof(vmad_notify_dir), env_ptr); + } + + if ((env_ptr = getenv(SYS_VAR_LOG_LEVEL)) != NULL) + log_level = log_level::from_str(env_ptr, VLOG_DEFAULT); + + if (log_level >= VLOG_DEBUG) + log_details = 2; + + if ((env_ptr = getenv(SYS_VAR_LOG_DETAILS)) != NULL) + log_details = (uint32_t)atoi(env_ptr); + + if ((env_ptr = getenv(SYS_VAR_LOG_COLORS)) != NULL) + log_colors = atoi(env_ptr) ? true : false; + + if ((env_ptr = getenv(SYS_VAR_APPLICATION_ID)) != NULL){ + read_env_variable_with_pid(app_id, sizeof(app_id), env_ptr); + } + + if ((env_ptr = getenv(SYS_VAR_HANDLE_SIGINTR)) != NULL) + handle_sigintr = atoi(env_ptr) ? true : false; + + if ((env_ptr = getenv(SYS_VAR_HANDLE_SIGSEGV)) != NULL) + handle_segfault = atoi(env_ptr) ? true : false; + + if ((env_ptr = getenv(SYS_VAR_STATS_FD_NUM)) != NULL) { + stats_fd_num_max = (uint32_t)atoi(env_ptr); + if (stats_fd_num_max > MAX_STATS_FD_NUM) { + vlog_printf(VLOG_WARNING," Can only monitor maximum %d sockets in statistics \n", MAX_STATS_FD_NUM); + stats_fd_num_max = MAX_STATS_FD_NUM; + } + } + + + if ((env_ptr = getenv(SYS_VAR_TX_NUM_SEGS_TCP)) != NULL) + tx_num_segs_tcp = (uint32_t)atoi(env_ptr); + + if ((env_ptr = getenv(SYS_VAR_TX_NUM_BUFS)) != NULL) + tx_num_bufs = (uint32_t)atoi(env_ptr); + +#ifdef DEFINED_TSO + if ((env_ptr = getenv(SYS_VAR_TX_BUF_SIZE)) != NULL) + tx_buf_size = (uint32_t)atoi(env_ptr); +#endif /* DEFINED_TSO */ + + if ((env_ptr = getenv(SYS_VAR_TX_NUM_WRE)) != NULL) + tx_num_wr = (uint32_t)atoi(env_ptr); + + if ((env_ptr = getenv(SYS_VAR_TX_NUM_WRE_TO_SIGNAL)) != NULL) + tx_num_wr_to_signal = MIN(NUM_TX_WRE_TO_SIGNAL_MAX, MAX(1, (uint32_t)atoi(env_ptr))); + if (tx_num_wr <= (tx_num_wr_to_signal * 2)) + tx_num_wr = tx_num_wr_to_signal * 2; + + if ((env_ptr = getenv(SYS_VAR_TX_MAX_INLINE)) != NULL) + tx_max_inline = (uint32_t)atoi(env_ptr); + if (tx_max_inline > MAX_SUPPORTED_IB_INLINE_SIZE) { + vlog_printf(VLOG_WARNING,"VMA_TX_MAX_INLINE must be smaller or equal to %d [%d]\n", + MAX_SUPPORTED_IB_INLINE_SIZE, tx_max_inline); + tx_max_inline = MAX_SUPPORTED_IB_INLINE_SIZE; + } + unsigned int cx4_max_tx_wre_for_inl = (16 * 1024 * 64) / (VMA_ALIGN(VMA_ALIGN(tx_max_inline - 12, 64) + 12, 64)); + if (tx_num_wr > cx4_max_tx_wre_for_inl) { + vlog_printf(VLOG_WARNING,"For the given VMA_TX_MAX_INLINE [%d], VMA_TX_WRE [%d] must be smaller than %d\n", + tx_max_inline, tx_num_wr, cx4_max_tx_wre_for_inl); + tx_num_wr = cx4_max_tx_wre_for_inl; + } + + if ((env_ptr = getenv(SYS_VAR_TX_MC_LOOPBACK)) != NULL) + tx_mc_loopback_default = atoi(env_ptr) ? true : false; + + if ((env_ptr = getenv(SYS_VAR_TX_NONBLOCKED_EAGAINS)) != NULL) + tx_nonblocked_eagains = atoi(env_ptr)? true : false; + + if ((env_ptr = getenv(SYS_VAR_TX_PREFETCH_BYTES)) != NULL) + tx_prefetch_bytes = (uint32_t)atoi(env_ptr); + + if ((env_ptr = getenv(SYS_VAR_RING_ALLOCATION_LOGIC_TX)) != NULL) { + ring_allocation_logic_tx = (ring_logic_t)atoi(env_ptr); + if (!is_ring_logic_valid(ring_allocation_logic_tx)) { + vlog_printf(VLOG_WARNING,"%s = %d is not valid, setting logic to default = %d\n", + SYS_VAR_RING_ALLOCATION_LOGIC_TX, ring_allocation_logic_tx, MCE_DEFAULT_RING_ALLOCATION_LOGIC_TX); + ring_allocation_logic_tx = MCE_DEFAULT_RING_ALLOCATION_LOGIC_TX; + } + } + + if ((env_ptr = getenv(SYS_VAR_RING_ALLOCATION_LOGIC_RX)) != NULL) { + ring_allocation_logic_rx = (ring_logic_t)atoi(env_ptr); + if (!is_ring_logic_valid(ring_allocation_logic_rx)) { + vlog_printf(VLOG_WARNING,"%s = %d is not valid, setting logic to default = %d\n", + SYS_VAR_RING_ALLOCATION_LOGIC_RX, ring_allocation_logic_rx, MCE_DEFAULT_RING_ALLOCATION_LOGIC_RX); + ring_allocation_logic_rx = MCE_DEFAULT_RING_ALLOCATION_LOGIC_RX; + } + } + + if ((env_ptr = getenv(SYS_VAR_RING_MIGRATION_RATIO_TX)) != NULL) + ring_migration_ratio_tx = (int32_t)atoi(env_ptr); + + if ((env_ptr = getenv(SYS_VAR_RING_MIGRATION_RATIO_RX)) != NULL) + ring_migration_ratio_rx = (int32_t)atoi(env_ptr); + + if ((env_ptr = getenv(SYS_VAR_RING_LIMIT_PER_INTERFACE)) != NULL) + ring_limit_per_interface = MAX(0, (int32_t)atoi(env_ptr)); + + if ((env_ptr = getenv(SYS_VAR_RING_DEV_MEM_TX)) != NULL) + ring_dev_mem_tx = MAX(0, (int32_t)atoi(env_ptr)); + + if ((env_ptr = getenv(SYS_VAR_TCP_MAX_SYN_RATE)) != NULL) + tcp_max_syn_rate = MIN(TCP_MAX_SYN_RATE_TOP_LIMIT, MAX(0, (int32_t)atoi(env_ptr))); + + if ((env_ptr = getenv(SYS_VAR_RX_NUM_BUFS)) != NULL) + rx_num_bufs = (uint32_t)atoi(env_ptr); + + if ((env_ptr = getenv(SYS_VAR_RX_NUM_WRE_TO_POST_RECV)) != NULL) + rx_num_wr_to_post_recv = MIN(NUM_RX_WRE_TO_POST_RECV_MAX, MAX(1, (uint32_t)atoi(env_ptr))); + + if ((env_ptr = getenv(SYS_VAR_RX_NUM_WRE)) != NULL) + rx_num_wr = (uint32_t)atoi(env_ptr); + if (rx_num_wr <= (rx_num_wr_to_post_recv * 2)) + rx_num_wr = rx_num_wr_to_post_recv * 2; + + if ((env_ptr = getenv(SYS_VAR_RX_NUM_POLLS)) != NULL) { + rx_poll_num = atoi(env_ptr); + } + if (rx_poll_num < MCE_MIN_RX_NUM_POLLS || rx_poll_num > MCE_MAX_RX_NUM_POLLS) { + vlog_printf(VLOG_WARNING," Rx Poll loops should be between %d and %d [%d]\n", MCE_MIN_RX_NUM_POLLS, MCE_MAX_RX_NUM_POLLS, rx_poll_num); + rx_poll_num = MCE_DEFAULT_RX_NUM_POLLS; + } + if ((env_ptr = getenv(SYS_VAR_RX_NUM_POLLS_INIT)) != NULL) + rx_poll_num_init = atoi(env_ptr); + if (rx_poll_num_init < MCE_MIN_RX_NUM_POLLS || rx_poll_num_init > MCE_MAX_RX_NUM_POLLS) { + vlog_printf(VLOG_WARNING," Rx Poll loops should be between %d and %d [%d]\n", MCE_MIN_RX_NUM_POLLS, MCE_MAX_RX_NUM_POLLS, rx_poll_num_init); + rx_poll_num_init = MCE_DEFAULT_RX_NUM_POLLS_INIT; + } + if (rx_poll_num == 0) + rx_poll_num = 1; // Force at least one good polling loop + + if ((env_ptr = getenv(SYS_VAR_RX_UDP_POLL_OS_RATIO)) != NULL) + rx_udp_poll_os_ratio = (uint32_t)atoi(env_ptr); + + if ((env_ptr = getenv(SYS_VAR_HW_TS_CONVERSION_MODE)) != NULL) { + hw_ts_conversion_mode = (ts_conversion_mode_t)atoi(env_ptr); + if ((uint32_t)hw_ts_conversion_mode >= TS_CONVERSION_MODE_LAST) { + vlog_printf(VLOG_WARNING,"HW TS conversion size out of range [%d] (min=%d, max=%d). using default [%d]\n", hw_ts_conversion_mode, TS_CONVERSION_MODE_DISABLE , TS_CONVERSION_MODE_LAST - 1, MCE_DEFAULT_HW_TS_CONVERSION_MODE); + hw_ts_conversion_mode = MCE_DEFAULT_HW_TS_CONVERSION_MODE; + } + } + + //The following 2 params were replaced by SYS_VAR_RX_UDP_POLL_OS_RATIO + if ((env_ptr = getenv(SYS_VAR_RX_POLL_OS_RATIO)) != NULL) { + rx_udp_poll_os_ratio = (uint32_t)atoi(env_ptr); + vlog_printf(VLOG_WARNING,"The parameter VMA_RX_POLL_OS_RATIO is no longer in use. Parameter VMA_RX_UDP_POLL_OS_RATIO was set to %d instead\n", rx_udp_poll_os_ratio); + } + if ((env_ptr = getenv(SYS_VAR_RX_SKIP_OS)) != NULL) { + rx_udp_poll_os_ratio = (uint32_t)atoi(env_ptr); + vlog_printf(VLOG_WARNING,"The parameter VMA_RX_SKIP_OS is no longer in use. Parameter VMA_RX_UDP_POLL_OS_RATIO was set to %d instead\n", rx_udp_poll_os_ratio); + } + + if ((env_ptr = getenv(SYS_VAR_RX_POLL_YIELD)) != NULL) + rx_poll_yield_loops = (uint32_t)atoi(env_ptr); + + if ((env_ptr = getenv(SYS_VAR_SELECT_CPU_USAGE_STATS)) != NULL) + select_handle_cpu_usage_stats = atoi(env_ptr) ? true : false; + + if ((env_ptr = getenv(SYS_VAR_RX_BYTE_MIN_LIMIT)) != NULL) + rx_ready_byte_min_limit = (uint32_t)atoi(env_ptr); + + if ((env_ptr = getenv(SYS_VAR_RX_PREFETCH_BYTES)) != NULL) + rx_prefetch_bytes = (uint32_t)atoi(env_ptr); + if (rx_prefetch_bytes < MCE_MIN_RX_PREFETCH_BYTES || rx_prefetch_bytes > MCE_MAX_RX_PREFETCH_BYTES) { + vlog_printf(VLOG_WARNING," Rx prefetch bytes size out of range [%d] (min=%d, max=%d)\n", rx_prefetch_bytes, MCE_MIN_RX_PREFETCH_BYTES, MCE_MAX_RX_PREFETCH_BYTES); + rx_prefetch_bytes = MCE_DEFAULT_RX_PREFETCH_BYTES; + } + + if ((env_ptr = getenv(SYS_VAR_RX_PREFETCH_BYTES_BEFORE_POLL)) != NULL) + rx_prefetch_bytes_before_poll = (uint32_t)atoi(env_ptr); + if (rx_prefetch_bytes_before_poll != 0 && (rx_prefetch_bytes_before_poll < MCE_MIN_RX_PREFETCH_BYTES || rx_prefetch_bytes_before_poll > MCE_MAX_RX_PREFETCH_BYTES)) { + vlog_printf(VLOG_WARNING," Rx prefetch bytes size out of range [%d] (min=%d, max=%d, disabled=0)\n", rx_prefetch_bytes_before_poll, MCE_MIN_RX_PREFETCH_BYTES, MCE_MAX_RX_PREFETCH_BYTES); + rx_prefetch_bytes_before_poll = MCE_DEFAULT_RX_PREFETCH_BYTES_BEFORE_POLL; + } + + if ((env_ptr = getenv(SYS_VAR_RX_CQ_DRAIN_RATE_NSEC)) != NULL) + rx_cq_drain_rate_nsec = atoi(env_ptr); + // Update the rx cq polling rate for draining logic + tscval_t tsc_per_second = get_tsc_rate_per_second(); + rx_delta_tsc_between_cq_polls = tsc_per_second * rx_cq_drain_rate_nsec / NSEC_PER_SEC; + + if ((env_ptr = getenv(SYS_VAR_GRO_STREAMS_MAX)) != NULL) + gro_streams_max = MAX(atoi(env_ptr), 0); + + if ((env_ptr = getenv(SYS_VAR_TCP_3T_RULES)) != NULL) + tcp_3t_rules = atoi(env_ptr) ? true : false; + + if ((env_ptr = getenv(SYS_VAR_ETH_MC_L2_ONLY_RULES)) != NULL) + eth_mc_l2_only_rules = atoi(env_ptr) ? true : false; + + if ((env_ptr = getenv(SYS_VAR_MC_FORCE_FLOWTAG)) != NULL) + mc_force_flowtag = atoi(env_ptr) ? true : false; + + if ((env_ptr = getenv(SYS_VAR_SELECT_NUM_POLLS)) != NULL) + select_poll_num = atoi(env_ptr); + + if (select_poll_num < MCE_MIN_RX_NUM_POLLS || select_poll_num > MCE_MAX_RX_NUM_POLLS) { + vlog_printf(VLOG_WARNING," Select Poll loops can not be below zero [%d]\n", select_poll_num); + select_poll_num = MCE_DEFAULT_SELECT_NUM_POLLS; + } + + if ((env_ptr = getenv(SYS_VAR_SELECT_POLL_OS_FORCE)) != NULL) + select_poll_os_force = (uint32_t)atoi(env_ptr); + + if (select_poll_os_force) { + select_poll_os_ratio = 1; + select_skip_os_fd_check = 1; + } + + if ((env_ptr = getenv(SYS_VAR_SELECT_POLL_OS_RATIO)) != NULL) + select_poll_os_ratio = (uint32_t)atoi(env_ptr); + + if ((env_ptr = getenv(SYS_VAR_SELECT_SKIP_OS)) != NULL) + select_skip_os_fd_check = (uint32_t)atoi(env_ptr); + +#ifdef DEFINED_IBV_CQ_ATTR_MODERATE + if (rx_poll_num < 0 || select_poll_num < 0) { + cq_moderation_enable = false; + } + if ((env_ptr = getenv(SYS_VAR_CQ_MODERATION_ENABLE)) != NULL) + cq_moderation_enable = atoi(env_ptr) ? true : false; + if ((env_ptr = getenv(SYS_VAR_CQ_MODERATION_COUNT)) != NULL) + cq_moderation_count = (uint32_t)atoi(env_ptr); + if (cq_moderation_count > rx_num_wr / 2) { + cq_moderation_count = rx_num_wr / 2; + } + + if ((env_ptr = getenv(SYS_VAR_CQ_MODERATION_PERIOD_USEC)) != NULL) + cq_moderation_period_usec = (uint32_t)atoi(env_ptr); + + if ((env_ptr = getenv(SYS_VAR_CQ_AIM_MAX_COUNT)) != NULL) + cq_aim_max_count = (uint32_t)atoi(env_ptr); + if (cq_aim_max_count > rx_num_wr / 2){ + cq_aim_max_count = rx_num_wr / 2; + } + + if ((env_ptr = getenv(SYS_VAR_CQ_AIM_MAX_PERIOD_USEC)) != NULL) + cq_aim_max_period_usec = (uint32_t)atoi(env_ptr); + + if ((env_ptr = getenv(SYS_VAR_CQ_AIM_INTERVAL_MSEC)) != NULL) + cq_aim_interval_msec = (uint32_t)atoi(env_ptr); + + if (!cq_moderation_enable) { + cq_aim_interval_msec = MCE_CQ_ADAPTIVE_MODERATION_DISABLED; + } + + if ((env_ptr = getenv(SYS_VAR_CQ_AIM_INTERRUPTS_RATE_PER_SEC)) != NULL) + cq_aim_interrupts_rate_per_sec = (uint32_t)atoi(env_ptr); +#else + if ((env_ptr = getenv(SYS_VAR_CQ_MODERATION_ENABLE)) != NULL) { + vlog_printf(VLOG_WARNING,"'%s' is not supported on this environment\n", SYS_VAR_CQ_MODERATION_ENABLE); + } + if ((env_ptr = getenv(SYS_VAR_CQ_MODERATION_COUNT)) != NULL) { + vlog_printf(VLOG_WARNING,"'%s' is not supported on this environment\n", SYS_VAR_CQ_MODERATION_COUNT); + } + if ((env_ptr = getenv(SYS_VAR_CQ_MODERATION_PERIOD_USEC)) != NULL) { + vlog_printf(VLOG_WARNING,"'%s' is not supported on this environment\n", SYS_VAR_CQ_MODERATION_PERIOD_USEC); + } + if ((env_ptr = getenv(SYS_VAR_CQ_AIM_MAX_COUNT)) != NULL) { + vlog_printf(VLOG_WARNING,"'%s' is not supported on this environment\n", SYS_VAR_CQ_AIM_MAX_COUNT); + } + if ((env_ptr = getenv(SYS_VAR_CQ_AIM_MAX_PERIOD_USEC)) != NULL) { + vlog_printf(VLOG_WARNING,"'%s' is not supported on this environment\n", SYS_VAR_CQ_AIM_MAX_PERIOD_USEC); + } + if ((env_ptr = getenv(SYS_VAR_CQ_AIM_INTERVAL_MSEC)) != NULL) { + vlog_printf(VLOG_WARNING,"'%s' is not supported on this environment\n", SYS_VAR_CQ_AIM_INTERVAL_MSEC); + } + if ((env_ptr = getenv(SYS_VAR_CQ_AIM_INTERRUPTS_RATE_PER_SEC)) != NULL) { + vlog_printf(VLOG_WARNING,"'%s' is not supported on this environment\n", SYS_VAR_CQ_AIM_INTERRUPTS_RATE_PER_SEC); + } +#endif /* DEFINED_IBV_CQ_ATTR_MODERATE */ + + if ((env_ptr = getenv(SYS_VAR_CQ_POLL_BATCH_MAX)) != NULL) + cq_poll_batch_max = (uint32_t)atoi(env_ptr); + if (cq_poll_batch_max < MCE_MIN_CQ_POLL_BATCH || cq_poll_batch_max > MCE_MAX_CQ_POLL_BATCH) { + vlog_printf(VLOG_WARNING," Rx number of cq poll batchs should be between %d and %d [%d]\n", MCE_MIN_CQ_POLL_BATCH, MCE_MAX_CQ_POLL_BATCH, cq_poll_batch_max); + cq_poll_batch_max = MCE_DEFAULT_CQ_POLL_BATCH; + } + + if ((env_ptr = getenv(SYS_VAR_PROGRESS_ENGINE_INTERVAL)) != NULL) + progress_engine_interval_msec = (uint32_t)atoi(env_ptr); + if (enable_socketxtreme && (progress_engine_interval_msec != MCE_CQ_DRAIN_INTERVAL_DISABLED)) { + progress_engine_interval_msec = MCE_CQ_DRAIN_INTERVAL_DISABLED; + vlog_printf(VLOG_DEBUG,"%s parameter is forced to %d in case %s is enabled\n", + SYS_VAR_PROGRESS_ENGINE_INTERVAL, progress_engine_interval_msec, SYS_VAR_SOCKETXTREME); + } + + if ((env_ptr = getenv(SYS_VAR_PROGRESS_ENGINE_WCE_MAX)) != NULL) + progress_engine_wce_max = (uint32_t)atoi(env_ptr); + + if ((env_ptr = getenv(SYS_VAR_CQ_KEEP_QP_FULL)) != NULL) + cq_keep_qp_full = atoi(env_ptr) ? true : false; + + if ((env_ptr = getenv(SYS_VAR_QP_COMPENSATION_LEVEL)) != NULL) + qp_compensation_level = (uint32_t)atoi(env_ptr); + if (qp_compensation_level < rx_num_wr_to_post_recv) + qp_compensation_level = rx_num_wr_to_post_recv; + + if ((env_ptr = getenv(SYS_VAR_OFFLOADED_SOCKETS)) != NULL) + offloaded_sockets = atoi(env_ptr) ? true : false; + + if ((env_ptr = getenv(SYS_VAR_TIMER_RESOLUTION_MSEC)) != NULL) + timer_resolution_msec = atoi(env_ptr); + + if ((env_ptr = getenv(SYS_VAR_TCP_TIMER_RESOLUTION_MSEC)) != NULL) + tcp_timer_resolution_msec = atoi(env_ptr); + + if ((env_ptr = getenv(SYS_VAR_INTERNAL_THREAD_TCP_TIMER_HANDLING)) != NULL) { + internal_thread_tcp_timer_handling = + atoi(env_ptr) == 1 ? INTERNAL_THREAD_TCP_TIMER_HANDLING_IMMEDIATE : INTERNAL_THREAD_TCP_TIMER_HANDLING_DEFERRED; + } + + if ((env_ptr = getenv(SYS_VAR_TCP_CTL_THREAD)) != NULL) { + tcp_ctl_thread = (tcp_ctl_thread_t)atoi(env_ptr); + if (tcp_ctl_thread >= CTL_THREAD_LAST || tcp_ctl_thread < 0) + tcp_ctl_thread = MCE_DEFAULT_TCP_CTL_THREAD; + } + + if ((env_ptr = getenv(SYS_VAR_TCP_TIMESTAMP_OPTION)) != NULL) { + tcp_ts_opt = (tcp_ts_opt_t)atoi(env_ptr); + if ((uint32_t) tcp_ts_opt >= TCP_TS_OPTION_LAST) { + vlog_printf(VLOG_WARNING,"TCP timestamp option value is out of range [%d] (min=%d, max=%d). using default [%d]\n", tcp_ts_opt, TCP_TS_OPTION_DISABLE , TCP_TS_OPTION_LAST - 1, MCE_DEFAULT_TCP_TIMESTAMP_OPTION); + tcp_ts_opt = MCE_DEFAULT_TCP_TIMESTAMP_OPTION; + } + } + + if ((env_ptr = getenv(SYS_VAR_TCP_NODELAY)) != NULL) { + tcp_nodelay = atoi(env_ptr) ? true : false; + } + + if ((env_ptr = getenv(SYS_VAR_TCP_QUICKACK)) != NULL) { + tcp_quickack = atoi(env_ptr) ? true : false; + } + + // TODO: this should be replaced by calling "exception_handling.init()" that will be called from init() + if ((env_ptr = getenv(vma_exception_handling::getSysVar())) != NULL) { + exception_handling = vma_exception_handling(strtol(env_ptr, NULL, 10)); // vma_exception_handling is responsible for its invariant + } + + if ((env_ptr = getenv(SYS_VAR_AVOID_SYS_CALLS_ON_TCP_FD)) != NULL) { + avoid_sys_calls_on_tcp_fd = atoi(env_ptr) ? true : false; + } + + if ((env_ptr = getenv(SYS_VAR_ALLOW_PRIVILEGED_SOCK_OPT)) != NULL) { + allow_privileged_sock_opt = atoi(env_ptr) ? true : false; + } + + if(tcp_timer_resolution_msec < timer_resolution_msec){ + vlog_printf(VLOG_WARNING," TCP timer resolution [%s=%d] cannot be smaller than timer resolution [%s=%d]. Setting TCP timer resolution to %d msec.\n", SYS_VAR_TCP_TIMER_RESOLUTION_MSEC, tcp_timer_resolution_msec, SYS_VAR_TIMER_RESOLUTION_MSEC, timer_resolution_msec, timer_resolution_msec); + tcp_timer_resolution_msec = timer_resolution_msec; + } + + if ((env_ptr = getenv(SYS_VAR_INTERNAL_THREAD_ARM_CQ)) != NULL) + internal_thread_arm_cq_enabled = atoi(env_ptr) ? true : false; + + if ((env_ptr = getenv(SYS_VAR_INTERNAL_THREAD_CPUSET)) != NULL) { + snprintf(internal_thread_cpuset, FILENAME_MAX, "%s", env_ptr); + } + + // handle internal thread affinity - default is CPU-0 + if ((env_ptr = getenv(SYS_VAR_INTERNAL_THREAD_AFFINITY)) != NULL) { + int n = snprintf(internal_thread_affinity_str, + sizeof(internal_thread_affinity_str), "%s", env_ptr); + if (unlikely(((int)sizeof(internal_thread_affinity_str) < n) || (n < 0))) { + vlog_printf(VLOG_WARNING,"Failed to process: %s.\n", + SYS_VAR_INTERNAL_THREAD_AFFINITY); + } + } + if (env_to_cpuset(internal_thread_affinity_str, &internal_thread_affinity)) { + vlog_printf(VLOG_WARNING," Failed to set internal thread affinity: %s... deferring to cpu-0.\n", + internal_thread_affinity_str); + } + + if ((env_ptr = getenv(SYS_VAR_WAIT_AFTER_JOIN_MSEC)) != NULL) + wait_after_join_msec = (uint32_t)atoi(env_ptr); + + if ((env_ptr = getenv(SYS_VAR_THREAD_MODE)) != NULL) { + thread_mode = (thread_mode_t)atoi(env_ptr); + if (thread_mode < 0 || thread_mode >= THREAD_MODE_LAST) + thread_mode = MCE_DEFAULT_THREAD_MODE; + } + + if ((env_ptr = getenv(SYS_VAR_BUFFER_BATCHING_MODE)) != NULL) { + buffer_batching_mode = (buffer_batching_mode_t)atoi(env_ptr); + if (buffer_batching_mode < 0 || buffer_batching_mode >= BUFFER_BATCHING_LAST) + buffer_batching_mode = MCE_DEFAULT_BUFFER_BATCHING_MODE; + } + + if (buffer_batching_mode == BUFFER_BATCHING_NONE) { + tx_bufs_batch_tcp = 1; + tx_bufs_batch_udp = 1; + rx_bufs_batch = 1; + } + + if ((env_ptr = getenv(SYS_VAR_NETLINK_TIMER_MSEC)) != NULL) + timer_netlink_update_msec = (uint32_t)atoi(env_ptr); + + + if((env_ptr = getenv(SYS_VAR_NEIGH_NUM_ERR_RETRIES))!= NULL) { + neigh_num_err_retries = (uint32_t)atoi(env_ptr); + } + if((env_ptr = getenv(SYS_VAR_NEIGH_UC_ARP_DELAY_MSEC)) != NULL){ + neigh_wait_till_send_arp_msec = (uint32_t)atoi(env_ptr); + } + if((env_ptr = getenv(SYS_VAR_NEIGH_UC_ARP_QUATA)) != NULL){ + neigh_uc_arp_quata = (uint32_t)atoi(env_ptr); + } + + if ((getenv(SYS_VAR_HUGETBL)) != NULL) + { + vlog_printf(VLOG_WARNING, "**********************************************************************************************************************\n"); + vlog_printf(VLOG_WARNING, "The '%s' parameter is no longer supported, please refer to '%s' in README.txt for more info\n", SYS_VAR_HUGETBL, SYS_VAR_MEM_ALLOC_TYPE); + vlog_printf(VLOG_WARNING, "**********************************************************************************************************************\n"); + } + + if ((env_ptr = getenv(SYS_VAR_MEM_ALLOC_TYPE)) != NULL) + mem_alloc_type = (alloc_mode_t)atoi(env_ptr); + if (mem_alloc_type < 0 || mem_alloc_type >= ALLOC_TYPE_LAST_ALLOWED_TO_USE) + mem_alloc_type = MCE_DEFAULT_MEM_ALLOC_TYPE; + if (mce_sys_var::HYPER_MSHV == hypervisor && mem_alloc_type == ALLOC_TYPE_CONTIG) { + char mem_str[sizeof(int) + 1] = {0}; + len = snprintf(mem_str, sizeof(mem_str), "%d", ALLOC_TYPE_HUGEPAGES); + if (likely((0 < len) && (len < (int)sizeof(mem_str)))) { + setenv(SYS_VAR_MEM_ALLOC_TYPE, mem_str, 1); // Setenv to avoid core dump while valgrind is used. + } + vlog_printf(VLOG_DEBUG, "The '%s' parameter can not be %d for %s.\n", + SYS_VAR_MEM_ALLOC_TYPE, mem_alloc_type, cpuid_hv_vendor()); + mem_alloc_type = ALLOC_TYPE_HUGEPAGES; + } + + if ((env_ptr = getenv(SYS_VAR_BF)) != NULL) + handle_bf = atoi(env_ptr) ? true : false; + + if ((env_ptr = getenv(SYS_VAR_FORK)) != NULL) + handle_fork = atoi(env_ptr) ? true : false; + + if((env_ptr = getenv(SYS_VAR_IPOIB )) != NULL) + enable_ipoib = atoi(env_ptr) ? true : false; + +#ifdef DEFINED_TSO + if((env_ptr = getenv(SYS_VAR_TSO)) != NULL) + enable_tso = atoi(env_ptr) ? true : false; + + if (enable_tso && (ring_migration_ratio_tx != -1)) { + ring_migration_ratio_tx = -1; + vlog_printf(VLOG_DEBUG,"%s parameter is forced to %d in case %s is enabled\n", + SYS_VAR_RING_MIGRATION_RATIO_TX, -1, SYS_VAR_TSO); + } +#endif /* DEFINED_TSO */ + + if ((env_ptr = getenv(SYS_VAR_CLOSE_ON_DUP2)) != NULL) + close_on_dup2 = atoi(env_ptr) ? true : false; + + if ((env_ptr = getenv(SYS_VAR_MTU)) != NULL) + mtu = (uint32_t)atoi(env_ptr); + + if ((env_ptr = getenv(SYS_VAR_MSS)) != NULL) + lwip_mss = (uint32_t)atoi(env_ptr); + + if ((env_ptr = getenv(SYS_VAR_TCP_CC_ALGO)) != NULL) + lwip_cc_algo_mod = (uint32_t)atoi(env_ptr); + + if ((env_ptr = getenv(SYS_VAR_VMA_RX_POLL_ON_TX_TCP)) != NULL) + rx_poll_on_tx_tcp = atoi(env_ptr) ? true : false; + + if ((env_ptr = getenv(SYS_VAR_VMA_TRIGGER_DUMMY_SEND_GETSOCKNAME)) != NULL) + trigger_dummy_send_getsockname = atoi(env_ptr) ? true : false; + +#ifdef VMA_TIME_MEASURE + if ((env_ptr = getenv(SYS_VAR_VMA_TIME_MEASURE_NUM_SAMPLES)) != NULL) { + vma_time_measure_num_samples = (uint32_t)atoi(env_ptr); + if(vma_time_measure_num_samples > INST_SIZE){ + vlog_printf(VLOG_WARNING, "The value of '%s' is bigger than %d. Time samples over %d will be dropped.\n", SYS_VAR_VMA_TIME_MEASURE_NUM_SAMPLES, INST_SIZE, INST_SIZE); + } + } + + if ((env_ptr = getenv(SYS_VAR_VMA_TIME_MEASURE_DUMP_FILE)) != NULL){ + read_env_variable_with_pid(vma_time_measure_filename, sizeof(vma_time_measure_filename), env_ptr); + } +#endif + +} + + +void set_env_params() +{ + // Need to call setenv() only after getenv() is done, because /bin/sh has + // a custom setenv() which overrides original environment. + + //setenv("MLX4_SINGLE_THREADED", "1", 0); + + /* + * MLX4_DEVICE_FATAL_CLEANUP/MLX5_DEVICE_FATAL_CLEANUP/RDMAV_ALLOW_DISASSOC_DESTROY + * tells ibv_destroy functions we want to get success errno value + * in case of calling them when the device was removed. + * It helps to destroy resources in DEVICE_FATAL state + */ + setenv("MLX4_DEVICE_FATAL_CLEANUP", "1", 1); + setenv("MLX5_DEVICE_FATAL_CLEANUP", "1", 1); + setenv("RDMAV_ALLOW_DISASSOC_DESTROY", "1", 1); + + if (safe_mce_sys().handle_bf) { + setenv("MLX4_POST_SEND_PREFER_BF", "1", 1); + setenv("MLX5_POST_SEND_PREFER_BF", "1", 1); + } else { + /* todo - these seem not to work if inline is on, since libmlx is doing (inl || bf) when deciding to bf*/ + setenv("MLX4_POST_SEND_PREFER_BF", "0", 1); + setenv("MLX5_POST_SEND_PREFER_BF", "0", 1); + } + + switch (safe_mce_sys().mem_alloc_type) { + case ALLOC_TYPE_ANON: + setenv("MLX_QP_ALLOC_TYPE", "ANON", 0); + setenv("MLX_CQ_ALLOC_TYPE", "ANON", 0); + break; + case ALLOC_TYPE_HUGEPAGES: + setenv("RDMAV_HUGEPAGES_SAFE", "1", 0); + setenv("MLX_QP_ALLOC_TYPE", "ALL", 0); + setenv("MLX_CQ_ALLOC_TYPE", "ALL", 0); + break; + case ALLOC_TYPE_CONTIG: + default: + setenv("MLX_QP_ALLOC_TYPE", "PREFER_CONTIG", 0); + setenv("MLX_CQ_ALLOC_TYPE", "PREFER_CONTIG", 0); + break; + } +} diff --git a/src/vma/util/sys_vars.h b/src/vma/util/sys_vars.h new file mode 100644 index 0000000..763fbbb --- /dev/null +++ b/src/vma/util/sys_vars.h @@ -0,0 +1,749 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef SYS_VARS_H +#define SYS_VARS_H + +#include +#include +#include +#include + +#include "vtypes.h" +#include "config.h" + +#include "vma/ib/base/verbs_extra.h" +#include "vma/util/sysctl_reader.h" +#include "vma/vma_extra.h" + +typedef enum { + MCE_SPEC_NONE = 0, + MCE_SPEC_SOCKPERF_ULTRA_LATENCY_10, + MCE_SPEC_SOCKPERF_LATENCY_15, + MCE_SPEC_29WEST_LBM_29, + MCE_SPEC_WOMBAT_FH_LBM_554, + MCE_SPEC_MCD_623, + MCE_SPEC_MCD_IRQ_624, + MCE_SPEC_RTI_784, + MCE_SPEC_LL_7750, + MCE_SPEC_LL_MULTI_RING, + + MCE_VMA__ALL /* last element */ +} vma_spec_t; + +typedef enum { + ALLOC_TYPE_ANON = 0, + ALLOC_TYPE_CONTIG, + ALLOC_TYPE_HUGEPAGES, + ALLOC_TYPE_LAST_ALLOWED_TO_USE, + ALLOC_TYPE_EXTERNAL, /* not allowed as a global parameter */ +} alloc_mode_t; + +typedef enum { + TS_CONVERSION_MODE_DISABLE = 0, // TS_CONVERSION_MODE_DISABLE must be the first enum + TS_CONVERSION_MODE_RAW, + TS_CONVERSION_MODE_BEST_POSSIBLE, + TS_CONVERSION_MODE_SYNC, + TS_CONVERSION_MODE_PTP, + TS_CONVERSION_MODE_LAST +} ts_conversion_mode_t; + +static inline bool is_ring_logic_valid(ring_logic_t logic) +{ + switch (logic) { + case RING_LOGIC_PER_INTERFACE: + case RING_LOGIC_PER_IP: + case RING_LOGIC_PER_SOCKET: + case RING_LOGIC_PER_THREAD: + case RING_LOGIC_PER_CORE: + case RING_LOGIC_PER_CORE_ATTACH_THREADS: + return true; + default: + return false; + } +} + +static inline const char* ring_logic_str(ring_logic_t logic) +{ + switch (logic) { + case RING_LOGIC_PER_INTERFACE: return "(Ring per interface)"; + case RING_LOGIC_PER_IP: return "(Ring per ip)"; + case RING_LOGIC_PER_SOCKET: return "(Ring per socket)"; + case RING_LOGIC_PER_THREAD: return "(Ring per thread)"; + case RING_LOGIC_PER_CORE: return "(Ring per core)"; + case RING_LOGIC_PER_CORE_ATTACH_THREADS: return "(Ring per core - attach threads)"; + default: break; + } + return "unsupported"; +} + +typedef enum { + THREAD_MODE_SINGLE = 0, + THREAD_MODE_MULTI, + THREAD_MODE_MUTEX, + THREAD_MODE_PLENTY, + THREAD_MODE_LAST +} thread_mode_t; + +typedef enum { + BUFFER_BATCHING_NONE = 0, + BUFFER_BATCHING_WITH_RECLAIM, + BUFFER_BATCHING_NO_RECLAIM, + BUFFER_BATCHING_LAST, +} buffer_batching_mode_t; + +// See ibv_transport_type for general verbs transport types +typedef enum { + VMA_TRANSPORT_UNKNOWN = -1, + VMA_TRANSPORT_IB = 0, + VMA_TRANSPORT_ETH +} transport_type_t; + +static inline const char* priv_vma_transport_type_str(transport_type_t transport_type) +{ + BULLSEYE_EXCLUDE_BLOCK_START + switch (transport_type) { + case VMA_TRANSPORT_IB: return "IB"; + case VMA_TRANSPORT_ETH: return "ETH"; + case VMA_TRANSPORT_UNKNOWN: + default: break; + } + return "UNKNOWN"; + BULLSEYE_EXCLUDE_BLOCK_END +} + +typedef enum { + MSS_FOLLOW_MTU = 0 +} mss_mode_t; + +typedef enum { + MTU_FOLLOW_INTERFACE = 0 +} mtu_mode_t; + +typedef enum { + CTL_THREAD_DISABLE = 0, + CTL_THREAD_WITH_WAKEUP, + CTL_THREAD_NO_WAKEUP, + CTL_THREAD_LAST +} tcp_ctl_thread_t; + +typedef enum { + TCP_TS_OPTION_DISABLE = 0, // TCP_TS_OPTION_DISABLE must be the first enum + TCP_TS_OPTION_ENABLE, + TCP_TS_OPTION_FOLLOW_OS, + TCP_TS_OPTION_LAST +} tcp_ts_opt_t; + +static inline const char* ctl_thread_str(tcp_ctl_thread_t logic) +{ + switch (logic) { + case CTL_THREAD_DISABLE: return "(Disabled)"; + case CTL_THREAD_WITH_WAKEUP: return "(Enabled - with wakeup)"; + case CTL_THREAD_NO_WAKEUP: return "(Enabled - no wakeup)"; + default: break; + } + return "unsupported"; +} + +typedef enum { + INTERNAL_THREAD_TCP_TIMER_HANDLING_DEFERRED = 0, + INTERNAL_THREAD_TCP_TIMER_HANDLING_IMMEDIATE +} internal_thread_tcp_timer_handling_t; + +static inline const char* internal_thread_tcp_timer_handling_str(internal_thread_tcp_timer_handling_t handling) +{ + switch (handling) { + case INTERNAL_THREAD_TCP_TIMER_HANDLING_DEFERRED: return "(deferred)"; + case INTERNAL_THREAD_TCP_TIMER_HANDLING_IMMEDIATE: return "(immediate)"; + default: break; + } + return "unsupported"; +} + +namespace vma_spec { + // convert str to vVMA_spec_t; upon error - returns the given 'def_value' + vma_spec_t from_str(const char* str, vma_spec_t def_value = MCE_SPEC_NONE); + + // convert int to vVMA_spec_t; upon error - returns the given 'def_value' + vma_spec_t from_int(const int int_spec, vma_spec_t def_value = MCE_SPEC_NONE); + + const char * to_str(vma_spec_t level); +} + +//////////////////////////////////////////////////////////////////////////////// +class vma_exception_handling +{ +public: + + static const char *getName() { + return "Exception handling mode"; + } + + static const char *getSysVar() { + return "VMA_EXCEPTION_HANDLING"; + } + + typedef enum { + MODE_FIRST = -3, + MODE_EXIT = -2, + MODE_DEBUG = -1, + MODE_UNOFFLOAD = 0, + MODE_LOG_ERROR, + MODE_RETURN_ERROR, + MODE_ABORT, + MODE_LAST, + + MODE_DEFAULT = MODE_DEBUG + } mode; + + const char* to_str() + { + switch (m_mode) { + case MODE_EXIT: return "(exit on failed startup)"; + case MODE_DEBUG: return "(just log debug message)"; + case MODE_UNOFFLOAD: return "(log debug and un-offload)"; + case MODE_LOG_ERROR: return "(log error and un-offload)"; + case MODE_RETURN_ERROR: return "(Log Error and return error)"; + case MODE_ABORT: return "(Log error and Abort!)"; + default: break; + } + return "unsupported"; + } + + bool is_suit_un_offloading() { + return m_mode == MODE_UNOFFLOAD || m_mode == MODE_LOG_ERROR; + } + + vlog_levels_t get_log_severity() { + switch (m_mode) { + case MODE_EXIT: + case MODE_DEBUG: + case MODE_UNOFFLOAD: + return VLOG_DEBUG; + case MODE_LOG_ERROR: + case MODE_RETURN_ERROR: + case MODE_ABORT: + default: + return VLOG_ERROR; + } + } + + // + // cast constructors and cast operators + // + + vma_exception_handling(mode _mode = MODE_DEFAULT) : m_mode(_mode) { + if (m_mode >= MODE_LAST || m_mode <= MODE_FIRST) + m_mode = MODE_DEFAULT; + } + + explicit vma_exception_handling(int _mode) : m_mode((mode)_mode) { + if (m_mode >= MODE_LAST || m_mode <= MODE_FIRST) + m_mode = MODE_DEFAULT; + } + + operator mode() const { + return m_mode; + } + +private: + mode m_mode; +}; + +//////////////////////////////////////////////////////////////////////////////// +struct mce_sys_var { + static mce_sys_var & instance() { + static mce_sys_var the_instance; //singelton + return the_instance; + } + +public: + enum hyper_t { + HYPER_NONE = 0, + HYPER_XEN, + HYPER_KVM, + HYPER_MSHV, + HYPER_VMWARE + }; + +public: + void get_env_params(); + + char *app_name; + char app_id[MAX_APP_ID_LENGHT]; + + uint32_t mce_spec; + uint32_t mce_spec_param1; + uint32_t mce_spec_param2; + + vlog_levels_t log_level; + uint32_t log_details; + char log_filename[PATH_MAX]; + char stats_filename[PATH_MAX]; + char stats_shmem_dirname[PATH_MAX]; + char conf_filename[PATH_MAX]; + char vmad_notify_dir[PATH_MAX]; + bool log_colors; + bool handle_sigintr; + bool handle_segfault; + uint32_t stats_fd_num_max; + + ring_logic_t ring_allocation_logic_tx; + ring_logic_t ring_allocation_logic_rx; + int ring_migration_ratio_tx; + int ring_migration_ratio_rx; + int ring_limit_per_interface; + int ring_dev_mem_tx; + int tcp_max_syn_rate; + + uint32_t tx_num_segs_tcp; + uint32_t tx_num_bufs; +#ifdef DEFINED_TSO + uint32_t tx_buf_size; +#endif /* DEFINED_TSO */ + uint32_t tx_num_wr; + uint32_t tx_num_wr_to_signal; + uint32_t tx_max_inline; + bool tx_mc_loopback_default; + bool tx_nonblocked_eagains; + uint32_t tx_prefetch_bytes; + uint32_t tx_bufs_batch_udp; + uint32_t tx_bufs_batch_tcp; + + uint32_t rx_num_bufs; + uint32_t rx_bufs_batch; + uint32_t rx_num_wr; + uint32_t rx_num_wr_to_post_recv; + int32_t rx_poll_num; + int32_t rx_poll_num_init; + uint32_t rx_udp_poll_os_ratio; + ts_conversion_mode_t hw_ts_conversion_mode; + uint32_t rx_poll_yield_loops; + uint32_t rx_ready_byte_min_limit; + uint32_t rx_prefetch_bytes; + uint32_t rx_prefetch_bytes_before_poll; + uint32_t rx_cq_drain_rate_nsec; // If enabled this will cause the Rx to drain all wce in CQ before returning to user, + // Else (Default: Disbaled) it will return when first ready packet is in socket queue + uint32_t rx_delta_tsc_between_cq_polls; + + uint32_t gro_streams_max; + + bool tcp_3t_rules; + bool eth_mc_l2_only_rules; + bool mc_force_flowtag; + + int32_t select_poll_num; + bool select_poll_os_force; + uint32_t select_poll_os_ratio; + uint32_t select_skip_os_fd_check; + bool select_handle_cpu_usage_stats; + + bool cq_moderation_enable; + uint32_t cq_moderation_count; + uint32_t cq_moderation_period_usec; + uint32_t cq_aim_max_count; + uint32_t cq_aim_max_period_usec; + uint32_t cq_aim_interval_msec; + uint32_t cq_aim_interrupts_rate_per_sec; + + + uint32_t cq_poll_batch_max; + uint32_t progress_engine_interval_msec; + uint32_t progress_engine_wce_max; + bool cq_keep_qp_full; + uint32_t qp_compensation_level; + + bool offloaded_sockets; + uint32_t timer_resolution_msec; + uint32_t tcp_timer_resolution_msec; + tcp_ctl_thread_t tcp_ctl_thread; + tcp_ts_opt_t tcp_ts_opt; + bool tcp_nodelay; + bool tcp_quickack; + vma_exception_handling exception_handling; + bool avoid_sys_calls_on_tcp_fd; + bool allow_privileged_sock_opt; + uint32_t wait_after_join_msec; + thread_mode_t thread_mode; + buffer_batching_mode_t buffer_batching_mode; + alloc_mode_t mem_alloc_type; + bool handle_fork; + bool close_on_dup2; + uint32_t mtu; /* effective MTU. If mtu==0 then auto calculate the MTU */ + uint32_t lwip_cc_algo_mod; + uint32_t lwip_mss; + char internal_thread_cpuset[FILENAME_MAX]; + char internal_thread_affinity_str[FILENAME_MAX]; + cpu_set_t internal_thread_affinity; + bool internal_thread_arm_cq_enabled; + internal_thread_tcp_timer_handling_t internal_thread_tcp_timer_handling; + bool handle_bf; + + bool enable_ipoib; + bool enable_socketxtreme; +#ifdef DEFINED_TSO + bool enable_tso; +#endif /* DEFINED_TSO */ + uint32_t timer_netlink_update_msec; + + //Neigh parameters + uint32_t neigh_uc_arp_quata; + uint32_t neigh_wait_till_send_arp_msec; + uint32_t neigh_num_err_retries; + + uint32_t vma_time_measure_num_samples; + char vma_time_measure_filename[PATH_MAX]; + sysctl_reader_t & sysctl_reader; + bool rx_poll_on_tx_tcp; + hyper_t hypervisor; + bool trigger_dummy_send_getsockname; +private: + void print_vma_load_failure_msg(); + int list_to_cpuset(char *cpulist, cpu_set_t *cpu_set); + int hex_to_cpuset(char *start, cpu_set_t *cpu_set); + int env_to_cpuset(char *orig_start, cpu_set_t *cpu_set); + void read_env_variable_with_pid(char* mce_sys_name, size_t mce_sys_max_size, char* env_ptr); + bool check_cpuinfo_flag(const char* flag); + bool cpuid_hv(); + const char* cpuid_hv_vendor(); + void read_hv(); + + // prevent unautothrized creation of objects + mce_sys_var () : sysctl_reader(sysctl_reader_t::instance()){ + // coverity[uninit_member] + get_env_params(); + } + mce_sys_var (const mce_sys_var &); + mce_sys_var & operator= (const mce_sys_var &); + + +}; + +extern mce_sys_var & safe_mce_sys(); + +#define SYS_VAR_LOG_LEVEL "VMA_TRACELEVEL" +#define SYS_VAR_LOG_DETAILS "VMA_LOG_DETAILS" +#define SYS_VAR_LOG_FILENAME "VMA_LOG_FILE" +#define SYS_VAR_STATS_FILENAME "VMA_STATS_FILE" +#define SYS_VAR_VMAD_DIR "VMA_VMAD_NOTIFY_DIR" +#define SYS_VAR_STATS_SHMEM_DIRNAME "VMA_STATS_SHMEM_DIR" +#define SYS_VAR_CONF_FILENAME "VMA_CONFIG_FILE" +#define SYS_VAR_LOG_COLORS "VMA_LOG_COLORS" +#define SYS_VAR_APPLICATION_ID "VMA_APPLICATION_ID" +#define SYS_VAR_HANDLE_SIGINTR "VMA_HANDLE_SIGINTR" +#define SYS_VAR_HANDLE_SIGSEGV "VMA_HANDLE_SIGSEGV" +#define SYS_VAR_STATS_FD_NUM "VMA_STATS_FD_NUM" + +#define SYS_VAR_RING_ALLOCATION_LOGIC_TX "VMA_RING_ALLOCATION_LOGIC_TX" +#define SYS_VAR_RING_ALLOCATION_LOGIC_RX "VMA_RING_ALLOCATION_LOGIC_RX" +#define SYS_VAR_RING_MIGRATION_RATIO_TX "VMA_RING_MIGRATION_RATIO_TX" +#define SYS_VAR_RING_MIGRATION_RATIO_RX "VMA_RING_MIGRATION_RATIO_RX" +#define SYS_VAR_RING_LIMIT_PER_INTERFACE "VMA_RING_LIMIT_PER_INTERFACE" +#define SYS_VAR_RING_DEV_MEM_TX "VMA_RING_DEV_MEM_TX" + +#define SYS_VAR_TX_NUM_SEGS_TCP "VMA_TX_SEGS_TCP" +#define SYS_VAR_TX_NUM_BUFS "VMA_TX_BUFS" +#ifdef DEFINED_TSO +#define SYS_VAR_TX_BUF_SIZE "VMA_TX_BUF_SIZE" +#endif /* DEFINED_TSO */ +#define SYS_VAR_TX_NUM_WRE "VMA_TX_WRE" +#define SYS_VAR_TX_NUM_WRE_TO_SIGNAL "VMA_TX_WRE_BATCHING" +#define SYS_VAR_TX_MAX_INLINE "VMA_TX_MAX_INLINE" +#define SYS_VAR_TX_MC_LOOPBACK "VMA_TX_MC_LOOPBACK" +#define SYS_VAR_TX_NONBLOCKED_EAGAINS "VMA_TX_NONBLOCKED_EAGAINS" +#define SYS_VAR_TX_PREFETCH_BYTES "VMA_TX_PREFETCH_BYTES" + +#define SYS_VAR_RX_NUM_BUFS "VMA_RX_BUFS" +#define SYS_VAR_RX_NUM_WRE "VMA_RX_WRE" +#define SYS_VAR_RX_NUM_WRE_TO_POST_RECV "VMA_RX_WRE_BATCHING" +#define SYS_VAR_RX_NUM_POLLS "VMA_RX_POLL" +#define SYS_VAR_RX_NUM_POLLS_INIT "VMA_RX_POLL_INIT" +#define SYS_VAR_RX_UDP_POLL_OS_RATIO "VMA_RX_UDP_POLL_OS_RATIO" +#define SYS_VAR_HW_TS_CONVERSION_MODE "VMA_HW_TS_CONVERSION" +// The following 2 params were replaced by VMA_RX_UDP_POLL_OS_RATIO +#define SYS_VAR_RX_POLL_OS_RATIO "VMA_RX_POLL_OS_RATIO" +#define SYS_VAR_RX_SKIP_OS "VMA_RX_SKIP_OS" +#define SYS_VAR_RX_POLL_YIELD "VMA_RX_POLL_YIELD" +#define SYS_VAR_RX_BYTE_MIN_LIMIT "VMA_RX_BYTES_MIN" +#define SYS_VAR_RX_PREFETCH_BYTES "VMA_RX_PREFETCH_BYTES" +#define SYS_VAR_RX_PREFETCH_BYTES_BEFORE_POLL "VMA_RX_PREFETCH_BYTES_BEFORE_POLL" +#define SYS_VAR_RX_CQ_DRAIN_RATE_NSEC "VMA_RX_CQ_DRAIN_RATE_NSEC" +#define SYS_VAR_GRO_STREAMS_MAX "VMA_GRO_STREAMS_MAX" +#define SYS_VAR_TCP_3T_RULES "VMA_TCP_3T_RULES" +#define SYS_VAR_ETH_MC_L2_ONLY_RULES "VMA_ETH_MC_L2_ONLY_RULES" +#define SYS_VAR_MC_FORCE_FLOWTAG "VMA_MC_FORCE_FLOWTAG" + +#define SYS_VAR_SELECT_CPU_USAGE_STATS "VMA_CPU_USAGE_STATS" +#define SYS_VAR_SELECT_NUM_POLLS "VMA_SELECT_POLL" +#define SYS_VAR_SELECT_POLL_OS_FORCE "VMA_SELECT_POLL_OS_FORCE" +#define SYS_VAR_SELECT_POLL_OS_RATIO "VMA_SELECT_POLL_OS_RATIO" +#define SYS_VAR_SELECT_SKIP_OS "VMA_SELECT_SKIP_OS" + +#define SYS_VAR_CQ_MODERATION_ENABLE "VMA_CQ_MODERATION_ENABLE" +#define SYS_VAR_CQ_MODERATION_COUNT "VMA_CQ_MODERATION_COUNT" +#define SYS_VAR_CQ_MODERATION_PERIOD_USEC "VMA_CQ_MODERATION_PERIOD_USEC" +#define SYS_VAR_CQ_AIM_MAX_COUNT "VMA_CQ_AIM_MAX_COUNT" +#define SYS_VAR_CQ_AIM_MAX_PERIOD_USEC "VMA_CQ_AIM_MAX_PERIOD_USEC" +#define SYS_VAR_CQ_AIM_INTERVAL_MSEC "VMA_CQ_AIM_INTERVAL_MSEC" +#define SYS_VAR_CQ_AIM_INTERRUPTS_RATE_PER_SEC "VMA_CQ_AIM_INTERRUPTS_RATE_PER_SEC" + +#define SYS_VAR_CQ_POLL_BATCH_MAX "VMA_CQ_POLL_BATCH_MAX" +#define SYS_VAR_PROGRESS_ENGINE_INTERVAL "VMA_PROGRESS_ENGINE_INTERVAL" +#define SYS_VAR_PROGRESS_ENGINE_WCE_MAX "VMA_PROGRESS_ENGINE_WCE_MAX" +#define SYS_VAR_CQ_KEEP_QP_FULL "VMA_CQ_KEEP_QP_FULL" +#define SYS_VAR_QP_COMPENSATION_LEVEL "VMA_QP_COMPENSATION_LEVEL" +#define SYS_VAR_OFFLOADED_SOCKETS "VMA_OFFLOADED_SOCKETS" +#define SYS_VAR_TIMER_RESOLUTION_MSEC "VMA_TIMER_RESOLUTION_MSEC" +#define SYS_VAR_TCP_TIMER_RESOLUTION_MSEC "VMA_TCP_TIMER_RESOLUTION_MSEC" +#define SYS_VAR_TCP_CTL_THREAD "VMA_TCP_CTL_THREAD" +#define SYS_VAR_TCP_TIMESTAMP_OPTION "VMA_TCP_TIMESTAMP_OPTION" +#define SYS_VAR_TCP_NODELAY "VMA_TCP_NODELAY" +#define SYS_VAR_TCP_QUICKACK "VMA_TCP_QUICKACK" +#define SYS_VAR_VMA_EXCEPTION_HANDLING (vma_exception_handling::getSysVar()) +#define SYS_VAR_AVOID_SYS_CALLS_ON_TCP_FD "VMA_AVOID_SYS_CALLS_ON_TCP_FD" +#define SYS_VAR_ALLOW_PRIVILEGED_SOCK_OPT "VMA_ALLOW_PRIVILEGED_SOCK_OPT" +#define SYS_VAR_WAIT_AFTER_JOIN_MSEC "VMA_WAIT_AFTER_JOIN_MSEC" +#define SYS_VAR_THREAD_MODE "VMA_THREAD_MODE" +#define SYS_VAR_BUFFER_BATCHING_MODE "VMA_BUFFER_BATCHING_MODE" +#define SYS_VAR_HUGETBL "VMA_HUGETBL" +#define SYS_VAR_MEM_ALLOC_TYPE "VMA_MEM_ALLOC_TYPE" +#define SYS_VAR_FORK "VMA_FORK" +#define SYS_VAR_BF "VMA_BF" +#define SYS_VAR_CLOSE_ON_DUP2 "VMA_CLOSE_ON_DUP2" +#define SYS_VAR_MTU "VMA_MTU" +#define SYS_VAR_TCP_MAX_SYN_RATE "VMA_TCP_MAX_SYN_RATE" +#define SYS_VAR_MSS "VMA_MSS" +#define SYS_VAR_TCP_CC_ALGO "VMA_TCP_CC_ALGO" +#define SYS_VAR_SPEC "VMA_SPEC" +#define SYS_VAR_SPEC_PARAM1 "VMA_SPEC_PARAM1" +#define SYS_VAR_SPEC_PARAM2 "VMA_SPEC_PARAM2" + +#define SYS_VAR_IPOIB "VMA_IPOIB" +#define SYS_VAR_SOCKETXTREME "VMA_SOCKETXTREME" +#ifdef DEFINED_TSO +#define SYS_VAR_TSO "VMA_TSO" +#endif /* DEFINED_TSO */ + +#define SYS_VAR_INTERNAL_THREAD_AFFINITY "VMA_INTERNAL_THREAD_AFFINITY" +#define SYS_VAR_INTERNAL_THREAD_CPUSET "VMA_INTERNAL_THREAD_CPUSET" +#define SYS_VAR_INTERNAL_THREAD_ARM_CQ "VMA_INTERNAL_THREAD_ARM_CQ" +#define SYS_VAR_INTERNAL_THREAD_TCP_TIMER_HANDLING "VMA_INTERNAL_THREAD_TCP_TIMER_HANDLING" + +#define SYS_VAR_NETLINK_TIMER_MSEC "VMA_NETLINK_TIMER" + +#define SYS_VAR_NEIGH_UC_ARP_QUATA "VMA_NEIGH_UC_ARP_QUATA" +#define SYS_VAR_NEIGH_UC_ARP_DELAY_MSEC "VMA_NEIGH_UC_ARP_DELAY_MSEC" +#define SYS_VAR_NEIGH_NUM_ERR_RETRIES "VMA_NEIGH_NUM_ERR_RETRIES" + +#define SYS_VAR_VMA_TIME_MEASURE_NUM_SAMPLES "VMA_TIME_MEASURE_NUM_SAMPLES" +#define SYS_VAR_VMA_TIME_MEASURE_DUMP_FILE "VMA_TIME_MEASURE_DUMP_FILE" +#define SYS_VAR_VMA_RX_POLL_ON_TX_TCP "VMA_RX_POLL_ON_TX_TCP" +#define SYS_VAR_VMA_TRIGGER_DUMMY_SEND_GETSOCKNAME "VMA_TRIGGER_DUMMY_SEND_GETSOCKNAME" + +#define MCE_DEFAULT_LOG_FILE ("") +#define MCE_DEFAULT_CONF_FILE ("/etc/libvma.conf") +#define MCE_DEFAULT_STATS_FILE ("") +#define MCE_DEFAULT_VMAD_FOLDER (VMA_AGENT_PATH) +#define MCE_DEFAULT_STATS_SHMEM_DIR ("/tmp/") +#define MCE_DEFAULT_LOG_DETAILS (0) +#define MCE_DEFAULT_LOG_COLORS (true) +#define MCE_DEFAULT_APP_ID ("VMA_DEFAULT_APPLICATION_ID") +#define MCE_DEFAULT_HANDLE_SIGINTR (false) +#define MCE_DEFAULT_HANDLE_SIGFAULT (false) +#define MCE_DEFAULT_STATS_FD_NUM 100 +#define MCE_DEFAULT_RING_ALLOCATION_LOGIC_TX (RING_LOGIC_PER_INTERFACE) +#define MCE_DEFAULT_RING_ALLOCATION_LOGIC_RX (RING_LOGIC_PER_INTERFACE) +#define MCE_DEFAULT_RING_MIGRATION_RATIO_TX (100) +#define MCE_DEFAULT_RING_MIGRATION_RATIO_RX (100) +#define MCE_DEFAULT_RING_LIMIT_PER_INTERFACE (0) +#define MCE_DEFAULT_RING_DEV_MEM_TX (0) +#define MCE_DEFAULT_TCP_MAX_SYN_RATE (0) +#define MCE_DEFAULT_TX_NUM_SEGS_TCP (1000000) +#define MCE_DEFAULT_TX_NUM_BUFS (200000) +#ifdef DEFINED_TSO +#define MCE_DEFAULT_TX_BUF_SIZE (0) +#endif /* DEFINED_TSO */ +#define MCE_DEFAULT_TX_NUM_WRE (2048) +#define MCE_DEFAULT_TX_NUM_WRE_TO_SIGNAL (64) +#define MCE_DEFAULT_TX_MAX_INLINE (204) //+18(always inline ETH header) = 222 +#define MCE_DEFAULT_TX_BUILD_IP_CHKSUM (true) +#define MCE_DEFAULT_TX_MC_LOOPBACK (true) +#define MCE_DEFAULT_TX_NONBLOCKED_EAGAINS (false) +#define MCE_DEFAULT_TX_PREFETCH_BYTES (256) +#define MCE_DEFAULT_TX_BUFS_BATCH_UDP (8) +#define MCE_DEFAULT_TX_BUFS_BATCH_TCP (16) +#define MCE_DEFAULT_TX_NUM_SGE (2) +#define MCE_DEFAULT_RX_NUM_BUFS (200000) +#define MCE_DEFAULT_RX_BUFS_BATCH (64) +#define MCE_DEFAULT_RX_NUM_WRE (16000) +#define MCE_DEFAULT_RX_NUM_WRE_TO_POST_RECV (64) +#define MCE_DEFAULT_RX_NUM_SGE (1) +#define MCE_DEFAULT_RX_NUM_POLLS (100000) +#define MCE_DEFAULT_RX_NUM_POLLS_INIT (0) +#define MCE_DEFAULT_RX_UDP_POLL_OS_RATIO (100) +#define MCE_DEFAULT_HW_TS_CONVERSION_MODE (TS_CONVERSION_MODE_SYNC) +#define MCE_DEFAULT_RX_POLL_YIELD (0) +#define MCE_DEFAULT_RX_BYTE_MIN_LIMIT (65536) +#define MCE_DEFAULT_RX_PREFETCH_BYTES (256) +#define MCE_DEFAULT_RX_PREFETCH_BYTES_BEFORE_POLL (0) +#define MCE_DEFAULT_RX_CQ_DRAIN_RATE (MCE_RX_CQ_DRAIN_RATE_DISABLED) +#define MCE_DEFAULT_GRO_STREAMS_MAX (32) +#define MCE_DEFAULT_TCP_3T_RULES (false) +#define MCE_DEFAULT_ETH_MC_L2_ONLY_RULES (false) +#define MCE_DEFAULT_MC_FORCE_FLOWTAG (false) +#define MCE_DEFAULT_SELECT_NUM_POLLS (100000) +#define MCE_DEFAULT_SELECT_POLL_OS_FORCE (0) +#define MCE_DEFAULT_SELECT_POLL_OS_RATIO (10) +#define MCE_DEFAULT_SELECT_SKIP_OS (4) +#define MCE_DEFAULT_SELECT_CPU_USAGE_STATS (false) + +#ifdef DEFINED_IBV_CQ_ATTR_MODERATE +#define MCE_DEFAULT_CQ_MODERATION_ENABLE (true) +#else +#define MCE_DEFAULT_CQ_MODERATION_ENABLE (false) +#endif + +#define MCE_DEFAULT_CQ_MODERATION_COUNT (48) +#define MCE_DEFAULT_CQ_MODERATION_PERIOD_USEC (50) +#define MCE_DEFAULT_CQ_AIM_MAX_COUNT (560) +#define MCE_DEFAULT_CQ_AIM_MAX_PERIOD_USEC (250) +#define MCE_DEFAULT_CQ_AIM_INTERVAL_MSEC (250) +#define MCE_DEFAULT_CQ_AIM_INTERRUPTS_RATE_PER_SEC (5000) +#define MCE_DEFAULT_CQ_POLL_BATCH (16) +#define MCE_DEFAULT_PROGRESS_ENGINE_INTERVAL_MSEC (10) +#define MCE_DEFAULT_PROGRESS_ENGINE_WCE_MAX (10000) +#define MCE_DEFAULT_CQ_KEEP_QP_FULL (true) +#define MCE_DEFAULT_QP_COMPENSATION_LEVEL (256) +#define MCE_DEFAULT_INTERNAL_THREAD_ARM_CQ_ENABLED (false) +#define MCE_DEFAULT_QP_FORCE_MC_ATTACH (false) +#define MCE_DEFAULT_OFFLOADED_SOCKETS (true) +#define MCE_DEFAULT_TIMER_RESOLUTION_MSEC (10) +#define MCE_DEFAULT_TCP_TIMER_RESOLUTION_MSEC (100) +#define MCE_DEFAULT_TCP_CTL_THREAD (CTL_THREAD_DISABLE) +#define MCE_DEFAULT_TCP_TIMESTAMP_OPTION (TCP_TS_OPTION_DISABLE) +#define MCE_DEFAULT_TCP_NODELAY (false) +#define MCE_DEFAULT_TCP_QUICKACK (false) +#define MCE_DEFAULT_VMA_EXCEPTION_HANDLING (vma_exception_handling::MODE_DEFAULT) +#define MCE_DEFAULT_AVOID_SYS_CALLS_ON_TCP_FD (false) +#define MCE_DEFAULT_ALLOW_PRIVILEGED_SOCK_OPT (true) +#define MCE_DEFAULT_WAIT_AFTER_JOIN_MSEC (0) +#define MCE_DEFAULT_THREAD_MODE (THREAD_MODE_MULTI) +#define MCE_DEFAULT_BUFFER_BATCHING_MODE (BUFFER_BATCHING_WITH_RECLAIM) +#ifndef VMA_IBV_ACCESS_ALLOCATE_MR +#define MCE_DEFAULT_MEM_ALLOC_TYPE (ALLOC_TYPE_HUGEPAGES) +#else +#define MCE_DEFAULT_MEM_ALLOC_TYPE (ALLOC_TYPE_CONTIG) +#endif +#define MCE_DEFAULT_FORK_SUPPORT (true) +#define MCE_DEFAULT_BF_FLAG (true) +#define MCE_DEFAULT_CLOSE_ON_DUP2 (true) +#define MCE_DEFAULT_MTU (0) +#define MCE_DEFAULT_MSS (0) +#define MCE_DEFAULT_LWIP_CC_ALGO_MOD (0) +#define MCE_DEFAULT_INTERNAL_THREAD_AFFINITY (-1) +#define MCE_DEFAULT_INTERNAL_THREAD_AFFINITY_STR ("-1") +#define MCE_DEFAULT_INTERNAL_THREAD_CPUSET ("") +#define MCE_DEFAULT_INTERNAL_THREAD_TCP_TIMER_HANDLING (INTERNAL_THREAD_TCP_TIMER_HANDLING_DEFERRED) +#define MCE_DEFAULT_NETLINK_TIMER_MSEC (10000) + +#define MCE_DEFAULT_NEIGH_UC_ARP_QUATA 3 +#define MCE_DEFAULT_NEIGH_UC_ARP_DELAY_MSEC 10000 +#define MCE_DEFAULT_NEIGH_NUM_ERR_RETRIES 1 + +#define MCE_DEFAULT_TIME_MEASURE_NUM_SAMPLES (10000) +#define MCE_DEFAULT_TIME_MEASURE_DUMP_FILE "/tmp/VMA_inst.dump" + +#define MCE_MIN_NUM_SGE (1) +#define MCE_MAX_NUM_SGE (32) +#define MCE_MIN_RX_NUM_POLLS (-1) +#define MCE_MAX_RX_NUM_POLLS (100000000) +#define MCE_MIN_RX_PREFETCH_BYTES (32) /* Just enough for headers (IPoIB+IP+UDP)*/ +#define MCE_MAX_RX_PREFETCH_BYTES (2044) +#define MCE_RX_CQ_DRAIN_RATE_DISABLED (0) +#define MCE_CQ_DRAIN_INTERVAL_DISABLED (0) +#define MCE_CQ_ADAPTIVE_MODERATION_DISABLED (0) +#define MCE_MIN_CQ_POLL_BATCH (1) +#define MCE_MAX_CQ_POLL_BATCH (128) +#define MCE_DEFAULT_IPOIB_FLAG (1) +#define MCE_DEFAULT_SOCKETXTREME (false) +#ifdef DEFINED_TSO +#define MCE_DEFAULT_TSO (true) +#endif /* DEFINED_TSO */ +#define MCE_DEFAULT_RX_POLL_ON_TX_TCP (false) +#define MCE_DEFAULT_TRIGGER_DUMMY_SEND_GETSOCKNAME (false) + +#define MCE_ALIGNMENT ((unsigned long)63) +#define RX_BUF_SIZE(mtu) ((mtu) + IPOIB_HDR_LEN + GRH_HDR_LEN) // RX buffers are larger in IB +#define TX_BUF_SIZE(mtu) ((mtu) + 92) // Tx buffers are larger in Ethernet (they include L2 for RAW QP) +#define NUM_TX_WRE_TO_SIGNAL_MAX 64 +#define NUM_RX_WRE_TO_POST_RECV_MAX 1024 +#define TCP_MAX_SYN_RATE_TOP_LIMIT 100000 +#define DEFAULT_MC_TTL 64 +#define IFTYPE_PARAM_FILE "/sys/class/net/%s/type" +#define IFADDR_MTU_PARAM_FILE "/sys/class/net/%s/mtu" +#define UMCAST_PARAM_FILE "/sys/class/net/%s/umcast" +#define IPOIB_MODE_PARAM_FILE "/sys/class/net/%s/mode" +#define VERBS_DEVICE_PORT_PARAM_FILE "/sys/class/net/%s/dev_port" +#define VERBS_DEVICE_ID_PARAM_FILE "/sys/class/net/%s/dev_id" +#define BONDING_MODE_PARAM_FILE "/sys/class/net/%s/bonding/mode" +#define BONDING_SLAVES_PARAM_FILE "/sys/class/net/%s/bonding/slaves" +#define BONDING_ACTIVE_SLAVE_PARAM_FILE "/sys/class/net/%s/bonding/active_slave" +#define BONDING_FAILOVER_MAC_PARAM_FILE "/sys/class/net/%s/bonding/fail_over_mac" +#define BONDING_XMIT_HASH_POLICY_PARAM_FILE "/sys/class/net/%s/bonding/xmit_hash_policy" +#define BONDING_ROCE_LAG_FILE "/sys/class/net/%s/device/roce_lag_enable" +/* BONDING_SLAVE_STATE_PARAM_FILE is for kernel > 3.14 or RH7.2 and higher */ +#define BONDING_SLAVE_STATE_PARAM_FILE "/sys/class/net/%s/bonding_slave/state" +#define L2_ADDR_FILE_FMT "/sys/class/net/%.*s/address" +#define L2_BR_ADDR_FILE_FMT "/sys/class/net/%.*s/broadcast" +#define OPER_STATE_PARAM_FILE "/sys/class/net/%s/operstate" +#define RAW_QP_PRIVLIGES_PARAM_FILE "/sys/module/ib_uverbs/parameters/disable_raw_qp_enforcement" +#define FLOW_STEERING_MGM_ENTRY_SIZE_PARAM_FILE "/sys/module/mlx4_core/parameters/log_num_mgm_entry_size" +#define VIRTUAL_DEVICE_FOLDER "/sys/devices/virtual/net/%s/" +#define BOND_DEVICE_FILE "/proc/net/bonding/%s" + + +#define NETVSC_DEVICE_CLASS_FILE "/sys/class/net/%s/device/class_id" +#define NETVSC_DEVICE_LOWER_FILE "/sys/class/net/%s/lower_%s/ifindex" +#define NETVSC_DEVICE_UPPER_FILE "/sys/class/net/%s/upper_%s/ifindex" +#define NETVSC_ID "{f8615163-df3e-46c5-913f-f2d2f965ed0e}\n" + +#define MAX_STATS_FD_NUM 1024 +#define MAX_WINDOW_SCALING 14 + +#define VIRTUALIZATION_FLAG "hypervisor" + +extern bool g_b_exit; +extern bool g_is_forked_child; +extern bool g_init_global_ctors_done; + +#endif diff --git a/src/vma/util/sysctl_reader.h b/src/vma/util/sysctl_reader.h new file mode 100644 index 0000000..b09db56 --- /dev/null +++ b/src/vma/util/sysctl_reader.h @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef SYSCNTL_READER_H_ +#define SYSCNTL_READER_H_ + +#include "vlogger/vlogger.h" +#include "utils.h" + +struct sysctl_tcp_mem { + int min_value; + int default_value; + int max_value; +}; + +class sysctl_reader_t { + +private: + + int sysctl_read(const char* path, int argument_num ,const char *format, ...){ + + FILE* pfile = fopen (path, "r"); + int ans; + + if (pfile == NULL) { + return -1; + } + + va_list arg; + va_start (arg, format); + ans = vfscanf(pfile, format, arg); + va_end (arg); + + fclose(pfile); + + if (ans != argument_num) { + return -1; + } + + return 0; + } + + void init(){ + } + + sysctl_reader_t() { + this->init(); + this->update_all(); + } + +public : + + static sysctl_reader_t & instance() { + static sysctl_reader_t the_instance; + return the_instance; + } + + void update_all(){ + get_tcp_max_syn_backlog(true); + get_listen_maxconn(true); + get_tcp_wmem(true); + get_tcp_rmem(true); + get_tcp_window_scaling(true); + get_net_core_rmem_max(true); + get_net_core_wmem_max(true); + get_net_ipv4_tcp_timestamps(true); + get_net_ipv4_ttl(true); + get_igmp_max_membership(true); + get_igmp_max_source_membership(true); + } + + int get_tcp_max_syn_backlog(bool update = false) { + static int val; + if (update) + val = read_file_to_int("/proc/sys/net/ipv4/tcp_max_syn_backlog", 1024); + return val; + } + + int get_listen_maxconn(bool update = false) { + static int val; + if (update) + val = read_file_to_int("/proc/sys/net/core/somaxconn", SOMAXCONN); + return val; + } + + const sysctl_tcp_mem *get_tcp_wmem(bool update = false) { + static sysctl_tcp_mem tcp_mem; + if (update) { + if (sysctl_read("/proc/sys/net/ipv4/tcp_wmem", 3, "%d %d %d", &tcp_mem.min_value, &tcp_mem.default_value, &tcp_mem.max_value) == -1) { + tcp_mem.min_value = 4096; + tcp_mem.default_value = 16384; + tcp_mem.max_value = 4194304; + vlog_printf(VLOG_WARNING, "sysctl_reader failed to read net.ipv4.tcp_wmem values - Using defaults : %d %d %d\n", tcp_mem.min_value, tcp_mem.default_value, tcp_mem.max_value); + } + } + return &tcp_mem; + } + + const sysctl_tcp_mem *get_tcp_rmem(bool update = false) { + static sysctl_tcp_mem tcp_mem; + if (update) { + if (sysctl_read("/proc/sys/net/ipv4/tcp_rmem", 3, "%d %d %d", &tcp_mem.min_value, &tcp_mem.default_value, &tcp_mem.max_value) == -1) { + // defaults were taken based on man (7) tcp + tcp_mem.min_value = 4096; + tcp_mem.default_value = 87380; + tcp_mem.max_value = 4194304; + vlog_printf(VLOG_WARNING, "sysctl_reader failed to read net.ipv4.tcp_rmem values - Using defaults : %d %d %d\n", tcp_mem.min_value, tcp_mem.default_value, tcp_mem.max_value); + } + } + return &tcp_mem; + } + + int get_tcp_window_scaling(bool update = false) { + static int val; + if (update) + val = read_file_to_int("/proc/sys/net/ipv4/tcp_window_scaling", 0); + return val; + } + + int get_net_core_rmem_max(bool update = false) { + static int val; + if (update) + val = read_file_to_int("/proc/sys/net/core/rmem_max", 229376); + return val; + } + + int get_net_core_wmem_max(bool update = false) { + static int val; + if (update) + val = read_file_to_int("/proc/sys/net/core/wmem_max", 229376); + return val; + } + + int get_net_ipv4_tcp_timestamps(bool update = false) { + static int val; + if (update) + val = read_file_to_int("/proc/sys/net/ipv4/tcp_timestamps", 0); + return val; + } + + int get_net_ipv4_ttl(bool update = false) { + static int val; + if (update) + val = read_file_to_int("/proc/sys/net/ipv4/ip_default_ttl", 64); + return val; + } + + int get_igmp_max_membership(bool update = false) { + static int val; + if (update) { + val = read_file_to_int("/proc/sys/net/ipv4/igmp_max_memberships", 1024); + if (0 > val) { + vlog_printf(VLOG_WARNING, "failed to read get_igmp_max_membership value"); + } + } + return val; + } + + int get_igmp_max_source_membership(bool update = false) { + static int val; + if (update) { + val = read_file_to_int("/proc/sys/net/ipv4/igmp_max_msf", 1024); + if (0 > val) { + vlog_printf(VLOG_WARNING, "failed to read get_igmp_max_source_membership value"); + } + } + return val; + } +}; + +#endif /* SYSCNTL_READER_H_ */ diff --git a/src/vma/util/to_str.h b/src/vma/util/to_str.h new file mode 100644 index 0000000..2e006fe --- /dev/null +++ b/src/vma/util/to_str.h @@ -0,0 +1,48 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +/* This class should be inherited by all classed that need to be printed +*/ +#ifndef TO_STR_H_ +#define TO_STR_H_ + +#include + +class tostr +{ +public: + virtual ~tostr(){}; + virtual const std::string to_str() const { return std::string(""); }; +}; + +#endif /* TO_STR_H_ */ diff --git a/src/vma/util/utils.cpp b/src/vma/util/utils.cpp new file mode 100644 index 0000000..c51ecf1 --- /dev/null +++ b/src/vma/util/utils.cpp @@ -0,0 +1,1108 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "utils.h" + +#include +#include +#include +#include +#include "vma/util/if.h" +#include +#include +#include +#include +#include +#include +#include //IP header (struct iphdr) definition +#ifdef HAVE_LINUX_ETHTOOL_H +#include // ioctl(SIOCETHTOOL) +#endif +#include +#include + +#include "utils/bullseye.h" +#include "vlogger/vlogger.h" +#include "vma/util/sys_vars.h" +#include "vma/util/sock_addr.h" +#include "vma/sock/sock-redirect.h" +#include "vma/util/vtypes.h" +#include "vma/ib/base/verbs_extra.h" + +#ifdef HAVE_SYS_CAPABILITY_H + #include +#endif + +using namespace std; + +#undef MODULE_NAME +#define MODULE_NAME "utils:" + + +int check_if_regular_file(char *path) +{ + static struct stat __sys_st; + + if (stat(path, &__sys_st)== 0) + { + BULLSEYE_EXCLUDE_BLOCK_START + if (!S_ISREG(__sys_st.st_mode)) + return -1; + BULLSEYE_EXCLUDE_BLOCK_END + } + + return 0; +} + +int get_sys_max_fd_num(int def_max_fd /*=1024*/) +{ + struct rlimit rlim; + BULLSEYE_EXCLUDE_BLOCK_START + if (getrlimit(RLIMIT_NOFILE, &rlim) == 0) + return rlim.rlim_cur; + BULLSEYE_EXCLUDE_BLOCK_END + return def_max_fd; +} + +int get_base_interface_name(const char *if_name, char *base_ifname, size_t sz_base_ifname) +{ + BULLSEYE_EXCLUDE_BLOCK_START + if ((!if_name) || (!base_ifname)) { + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_END + memset(base_ifname, 0, sz_base_ifname); + + if (get_vlan_base_name_from_ifname(if_name, base_ifname, sz_base_ifname)) { + return 0; + } + + //Am I already the base (not virtual, not alias, can be bond) + if ((!check_device_exist(if_name, VIRTUAL_DEVICE_FOLDER) || + check_device_exist(if_name, BOND_DEVICE_FILE)) && !strstr(if_name, ":")) { + snprintf(base_ifname, sz_base_ifname, "%s" ,if_name); + return 0; + } + + unsigned char vlan_if_address[MAX_L2_ADDR_LEN]; + const size_t ADDR_LEN = get_local_ll_addr(if_name, vlan_if_address, MAX_L2_ADDR_LEN, false); + if (ADDR_LEN > 0) { + struct ifaddrs *ifaddr, *ifa; + int rc = getifaddrs(&ifaddr); + BULLSEYE_EXCLUDE_BLOCK_START + if (rc == -1) { + __log_err("getifaddrs failed"); + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_END + + for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) { + if (!strcmp(ifa->ifa_name, if_name)) { + continue; + } + + if (strstr(ifa->ifa_name, ":")) { + //alias + continue; + } + + if (check_device_exist(ifa->ifa_name, VIRTUAL_DEVICE_FOLDER)) { + //virtual + if (!check_device_exist(ifa->ifa_name, BOND_DEVICE_FILE)) { + continue; + } + } + + unsigned char tmp_mac[ADDR_LEN]; + if (ADDR_LEN == get_local_ll_addr(ifa->ifa_name, tmp_mac, ADDR_LEN, false)) { + int size_to_compare; + if (ADDR_LEN == ETH_ALEN) size_to_compare = ETH_ALEN; + else size_to_compare = IPOIB_HW_ADDR_GID_LEN; + int offset = ADDR_LEN - size_to_compare; + if (0 == memcmp(vlan_if_address + offset, tmp_mac + offset, size_to_compare) && 0 == (ifa->ifa_flags & IFF_MASTER)) { + // A bond name cannot be a base name of an interface even if both have the same MAC(ethernet) or GID(IB) addresses + snprintf(base_ifname, sz_base_ifname, "%s" ,ifa->ifa_name); + freeifaddrs(ifaddr); + __log_dbg("Found base_ifname %s for interface %s", base_ifname, if_name); + return 0; + } + } + } + + freeifaddrs(ifaddr); + } + snprintf(base_ifname, sz_base_ifname, "%s" ,if_name); + __log_dbg("no base for %s", base_ifname, if_name); + return 0; +} + +void print_roce_lag_warnings(char* interface, char* disable_path /* = NULL */, const char* port1 /* = NULL */, const char* port2 /* = NULL */) +{ + vlog_printf(VLOG_WARNING,"******************************************************************************************************\n"); + + if (port1 && port2) { + vlog_printf(VLOG_WARNING,"* Bond %s has two slaves of the same device while RoCE LAG is enabled (%s, %s).\n", interface, port1, port2); + vlog_printf(VLOG_WARNING,"* Unexpected behaviour may occur during runtime.\n"); + } else { + vlog_printf(VLOG_WARNING,"* Interface %s will not be offloaded.\n", interface); + vlog_printf(VLOG_WARNING,"* VMA cannot offload the device while RoCE LAG is enabled.\n"); + } + + vlog_printf(VLOG_WARNING,"* Please refer to VMA Release Notes for more info\n"); + + if (disable_path) { + vlog_printf(VLOG_WARNING,"* In order to disable RoCE LAG please use:\n"); + vlog_printf(VLOG_WARNING,"* echo 0 > %s\n", disable_path); + } + vlog_printf(VLOG_WARNING,"******************************************************************************************************\n"); +} + +void compute_tx_checksum(mem_buf_desc_t* p_mem_buf_desc, bool l3_csum, bool l4_csum) +{ + // L3 + if (l3_csum) { + struct iphdr* ip_hdr = p_mem_buf_desc->tx.p_ip_h; + ip_hdr->check = 0; // use 0 at csum calculation time + ip_hdr->check = compute_ip_checksum((unsigned short *)ip_hdr, ip_hdr->ihl * 2); + + // L4 + if (l4_csum) { + if (ip_hdr->protocol == IPPROTO_UDP) { + struct udphdr* udp_hdr = p_mem_buf_desc->tx.p_udp_h; + udp_hdr->check = 0; + __log_entry_func("using SW checksum calculation: ip_hdr->check=%d, udp_hdr->check=%d", ip_hdr->check, udp_hdr->check); + } else if (ip_hdr->protocol == IPPROTO_TCP) { + struct tcphdr* tcp_hdr = p_mem_buf_desc->tx.p_tcp_h; + tcp_hdr->check = 0; + tcp_hdr->check = compute_tcp_checksum(ip_hdr, (const uint16_t *)tcp_hdr); + __log_entry_func("using SW checksum calculation: ip_hdr->check=%d, tcp_hdr->check=%d", ip_hdr->check, tcp_hdr->check); + } + } + } +} + +unsigned short compute_ip_checksum(const unsigned short *buf, unsigned int nshort_words) +{ + unsigned long sum = 0; + + while (nshort_words--) { + sum += *buf; + buf++; + } + sum = (sum >> 16) + (sum & 0xffff); + sum += (sum >> 16); + return ~sum; +} + +/* + * get tcp checksum: given IP header and tcp segment (assume checksum field in TCP header contains zero) + * matches RFC 793 + * + * This code borrows from other places and their ideas. + * */ +unsigned short compute_tcp_checksum(const struct iphdr *p_iphdr, const uint16_t *p_ip_payload) +{ + register unsigned long sum = 0; + uint16_t tcpLen = ntohs(p_iphdr->tot_len) - (p_iphdr->ihl<<2); // shift left 2 will multiply by 4 for converting to octets + + //add the pseudo header + //the source ip + sum += (p_iphdr->saddr >> 16) & 0xFFFF; + sum += (p_iphdr->saddr) & 0xFFFF; + //the dest ip + sum += (p_iphdr->daddr >> 16) & 0xFFFF; + sum += (p_iphdr->daddr) & 0xFFFF; + //protocol and reserved: 6 + sum += htons(IPPROTO_TCP); + //the length + sum += htons(tcpLen); + + //add the IP payload + while (tcpLen > 1) { + sum += * p_ip_payload++; + tcpLen -= 2; + } + //if any bytes left, pad the bytes and add + if(tcpLen > 0) { + sum += ((*p_ip_payload)&htons(0xFF00)); + } + //Fold 32-bit sum to 16 bits: add carrier to result + while (sum>>16) { + sum = (sum & 0xffff) + (sum >> 16); + } + sum = ~sum; + //computation result + return (unsigned short)sum; +} + +/* set udp checksum: given IP header and UDP datagram + * + * (assume checksum field in UDP header contains zero) + * This code borrows from other places and their ideas. + * Although according to rfc 768, If the computed checksum is zero, it is transmitted as all ones - + * this method will return the original value. + */ +unsigned short compute_udp_checksum_rx(const struct iphdr *p_iphdr, const struct udphdr *udphdrp, mem_buf_desc_t* p_rx_wc_buf_desc) +{ + register unsigned long sum = 0; + unsigned short udp_len = htons(udphdrp->len); + const uint16_t *p_ip_payload = (const uint16_t *) udphdrp; + mem_buf_desc_t *p_ip_frag = p_rx_wc_buf_desc; + unsigned short ip_frag_len = p_ip_frag->rx.frag.iov_len + sizeof(struct udphdr); + unsigned short ip_frag_remainder = ip_frag_len; + + //add the pseudo header + sum += (p_iphdr->saddr >> 16) & 0xFFFF; + sum += (p_iphdr->saddr) & 0xFFFF; + //the dest ip + sum += (p_iphdr->daddr >> 16) & 0xFFFF; + sum += (p_iphdr->daddr) & 0xFFFF; + //protocol and reserved: 17 + sum += htons(IPPROTO_UDP); + //the length + sum += udphdrp->len; + + //add the IP payload + while (udp_len > 1) { + // Each packet but the last must contain a payload length that is a multiple of 8 + if (!ip_frag_remainder && p_ip_frag->p_next_desc) { + p_ip_frag = p_ip_frag->p_next_desc; + p_ip_payload = (const uint16_t *) p_ip_frag->rx.frag.iov_base; + ip_frag_remainder = ip_frag_len = p_ip_frag->rx.frag.iov_len; + } + + while (ip_frag_remainder > 1) { + sum += * p_ip_payload++; + ip_frag_remainder -= 2; + } + + udp_len -= (ip_frag_len - ip_frag_remainder); + } + + //if any bytes left, pad the bytes and add + if(udp_len > 0) { + sum += ((*p_ip_payload)&htons(0xFF00)); + } + + //Fold sum to 16 bits: add carrier to result + while (sum >> 16) { + sum = (sum & 0xffff) + (sum >> 16); + } + + sum = ~sum; + //computation result + return (unsigned short)sum; +} + +/** + * Copy iovec to buffer + * Returns total bytes copyed + */ +int memcpy_fromiovec(u_int8_t* p_dst, const struct iovec* p_iov, size_t sz_iov, size_t sz_src_start_offset, size_t sz_data) +{ + /* Skip to start offset */ + int n_iovpos = 0; + while (n_iovpos < (int)sz_iov && sz_src_start_offset >= p_iov[n_iovpos].iov_len) { + sz_src_start_offset -= p_iov[n_iovpos].iov_len; + n_iovpos++; + } + + /* Copy len size into pBuf */ + int n_total = 0; + while (n_iovpos < (int)sz_iov && sz_data > 0) { + if (p_iov[n_iovpos].iov_len) + { + u_int8_t* p_src = ((u_int8_t*)(p_iov[n_iovpos].iov_base)) + sz_src_start_offset; + int sz_data_block_to_copy = min(sz_data, p_iov[n_iovpos].iov_len - sz_src_start_offset); + sz_src_start_offset = 0; + + memcpy(p_dst, p_src, sz_data_block_to_copy); + + p_dst += sz_data_block_to_copy; + sz_data -= sz_data_block_to_copy; + n_total += sz_data_block_to_copy; + } + n_iovpos++; + } + return n_total; +} + +int netmask_bitcount(uint32_t netmask) +{ + // Sparse Ones runs in time proportional to the number of 1 bits. + // The mystical line n &= (n - 1) simply sets the rightmost 1 bit in n to 0. + int cnt = 0; + while (netmask) { + cnt++; + netmask &= (netmask - 1); + } + return cnt; +} + +void set_fd_block_mode(int fd, bool b_block) +{ + __log_dbg("fd[%d]: setting to %sblocking mode (%d)", fd, b_block?"":"non-", b_block); + + int flags = orig_os_api.fcntl(fd, F_GETFL); + BULLSEYE_EXCLUDE_BLOCK_START + if (flags < 0) { + __log_err("failed reading fd[%d] flag (rc=%d errno=%d %m)", fd, flags, errno); + return; + } + BULLSEYE_EXCLUDE_BLOCK_END + + if (b_block) + flags &= ~O_NONBLOCK; + else + flags |= O_NONBLOCK; + + int ret = orig_os_api.fcntl(fd, F_SETFL, flags); + BULLSEYE_EXCLUDE_BLOCK_START + if (ret < 0) { + __log_err("failed changing fd[%d] to %sblocking mode (rc=%d errno=%d %m)", fd, b_block?"":"non-", flags, ret, errno); + } + BULLSEYE_EXCLUDE_BLOCK_END + + return; +} + +bool compare_double(double a, double b) +{ + return fabs(a - b) < std::numeric_limits::epsilon(); +} + +const char* iphdr_protocol_type_to_str(const int type) +{ + BULLSEYE_EXCLUDE_BLOCK_START + switch (type) { + case IPPROTO_TCP: return "TCP"; + case IPPROTO_UDP: return "UDP"; + default: break; + } + return "Not supported"; + BULLSEYE_EXCLUDE_BLOCK_END +} + +int priv_read_file(const char *path, char *buf, size_t size, vlog_levels_t log_level /*= VLOG_ERROR*/) +{ + int len = -1; + int fd = open(path, O_RDONLY); + BULLSEYE_EXCLUDE_BLOCK_START + if (fd < 0) { + VLOG_PRINTF(log_level, "ERROR while opening file %s (errno %d %m)", path, errno); + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_END + len = read(fd, buf, size); + BULLSEYE_EXCLUDE_BLOCK_START + if (len < 0) { + VLOG_PRINTF(log_level, "ERROR while reading from file %s (errno %d %m)", path, errno); + } + BULLSEYE_EXCLUDE_BLOCK_END + close(fd); + return len; +} + +int read_file_to_int(const char *path, int default_value) +{ + char buf[25]; + int rc = priv_safe_read_file(path, buf, sizeof buf); + if (rc < 0) { + __log_warn("ERROR while getting int from from file %s, we'll use default %d", path, default_value); + } + return (rc < 0) ? default_value : atoi(buf); +} + +int get_ifinfo_from_ip(const struct sockaddr& addr, char* ifname, uint32_t& ifflags) +{ + struct ifaddrs *ifap = NULL; + struct ifaddrs *ifaphead = NULL; + + __log_func("checking local interface: %d.%d.%d.%d", NIPQUAD(get_sa_ipv4_addr(addr))); + + // Get interface list info + if (!getifaddrs(&ifaphead)) { + + // Find our interface + for (ifap = ifaphead; ifap; ifap = ifap->ifa_next) { + if (ifap->ifa_netmask == NULL) + continue; + __log_func("interface '%s': %d.%d.%d.%d/%d%s%s%s%s%s%s%s%s%s%s", + ifap->ifa_name, + NIPQUAD(get_sa_ipv4_addr(ifap->ifa_addr)), + netmask_bitcount(get_sa_ipv4_addr(ifap->ifa_netmask)), + (ifap->ifa_flags & IFF_UP ? " UP":""), + (ifap->ifa_flags & IFF_RUNNING ? " RUNNING":""), + (ifap->ifa_flags & IFF_NOARP ? " NO_ARP":""), + (ifap->ifa_flags & IFF_LOOPBACK ? " LOOPBACK":""), + (ifap->ifa_flags & IFF_BROADCAST ? " BROADCAST":""), + (ifap->ifa_flags & IFF_MULTICAST ? " MULTICAST":""), + (ifap->ifa_flags & IFF_MASTER ? " MASTER":""), + (ifap->ifa_flags & IFF_SLAVE ? " SLAVE":""), + (ifap->ifa_flags & IFF_DEBUG ? " IFF_DEBUG":""), + (ifap->ifa_flags & IFF_PROMISC ? " IFF_PROMISC":"") + ); + + if (get_sa_ipv4_addr(ifap->ifa_addr) == get_sa_ipv4_addr(addr)) { + + // Found match to users request + // Copy specific ifaddrs info to user + ifflags = ifap->ifa_flags; + strncpy(ifname, ifap->ifa_name, IFNAMSIZ); + __log_dbg("matching device found for ip '%d.%d.%d.%d' on '%s' (flags=%#X)", + NIPQUAD(get_sa_ipv4_addr(addr)), ifname, ifflags); + __log_dbg("interface '%s': %d.%d.%d.%d/%d%s%s%s%s%s%s%s%s%s%s", + ifap->ifa_name, + NIPQUAD(get_sa_ipv4_addr(ifap->ifa_addr)), + netmask_bitcount(get_sa_ipv4_addr(ifap->ifa_netmask)), + (ifap->ifa_flags & IFF_UP ? " UP":""), + (ifap->ifa_flags & IFF_RUNNING ? " RUNNING":""), + (ifap->ifa_flags & IFF_NOARP ? " NO_ARP":""), + (ifap->ifa_flags & IFF_LOOPBACK ? " LOOPBACK":""), + (ifap->ifa_flags & IFF_BROADCAST ? " BROADCAST":""), + (ifap->ifa_flags & IFF_MULTICAST ? " MULTICAST":""), + (ifap->ifa_flags & IFF_MASTER ? " MASTER":""), + (ifap->ifa_flags & IFF_SLAVE ? " SLAVE":""), + (ifap->ifa_flags & IFF_DEBUG ? " IFF_DEBUG":""), + (ifap->ifa_flags & IFF_PROMISC ? " IFF_PROMISC":"") + ); + + freeifaddrs(ifaphead); + return 0; + } + } + } + else { + __log_dbg("ERROR from getifaddrs() (errno=%d %m)", errno); + } + + __log_dbg("can't find local if address %d.%d.%d.%d in ifaddr list", NIPQUAD(get_sa_ipv4_addr(addr))); + + if (ifaphead) + freeifaddrs(ifaphead); + + return -1; +} + +int get_port_from_ifname(const char* ifname) +{ + int port_num, dev_id = -1, dev_port = -1; + // Depending of kernel version and OFED stack the files containing dev_id and dev_port may not exist. + // if file reading fails *dev_id or *dev_port may remain unmodified + char num_buf[24] = {0}; + char dev_path[256] = {0}; + snprintf(dev_path, sizeof(dev_path), VERBS_DEVICE_PORT_PARAM_FILE, ifname); + if (priv_safe_try_read_file(dev_path, num_buf, sizeof(num_buf)) > 0) { + dev_port = strtol(num_buf, NULL, 0); // base=0 means strtol() can parse hexadecimal and decimal + __log_dbg("dev_port file=%s dev_port str=%s dev_port val=%d", dev_path, num_buf, dev_port); + } + snprintf(dev_path, sizeof(dev_path), VERBS_DEVICE_ID_PARAM_FILE, ifname); + if (priv_safe_try_read_file(dev_path, num_buf, sizeof(num_buf)) > 0) { + dev_id = strtol(num_buf, NULL, 0); // base=0 means strtol() can parse hexadecimal and decimal + __log_dbg("dev_id file= %s dev_id str=%s dev_id val=%d", dev_path, num_buf, dev_id); + } + + // take the max between dev_port and dev_id as port number + port_num = (dev_port > dev_id) ? dev_port : dev_id; + return ++port_num; +} + +int get_iftype_from_ifname(const char* ifname) +{ + __log_func("find interface type for ifname '%s'", ifname); + + char iftype_filename[100]; + char iftype_value_str[32]; + char base_ifname[32]; + char iftype_value = -1; + + get_base_interface_name(ifname, base_ifname, sizeof(base_ifname)); + sprintf(iftype_filename, IFTYPE_PARAM_FILE, base_ifname); + BULLSEYE_EXCLUDE_BLOCK_START + if (priv_read_file(iftype_filename, iftype_value_str, sizeof(iftype_value_str)) > 0) { + iftype_value = atoi(iftype_value_str); + } + BULLSEYE_EXCLUDE_BLOCK_END + return iftype_value; +} + +int get_if_mtu_from_ifname(const char* ifname) +{ + __log_func("find interface mtu for ifname '%s'", ifname); + + char if_mtu_len_filename[100]; + char if_mtu_value_str[32]; + char base_ifname[32]; + int if_mtu_value = 0; + + /* initially try reading MTU from ifname. In case of failure (expected in alias ifnames) - try reading MTU from base ifname */ + sprintf(if_mtu_len_filename, IFADDR_MTU_PARAM_FILE, ifname); + + if (priv_safe_try_read_file(if_mtu_len_filename, if_mtu_value_str, sizeof(if_mtu_value_str)) > 0) { + if_mtu_value = atoi(if_mtu_value_str); + } + else { + get_base_interface_name(ifname, base_ifname, sizeof(base_ifname)); + sprintf(if_mtu_len_filename, IFADDR_MTU_PARAM_FILE, base_ifname); + if (priv_safe_try_read_file(if_mtu_len_filename, if_mtu_value_str, sizeof(if_mtu_value_str)) > 0) { + if_mtu_value = atoi(if_mtu_value_str); + } + } + return if_mtu_value; +} + +int get_window_scaling_factor(int tcp_rmem_max, int core_rmem_max) +{ + __log_func("calculate OS tcp scaling window factor"); + + int scaling_factor = 0; + int space = MAX(tcp_rmem_max, core_rmem_max); + + while (space > 0xffff && scaling_factor < MAX_WINDOW_SCALING) { + space >>= 1; + scaling_factor++; + } + + __log_dbg("TCP scaling window factor is set to %d", scaling_factor); + return scaling_factor; +} + +int get_ipv4_from_ifname(char *ifname, struct sockaddr_in *addr) +{ + int ret = -1; + __log_func("find ip addr for ifname '%s'", ifname); + + int fd = orig_os_api.socket(AF_INET, SOCK_DGRAM, 0); + BULLSEYE_EXCLUDE_BLOCK_START + if (fd < 0) { + __log_err("ERROR from socket() (errno=%d %m)", errno); + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_END + + struct ifreq req; + memset(&req, 0, sizeof(req)); + strncpy(req.ifr_name, ifname, IFNAMSIZ-1); + ret = orig_os_api.ioctl(fd, SIOCGIFADDR, &req); + BULLSEYE_EXCLUDE_BLOCK_START + if (ret < 0) { + if (errno != ENODEV) { + __log_dbg("Failed getting ipv4 from interface '%s' (errno=%d %m)", ifname, errno); + } + else { + // Log in DEBUG (Maybe there is a better way to catch IPv6 only interfaces and not to get to this point?) + __log_dbg("Failed getting ipv4 from interface '%s' (errno=%d %m)", ifname, errno); + } + orig_os_api.close(fd); + return -1; + } + + if (req.ifr_addr.sa_family != AF_INET) { + __log_err("%s: address family %d is not supported", ifname, req.ifr_addr.sa_family); + orig_os_api.close(fd); + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_END + memcpy(addr, &req.ifr_addr, sizeof(*addr)); + orig_os_api.close(fd); + return 0; +} + +int get_ipv4_from_ifindex(int ifindex, struct sockaddr_in *addr) +{ + char if_name[IFNAMSIZ]; + //todo should we use get_base_interface after getting the name? + BULLSEYE_EXCLUDE_BLOCK_START + if (if_indextoname(ifindex, if_name) && get_ipv4_from_ifname(if_name, addr) == 0) { + return 0; + } + BULLSEYE_EXCLUDE_BLOCK_END + return -1; +} + +uint16_t get_vlan_id_from_ifname(const char* ifname) +{ + // find vlan id from interface name + struct vlan_ioctl_args ifr; + int fd = orig_os_api.socket(AF_INET, SOCK_DGRAM, 0); + + if (fd < 0) { + __log_err("ERROR from socket() (errno=%d %m)", errno); + return -1; + } + memset(&ifr, 0, sizeof(ifr)); + ifr.cmd = GET_VLAN_VID_CMD; + strncpy(ifr.device1, ifname, sizeof(ifr.device1) - 1); + + if (orig_os_api.ioctl(fd, SIOCGIFVLAN, &ifr) < 0) + { + __log_dbg("Failure in ioctl(SIOCGIFVLAN, cmd=GET_VLAN_VID_CMD) for interface '%s' (errno=%d %m)", ifname, errno); + orig_os_api.close(fd); + return 0; + } + + orig_os_api.close(fd); + + __log_dbg("found vlan id '%d' for interface '%s'", ifr.u.VID, ifname); + + return ifr.u.VID; +} + +size_t get_vlan_base_name_from_ifname(const char* ifname, char* base_ifname, size_t sz_base_ifname) +{ + // find vlan base name from interface name + struct vlan_ioctl_args ifr; + int fd = orig_os_api.socket(AF_INET, SOCK_DGRAM, 0); + if (fd < 0) { + __log_err("ERROR from socket() (errno=%d %m)", errno); + return -1; + } + memset(&ifr,0, sizeof(ifr)); + ifr.cmd = GET_VLAN_REALDEV_NAME_CMD; + strncpy(ifr.device1, ifname, sizeof(ifr.device1) - 1); + + if (orig_os_api.ioctl(fd, SIOCGIFVLAN, &ifr) < 0) + { + __log_dbg("Failure in ioctl(SIOCGIFVLAN, cmd=GET_VLAN_REALDEV_NAME_CMD) for interface '%s' (errno=%d %m)", ifname, errno); + orig_os_api.close(fd); + return 0; + } + + orig_os_api.close(fd); + + size_t name_len = strlen(ifr.u.device2); + if (base_ifname && name_len > 0) { + __log_dbg("found vlan base name '%s' for interface '%s'", ifr.u.device2, ifname); + strncpy(base_ifname, ifr.u.device2, sz_base_ifname); + return name_len; + } + + __log_dbg("did not find vlan base name for interface '%s'", ifname); + + return 0; +} + +int run_and_retreive_system_command(const char* cmd_line, char* return_str, int return_str_len) +{ + // TODO: NOTICE the current code will change the environment for all threads of our process + + BULLSEYE_EXCLUDE_BLOCK_START + if (!cmd_line) return -1; + if (return_str_len <= 0) return -1; + BULLSEYE_EXCLUDE_BLOCK_END + + // 29West may load vma dynamically (like sockperf with --load-vma) + for (int i=0; environ[i]; i++ ) { + if ( strstr(environ[i],"LD_PRELOAD=") ) { + environ[i][0] = '_'; + } + } + + // run system command and get response from FILE* + int rc = -1; + + FILE* file = popen(cmd_line, "r"); + if (file) { + int fd = fileno(file); + if (fd > 0) { + int actual_len = read(fd, return_str, return_str_len - 1); + if (actual_len > 0) { + return_str[actual_len] = '\0'; + } else { + return_str[0] = '\0'; + } + } + + // Check exit status code + rc = pclose(file); + if (rc == -1 && errno == ECHILD) { + /* suppress a case when termination status can be unavailable to pclose() */ + rc = 0; + } + + for (int i = 0; environ[i]; i++) { + if (strstr(environ[i], "_D_PRELOAD=")) { + environ[i][0] = 'L'; + } + } + } + return ((!rc && return_str) ? 0 : -1); +} + +size_t get_local_ll_addr(IN const char * ifname, OUT unsigned char* addr, IN int addr_len, bool is_broadcast) +{ + char l2_addr_path[256] = {0}; + char buf[256] = {0}; + + // In case of alias (ib0/eth0:xx) take only the device name for that interface (ib0/eth0) + size_t ifname_len = strcspn(ifname, ":"); // TODO: this is temp code till we get base interface for any alias format of an interface + const char * l2_addr_path_fmt = is_broadcast ? L2_BR_ADDR_FILE_FMT : L2_ADDR_FILE_FMT; + snprintf(l2_addr_path, sizeof(l2_addr_path)-1, l2_addr_path_fmt, ifname_len, ifname); + + int len = priv_read_file(l2_addr_path, buf, sizeof(buf)); + int bytes_len = (len + 1) / 3; // convert len from semantic of hex format L2 address with ':' delimiter (and optional newline character) into semantic of byte array + __log_dbg("ifname=%s un-aliased-ifname=%.*s l2_addr_path=%s l2-addr=%s (addr-bytes_len=%d)", ifname, ifname_len, ifname, l2_addr_path, buf, bytes_len); + + BULLSEYE_EXCLUDE_BLOCK_START + if (len < 0) return 0; // failure in priv_read_file + if (addr_len < bytes_len) return 0; // error not enough room was provided by caller + BULLSEYE_EXCLUDE_BLOCK_END + + if (bytes_len == IPOIB_HW_ADDR_LEN && addr_len >= IPOIB_HW_ADDR_LEN) { // addr_len >= IPOIB_HW_ADDR_LEN is just for silencing coverity + sscanf(buf, IPOIB_HW_ADDR_SSCAN_FMT, IPOIB_HW_ADDR_SSCAN(addr)); + __log_dbg("found IB %s address " IPOIB_HW_ADDR_PRINT_FMT " for interface %s", is_broadcast?"BR":"UC", IPOIB_HW_ADDR_PRINT_ADDR(addr), ifname); + } + else if (bytes_len == ETH_ALEN) { + sscanf(buf, ETH_HW_ADDR_SSCAN_FMT, ETH_HW_ADDR_SSCAN(addr)); + __log_dbg("found ETH %s address" ETH_HW_ADDR_PRINT_FMT " for interface %s", is_broadcast?"BR":"UC", ETH_HW_ADDR_PRINT_ADDR(addr), ifname); + } + else { + return 0; // error + } + + return bytes_len; // success +} + +bool get_bond_active_slave_name(IN const char* bond_name, OUT char* active_slave_name, IN int sz) +{ + char active_slave_path[256] = {0}; + sprintf(active_slave_path, BONDING_ACTIVE_SLAVE_PARAM_FILE, bond_name); + BULLSEYE_EXCLUDE_BLOCK_START + if (priv_safe_read_file(active_slave_path, active_slave_name, sz) < 0) + return false; + if (strlen(active_slave_name) == 0) + return false; + BULLSEYE_EXCLUDE_BLOCK_END + char* p = strchr(active_slave_name, '\n'); + if (p) *p = '\0'; // Remove the tailing 'new line" char + return true; +} + +bool check_bond_roce_lag_exist(OUT char* bond_roce_lag_path, int sz, IN const char* slave_name) +{ + char sys_res[1024] = {0}; + snprintf(bond_roce_lag_path, sz, BONDING_ROCE_LAG_FILE, slave_name); + if (priv_read_file(bond_roce_lag_path, sys_res, 1024, VLOG_FUNC) > 0) { + if (strtol(sys_res, NULL,10) > 0 && errno != ERANGE) { + return true; + } + } + + return false; +} + +bool get_netvsc_slave(IN const char* ifname, OUT char* slave_name, OUT unsigned int &slave_flags) +{ + char netvsc_path[256]; + char base_ifname[IFNAMSIZ]; + get_base_interface_name(ifname, base_ifname, sizeof(base_ifname)); + struct ifaddrs *ifaddr, *ifa; + bool ret = false; + + if (getifaddrs(&ifaddr) == -1) { + __log_err("getifaddrs() failed (errno = %d %m)", errno); + return ret; + } + + for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) { + snprintf(netvsc_path, sizeof(netvsc_path), NETVSC_DEVICE_LOWER_FILE, base_ifname, ifa->ifa_name); + int fd = open(netvsc_path, O_RDONLY); + if (fd >= 0) { + close(fd); + memcpy(slave_name, ifa->ifa_name, IFNAMSIZ); + slave_flags = ifa->ifa_flags; + __log_dbg("Found slave_name = %s, slave_flags = %u", slave_name, slave_flags); + ret = true; + break; + } + } + + freeifaddrs(ifaddr); + + return ret; +} + +bool check_netvsc_device_exist(const char* ifname) +{ + char device_path[256] = {0}; + char base_ifname[IFNAMSIZ]; + get_base_interface_name(ifname, base_ifname, sizeof(base_ifname)); + sprintf(device_path, NETVSC_DEVICE_CLASS_FILE, base_ifname); + char sys_res[1024] = {0}; + if (priv_read_file(device_path, sys_res, 1024, VLOG_FUNC) > 0) { + if (strcmp(sys_res, NETVSC_ID) == 0) { + return true; + } + } + + return false; +} + +/* + * this function will work only for kernel > 3.14 or RH7.2 and higher + */ +bool get_bond_slave_state(IN const char* slave_name, OUT char* curr_state, IN int sz) +{ + char bond_slave_state_path[256] = {0}; + sprintf(bond_slave_state_path, BONDING_SLAVE_STATE_PARAM_FILE, slave_name); + BULLSEYE_EXCLUDE_BLOCK_START + if (priv_safe_try_read_file(bond_slave_state_path, curr_state, sz) < 0) + return false; + BULLSEYE_EXCLUDE_BLOCK_END + char* p = strchr(curr_state, '\n'); + if (p) *p = '\0'; // Remove the tailing 'new line" char + return true; +} + +bool get_bond_slaves_name_list(IN const char* bond_name, OUT char* slaves_list, IN int sz) +{ + char slaves_list_path[256] = {0}; + sprintf(slaves_list_path, BONDING_SLAVES_PARAM_FILE, bond_name); + BULLSEYE_EXCLUDE_BLOCK_START + if (priv_safe_read_file(slaves_list_path, slaves_list, sz) < 0) + return false; + BULLSEYE_EXCLUDE_BLOCK_END + char* p = strchr(slaves_list, '\n'); + if (p) *p = '\0'; // Remove the tailing 'new line" char + return true; +} + +bool check_device_exist(const char* ifname, const char *path) +{ + char device_path[256] = {0}; + sprintf(device_path, path, ifname); + int fd = orig_os_api.open(device_path, O_RDONLY); + if (fd >= 0) + orig_os_api.close(fd); + if (fd < 0 && errno == EMFILE) { + __log_warn("There are no free fds in the system. This may cause unexpected behavior"); + } + return (fd > 0); +} + +bool check_device_name_ib_name(const char* ifname, const char* ibname) +{ + int n = -1; + int fd = -1; + char ib_path[IBV_SYSFS_PATH_MAX]= {0}; + + n = snprintf(ib_path, sizeof(ib_path), "/sys/class/infiniband/%s/device/net/%s/ifindex", + ibname, ifname); + if (likely((0 < n) && (n < (int)sizeof(ib_path)))) { + fd = open(ib_path, O_RDONLY); + if (fd >= 0) { + close(fd); + return true; + } + } + + return false; +} + +bool get_interface_oper_state(IN const char* interface_name, OUT char* curr_state, IN int sz) +{ + char interface_state_path[256] = {0}; + sprintf(interface_state_path, OPER_STATE_PARAM_FILE, interface_name); + BULLSEYE_EXCLUDE_BLOCK_START + if (priv_safe_read_file(interface_state_path, curr_state, sz) < 0) + return false; + BULLSEYE_EXCLUDE_BLOCK_END + char* p = strchr(curr_state, '\n'); + if (p) *p = '\0'; // Remove the tailing 'new line" char + return true; +} + +int validate_ipoib_prop(const char* ifname, unsigned int ifflags, + const char prop_file[], const char *expected_val, + int val_size, OUT char *filename, OUT char* base_ifname) +{ + char mode[10]; + char ifname_tmp[IFNAMSIZ]; + char active_slave_name[IFNAMSIZ]; + + // In case of alias (ib0:xx) take only the device name for that interface (ib0) + strncpy(ifname_tmp, ifname, sizeof(ifname_tmp) - 1); + ifname_tmp[sizeof(ifname_tmp) - 1] = '\0'; + base_ifname = strtok(ifname_tmp, ":"); + + if (ifflags & IFF_MASTER) { + // this is a bond interface, let find the slave + BULLSEYE_EXCLUDE_BLOCK_START + if (!get_bond_active_slave_name(base_ifname, active_slave_name, IFNAMSIZ)) { + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_END + sprintf(filename, prop_file, active_slave_name); + } else { + sprintf(filename, prop_file, base_ifname); + } + + BULLSEYE_EXCLUDE_BLOCK_START + if (priv_read_file(filename, mode, val_size) <= 0) { + return -1; + } + BULLSEYE_EXCLUDE_BLOCK_END + if (strncmp(mode, expected_val, val_size)) { + return 1; + } else { + return 0; + } +} + +//NOTE RAW_QP_PRIVLIGES_PARAM_FILE does not exist on upstream drivers +int validate_raw_qp_privliges() +{ + // RAW_QP_PRIVLIGES_PARAM_FILE: "/sys/module/ib_uverbs/parameters/disable_raw_qp_enforcement" + char raw_qp_privliges_value = 0; + if (priv_read_file((const char*)RAW_QP_PRIVLIGES_PARAM_FILE, &raw_qp_privliges_value, 1, VLOG_DEBUG) <= 0) { + return -1; + } + if (raw_qp_privliges_value != '1') { + return 0; + } + return 1; +} + +bool validate_user_has_cap_net_raw_privliges() +{ +#ifdef HAVE_SYS_CAPABILITY_H + struct __user_cap_header_struct cap_header; + cap_user_header_t cap_header_ptr = &cap_header; + struct __user_cap_data_struct cap_data; + cap_user_data_t cap_data_ptr = &cap_data; + cap_header_ptr->pid = getpid(); + cap_header_ptr->version = _LINUX_CAPABILITY_VERSION; + if(capget(cap_header_ptr, cap_data_ptr) < 0) { + __log_dbg("error getting cap_net_raw permissions (%d %m)", errno); + return false; + } else { + __log_dbg("successfully got cap_net_raw permissions. Effective=%X Permitted=%X", cap_data_ptr->effective, cap_data_ptr->permitted); + } + return ((cap_data_ptr->effective & CAP_TO_MASK(CAP_NET_RAW)) != 0); +#else + __log_dbg("libcap-devel library is not installed, skipping cap_net_raw permission checks"); + return false; +#endif +} + +int validate_tso(int if_index) +{ +#ifdef HAVE_LINUX_ETHTOOL_H + int ret = -1; + int fd = -1; + struct ifreq req; + struct ethtool_value eval; + + fd = orig_os_api.socket(AF_INET, SOCK_DGRAM, 0); + if (fd < 0) { + __log_err("ERROR from socket() (errno=%d %m)", errno); + return -1; + } + memset(&req, 0, sizeof(req)); + eval.cmd = ETHTOOL_GTSO; + req.ifr_ifindex = if_index; + if_indextoname(if_index, req.ifr_name); + req.ifr_data = (char *)&eval; + ret = orig_os_api.ioctl(fd, SIOCETHTOOL, &req); + if (ret < 0) { + __log_dbg("ioctl(SIOCETHTOOL) cmd=ETHTOOL_GTSO (errno=%d %m)", errno); + } else { + ret = eval.data; + } + orig_os_api.close(fd); + return ret; +#else + NOT_IN_USE(if_index); + return -1; +#endif +} + +loops_timer::loops_timer() +{ + m_timeout_msec = -1; + m_timer_countdown = 0; + m_interval_it = 2048; + ts_clear(&m_start); + ts_clear(&m_elapsed); + ts_clear(&m_current); +} + +void loops_timer::start() +{ + ts_clear(&m_start); + // set to 1 so the first loop is fast and only after it m_start will be initialized + m_timer_countdown = 1; +} + +int loops_timer::time_left_msec() +{ + if ( m_timeout_msec == -1 ) + return -1; + + if (!ts_isset(&m_start)) { //VMA_RX_POLL==0 + gettime(&m_start); + } + timespec current; + gettime(¤t); + ts_sub(¤t, &m_start, &m_elapsed); + + //cover the case of left<0 + return (m_timeout_msec-ts_to_msec(&m_elapsed))>0 ? m_timeout_msec-ts_to_msec(&m_elapsed) : 0; +} + +/////////////////////////////////////////// +uint32_t fd2inode(int fd) +{ + struct stat buf; + int rc = fstat(fd, &buf); + return rc==0 ? buf.st_ino : 0; // no inode is 0 +} + +/////////////////////////////////////////// +vma_error::vma_error(const char* _message, const char* _function, const char* _filename, int _lineno, int _errnum) throw() + : message(_message), function(_function), filename(_filename), lineno(_lineno), errnum(_errnum) +{ + snprintf(formatted_message, sizeof(formatted_message), "vma_error <%s> (errno=%d %s) in %s:%d", message, errnum, strerror(errnum), filename, lineno); + formatted_message[ sizeof(formatted_message)-1 ] = '\0'; +} + +vma_error::~vma_error() throw() +{ +} + +const char* vma_error::what() const throw() +{ + return formatted_message; +} + +/////////////////////////////////////////// diff --git a/src/vma/util/utils.h b/src/vma/util/utils.h new file mode 100644 index 0000000..41697eb --- /dev/null +++ b/src/vma/util/utils.h @@ -0,0 +1,525 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef UTILS_H +#define UTILS_H + +#include +#include +#include +#include +#include +#include + +#include "vtypes.h" +#include "utils/rdtsc.h" +#include "vlogger/vlogger.h" +#include "vma/proto/mem_buf_desc.h" +#include "vma/util/vma_stats.h" + +struct iphdr; //forward declaration + +#define VMA_ALIGN(x, y) ((((x) + (y) - 1) / (y)) * (y) ) + +#define ARRAY_SIZE(arr) (sizeof(arr) / sizeof(arr[0])) + +/** +* Check if file type is regular +**/ +int check_if_regular_file (char *path); + +/** + * L3 and L4 Header Checksum Calculation + */ +void compute_tx_checksum(mem_buf_desc_t* p_mem_buf_desc, bool l3_csum, bool l4_csum); + +/** + * IP Header Checksum Calculation + */ +unsigned short compute_ip_checksum(const unsigned short *buf, unsigned int nshort_words); + +/** +* get tcp checksum: given IP header and tcp segment (assume checksum field in TCP header contains zero) +* matches RFC 793 +*/ +unsigned short compute_tcp_checksum(const struct iphdr *p_iphdr, const uint16_t *p_ip_payload); + +/** +* get udp checksum: given IP header and UDP datagram (assume checksum field in UDP header contains zero) +* matches RFC 793 +*/ +unsigned short compute_udp_checksum_rx(const struct iphdr *p_iphdr, const struct udphdr *udphdrp, mem_buf_desc_t* p_rx_wc_buf_desc); + +/** + * get user space max number of open fd's using getrlimit, default parameter equals to 1024 + */ + +int get_sys_max_fd_num(int def_max_fd=1024); + +/** + * iovec extensions + * Returns total bytes copyed + */ +int memcpy_fromiovec(u_int8_t* p_dst, const struct iovec* p_iov, size_t sz_iov, size_t sz_src_start_offset, size_t sz_data); + +/** + * get base interface from an aliased/vlan tagged one. i.e. eth2:1 --> eth2 / eth2.1 --> eth2 + * Functions gets:interface name,output variable for base interface,output size; and returns the base interface + */ +int get_base_interface_name(const char *if_name, char *base_ifname, size_t sz_base_ifname); + +/** + * Count bitmark set bits + */ +int netmask_bitcount(uint32_t netmask); + + +/** + * Set the fd blocking mode + * @param fd the file descriptor on which to operate + * @param block 'true' to set to block + * 'false' to set to non-blocking + */ +void set_fd_block_mode(int fd, bool block); + +/** + * @param a number + * @param b number + * @return true if 'a' and 'b' are equal. else false. + */ +bool compare_double(double a, double b); + +/** + * Run a system command while bypassing LD_PRELOADed with VMA + * @param cmd_line to be exceuted wiout VMA in process space + * @param return_str is the output of the system call + */ +int run_and_retreive_system_command(const char* cmd_line, char* return_str, int return_str_len); + +const char* iphdr_protocol_type_to_str(const int type); + +/** + * Read content of file detailed in 'path' (usually a sysfs file) and + * store the file content into the given 'buf' up to 'size' characters. + * print log in case of failure according to the given 'log_level' argument. + * @return length of content that was read, or -1 upon any error + */ +int priv_read_file(const char *path, char *buf, size_t size, vlog_levels_t log_level = VLOG_ERROR); + +/** + * like above 'priv_read_file' however make sure that upon success the result in buf is a null terminated string + */ +inline int priv_safe_read_file(const char *path, char *buf, size_t size, vlog_levels_t log_level = VLOG_ERROR){ + int ret = -1; + if (size > 0) { + ret = priv_read_file(path, buf, size - 1, log_level); + if (0 <= ret) buf[ret] = '\0'; + } + return ret; +} + + +/** + * like above however make sure that upon success the result in buf is a null terminated string and VLOG_DEBUG + */ +inline int priv_safe_try_read_file(const char *path, char *buf, size_t size) { + int ret = -1; + if (size > 0) { + ret = priv_read_file(path, buf, size - 1, VLOG_DEBUG); + if (0 <= ret) buf[ret] = '\0'; + } + return ret; +} + +/** + * Read content of file detailed in 'path' (usually a sysfs file) + * upon failure print error + * @return int value (atoi) of the file content, or 'default_value' upon failure + */ +int read_file_to_int(const char *path, int default_value); + +/** + * Get interface name and flags from local address + * + * @char ifname[IFNAMSIZ]; + * @unsigned int ifflags; Flags as from SIOCGIFFLAGS ioctl. + * + * @return zero on success + */ +int get_ifinfo_from_ip(const struct sockaddr& local_addr, char* ifname, uint32_t &ifflags); + +/** + * Get port number from interface name + * @param ifname input interface name of device (e.g. eth1, ib2) + * should be of size IFNAMSIZ + * @return zero on failure, else port number + */ +int get_port_from_ifname(const char* ifname); + +/** + * Get interface type value from interface name + * + * @param ifname input interface name of device (e.g. eth1, ib2) + * should be of size IFNAMSIZ + * @return if type on success or -1 on failure + */ +int get_iftype_from_ifname(const char* ifname); + +/** + * Get interface mtu from interface name + * + * @param ifname input interface name of device (e.g. eth1, ib2) + * should be of size IFNAMSIZ + * @return mtu length zero on failure + */ +int get_if_mtu_from_ifname(const char* ifname); + +/** + * Get the OS TCP window scaling factor when tcp_window_scaling is enabled. + * The value is calculated from the maximum receive buffer value. + * + * @param tcp_rmem_max the maximum size of the receive buffer used by each TCP socket + * @parma core_rmem_max contains the maximum socket receive buffer size in bytes which a user may set by using the SO_RCVBUF socket option. + * + * @return TCP window scaling factor + */ +int get_window_scaling_factor(int tcp_rmem_max, int core_rmem_max); + +/** + * Get Ethernet ipv4 address from interface name + * + * @param ifname input interface name of device (e.g. eth1, ib2) + * should be of size IFNAMSIZ + * @param sockaddr_in output interface inet address + * + * @return -1 on failure + */ +int get_ipv4_from_ifname(char *ifname, struct sockaddr_in *addr); + +/** + * Get Ethernet ipv4 address from interface index + * + * @param ifindex input interface index of device + * @param sockaddr_in output interface inet address + * + * @return -1 on failure + */ +int get_ipv4_from_ifindex(int ifindex, struct sockaddr_in *addr); + +/** + * Get vlan id from interface name + * + * @param ifname input interface name of device (e.g. eth2, eth2.5) + * @return the vlan id or 0 if not a vlan + */ +uint16_t get_vlan_id_from_ifname(const char* ifname); + +/** + * Get vlan base name from interface name + * + * @param ifname input interface name of device (e.g. eth2, eth2.5) + * @param base_ifname output base interface name of device (e.g. eth2) + * @param sz_base_ifname input the size of base_ifname param + * @return the vlan base name length or 0 if not a vlan + */ +size_t get_vlan_base_name_from_ifname(const char* ifname, char* base_ifname, size_t sz_base_ifname); + +/* Upon success - returns the actual address len in bytes; Upon error - returns zero*/ +size_t get_local_ll_addr(const char* ifname, unsigned char* addr, int addr_len, bool is_broadcast); + +/* Print warning while RoCE Lag is enabled */ +void print_roce_lag_warnings(char* interface, char* disable_path = NULL, const char* port1 = NULL, const char* port2 = NULL); + +bool get_bond_active_slave_name(IN const char* bond_name, OUT char* active_slave_name, IN int sz); +bool get_bond_slave_state(IN const char* slave_name, OUT char* curr_state, IN int sz); +bool get_bond_slaves_name_list(IN const char* bond_name, OUT char* slaves_list, IN int sz); +bool check_bond_roce_lag_exist(OUT char* bond_roce_lag_path, int sz, IN const char* slave_name); +bool check_device_exist(const char* ifname, const char *path); +bool check_device_name_ib_name(const char* ifname, const char* ibname); +bool check_netvsc_device_exist(const char* ifname); +bool get_netvsc_slave(IN const char* ifname, OUT char* slave_name, OUT unsigned int &slave_flags); +bool get_interface_oper_state(IN const char* interface_name, OUT char* slaves_list, IN int sz); + +int validate_ipoib_prop(const char* ifname, unsigned int ifflags, + const char prop_file[], const char *expected_val, + int val_size, char *filename, char* base_ifname); + +int validate_raw_qp_privliges(); + +bool validate_user_has_cap_net_raw_privliges(); + +/** + * Get TSO support using interface index + * + * @param if_index input interface index + * @return 0/1 or -1 on failure + */ +int validate_tso(int if_index); + +static inline int get_procname(int pid, char *proc, size_t size) +{ + char app_full_name[PATH_MAX] = {0}; + char proccess_proc_dir[FILE_NAME_MAX_SIZE] = {0}; + char* app_base_name = NULL; + int n = -1; + + if (NULL == proc) { + return -1; + } + + n = snprintf(proccess_proc_dir, sizeof(proccess_proc_dir), "/proc/%d/exe", pid); + if (likely((0 < n) && (n < (int)sizeof(proccess_proc_dir)))) { + n = readlink(proccess_proc_dir, app_full_name, sizeof(app_full_name) - 1); + if (n > 0) { + app_full_name[n] = '\0'; + app_base_name = strrchr(app_full_name, '/'); + if (app_base_name) { + strncpy(proc, app_base_name + 1, size - 1); + proc[size - 1] = '\0'; + return 0; + } + } + } + + return -1; +} + +static inline in_addr_t prefix_to_netmask(int prefix_length) +{ + in_addr_t mask = 0; + + if (prefix_length <= 0 || prefix_length > 32) { + return 0; + } + mask = ~mask << (32 - prefix_length); + mask = htonl(mask); + return mask; +} + +//Creates multicast MAC from multicast IP +//inline void create_multicast_mac_from_ip(uint8_t (& mc_mac) [6], in_addr_t ip) +inline void create_multicast_mac_from_ip(unsigned char* mc_mac, in_addr_t ip) +{ + if(mc_mac == NULL) + return; + + mc_mac[0] = 0x01; + mc_mac[1] = 0x00; + mc_mac[2] = 0x5e; + mc_mac[3] = (uint8_t)((ip>> 8)&0x7f); + mc_mac[4] = (uint8_t)((ip>>16)&0xff); + mc_mac[5] = (uint8_t)((ip>>24)&0xff); +} + +static inline void create_mgid_from_ipv4_mc_ip(uint8_t *mgid, uint16_t pkey, uint32_t ip) +{ + +// +--------+----+----+-----------------+---------+-------------------+ +// | 8 | 4 | 4 | 16 bits | 16 bits | 80 bits | +// +--------+----+----+-----------------+---------+-------------------+ +// |11111111|0001|scop||< P_Key >| group ID | +// +--------+----+----+-----------------+---------+-------------------+ +// |11111111|0001|0010|01000000000011011| | group ID | +// +--------+----+----+-----------------+---------+-------------------+ + + //Fixed for multicast + mgid[0] = 0xff; + mgid[1] = 0x12; + + //IPoIB signature: 0x401b for ipv4, 0x601b for ipv6 + mgid[2] = 0x40; + mgid[3] = 0x1b; + + //P_Key + mgid[4] = (((unsigned char *)(&pkey))[0]); + mgid[5] = (((unsigned char *)(&pkey))[1]); + + //group ID - relevant only for ipv4 + mgid[6] = 0x00; + mgid[7] = 0x00; + mgid[8] = 0x00; + mgid[9] = 0x00; + mgid[10] = 0x00; + mgid[11] = 0x00; + mgid[12] = (uint8_t)((ip)&0x0f); + mgid[13] = (uint8_t)((ip>>8)&0xff); + mgid[14] = (uint8_t)((ip>>16)&0xff); + mgid[15] = (uint8_t)((ip>>24)&0xff); + + vlog_printf(VLOG_DEBUG, "Translated to mgid: %02X%02X:%02X%02X:%02X%02X:%02X%02X:%02X%02X:%02X%02X:%02X%02X:%02X%02X\n", + ((unsigned char *)(mgid))[0],((unsigned char *)(mgid))[1], + ((unsigned char *)(mgid))[2],((unsigned char *)(mgid))[3], + ((unsigned char *)(mgid))[4],((unsigned char *)(mgid))[5], + ((unsigned char *)(mgid))[6],((unsigned char *)(mgid))[7], + ((unsigned char *)(mgid))[8],((unsigned char *)(mgid))[9], + ((unsigned char *)(mgid))[10],((unsigned char *)(mgid))[11], + ((unsigned char *)(mgid))[12],((unsigned char *)(mgid))[13], + ((unsigned char *)(mgid))[14],((unsigned char *)(mgid))[15]); +} + +/** + * special design for the rx loop. + */ +class loops_timer { + public: + loops_timer(); + void start(); + int time_left_msec(); + void set_timeout_msec(int timeout_msec) { m_timeout_msec = timeout_msec; } + int get_timeout_msec() { return m_timeout_msec; } + inline bool is_timeout() { + if (m_timeout_msec == -1) + return false; + + if (m_timer_countdown > 0) { + m_timer_countdown--; + return false; + } + //init counter + m_timer_countdown = m_interval_it; + + if (!ts_isset(&m_start)) { + gettime(&m_start); + } + //update timer + gettime(&m_current); + ts_sub(&m_current, &m_start, &m_elapsed); + vlog_printf(VLOG_FUNC_ALL, "update loops_timer (elapsed time=%d sec %d usec \n", ts_to_sec(&m_elapsed), ts_to_usec(&m_elapsed)); + + + + // test for timeout + if (m_timeout_msec <= ts_to_msec(&m_elapsed)) + return true; + + return false; + } + private: + timespec m_start; + timespec m_elapsed; + timespec m_current; + int m_interval_it; + int m_timer_countdown; + int m_timeout_msec; +}; + +// Returns the filesystem's inode number for the given 'fd' using 'fstat' system call that assumes 32 bit inodes +// This should be safe for 'proc' filesytem and for standard filesystems +uint32_t fd2inode(int fd); + + +/** + * @class vma_error + * + * base class for vma exceptions classes. + * Note: VMA code should NOT catch vma_error; VMA code should only catch exceptions of derived classes + */ +class vma_error : public std::exception { + char formatted_message[512]; +public: + const char * const message; + const char * const function; + const char * const filename; + const int lineno; + const int errnum; + + /** + * Create an object that contains const members for all the given arguments, plus a formatted message that will be + * available thru the 'what()' method of base class. + * + * The formatted_message will look like this: + * "vma_error (errno=24 Too many open files) in sock/sockinfo.cpp:61" + * catcher can print it to log like this: + * fdcoll_loginfo("recovering from %s", e.what()); + */ + vma_error(const char* _message, const char* _function, const char* _filename, int _lineno, int _errnum) throw(); + + virtual ~vma_error() throw(); + + virtual const char* what() const throw(); + +}; + +/** + * @class vma_exception + * NOTE: ALL exceptions that can be caught by VMA should be derived of this class + */ +class vma_exception : public vma_error { +public: + vma_exception(const char* _message, const char* _function, const char* _filename, int _lineno, int _errnum) throw() : + vma_error(_message, _function, _filename, _lineno, _errnum) + { + } +}; + + +#define create_vma_exception_class(clsname, basecls) \ + class clsname : public basecls { \ + public: \ + clsname(const char* _message, const char* _function, const char* _filename, int _lineno, int _errnum) throw() : \ + basecls(_message, _function, _filename, _lineno, _errnum) {} \ + } + +create_vma_exception_class(vma_unsupported_api, vma_error); + +#define throw_vma_exception(msg) throw vma_exception(msg, __PRETTY_FUNCTION__, __FILE__, __LINE__, errno) +// uses for throwing something that is derived from vma_error and has similar CTOR; msg will automatically be class name +#define vma_throw_object(_class) throw _class(#_class, __PRETTY_FUNCTION__, __FILE__, __LINE__, errno) +#define vma_throw_object_with_msg(_class, _msg) throw _class(_msg, __PRETTY_FUNCTION__, __FILE__, __LINE__, errno) + +/* Rounding up to nearest power of 2 */ +static inline uint32_t align32pow2(uint32_t x) +{ + x--; + x |= x >> 1; + x |= x >> 2; + x |= x >> 4; + x |= x >> 8; + x |= x >> 16; + + return x + 1; +} + + +static inline int ilog_2(uint32_t n) { + if (n == 0) + return 0; + + uint32_t t = 0; + while ((1 << t) < (int)n) + ++t; + + return (int)t; +} + +#endif diff --git a/src/vma/util/valgrind.h b/src/vma/util/valgrind.h new file mode 100644 index 0000000..46aad60 --- /dev/null +++ b/src/vma/util/valgrind.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef VMA_VALGRIND_H_ +#define VMA_VALGRIND_H_ + +#ifdef HAVE_CONFIG_H +# include "config.h" +#endif + +/* Valgrind compatibility */ +#ifndef NVALGRIND +# include +# ifndef VALGRIND_MAKE_MEM_DEFINED +# define VALGRIND_MAKE_MEM_DEFINED(p, n) VALGRIND_MAKE_READABLE(p, n) +# endif +# ifndef VALGRIND_MAKE_MEM_UNDEFINED +# define VALGRIND_MAKE_MEM_UNDEFINED(p, n) VALGRIND_MAKE_WRITABLE(p, n) +# endif +#else +# define VALGRIND_MAKE_MEM_DEFINED(p, n) +# define VALGRIND_MAKE_MEM_UNDEFINED(p, n) +# define VALGRIND_MAKE_MEM_NOACCESS(p, n) +# define VALGRIND_CREATE_MEMPOOL(n,p,x) +# define VALGRIND_DESTROY_MEMPOOL(p) +# define VALGRIND_MEMPOOL_ALLOC(n,p,x) +# define VALGRIND_MEMPOOL_FREE(n,p) +# define RUNNING_ON_VALGRIND 0 +#endif + + +#endif diff --git a/src/vma/util/vma_list.h b/src/vma/util/vma_list.h new file mode 100644 index 0000000..dc19c85 --- /dev/null +++ b/src/vma/util/vma_list.h @@ -0,0 +1,410 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef VMA_LIST_H +#define VMA_LIST_H + +#include "vma/util/list.h" +#include "vlogger/vlogger.h" + +#define VLIST_DEBUG 0 +#define VLIST_ID_SIZE 200 + +#define vlist_logwarn(log_fmt, log_args...) vlog_printf(VLOG_WARNING, "vlist[%p]:%d:%s() " log_fmt "\n", this, __LINE__, __FUNCTION__, ##log_args) +#define vlist_logerr(log_fmt, log_args...) vlog_printf(VLOG_ERROR, "vlist[%p]:%d:%s() " log_fmt "\n", this, __LINE__, __FUNCTION__, ##log_args) + +#if VLIST_DEBUG +template +class vma_list_t; +#define VLIST_DEBUG_PRINT_ERROR_IS_MEMBER vlist_logerr("Buff is already a member in a list! parent.id=[%s], this.id=[%s]", node_obj->list_id(), this->list_id()) +#define VLIST_DEBUG_SET_PARENT(node_obj, val) node_obj->parent = val +#else +#define VLIST_DEBUG_PRINT_ERROR_IS_MEMBER vlist_logerr("Buff is already a member in a list!") +#define VLIST_DEBUG_SET_PARENT(node_obj, val) +#endif + +#define NODE_OFFSET(_obj_type, _node_name) \ + ((size_t)(&(char &)(((_obj_type *) 1)->_node_name)) - 1) +#define GET_NODE(_obj, _obj_type, _offset_func) \ + ((list_node<_obj_type, _offset_func> *) ((size_t)(_obj) + (size_t)(_offset_func()))) + +template +class list_node { +public : + + /* head must be the first field! */ + struct list_head head; + T *obj_ptr; + +#if VLIST_DEBUG + vma_list_t * parent; + + char* list_id() { + return this->parent->list_id(); + } + +#endif + + list_node() : obj_ptr(NULL) { + this->head.next = &this->head; + this->head.prev = &this->head; + VLIST_DEBUG_SET_PARENT(this, NULL); + } + + /* is_list_member - check if the node is already a member in a list. */ + bool is_list_member() { + return this->head.next != &this->head || this->head.prev != &this->head; + } +}; + +template +class list_iterator_t : public std::iterator +{ +public: + + list_iterator_t(T* ptr = NULL) : m_ptr(ptr) {} + + list_iterator_t(const list_iterator_t& iter) : m_ptr(iter.m_ptr) {} + + ~list_iterator_t() {} + + list_iterator_t& operator=(T* ptr) { + m_ptr = ptr; + return (*this); + } + + list_iterator_t& operator=(const list_iterator_t& iter) { + m_ptr = iter.m_ptr; + return (*this); + } + + operator bool() const { + return m_ptr ? true : false; + } + + bool operator==(const list_iterator_t& iter) const { + return m_ptr == iter.getConstPtr(); + } + + bool operator!=(const list_iterator_t& iter) const { + return m_ptr != iter.getConstPtr(); + } + + list_iterator_t operator++(int) { + list_iterator_t iter(*this); + increment_ptr(); + return iter; + } + + list_iterator_t& operator++() { + increment_ptr(); + return *this; + } + + list_iterator_t operator--(int) { + list_iterator_t iter(*this); + decrement_ptr(); + return iter; + } + + list_iterator_t& operator--() { + decrement_ptr(); + return *this; + } + + T* operator*() { + return m_ptr; + } + + const T* operator*() const { + return m_ptr; + } + + T* operator->() { + return m_ptr; + } + + T* getPtr() const { + return m_ptr; + } + + const T* getConstPtr() const { + return m_ptr; + } + +private: + + T* m_ptr; + + inline void increment_ptr() { + m_ptr = ((list_node *)GET_NODE(m_ptr, T, offset)->head.next)->obj_ptr; + } + + inline void decrement_ptr() { + m_ptr = ((list_node *)GET_NODE(m_ptr, T, offset)->head.prev)->obj_ptr; + } + +}; + +template +class vma_list_t +{ +public: + typedef list_iterator_t iterator; + vma_list_t() { + init_list(); + } + + void set_id(const char *format, ...) { + if (format) { + #if VLIST_DEBUG + va_list arg; + va_start (arg, format); + vsnprintf (id, sizeof(id) ,format, arg); + va_end (arg); + #endif + } + } + + ~vma_list_t() { + if (!empty()) { + vlist_logwarn("Destructor is not supported for non-empty list! size=%zu", m_size); + } + } + + vma_list_t (const vma_list_t& other) { + if (!other.empty()) + vlist_logwarn("Copy constructor is not supported for non-empty list! other.size=%zu", other.m_size); + init_list(); + } + + vma_list_t& operator=(const vma_list_t& other) { + if (!empty() || !other.empty()) + vlist_logwarn("Operator= is not supported for non-empty list! size=%zu, other.size=%zu", m_size, other.m_size); + if (this != &other) { + init_list(); + } + return *this; + } + + T* operator[](size_t idx) const { + return get(idx); + } + + inline bool empty() const { + return m_size == 0; + } + + inline size_t size() const { + return m_size; + } + + inline T* front() const { + if (unlikely(empty())) + return NULL; + return ((list_node *)m_list.head.next)->obj_ptr; + } + + inline T* back() const { + if (unlikely(empty())) + return NULL; +/* clang analyzer reports: + * Use of memory after it is freed + * This issue comes from ~chunk_list_t() + * Root cause is unknown. + * TODO: Fix based on root cause instead of supressing + */ +#ifndef __clang_analyzer__ + return ((list_node *)m_list.head.prev)->obj_ptr; +#endif + } + + inline void pop_front() { + erase(front()); + } + + inline void pop_back() { + erase(back()); + } + + inline T* get_and_pop_front() { + T* list_front = front(); + pop_front(); + return list_front; + } + + inline T* get_and_pop_back() { + T* list_back = back(); + pop_back(); + return list_back; + } + + void erase(T* obj) { + if (unlikely(!obj)) { + vlist_logwarn("Got NULL object - ignoring"); + return; + } + + list_node *node_obj = GET_NODE(obj, T, offset); + VLIST_DEBUG_SET_PARENT(node_obj, NULL); + list_del_init(&node_obj->head); + m_size--; + } + + /** + * Clear content + * Removes all elements from the list container (which are NOT destroyed), and leaving the container with a size of 0. + * + * NOTE: we don't expect calling this method in normal situations (it is workaround at application shutdown); Hence, there is no cleanup of node.parent + */ + void clear_without_cleanup() { + init_list(); + } + + void push_back(T* obj) { + if (unlikely(!obj)) { + vlist_logwarn("Got NULL object - ignoring"); + return; + } + + list_node *node_obj = GET_NODE(obj, T, offset); + if (unlikely(node_obj->is_list_member())) { + VLIST_DEBUG_PRINT_ERROR_IS_MEMBER; + } + + VLIST_DEBUG_SET_PARENT(node_obj, this); + node_obj->obj_ptr = obj; + list_add_tail(&node_obj->head, &m_list.head); + m_size++; + } + + void push_front(T* obj) { + if (unlikely(!obj)) { + vlist_logwarn("Got NULL object - ignoring"); + return; + } + + list_node *node_obj = GET_NODE(obj, T, offset); + if (unlikely(node_obj->is_list_member())) { + VLIST_DEBUG_PRINT_ERROR_IS_MEMBER; + } + + VLIST_DEBUG_SET_PARENT(node_obj, this); + node_obj->obj_ptr = obj; + list_add(&node_obj->head, &m_list.head); + m_size++; + } + + T* get(size_t index) const { + if (m_size <= index) { + return NULL; + } else { + list_head* ans = m_list.head.next; + for (size_t i = 0 ; i < index ; i++) { + ans = ans->next; + } + return ((list_node *)ans)->obj_ptr; + } + } + + // concatenate 'from' at the head of this list + void splice_head(vma_list_t& from) { + + this->m_size += from.m_size; + list_splice(&from.m_list.head, &this->m_list.head); + from.init_list(); + // TODO: in case VLIST_DEBUG, this invalidates parent list of all nodes in the list + } + + // concatenate 'from' at the tail of this list + void splice_tail(vma_list_t& from) { + this->m_size += from.m_size; + list_splice_tail(&from.m_list.head, &this->m_list.head); + from.init_list(); + // TODO: in case VLIST_DEBUG, this invalidates parent list of all nodes in the list + } + + /** + * Swap content + * Exchanges the content of the container by the content of x, which is another list of the same type. Sizes may differ. + * + * After the call to this member function, the elements in this container are those which were in x before the call, and + * the elements of x are those which were in this. All iterators, references and pointers remain valid for the swapped objects. + */ + void swap (vma_list_t& x) { + vma_list_t temp_list; + this->move_to_empty(temp_list); + x.move_to_empty(*this); + temp_list.move_to_empty(x); + } + + list_iterator_t begin() { + return list_iterator_t(front()); + } + + list_iterator_t end() { + return list_iterator_t(NULL); + } + +#if VLIST_DEBUG + char* list_id() { + return (char*)&id; + } +#endif + +private: + + list_node m_list; + size_t m_size; + +#if VLIST_DEBUG + char id[VLIST_ID_SIZE]; +#endif + + void move_to_empty(vma_list_t& to) { + assert(to.empty()); + to.m_size = this->m_size; + list_splice_tail(&this->m_list.head, &to.m_list.head); + this->init_list(); + // TODO: in case VLIST_DEBUG, this invalidates parent list of all nodes in the list + } + + void init_list() { + m_size = 0; + INIT_LIST_HEAD(&m_list.head); + #if VLIST_DEBUG + id[0] = '\0'; + #endif + } +}; + +#endif /* VMA_LIST_H */ diff --git a/src/vma/util/vma_stats.h b/src/vma/util/vma_stats.h new file mode 100644 index 0000000..15fba05 --- /dev/null +++ b/src/vma/util/vma_stats.h @@ -0,0 +1,401 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef VMA_STATS_H +#define VMA_STATS_H + +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NUM_OF_SUPPORTED_CQS 16 +#define NUM_OF_SUPPORTED_RINGS 16 +#define NUM_OF_SUPPORTED_BPOOLS 2 +#define NUM_OF_SUPPORTED_EPFDS 32 +#define SHMEM_STATS_SIZE(fds_num) sizeof(sh_mem_t) + (fds_num * sizeof(socket_instance_block_t)) +#define FILE_NAME_MAX_SIZE (NAME_MAX + 1) +#define MC_TABLE_SIZE 1024 +#define MAP_SH_MEM(var,sh_stats) var = (sh_mem_t*)sh_stats +#define STATS_PUBLISHER_TIMER_PERIOD 10 // publisher will check for stats request every 10 msec +#define STATS_READER_DELAY STATS_PUBLISHER_TIMER_PERIOD + 5 // reader will wait for vma to wakeup and write statistics to shmem (with extra 5 msec overhead) +#define STATS_FD_STATISTICS_DISABLED -1 +#define STATS_FD_STATISTICS_LOG_LEVEL_DEFAULT VLOG_DEFAULT + +//statistic file +extern FILE* g_stats_file; + +// Common iomux stats +typedef struct { + pid_t threadid_last; + uint32_t n_iomux_poll_hit; + uint32_t n_iomux_poll_miss; + uint32_t n_iomux_timeouts; + uint32_t n_iomux_errors; + uint32_t n_iomux_rx_ready; + uint32_t n_iomux_os_rx_ready; + uint32_t n_iomux_polling_time; +} iomux_func_stats_t; + +typedef enum { + e_totals = 1, + e_deltas +} print_details_mode_t; + +typedef enum { + e_basic = 1, + e_medium, + e_full, + e_mc_groups, + e_netstat_like +} view_mode_t; + +typedef enum { + e_by_pid_str, + e_by_app_name, + e_by_runn_proccess +} proc_ident_mode_t; + +struct user_params_t { + int interval; + print_details_mode_t print_details_mode; + view_mode_t view_mode; + bool forbid_cleaning; + vlog_levels_t vma_log_level; + int vma_details_level; + bool zero_counters; + proc_ident_mode_t proc_ident_mode; + bool write_auth; + int cycles; + int fd_dump; + vlog_levels_t fd_dump_log_level; + std::string vma_stats_path; +}; + +extern user_params_t user_params; + +// Epoll group stats +typedef struct { + bool enabled; + int epfd; + iomux_func_stats_t stats; +} epoll_stats_t; + +// iomux function stat info +typedef struct { + iomux_func_stats_t poll; + iomux_func_stats_t select; + epoll_stats_t epoll[NUM_OF_SUPPORTED_EPFDS]; +} iomux_stats_t; + +// multicast stat info +typedef struct { + uint32_t sock_num; + in_addr_t mc_grp; +} mc_tbl_entry_t; + +typedef struct { + uint16_t max_grp_num; + mc_tbl_entry_t mc_grp_tbl[MC_TABLE_SIZE]; +} mc_grp_info_t; + +// socket stat info +typedef struct { + uint32_t n_rx_packets; + uint32_t n_rx_bytes; + uint32_t n_rx_poll_hit; + uint32_t n_rx_poll_miss; + uint32_t n_rx_ready_pkt_max; + uint32_t n_rx_ready_byte_drop; + uint32_t n_rx_ready_pkt_drop; + uint32_t n_rx_ready_byte_max; + uint32_t n_rx_errors; + uint32_t n_rx_eagain; + uint32_t n_rx_os_packets; + uint32_t n_rx_os_bytes; + uint32_t n_rx_poll_os_hit; + uint32_t n_rx_os_errors; + uint32_t n_rx_os_eagain; + uint32_t n_rx_migrations; + uint32_t n_tx_sent_pkt_count; + uint32_t n_tx_sent_byte_count; + uint32_t n_tx_errors; + uint32_t n_tx_drops; + uint32_t n_tx_retransmits; + uint32_t n_tx_os_packets; + uint32_t n_tx_os_bytes; + uint32_t n_tx_os_errors; + uint32_t n_tx_os_eagain; + uint32_t n_tx_migrations; + uint32_t n_tx_dummy; +} socket_counters_t; + +typedef struct { + int fd; + uint32_t inode; + uint32_t tcp_state; // enum tcp_state + uint8_t socket_type; // SOCK_STREAM, SOCK_DGRAM, ... + uint8_t padding1[3]; + bool b_is_offloaded; + bool b_blocking; + bool b_mc_loop; + bool padding2; + in_addr_t bound_if; + in_addr_t connected_ip; + in_addr_t mc_tx_if; + in_port_t bound_port; + in_port_t connected_port; + pid_t threadid_last_rx; + pid_t threadid_last_tx; + uint32_t n_rx_ready_pkt_count; + uint32_t n_rx_ready_byte_count; + uint32_t n_rx_ready_byte_limit; + uint32_t n_rx_zcopy_pkt_count; + uint32_t n_tx_ready_byte_count; + socket_counters_t counters; + std::bitset mc_grp_map; + ring_logic_t ring_alloc_logic_rx; + ring_logic_t ring_alloc_logic_tx; + uint64_t ring_user_id_rx; + uint64_t ring_user_id_tx; + + void reset() { + fd = 0; + inode = tcp_state = 0; + socket_type = 0; + b_is_offloaded = b_blocking = b_mc_loop = false; + bound_if = connected_ip = mc_tx_if = (in_addr_t)0; + bound_port = connected_port = (in_port_t)0; + threadid_last_rx = threadid_last_tx = pid_t(0); + n_rx_ready_pkt_count = n_rx_ready_byte_count = n_rx_ready_byte_limit = n_rx_zcopy_pkt_count = n_tx_ready_byte_count = 0; + memset(&counters, 0, sizeof(counters)); + mc_grp_map.reset(); + ring_user_id_rx = ring_user_id_tx = 0; + ring_alloc_logic_rx = ring_alloc_logic_tx = RING_LOGIC_PER_INTERFACE; + }; +} socket_stats_t; + +typedef struct { + bool b_enabled; + socket_stats_t skt_stats; + + void reset() { + b_enabled = false; + skt_stats.reset(); + } +} socket_instance_block_t; + +// CQ stat info +typedef struct { + uint64_t n_rx_pkt_drop; + uint32_t n_rx_sw_queue_len; + uint32_t n_rx_drained_at_once_max; + uint32_t n_buffer_pool_len; +} cq_stats_t; + +typedef struct { + bool b_enabled; + cq_stats_t cq_stats; +} cq_instance_block_t; + +typedef enum { + RING_ETH = 0, + RING_ETH_CB, + RING_ETH_DIRECT, + RING_TAP, + RING_IB +} ring_type_t; + +static const char * const ring_type_str[] = { + "RING_ETH", + "RING_ETH_CB", + "RING_ETH_DIRECT", + "RING_TAP", + "RING_IB" +}; + +// Ring stat info +typedef struct { + uint64_t n_rx_pkt_count; + uint64_t n_rx_byte_count; + uint64_t n_tx_pkt_count; + uint64_t n_tx_byte_count; + uint64_t n_tx_retransmits; + void* p_ring_master; + ring_type_t n_type; + union { + struct { + uint64_t n_rx_interrupt_requests; + uint64_t n_rx_interrupt_received; + uint32_t n_rx_cq_moderation_count; + uint32_t n_rx_cq_moderation_period; + uint64_t n_tx_dev_mem_pkt_count; + uint64_t n_tx_dev_mem_byte_count; + uint64_t n_tx_dev_mem_oob; + uint32_t n_tx_dev_mem_allocated; + } simple; + struct { + char s_tap_name[IFNAMSIZ]; + uint32_t n_tap_fd; + uint32_t n_rx_buffers; + uint32_t n_vf_plugouts; + } tap; + }; +} ring_stats_t; + +typedef struct { + bool b_enabled; + ring_stats_t ring_stats; +} ring_instance_block_t; + +// Buffer Pool stat info +typedef struct { + bool is_rx; + bool is_tx; + uint32_t n_buffer_pool_size; + uint32_t n_buffer_pool_no_bufs; +} bpool_stats_t; + +typedef struct { + bool b_enabled; + bpool_stats_t bpool_stats; +} bpool_instance_block_t; + +// Version info +typedef struct { + uint8_t vma_lib_maj; + uint8_t vma_lib_min; + uint8_t vma_lib_rev; + uint8_t vma_lib_rel; +} version_info_t; + +typedef struct sh_mem_t { + int reader_counter; //only copy to shm upon active reader + version_info_t ver_info; + char stats_protocol_ver[32]; + vlog_levels_t log_level; + uint8_t log_details_level; + int fd_dump; + vlog_levels_t fd_dump_log_level; + cq_instance_block_t cq_inst_arr[NUM_OF_SUPPORTED_CQS]; + ring_instance_block_t ring_inst_arr[NUM_OF_SUPPORTED_RINGS]; + bpool_instance_block_t bpool_inst_arr[NUM_OF_SUPPORTED_BPOOLS]; + mc_grp_info_t mc_info; + iomux_stats_t iomux; + size_t max_skt_inst_num; // number of elements allocated in 'socket_instance_block_t skt_inst_arr[]' + + /* IMPORTANT: MUST BE LAST ENTRY in struct: [0] is the allocation start point for all fd's + * + * Some compiler can report issue as 'array subscript is above array bounds' + * + * In ISO C90, you would have to give contents a length of 1, + * which means either you waste space or complicate the argument to malloc. + * Note: + * - 1 was the portable way to go, though it was rather strange + * - 0 was better at indicating intent, but not legal as far as + * the Standard was concerned and supported as an extension by some compilers (including gcc) + * + * In ISO C99, you would use a flexible array member, which is slightly different in syntax and semantics: + * - Flexible array members are written as contents[] without the 0. + * - Flexible array members have incomplete type, and so the sizeof operator may not be applied. + * As a quirk of the original implementation of zero-length arrays, sizeof evaluates to zero. + * - Flexible array members may only appear as the last member of a struct that is otherwise non-empty. + * - A structure containing a flexible array member, or a union containing such a structure (possibly recursively), + * may not be a member of a structure or an element of an array. (However, these uses are permitted by GCC as extensions.) + */ + socket_instance_block_t skt_inst_arr[1]; //sockets statistics array + + void reset() { + reader_counter = 0; + memset(&ver_info, 0, sizeof(ver_info)); + memset(stats_protocol_ver, 0, sizeof(stats_protocol_ver)); + max_skt_inst_num = 0; + log_level = (vlog_levels_t)0; + log_details_level = 0; + fd_dump = 0; + fd_dump_log_level = (vlog_levels_t)0; + memset(cq_inst_arr, 0, sizeof(cq_inst_arr)); + memset(ring_inst_arr, 0, sizeof(ring_inst_arr)); + memset(bpool_inst_arr, 0, sizeof(bpool_inst_arr)); + memset(&mc_info, 0, sizeof(mc_info)); + memset(&iomux, 0, sizeof(iomux)); + for (uint32_t i = 0; i < max_skt_inst_num; i++) { + skt_inst_arr[i].reset(); + } + } +} sh_mem_t; + +typedef struct sh_mem_info { + char filename_sh_stats[PATH_MAX]; + size_t shmem_size; + int fd_sh_stats; + void* p_sh_stats; + int pid; +} sh_mem_info_t; + +// publisher functions +void vma_shmem_stats_open(vlog_levels_t** p_p_vma_log_level, uint8_t** p_p_vma_log_details); +void vma_shmem_stats_close(); + +void vma_stats_instance_create_socket_block(socket_stats_t*); +void vma_stats_instance_remove_socket_block(socket_stats_t*); + +void vma_stats_mc_group_add(in_addr_t mc_grp, socket_stats_t* p_socket_stats); +void vma_stats_mc_group_remove(in_addr_t mc_grp, socket_stats_t* p_socket_stats); + +void vma_stats_instance_create_ring_block(ring_stats_t*); +void vma_stats_instance_remove_ring_block(ring_stats_t*); + +void vma_stats_instance_create_cq_block(cq_stats_t*); +void vma_stats_instance_remove_cq_block(cq_stats_t*); + +void vma_stats_instance_create_bpool_block(bpool_stats_t*); +void vma_stats_instance_remove_bpool_block(bpool_stats_t*); + +void vma_stats_instance_get_poll_block(iomux_func_stats_t*); +void vma_stats_instance_get_select_block(iomux_func_stats_t*); + +void vma_stats_instance_create_epoll_block(int, iomux_func_stats_t*); +void vma_stats_instance_remove_epoll_block(iomux_func_stats_t* ep_stats); + +//reader functions +void print_full_stats(socket_stats_t* p_si_stats, mc_grp_info_t* p_mc_grp_info, FILE* filename); +void print_netstat_like(socket_stats_t* p_si_stats, mc_grp_info_t* p_mc_grp_info, FILE* file, int pid); +void print_netstat_like_headers(FILE* file); + +#endif // VMA_STATS_H diff --git a/src/vma/util/vtypes.h b/src/vma/util/vtypes.h new file mode 100644 index 0000000..1c83964 --- /dev/null +++ b/src/vma/util/vtypes.h @@ -0,0 +1,214 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef VTYPES_H +#define VTYPES_H + +#include +#include +#include +#include +#include + +#include "utils/types.h" +#include "utils/bullseye.h" +#ifndef IN +#define IN +#endif + +#ifndef OUT +#define OUT +#endif + +#ifndef INOUT +#define INOUT +#endif + +#if __BYTE_ORDER == __LITTLE_ENDIAN +static inline uint64_t htonll(uint64_t x) { return bswap_64(x); } +static inline uint64_t ntohll(uint64_t x) { return bswap_64(x); } +#elif __BYTE_ORDER == __BIG_ENDIAN +static inline uint64_t htonll(uint64_t x) { return x; } +static inline uint64_t ntohll(uint64_t x) { return x; } +#else +#error __BYTE_ORDER is neither __LITTLE_ENDIAN nor __BIG_ENDIAN +#endif + +#define likely(x) __builtin_expect(!!(x), 1) +#define unlikely(x) __builtin_expect(!!(x), 0) + +// Check if given IP address is in a specific ip class / range +#define ZERONET_N(a) (((long int)(a)) == (long int)(htonl(0x00000000))) +#define LOOPBACK_N(a) (((long int)(a) & htonl(0xff000000)) == htonl(0x7f000000)) +#define IN_CLASSD_N(a) (((long int)(a) & htonl(0xf0000000)) == htonl(0xe0000000)) +#define IN_CLASSE_N(a) (((long int)(a) & htonl(0xffffffff)) == htonl(0xffffffff)) +#define IN_MULTICAST_N(a) IN_CLASSD_N(a) +#define IS_BROADCAST_N(a) IN_CLASSE_N(a) + + +// printf formating when IP is in network byte ordering (for LITTLE_ENDIAN) +#define NETWORK_IP_PRINTQUAD_LITTLE_ENDIAN(ip) (uint8_t)((ip)&0xff), (uint8_t)(((ip)>>8)&0xff),(uint8_t)(((ip)>>16)&0xff),(uint8_t)(((ip)>>24)&0xff) + +// printf formating when IP is in host byte ordering (for LITTLE_ENDIAN) +#define HOST_IP_PRINTQUAD_LITTLE_ENDIAN(ip) (uint8_t)(((ip)>>24)&0xff),(uint8_t)(((ip)>>16)&0xff),(uint8_t)(((ip)>>8)&0xff),(uint8_t)((ip)&0xff) + + + +#if __BYTE_ORDER == __LITTLE_ENDIAN + +/* The host byte order is the same as network byte order, so these functions are all just identity. */ + +# define NIPQUAD(ip) NETWORK_IP_PRINTQUAD_LITTLE_ENDIAN(ip) +# define HIPQUAD(ip) HOST_IP_PRINTQUAD_LITTLE_ENDIAN(ip) + +#else +# if __BYTE_ORDER == __BIG_ENDIAN + +# define NIPQUAD(ip) HOST_IP_PRINTQUAD_LITTLE_ENDIAN(ip) +# define HIPQUAD(ip) NETWORK_IP_PRINTQUAD_LITTLE_ENDIAN(ip) + +# endif +#endif + +#define ETH_HW_ADDR_PRINT_FMT "%02x:%02x:%02x:%02x:%02x:%02x" +#define ETH_HW_ADDR_PRINT_ADDR(__addr) \ + ((unsigned char *)(__addr))[0], ((unsigned char *)(__addr))[1], \ + ((unsigned char *)(__addr))[2], ((unsigned char *)(__addr))[3], \ + ((unsigned char *)(__addr))[4], ((unsigned char *)(__addr))[5] + + +#define IPOIB_HW_ADDR_PRINT_FMT_16 "%02X%02X:%02X%02X:%02X%02X:%02X%02X:%02X%02X:%02X%02X:%02X%02X:%02X%02X" +#define IPOIB_HW_ADDR_PRINT_ADDR_16(__addr) \ + ((unsigned char *)(__addr))[0],((unsigned char *)(__addr))[1], \ + ((unsigned char *)(__addr))[2],((unsigned char *)(__addr))[3], \ + ((unsigned char *)(__addr))[4],((unsigned char *)(__addr))[5], \ + ((unsigned char *)(__addr))[6],((unsigned char *)(__addr))[7], \ + ((unsigned char *)(__addr))[8],((unsigned char *)(__addr))[9], \ + ((unsigned char *)(__addr))[10],((unsigned char *)(__addr))[11], \ + ((unsigned char *)(__addr))[12],((unsigned char *)(__addr))[13], \ + ((unsigned char *)(__addr))[14],((unsigned char *)(__addr))[15] + +#define IPOIB_HW_ADDR_PRINT_FMT "%02X%02X:%02X%02X:%02X%02X:%02X%02X:%02X%02X:%02X%02X:%02X%02X:%02X%02X:%02X%02X:%02X%02X" +#define IPOIB_HW_ADDR_PRINT_ADDR(__addr) \ + ((unsigned char *)(__addr))[0],((unsigned char *)(__addr))[1], \ + ((unsigned char *)(__addr))[2],((unsigned char *)(__addr))[3], \ + ((unsigned char *)(__addr))[4],((unsigned char *)(__addr))[5], \ + ((unsigned char *)(__addr))[6],((unsigned char *)(__addr))[7], \ + ((unsigned char *)(__addr))[8],((unsigned char *)(__addr))[9], \ + ((unsigned char *)(__addr))[10],((unsigned char *)(__addr))[11], \ + ((unsigned char *)(__addr))[12],((unsigned char *)(__addr))[13], \ + ((unsigned char *)(__addr))[14],((unsigned char *)(__addr))[15], \ + ((unsigned char *)(__addr))[16],((unsigned char *)(__addr))[17], \ + ((unsigned char *)(__addr))[18],((unsigned char *)(__addr))[19] + +#define ETH_HW_ADDR_SSCAN_FMT "%02hhX:%02hhX:%02hhX:%02hhX:%02hhX:%02hhX" +#define ETH_HW_ADDR_SSCAN(__addr) \ + &(__addr[0]),&(__addr[1]), \ + &(__addr[2]),&(__addr[3]), \ + &(__addr[4]),&(__addr[5]) + + +#define IPOIB_HW_ADDR_SSCAN_FMT "%02hhX:%02hhX:%02hhX:%02hhX:%02hhX:%02hhX:%02hhX:%02hhX:%02hhX:%02hhX:%02hhX:%02hhX:%02hhX:%02hhX:%02hhX:%02hhX:%02hhX:%02hhX:%02hhX:%02hhX" +#define IPOIB_HW_ADDR_SSCAN(__addr) \ + &(__addr[0]),&(__addr[1]), \ + &(__addr[2]),&(__addr[3]), \ + &(__addr[4]),&(__addr[5]), \ + &(__addr[6]),&(__addr[7]), \ + &(__addr[8]),&(__addr[9]), \ + &(__addr[10]),&(__addr[11]), \ + &(__addr[12]),&(__addr[13]), \ + &(__addr[14]),&(__addr[15]), \ + &(__addr[16]),&(__addr[17]), \ + &(__addr[18]),&(__addr[19]) + +#define ETH_HDR_LEN (ETH_HLEN) +#define ETH_VLAN_HDR_LEN (ETH_HDR_LEN + sizeof(struct vlanhdr)) +#define GRH_HDR_LEN (sizeof(struct ibv_grh)) +#define IPOIB_HDR_LEN (sizeof(struct ipoibhdr)) +#define IPOIB_HEADER ((uint32_t)0x08000000) +#define IPOIB_ARP_HEADER ((uint32_t)0x08060000) +#define IPOIB_HW_ADDR_LEN 20 +#define IPOIB_HW_ADDR_GID_LEN 16 +#define MAX_L2_ADDR_LEN (MAX(IPOIB_HW_ADDR_LEN, ETH_ALEN)) +#define IPV4_VERSION 0x4 +#define IPV4_HDR_LEN_WITHOUT_OPTIONS (sizeof(struct iphdr)) // Ip Header without any options +#define IPV4_IGMP_HDR_LEN (IPV4_HDR_LEN_WITHOUT_OPTIONS + sizeof(uint32_t)) +#define IPV4_IGMP_HDR_LEN_WORDS (IPV4_IGMP_HDR_LEN / sizeof(uint32_t)) +#define IGMP_HDR_LEN (sizeof(struct igmphdr)) +#define IGMP_HDR_LEN_WORDS (IGMP_HDR_LEN / sizeof(uint32_t)) +#define DONT_FRAGMENT_FLAG 0x4000 +#define MORE_FRAGMENTS_FLAG 0x2000 +#define FRAGMENT_OFFSET 0x1FFF +#define MAX_APP_ID_LENGHT 64 + +#define INPORT_ANY ((uint16_t)0x0000) + +#define MCE_IMM_DATA_MASK_MC_TX_LOOP_DISABLED (1 << 0) + +#define BROADCAST_IP "255.255.255.255" + +#ifndef ARPHRD_INFINIBAND +#define ARPHRD_INFINIBAND 32 /* InfiniBand */ +#endif + +#ifndef ARPHRD_ETHER +#define ARPHRD_ETHER 1 /* Ethernet 10Mbps */ +#endif + +#ifndef ARPHRD_LOOPBACK +#define ARPHRD_LOOPBACK 772 /* Loopback device */ +#endif + +#ifndef ETH_P_8021Q +#define ETH_P_8021Q 0x8100 /* 802.1Q VLAN Extended Header */ +#endif + +struct __attribute__ ((packed)) ipoibhdr { + uint32_t ipoib_header; +}; + +struct __attribute__((packed)) vlanhdr { + uint16_t h_vlan_TCI; /* Encapsulates priority and VLAN ID */ + uint16_t h_vlan_encapsulated_proto; /* packet type ID field (or len) */ +}; + +#include +//support for RH 5.7 and older OS +#ifndef EPOLLHUP +#define EPOLLHUP 0x010 +#endif +#ifndef EPOLLRDHUP +#define EPOLLRDHUP 0x2000 +#endif + +#endif //VTYPES_H diff --git a/src/vma/util/wakeup.cpp b/src/vma/util/wakeup.cpp new file mode 100644 index 0000000..c57bb28 --- /dev/null +++ b/src/vma/util/wakeup.cpp @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "utils/bullseye.h" +#include "vlogger/vlogger.h" +#include "wakeup.h" +#include + +#define MODULE_NAME "wakeup" + +#define wkup_logpanic __log_info_panic +#define wkup_logerr __log_info_err +#define wkup_logwarn __log_info_warn +#define wkup_loginfo __log_info_info +#define wkup_logdbg __log_info_dbg +#define wkup_logfunc __log_info_func +#define wkup_logfuncall __log_info_funcall +#define wkup_entry_dbg __log_entry_dbg + +#undef MODULE_HDR_INFO +#define MODULE_HDR_INFO MODULE_NAME "[epfd=%d]:%d:%s() " +#undef __INFO__ +#define __INFO__ m_epfd + +wakeup::wakeup() +{ + m_epfd = 0; + m_is_sleeping = 0; + memset(&m_ev, 0, sizeof(m_ev)); +} +void wakeup::going_to_sleep() +{ + BULLSEYE_EXCLUDE_BLOCK_START + if(likely(m_epfd)) + m_is_sleeping++; + else + { + wkup_logerr(" m_epfd is not initialized - cannot use wakeup mechanism\n"); + m_is_sleeping = 0; + } + BULLSEYE_EXCLUDE_BLOCK_END +} + +void wakeup::wakeup_set_epoll_fd(int epfd) +{ + m_epfd = epfd; +} diff --git a/src/vma/util/wakeup.h b/src/vma/util/wakeup.h new file mode 100644 index 0000000..d50573d --- /dev/null +++ b/src/vma/util/wakeup.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef WAKEUP_H +#define WAKEUP_H + +/** + * wakeup class that adds a wakeup functionality to socket (tcp and udp) and epoll. + */ +#include +#include "utils/lock_wrapper.h" + +class wakeup +{ +public: + wakeup(void); + virtual ~wakeup() {}; + virtual void do_wakeup() = 0; + virtual bool is_wakeup_fd(int fd) = 0; + virtual void remove_wakeup_fd() = 0; + void going_to_sleep(); + void return_from_sleep() { --m_is_sleeping; }; + +protected: + virtual void wakeup_set_epoll_fd(int epfd); + int m_is_sleeping; + + //lock_spin_recursive m_wakeup_lock; This lock is not needed for now. Maybe we will need it for epoll. + + int m_epfd; + struct epoll_event m_ev; +}; + +#endif /* WAKEUP_H */ diff --git a/src/vma/util/wakeup_pipe.cpp b/src/vma/util/wakeup_pipe.cpp new file mode 100644 index 0000000..d53c404 --- /dev/null +++ b/src/vma/util/wakeup_pipe.cpp @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "utils/bullseye.h" +#include "vlogger/vlogger.h" +#include "wakeup_pipe.h" +#include "vma/sock/sock-redirect.h" + +#define MODULE_NAME "wakeup_pipe" + +#define wkup_logpanic __log_info_panic +#define wkup_logerr __log_info_err +#define wkup_logwarn __log_info_warn +#define wkup_loginfo __log_info_info +#define wkup_logdbg __log_info_dbg +#define wkup_logfunc __log_info_func +#define wkup_logfuncall __log_info_funcall +#define wkup_entry_dbg __log_entry_dbg + +#undef MODULE_HDR_INFO +#define MODULE_HDR_INFO MODULE_NAME "[epfd=%d]:%d:%s() " +#undef __INFO__ +#define __INFO__ m_epfd +#define UNINIT_PIPE_FD (-1) + +int wakeup_pipe::g_wakeup_pipes[2] = {UNINIT_PIPE_FD, UNINIT_PIPE_FD}; +atomic_t wakeup_pipe::ref_count = ATOMIC_INIT(0); + +wakeup_pipe::wakeup_pipe() +{ + int ref = atomic_fetch_and_inc(&ref_count); + if (ref == 0) { + BULLSEYE_EXCLUDE_BLOCK_START + if (orig_os_api.pipe(g_wakeup_pipes)) { + wkup_logpanic("wakeup pipe create failed (errno=%d %m)", errno); + } + if (orig_os_api.write(g_wakeup_pipes[1], "^", 1) != 1) { + wkup_logpanic("wakeup pipe write failed(errno=%d %m)", errno); + } + BULLSEYE_EXCLUDE_BLOCK_END + wkup_logdbg("created wakeup pipe [RD=%d, WR=%d]", g_wakeup_pipes[0], g_wakeup_pipes[1]); + + // ToDo - these pipe should be closed at some point + // orig_os_api.close(g_si_wakeup_pipes[1]); + // orig_os_api.close(g_si_wakeup_pipes[0]); + } + + m_ev.events = EPOLLIN; + m_ev.data.fd = g_wakeup_pipes[0]; +} + +void wakeup_pipe::do_wakeup() +{ + wkup_logfuncall(""); + + //m_wakeup_lock.lock(); + //This func should be called under socket / epoll lock + + //Call to wakeup only in case there is some thread that is sleeping on epoll + if (!m_is_sleeping) + { + wkup_logfunc("There is no thread in epoll_wait, therefore not calling for wakeup"); + //m_wakeup_lock.unlock(); + return; + } + + wkup_entry_dbg(""); + + int errno_tmp = errno; //don't let wakeup affect errno, as this can fail with EEXIST + BULLSEYE_EXCLUDE_BLOCK_START + if ((orig_os_api.epoll_ctl(m_epfd, EPOLL_CTL_ADD, g_wakeup_pipes[0], &m_ev)) && (errno != EEXIST)) { + wkup_logerr("Failed to add wakeup fd to internal epfd (errno=%d %m)", errno); + } + BULLSEYE_EXCLUDE_BLOCK_END + errno = errno_tmp; + + //m_wakeup_lock.unlock(); + //sched_yield(); +} + +void wakeup_pipe::remove_wakeup_fd() +{ + if (m_is_sleeping) return; + wkup_entry_dbg(""); + int tmp_errno = errno; + if (orig_os_api.epoll_ctl(m_epfd, EPOLL_CTL_DEL, g_wakeup_pipes[0], NULL)) + { + BULLSEYE_EXCLUDE_BLOCK_START + if (errno == ENOENT) { + wkup_logdbg("Failed to delete global pipe from internal epfd it was already deleted"); + } else { + wkup_logerr("failed to delete global pipe from internal epfd (errno=%d %m)", errno); + } + BULLSEYE_EXCLUDE_BLOCK_END + } + errno = tmp_errno; +} + +wakeup_pipe::~wakeup_pipe() +{ + int ref = atomic_fetch_and_dec(&ref_count); + if (ref == 1) { + close(g_wakeup_pipes[0]); + close(g_wakeup_pipes[1]); + g_wakeup_pipes[0] = UNINIT_PIPE_FD; + g_wakeup_pipes[1] = UNINIT_PIPE_FD; + } +} diff --git a/src/vma/util/wakeup_pipe.h b/src/vma/util/wakeup_pipe.h new file mode 100644 index 0000000..78415cc --- /dev/null +++ b/src/vma/util/wakeup_pipe.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef WAKEUP_PIPE_H +#define WAKEUP_PIPE_H + +/** + * wakeup class that adds a wakeup functionality to socket (tcp and udp) and epoll using a pipe. + */ +#include "wakeup.h" +#include "utils/atomic.h" + +class wakeup_pipe : public wakeup +{ +public: + wakeup_pipe(void); + ~wakeup_pipe(); + virtual void do_wakeup(); + virtual inline bool is_wakeup_fd(int fd) + { + return fd == g_wakeup_pipes[0]; + }; + virtual void remove_wakeup_fd(); + +private: + static int g_wakeup_pipes[2]; + static atomic_t ref_count; +}; + +#endif /* WAKEUP_PIPE_H */ diff --git a/src/vma/vma_extra.h b/src/vma/vma_extra.h new file mode 100644 index 0000000..9e3bd70 --- /dev/null +++ b/src/vma/vma_extra.h @@ -0,0 +1,880 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef VMA_EXTRA_H +#define VMA_EXTRA_H + +#include +#include +#include + +/* + * Flags for recvfrom_zcopy() + */ +#define MSG_VMA_ZCOPY_FORCE 0x01000000 // don't fallback to bcopy +#define MSG_VMA_ZCOPY 0x00040000 // return: zero copy was done + +/* + * Options for setsockopt()/getsockopt() + */ +#define SO_VMA_GET_API 2800 +#define SO_VMA_USER_DATA 2801 +#define SO_VMA_RING_ALLOC_LOGIC 2810 +#define SO_VMA_RING_USER_MEMORY 2811 +#define SO_VMA_FLOW_TAG 2820 +#define SO_VMA_SHUTDOWN_RX 2821 + +/* + * Flags for Dummy send API + */ +#define VMA_SND_FLAGS_DUMMY MSG_SYN // equals to 0x400 + +/* + * Return values for the receive packet notify callback function + */ +typedef enum { + VMA_PACKET_DROP, /* VMA will drop the received packet and recycle + the buffer if no other socket needs it */ + + VMA_PACKET_RECV, /* VMA will queue the received packet on this socket ready queue. + The application will read it with the usual recv socket APIs */ + + VMA_PACKET_HOLD /* Application will handle the queuing of the received packet. The application + must return the descriptor to VMA using the free_packet function + But not in the context of VMA's callback itself. */ +} vma_recv_callback_retval_t; + + +/************ SocketXtreme API types definition start***************/ + +typedef enum { + VMA_SOCKETXTREME_PACKET = (1ULL << 32), /* New packet is available */ + VMA_SOCKETXTREME_NEW_CONNECTION_ACCEPTED = (1ULL << 33) /* New connection is auto accepted by server */ +} vma_socketxtreme_events_t; + +/* + * Represents VMA buffer + * Used in SocketXtreme extended API. + */ +struct vma_buff_t { + struct vma_buff_t* next; /* next buffer (for last buffer next == NULL) */ + void* payload; /* pointer to data */ + uint16_t len; /* data length */ +}; + +/** + * Represents one VMA packet + * Used in SocketXtreme extended API. + */ +struct vma_packet_desc_t { + size_t num_bufs; /* number of packet's buffers */ + uint16_t total_len; /* total data length */ + struct vma_buff_t* buff_lst; /* list of packet's buffers */ + struct timespec hw_timestamp; /* packet hw_timestamp */ +}; + +/* + * Represents VMA Completion. + * Used in SocketXtreme extended API. + */ +struct vma_completion_t { + /* Packet is valid in case VMA_SOCKETXTREME_PACKET event is set + */ + struct vma_packet_desc_t packet; + /* Set of events + */ + uint64_t events; + /* User provided data. + * By default this field has FD of the socket + * User is able to change the content using setsockopt() + * with level argument SOL_SOCKET and opname as SO_VMA_USER_DATA + */ + uint64_t user_data; + /* Source address (in network byte order) set for: + * VMA_SOCKETXTREME_PACKET and VMA_SOCKETXTREME_NEW_CONNECTION_ACCEPTED events + */ + struct sockaddr_in src; + /* Connected socket's parent/listen socket fd number. + * Valid in case VMA_SOCKETXTREME_NEW_CONNECTION_ACCEPTED event is set. + */ + int listen_fd; +}; + +/************ SocketXtreme API types definition end ***************/ + +/** + * Represents one VMA packets + * Used in zero-copy extended API. + */ +struct __attribute__ ((packed)) vma_packet_t { + void* packet_id; // packet identifier + size_t sz_iov; // number of fragments + struct iovec iov[]; // fragments size+data +}; + +/** + * Represents received packets in VMA + * Used in zero-copy extended API. + */ +struct __attribute__ ((packed)) vma_packets_t { + size_t n_packet_num; // number of received packets + struct vma_packet_t pkts[]; // array of received packets +}; + +/* + * Structure holding additional information on the packet and socket + * Note: Check structure size value for future VMA libraries changes + */ +struct __attribute__ ((packed)) vma_info_t { + size_t struct_sz; /* Compare this value with sizeof(vma_info_t) to check version compatability */ + void* packet_id; /* VMA's handle to received packet buffer to be return if zero copy logic is used */ + + /* Packet addressing information (in network byte order) */ + struct sockaddr_in* src; + struct sockaddr_in* dst; + + /* Packet information */ + size_t payload_sz; + + /* Socket's information */ + uint32_t socket_ready_queue_pkt_count; /* Current count of packets waiting to be read from the socket */ + uint32_t socket_ready_queue_byte_count; /* Current count of bytes waiting to be read from the socket */ + + /* Packet timestamping information */ + struct timespec hw_timestamp; + struct timespec sw_timestamp; +}; + +struct vma_rate_limit_t { + uint32_t rate; /* rate limit in Kbps */ + uint32_t max_burst_sz; /* maximum burst size in bytes */ + uint16_t typical_pkt_sz; /* typical packet size in bytes */ +}; + +typedef enum { + VMA_CB_MASK_TIMESTAMP = (1 << 0), +} vma_completion_cb_mask; + +/** + * @param comp_mask attributes you want to get from @ref vma_cyclic_buffer_read. + * see @ref vma_completion_cb_mask + * @param payload_ptr pointer to user data not including user header + * @param payload_length size of payload_ptr + * @param packets how many packets arrived + * @param usr_hdr_ptr points to the user header defined when creating the ring + * @param usr_hdr_ptr_length user header length + * @param hw_timestamp the HW time stamp of the first packet arrived + */ +struct vma_completion_cb_t { + uint32_t comp_mask; + void* payload_ptr; + size_t payload_length; + size_t packets; + void* usr_hdr_ptr; + size_t usr_hdr_ptr_length; + struct timespec hw_timestamp; +}; + +typedef int vma_ring_profile_key; + +typedef enum { + RING_LOGIC_PER_INTERFACE = 0, //!< RING_LOGIC_PER_INTERFACE + RING_LOGIC_PER_IP = 1, //!< RING_LOGIC_PER_IP + RING_LOGIC_PER_SOCKET = 10, //!< RING_LOGIC_PER_SOCKET + RING_LOGIC_PER_USER_ID = 11, //!< RING_LOGIC_PER_USER_ID + RING_LOGIC_PER_THREAD = 20, //!< RING_LOGIC_PER_THREAD + RING_LOGIC_PER_CORE = 30, //!< RING_LOGIC_PER_CORE + RING_LOGIC_PER_CORE_ATTACH_THREADS = 31, //!< RING_LOGIC_PER_CORE_ATTACH_THREADS + RING_LOGIC_LAST //!< RING_LOGIC_LAST +} ring_logic_t; + +typedef enum { + VMA_RING_ALLOC_MASK_RING_PROFILE_KEY = (1 << 0), + VMA_RING_ALLOC_MASK_RING_USER_ID = (1 << 1), + VMA_RING_ALLOC_MASK_RING_INGRESS = (1 << 2), + VMA_RING_ALLOC_MASK_RING_ENGRESS = (1 << 3), +} vma_ring_alloc_logic_attr_comp_mask; + +/** + * @brief pass this struct to vma using setsockopt with @ref SO_VMA_RING_ALLOC_LOGIC + * to set the allocation logic of this FD when he requests a ring. + * @note ring_alloc_logic is a mandatory + * @param comp_mask - what fields are read when processing this struct + * see @ref vma_ring_alloc_logic_attr_comp_mask + * @param ring_alloc_logic- allocation ratio to use + * @param ring_profile_key - what ring profile to use - get the profile when + * creating ring using @ref vma_add_ring_profile in extra_api + * can only be set once + * @param user_idx - when used RING_LOGIC_PER_USER_ID int @ref ring_alloc_logic + * this is the user id to define. This lets you define the same ring for + * few FD's regardless the interface\thread\core. + * @param ingress - RX ring + * @param engress - TX ring + */ +struct vma_ring_alloc_logic_attr { + uint32_t comp_mask; + ring_logic_t ring_alloc_logic; + uint32_t ring_profile_key; + uint32_t user_id; + uint32_t ingress:1; + uint32_t engress:1; + uint32_t reserved:30; +}; + +/* + * @note you cannot use RAW_PACKET with hdr_bytes > 0 + */ +typedef enum { + RAW_PACKET, // Full wire packet in payload_ptr cyclic buffer + STRIP_NETWORK_HDRS, // Strip down packet's network headers in cyclic buffers. + SEPERATE_NETWORK_HDRS, // Expose the packet's network headers in headers_ptr + PADDED_PACKET, // Full packet with padding to power of 2 +} vma_cb_packet_rec_mode; + +typedef enum { + VMA_CB_HDR_BYTE = (1 << 0), + VMA_CB_EXTERNAL_MEM = (1 << 1), +} vma_cb_ring_attr_mask; + +typedef enum { + VMA_MODIFY_RING_CQ_MODERATION = (1 << 0), + VMA_MODIFY_RING_CQ_ARM = (1 << 1), +} vma_modify_ring_mask; + +struct vma_cq_moderation_attr { + uint32_t cq_moderation_count; + uint32_t cq_moderation_period_usec; +}; + +struct vma_cq_arm_attr { +}; + +/** + * @param comp_mask - what fields should be read when processing this struct + * see @ref vma_modify_ring_mask + * @param ring_fd - ring fd + */ +struct vma_modify_ring_attr { + uint32_t comp_bit_mask; + int ring_fd; + union { + struct vma_cq_moderation_attr cq_moderation; + struct vma_cq_arm_attr cq_arm; + }; +}; + +/** + * @param comp_mask - what fields are read when processing this struct see @ref vma_cb_ring_attr_mask + * @param num - Minimum number of elements allocated in the circular buffer + * @param hdr_bytes - Bytes separated from UDP payload which are + * part of the application header + * @note this will be accesable from headers_ptr in @ref vma_completion_cb_t + * @param stride_bytes - Bytes separated for each ingress payload for alignment + * control (does not include the hdr_bytes). Should be smaller + * than MTU. + * + * @note your packet will be written to the memory in a different way depending + * on the packet_receive_mode and hdr_bytes. + * In all modes all the packets and\or headers will be contiguous in the memory. + * The number of headers\packets is equal to packets in @ref vma_completion_cb_t. + * the packet memory layout has five options: + * 1. RAW_PACKET - payload_ptr will point to the raw packet containing the + * network headers and user payload. + * 2. STRIP_NETWORK_HDRS - network headers will be ignored by VMA. + * payload_ptr - will point to the first packet which it size is defined in + * stride_bytes. + * a. hdr_bytes > 0 + * usr_hdr_ptr will point to the first header. + * b. hdr_bytes = 0 + * usr_hdr_ptr is NULL + * 3. SEPERATE_NETWORK_HDRS - network headers will be dropped + * payload_ptr - will point to the first packet as it size is defined + * in stride_bytes. + * a. hdr_bytes > 0 + * usr_hdr_ptr will point to the first network header + user header + * (contiguous in memory). + * b. hdr_bytes = 0 + * usr_hdr_ptr will point to the first network header. + * 4. PADDED_PACKET - packet will be written to memory and additional padding + * will be added to the end of it to match the nearest power of two. + * e.g. if stride_bytes is 1400 then and the network size is 42 (eth+ip+udp) + * the padding will be 2048 - 1400 - 42 -> 606. + * This mode has the best performance and causes less PCI bus back pressure. + * In this mode hdr_bytes is ignored and usr_hdr_ptr is NULL. + * packet layout in PADDED_PACKET mode + * +--------------------------------------------------------------------------+ + * #| mac+ip+udp | datagram payload | alignment| + * +--------------------------------------------------------------------------+ + * 1| | e.g. RTP header | e.g. RTP payload | alignment | + * 2| | e.g. RTP header | e.g. RTP payload | alignment | + * +--------------------------------------------------------------------------+ + * + */ +struct vma_cyclic_buffer_ring_attr { + uint32_t comp_mask; + uint32_t num; + uint16_t stride_bytes; + uint16_t hdr_bytes; + vma_cb_packet_rec_mode packet_receive_mode; +}; + +struct vma_packet_queue_ring_attr { + uint32_t comp_mask; +}; + +struct vma_external_mem_attr { + uint32_t comp_mask; +}; + +typedef enum { + // for future use + VMA_RING_ATTR_LAST +} vma_ring_type_attr_mask; + +typedef enum { + VMA_RING_PACKET, + VMA_RING_CYCLIC_BUFFER, + VMA_RING_EXTERNAL_MEM, +} vma_ring_type; + +/** + * @param comp_mask - what fields are read when processing this struct + * see @ref vma_ring_type_attr_mask + * @param ring_type - use cyclic buffer ring or default packets ring + * + */ +struct vma_ring_type_attr { + uint32_t comp_mask; + vma_ring_type ring_type; + union { + struct vma_cyclic_buffer_ring_attr ring_cyclicb; + struct vma_packet_queue_ring_attr ring_pktq; + struct vma_external_mem_attr ring_ext; + }; +}; + +typedef enum { + VMA_HW_PP_EN = (1 << 0), + VMA_HW_UMR_EN = (1 << 1), + VMA_HW_MP_RQ_EN = (1 << 2), + VMA_HW_PP_BURST_EN = (1 << 3), +} mlx_hw_device_cap; + +struct dev_data { + uint32_t vendor_id; + uint32_t vendor_part_id; + uint32_t device_cap; // mlx_hw_device_cap +}; + +struct hw_cq_data { + void *buf; + volatile uint32_t *dbrec; + uint32_t cq_size; + uint32_t cqe_size; + uint32_t cqn; + void *uar; + // for notifications + uint32_t *cons_idx; +}; + +struct hw_wq_data { + void *buf; + uint32_t wqe_cnt; + uint32_t stride; + volatile uint32_t *dbrec; + struct hw_cq_data cq_data; +}; + +struct hw_rq_data { + struct hw_wq_data wq_data; + // TBD do we need it + uint32_t *head; + uint32_t *tail; +}; + +struct hw_sq_data { + struct hw_wq_data wq_data; + uint32_t sq_num; + struct { + void *reg; + uint32_t size; + uint32_t offset; + } bf; +}; + +typedef enum { + DATA_VALID_DEV, + DATA_VALID_SQ, + DATA_VALID_RQ, +} vma_mlx_hw_valid_data_mask; + +struct vma_mlx_hw_device_data { + uint32_t valid_mask; // see vma_mlx_hw_valid_data_mask + struct dev_data dev_data; + struct hw_sq_data sq_data; + struct hw_rq_data rq_data; +}; + +typedef enum { + VMA_EXTRA_API_REGISTER_RECV_CALLBACK = (1 << 0), + VMA_EXTRA_API_RECVFROM_ZCOPY = (1 << 1), + VMA_EXTRA_API_FREE_PACKETS = (1 << 2), + VMA_EXTRA_API_ADD_CONF_RULE = (1 << 3), + VMA_EXTRA_API_THREAD_OFFLOAD = (1 << 4), + VMA_EXTRA_API_DUMP_FD_STATS = (1 << 5), + VMA_EXTRA_API_SOCKETXTREME_POLL = (1 << 6), + VMA_EXTRA_API_SOCKETXTREME_FREE_VMA_PACKETS = (1 << 7), + VMA_EXTRA_API_SOCKETXTREME_REF_VMA_BUFF = (1 << 8), + VMA_EXTRA_API_SOCKETXTREME_FREE_VMA_BUFF = (1 << 9), + VMA_EXTRA_API_GET_SOCKET_RINGS_NUM = (1 << 10), + VMA_EXTRA_API_GET_SOCKET_RINGS_FDS = (1 << 11), + VMA_EXTRA_API_GET_SOCKET_TX_RING_FD = (1 << 12), + VMA_EXTRA_API_GET_SOCKET_NETWORK_HEADER = (1 << 13), + VMA_EXTRA_API_GET_RING_DIRECT_DESCRIPTORS = (1 << 14), + VMA_EXTRA_API_CYCLIC_BUFFER_READ = (1 << 15), + VMA_EXTRA_API_ADD_RING_PROFILE = (1 << 16), + VMA_EXTRA_API_REGISTER_MEMORY_ON_RING = (1 << 17), + VMA_EXTRA_API_DEREGISTER_MEMORY_ON_RING = (1 << 18), + VMA_EXTRA_API_GET_MEM_INFO = (1 << 19), + VMA_EXTRA_API_MODIFY_RING = (1 << 20), + VMA_EXTRA_API_GET_DPCP_DEVICES = (1 << 21) +} vma_extra_api_mask; + +/** + * + * VMA Notification callback for incoming packet on socket + * @param fd Socket's file descriptor which this packet refers to + * @param iov iovector structure array point holding the packet + * received data buffer pointers and size of each buffer + * @param iov_sz Size of iov array + * @param vma_info Additional information on the packet and socket + * @param context User-defined value provided during callback + * registration for each socket + * + * This callback function should be registered with VMA by calling + * register_recv_callback() in the extended API. It can be unregistered by + * setting a NULL function pointer. VMA will call the callback to notify + * of new incoming packets after the IP & UDP header processing and before + * they are queued in the socket's receive queue. + * Context of the callback will always be from one of the user's application + * threads when calling the following socket APIs: select, poll, epoll, recv, + * recvfrom, recvmsg, read, readv. + * + * Notes: + * - The application can call all of the Socket APIs control and send from + * within the callback context. + * - Packet loss might occur depending on the applications behavior in the + * callback context. + * - Parameters `iov' and `vma_info' are only valid until callback context + * is returned to VMA. User should copy these structures for later use + * if working with zero copy logic. + */ +typedef vma_recv_callback_retval_t +(*vma_recv_callback_t)(int fd, size_t sz_iov, struct iovec iov[], + struct vma_info_t* vma_info, void *context); + +/** + * VMA Extended Socket API + */ +struct __attribute__ ((packed)) vma_api_t { + /** + * Register a received packet notification callback. + * + * @param s Socket file descriptor. + * @param callback Callback function. + * @param context user contex for callback function. + * @return 0 - success, -1 - error + * + * errno is set to: EINVAL - not VMA offloaded socket + */ + int (*register_recv_callback)(int s, vma_recv_callback_t callback, void *context); + + /** + * Zero-copy revcfrom implementation. + * + * @param s Socket file descriptor. + * @param buf Buffer to fill with received data or pointers to data (see below). + * @param flags Pointer to flags (see below). + * @param from If not NULL, will be filled with source address (same as recvfrom). + * @param fromlen If not NULL, will be filled with source address size (same as recvfrom). + * + * This function attempts to receive a packet without doing data copy. + * The flags argument can contain the usual flags of recvmsg(), and also the + * MSG_VMA_ZCOPY_FORCE flag. If the latter is set, the function will not + * fall back to data copy. Otherwise, the function falls back to data copy + * if zero-copy cannot be performed. If zero-copy is done then MSG_VMA_ZCOPY + * flag is set upon exit. + * + * If zero copy is performed (MSG_VMA_ZCOPY flag is returned), the buffer + * is filled with a vma_packets_t structure, holding as much fragments + * as `len' allows. The total size of all fragments is returned. + * Otherwise the MSG_VMA_ZCOPY flag is not set and the buffer is filled + * with actual data and it's size is returned (same as recvfrom()) + * If no data was received the return value is zero. + * + * NOTE: The returned packet must be freed with free_packet() after + * the application finished using it. + */ + int (*recvfrom_zcopy)(int s, void *buf, size_t len, int *flags, + struct sockaddr *from, socklen_t *fromlen); + + /** + * Frees a packet received by recvfrom_zcopy() or held by receive callback. + * + * @param s Socket from which the packet was received. + * @param pkts Array of packet. + * @param count Number of packets in the array. + * @return 0 on success, -1 on failure + * + * errno is set to: EINVAL - not a VMA offloaded socket + * ENOENT - the packet was not received from `s'. + */ + int (*free_packets)(int s, struct vma_packet_t *pkts, size_t count); + + /* + * Add a libvma.conf rule to the top of the list. + * This rule will not apply to existing sockets which already considered the conf rules. + * (around connect/listen/send/recv ..) + * @param config_line A char buffer with the exact format as defined in libvma.conf, and should end with '\0'. + * @return 0 on success, or error code on failure. + */ + int (*add_conf_rule)(const char *config_line); + + /* + * Create sockets on pthread tid as offloaded/not-offloaded. + * This does not affect existing sockets. + * Offloaded sockets are still subject to libvma.conf rules. + * @param offload 1 for offloaded, 0 for not-offloaded. + * @return 0 on success, or error code on failure. + */ + int (*thread_offload)(int offload, pthread_t tid); + + /** + * socketxtreme_poll() polls for VMA completions + * + * @param fd File descriptor. + * @param completions VMA completions array. + * @param ncompletions Maximum number of completion to return. + * @param flags Flags. + * @return On success, return the number of ready completions. + * On error, -1 is returned, and TBD:errno is set?. + * + * This function polls the `fd` for VMA completions and returns maximum `ncompletions` ready + * completions via `completions` array. + * The `fd` can represent a ring, socket or epoll file descriptor. + * + * VMA completions are indicated for incoming packets and/or for other events. + * If VMA_SOCKETXTREME_PACKET flag is enabled in vma_completion_t.events field + * the completion points to incoming packet descriptor that can be accesses + * via vma_completion_t.packet field. + * Packet descriptor points to VMA buffers that contain data scattered + * by HW, so the data is deliver to application with zero copy. + * Notice: after application finished using the returned packets + * and their buffers it must free them using socketxtreme_free_vma_packets(), + * socketxtreme_free_vma_buff() functions. + * + * If VMA_SOCKETXTREME_PACKET flag is disabled vma_completion_t.packet field is + * reserved. + * + * In addition to packet arrival event (indicated by VMA_SOCKETXTREME_PACKET flag) + * VMA also reports VMA_SOCKETXTREME_NEW_CONNECTION_ACCEPTED event and standard + * epoll events via vma_completion_t.events field. + * VMA_SOCKETXTREME_NEW_CONNECTION_ACCEPTED event is reported when new connection is + * accepted by the server. + * When working with socketxtreme_poll() new connections are accepted + * automatically and accept(listen_socket) must not be called. + * VMA_SOCKETXTREME_NEW_CONNECTION_ACCEPTED event is reported for the new + * connected/child socket (vma_completion_t.user_data refers to child socket) + * and EPOLLIN event is not generated for the listen socket. + * For events other than packet arrival and new connection acceptance + * vma_completion_t.events bitmask composed using standard epoll API + * events types. + * Notice: the same completion can report multiple events, for example + * VMA_SOCKETXTREME_PACKET flag can be enabled together with EPOLLOUT event, + * etc... + * + * * errno is set to: EOPNOTSUPP - socketXtreme was not enabled during configuration time. + */ + int (*socketxtreme_poll)(int fd, struct vma_completion_t* completions, unsigned int ncompletions, int flags); + + /** + * Returns the amount of rings that are associated with socket. + * + * @param fd File Descriptor number of the socket. + * @return On success, return the amount of rings. + * On error, -1 is returned. + * + * errno is set to: EINVAL - not a VMA offloaded fd + */ + int (*get_socket_rings_num)(int fd); + + /** + * Returns FDs of the RX rings that are associated with the socket. + * + * This function gets socket FD + int array + array size and populates + * the array with FD numbers of the rings that are associated + * with the socket. + * + * @param fd File Descriptor number. + * @param ring_fds Array of ring fds + * @param ring_fds_sz Size of the array + * @return On success, return the number populated array entries. + * On error, -1 is returned. + * + * errno is set to: EINVAL - not a VMA offloaded fd + TBD + */ + int (*get_socket_rings_fds)(int fd, int *ring_fds, int ring_fds_sz); + + /** + * Returns the ring FD of the TX rings used by this socket. + * should be used after connect or joining a MC group. + * @param sock_fd - UDP socket fd + * @param to - the destination the socket is connected to. + * @param tolen - so len + * @return ring fd on success -1 on failure (e.g. no ring, non offloaded fd) + * @note @ref get_socket_rings_fds returns the RX ring fd + * errno is set to: EINVAL - not a VMA offloaded fd + * ENODATA - no rings fds available + */ + int (*get_socket_tx_ring_fd)(int sock_fd, struct sockaddr *to, socklen_t tolen); + + /** + * Frees packets received by socketxtreme_poll(). + * + * @param packets Packets to free. + * @param num Number of packets in `packets` array + * @return 0 on success, -1 on failure + * + * For each packet in `packet` array this function: + * - Updates receive queue size and the advertised TCP + * window size, if needed, for the socket that received + * the packet. + * - Frees vma buffer list that is associated with the packet. + * Notice: for each buffer in buffer list VMA decreases buffer's + * reference count and only buffers with reference count zero are deallocated. + * Notice: + * - Application can increase buffer reference count, + * in order to hold the buffer even after socketxtreme_free_vma_packets() + * was called for the buffer, using socketxtreme_ref_vma_buff(). + * - Application is responsible to free buffers, that + * couldn't be deallocated during socketxtreme_free_vma_packets() due to + * non zero reference count, using socketxtreme_free_vma_buff() function. + * + * errno is set to: EINVAL - NULL pointer is provided. + * EOPNOTSUPP - socketXtreme was not enabled during configuration time. + */ + int (*socketxtreme_free_vma_packets)(struct vma_packet_desc_t *packets, int num); + + /* This function increments the reference count of the buffer. + * This function should be used in order to hold the buffer + * even after socketxtreme_free_vma_packets() call. + * When buffer is not needed any more it should be freed via + * socketxtreme_free_vma_buff(). + * + * @param buff Buffer to update. + * @return On success, return buffer's reference count after the change + * On errors -1 is returned + * + * errno is set to: EINVAL - NULL pointer is provided. + * EOPNOTSUPP - socketXtreme was not enabled during configuration time. + */ + int (*socketxtreme_ref_vma_buff)(struct vma_buff_t *buff); + + /* This function decrements the buff reference count. + * When buff's reference count reaches zero, the buff is + * deallocated. + * + * @param buff Buffer to free. + * @return On success, return buffer's reference count after the change + * On error -1 is returned + * + * Notice: return value zero means that buffer was deallocated. + * + * errno is set to: EINVAL - NULL pointer is provided. + * EOPNOTSUPP - socketXtreme was not enabled during configuration time. + */ + int (*socketxtreme_free_vma_buff)(struct vma_buff_t *buff); + + /* + * Dump fd statistics using VMA logger. + * @param fd to dump, 0 for all open fds. + * @param log_level dumping level corresponding vlog_levels_t enum (vlogger.h). + * @return 0 on success, or error code on failure. + * + * errno is set to: EOPNOTSUPP - Function is not supported when socketXtreme is enabled. + */ + int (*dump_fd_stats)(int fd, int log_level); + + /** + * Get data from the MP_RQ cyclic buffer + * @param fd - the fd of the ring to query - get it using @ref get_socket_rings_fds + * @param completion results see @ref struct vma_completion_cb_t + * @param min min number of packet to return, if not available + * will return 0 packets + * @param max max packets to return + * @param flags can be MSG_DONTWAIT, MSG_WAITALL (not yet supported), MSG_PEEK (not yet supported) + * @return 0 on success -1 on failure + * + * errno is set to: EOPNOTSUPP - Striding RQ is no supported. + */ + int (*vma_cyclic_buffer_read)(int fd, + struct vma_completion_cb_t *completion, + size_t min, size_t max, int flags); + + /** + * add a ring profile to VMA ring profile list. you can use this + * to create advacned rings like MP_RQ ring + * the need to pass vma the ring profile using the fd's setsockopt + * @param profile the profile to add to the list + * @param key - the profile key + * @return 0 on success -1 on failure + */ + int (*vma_add_ring_profile)(struct vma_ring_type_attr *profile, int *key); + + /** + * get the socket's network header created by VMA + * @param fd - the socket's fd + * @param ptr - pointer to write the data to. can be NULL see notes + * @param len - IN\OUT parameter + * IN - len given by user + * OUT- len used by header + * @return 0 on success -1 on error + * errno EINVAL - bad fd + * errno ENOBUFS - ptr is too small + * errno ENOTCONN - header no available since socket is not + * ofloaded or not connected + * @note this function should be called for connected socket + * @note calling with ptr NULL will update the len with the size needed + * by VMA so application will allocate the exact needed space + * @note application can: + * call twice once with ptr == NULL and get the size needed to allocate + * and call again to get the data. + * if application called with big enough buffer vma will update the + * size actually used. + */ + int (*get_socket_network_header)(int fd, void *ptr, uint16_t *len); + + /** + * get the HW descriptors created by VMA + * @param fd - the ring fd + * @param data - result see @ref vma_mlx_hw_device_data + * @return -1 on failure 0 on success + */ + int (*get_ring_direct_descriptors)(int fd, + struct vma_mlx_hw_device_data *data); + + /** + * register memory to use on a ring. + * @param fd - the ring fd see @ref socketxtreme_get_socket_rings_fds + * @param addr - the virtual address to register + * @param length - hte length of addr + * @param key - out parameter to use when accessing this memory + * @return 0 on success, -1 on failure + * + * @note in vma_extra_api ring is associated with device, although you + * can use the key in other rings using the same port we decided to leave + * the ring fd as the bridge in the "extra" convention instead of + * using an opaque ib_ctx or src ip (that can cause routing issues). + */ + int (*register_memory_on_ring)(int fd, void *addr, size_t length, + uint32_t *key); + + /** + * deregister the addr that was previously registered in this ring + * @return 0 on success, -1 on failure + * + * @note - this function doens't free the memory + */ + int (*deregister_memory_on_ring)(int fd, void *addr, size_t length); + + /** + * returns memory information for the ring fd + * @param fd - ring fd + * @param addr - the buffer address used + * @return 0 on success, -1 on failure + * + */ + int (*get_mem_info)(int fd, void **addr, size_t *length, uint32_t *lkey); + + /** + * perform ring modifications + * + * @param mr_data ring modification parameters + * + * @return 0 on success -1 on failure 1 on busy + */ + int (*vma_modify_ring)(struct vma_modify_ring_attr *mr_data); + + /** + * Used to identify which methods were initialized by VMA as part of vma_get_api(). + * The value content is based on vma_extra_api_mask enum. + */ + uint64_t vma_extra_supported_mask; + + /** + * get dpcp devices allocated by VMA + * + * @param devices - pointer to write the data to. can be NULL see notes + * @param devices_num - IN\OUT parameter + * IN - devices size given by user + * OUT- devices returned to user + * + * @return 0 on success -1 otherwise + * + * @note application can: + * call twice once with devices == NULL and get the size needed to allocate + * and call again to get the filled device array. + * if application is called with big enough buffer VMA will update the + * size actually used. + */ + int (*get_dpcp_devices)(uintptr_t **devices, size_t *devices_num); +}; + +/** + * Retrieve VMA extended API. + * + * @return Pointer to the VMA Extended Socket API, of NULL if VMA not found. + */ +static inline struct vma_api_t* vma_get_api() +{ + struct vma_api_t *api_ptr = NULL; + socklen_t len = sizeof(api_ptr); + + /* coverity[negative_returns] */ + int err = getsockopt(-1, SOL_SOCKET, SO_VMA_GET_API, &api_ptr, &len); + if (err < 0) { + return NULL; + } + return api_ptr; +} + +#endif /* VMA_EXTRA_H */ diff --git a/tests/Makefile.am b/tests/Makefile.am new file mode 100644 index 0000000..f61c097 --- /dev/null +++ b/tests/Makefile.am @@ -0,0 +1,18 @@ +SUBDIRS := timetest gtest latency_test pps_test throughput_test + +EXTRA_DIST = \ + timetest \ + gtest \ + async-echo-client \ + benchmarking_test \ + connect-disconnect \ + extra_api_tests \ + functionality \ + listen \ + low_pps_tcp_send_test \ + mc_loop_test \ + multithread_test \ + server_test \ + vma_perf_envelope \ + reuse_ud_test.c \ + select_t1.c diff --git a/tests/README_How_to_update_sockperf_in_VMA.txt b/tests/README_How_to_update_sockperf_in_VMA.txt new file mode 100644 index 0000000..158f6c1 --- /dev/null +++ b/tests/README_How_to_update_sockperf_in_VMA.txt @@ -0,0 +1,6 @@ +As of Oct 2015, sockperf was entirely splitted out of VMA. +sockperf is anyhow a standalone project in github (and previously in googlecode). +Recently sockperf started to have an *.rpm and *.deb for itself. +Resources: + https://github.com/Mellanox/sockperf + http://rpmfind.net/linux/rpm2html/search.php?query=sockperf diff --git a/tests/TODO b/tests/TODO new file mode 100644 index 0000000..31637cc --- /dev/null +++ b/tests/TODO @@ -0,0 +1,5 @@ +- setsockopt tests +- fork +- ip fragmentation tests +- multicast tests + diff --git a/tests/api-support-check.py b/tests/api-support-check.py new file mode 100755 index 0000000..247be6e --- /dev/null +++ b/tests/api-support-check.py @@ -0,0 +1,75 @@ +#!/usr/bin/env python +# +#@copyright: +# Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +#@author: Avner BenHanoch + +#@date: 18Oct2015 +# +# This script performs ioctl/fcntl/setsockopt tests for verifying VMA coverage and behavior + +# example for usage: +# LD_PRELOAD=libvma.so VMA_EXCEPTION_HANDLING=1 ./tests/api-support-check.py +# +import socket, sys, fcntl +import struct, os + +def test_ioctl(sock): + ifname = 'eth0' + addr = socket.inet_ntoa(fcntl.ioctl( + sock.fileno(), + 0x8915, # SIOCGIFADDR + struct.pack('256s', ifname) + )[20:24]) + return "ioctl test: %s=%s" % (ifname, addr) + +def test_fcntl(sock): + rv = fcntl.fcntl(sock, fcntl.F_SETFL, os.O_NDELAY) + + lockdata = struct.pack('hhllhh', fcntl.F_WRLCK, 0, 0, 0, 0, 0) + rv = fcntl.fcntl(sock, fcntl.F_SETLKW, lockdata) + return "fcntl test: returned with data of len=" + str(len(rv)) + +if __name__ == "__main__": + print "testing TCP:" + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + print test_ioctl(s) + print test_fcntl(s) + print "setsockopt test..."; s.setsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1) + print "getsockopt test..."; s.getsockopt(socket.SOL_SOCKET, socket.SO_BROADCAST, 1) + + print "testing UDP:" + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + print test_ioctl(s) + print test_fcntl(s) + print "setsockopt test..."; s.setsockopt(socket.IPPROTO_TCP, socket.TCP_CORK, 1) + print "getsockopt test..."; s.getsockopt(socket.IPPROTO_TCP, socket.TCP_CORK, 1) diff --git a/tests/async-echo-client/async-echo-client.py b/tests/async-echo-client/async-echo-client.py new file mode 100755 index 0000000..b437b36 --- /dev/null +++ b/tests/async-echo-client/async-echo-client.py @@ -0,0 +1,133 @@ +#!/usr/bin/env python +# +# +#@copyright: +# Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +#@author: Avner BenHanoch + +#@date: 20June2015 +# +# This script performs non blocking connect to a given TCP server +# It can serve for checking VMA behaviour with async connect when the server +# is up or down +# +import socket, select, os, time, sys, fcntl, errno + +EPOLL_TIMEOUT=-1 # infinity + +def is_writeable_timeout( sock ): + "use poll with zero timeout for checking for timeout on writeable check (otherwise, the socket is either writeable or has errors)" + + poller = select.poll() + poller.register(sock, select.POLLOUT) + pevents = poller.poll(0) + print "- poll returned: %s (is_writeable_timeout=%s)" % (str(pevents), str(len(pevents) == 0)) + return len(pevents) == 0 + +def async_echo_client(argv): + if (len(argv) <4): + print "Incorrect parameter : " + argv[0] + " server-ip server-port msg-for-echo [bind-IP]" + sys.exit(-1) + + IP = argv[1] + PORT = int(argv[2]) + msg = argv[3] + BIND_IP=None + if (len(argv) > 4): BIND_IP=argv[4] + + sock=None + sock_fd = 0 + epoll = select.epoll(1) # we only have 1 socket + + counter = 0 + success = False + while not success: + if sock == None: + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock_fd = sock.fileno() + sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) + + flags = fcntl.fcntl(sock_fd, fcntl.F_GETFL, 0) + flags = flags|os.O_NONBLOCK + rv = fcntl.fcntl(sock_fd, fcntl.F_SETFL, flags) + + if (BIND_IP): sock.bind((BIND_IP, 0)) + + print "starting async connect..." + err = sock.connect_ex((IP, PORT)) + if err != errno.EINPROGRESS and err != 0: + print "error %d"%err + sys.exit (1) + + #epoll.register(sock_fd, select.EPOLLOUT | select.EPOLLET) + epoll.register(sock_fd, select.EPOLLOUT ) + + print "calling epoll - for getting for connect result" + eevents = epoll.poll(EPOLL_TIMEOUT) + print "- epoll returned: %s" % str(eevents) + if len(eevents) == 0: continue # epoll timeout + if is_writeable_timeout (sock): + pass # timeout - not writeable and no errors - call epoll again on same registered socket + else: + fd, events = eevents[0] + if events & (select.EPOLLERR | select.EPOLLHUP): # error on socket close it and restart from begining + print counter, "connection was NOT established successfully (will retry in 1 second) - Is the server up?" + counter += 1 + sock.close() + sock = None + time.sleep(1) + else: + print " **** connection established successfully after %d failures" % counter + success = True + + + + print "* sending..." + sock.send(msg) + + print "* minor sleep before receiving..." + time.sleep (1) + + print "* receiving..." + data = sock.recv(1024) + print ' **** Received:', data + + print "starting disconnect..." + epoll.close() + sock.close() + print "Done..." + +def main(): + async_echo_client(sys.argv) + +if __name__ == "__main__": + main() diff --git a/tests/async-echo-client/echo-server.py b/tests/async-echo-client/echo-server.py new file mode 100755 index 0000000..6627d03 --- /dev/null +++ b/tests/async-echo-client/echo-server.py @@ -0,0 +1,28 @@ +#!/usr/bin/env python + +""" +A simple echo server +""" + +import socket, sys + +if (len(sys.argv) <3): + print "Incorrect parameter : " + sys.argv[0] + " server-ip server-port" + sys.exit(-1) + + +host = sys.argv[1] +port = int(sys.argv[2]) + +backlog = 5 +size = 1024 +s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) +s.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) +s.bind((host,port)) +s.listen(backlog) +while 1: + client, address = s.accept() + data = client.recv(size) + if data: + client.send(data) + client.close() diff --git a/tests/async-echo-client/syncronized-echo-client.py b/tests/async-echo-client/syncronized-echo-client.py new file mode 100755 index 0000000..8205e91 --- /dev/null +++ b/tests/async-echo-client/syncronized-echo-client.py @@ -0,0 +1,123 @@ +#!/usr/bin/env python +# +# +#@copyright: +# Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +#@author: Avner BenHanoch + +#@date: 05July2015 +# +# This script performs blocking connect to a given TCP server +# It can serve for checking VMA behaviour with a blocking connect when the server +# is up or down +# +import socket, select, os, time, sys, fcntl + +def is_connected( sock ): + "use poll with zero timeout for checking if socket is writeable and has no errors" + + poller = select.poll() + poller.register(sock, select.POLLOUT) + pevents = poller.poll(0) + rv = True; + if len(pevents) == 0: rv = False + else: + fd, flag = pevents[0] + rv = (flag == select.POLLOUT) # we only asked for POLLOUT, verify that we didn't get also errors + + print "- poll returned: %s (SUCCESS=%s)" % (str(pevents), str(rv)) + return rv + +def syncronized_echo_client(argv): + if (len(argv) <4): + print "Incorrect parameter : " + argv[0] + " server-ip server-port msg-for-echo [bind-IP]" + sys.exit(-1) + + IP = argv[1] + PORT = int(argv[2]) + msg = argv[3] + + BIND_IP=None + + if (len(argv) > 4): + BIND_IP=argv[4] + + sock=None + sock_fd = 0 + + counter = 0 + success = False + while not success: + sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock_fd = sock.fileno() + sock.setsockopt(socket.IPPROTO_TCP, socket.TCP_NODELAY, 1) + + flags = fcntl.fcntl(sock_fd, fcntl.F_GETFL, 0) + flags = flags & ~os.O_NONBLOCK # set blocking + rv = fcntl.fcntl(sock_fd, fcntl.F_SETFL, flags) + + if (BIND_IP): sock.bind((BIND_IP, 0)) + + print "starting synchronized connect..." + err = sock.connect_ex((IP, PORT)) + if err != 0: + print "error %d"%err + + if is_connected( sock ): + success = True + else: + print counter, "connection was NOT established successfully (will retry in 1 second) - Is the server up?" + sock.close() + time.sleep(1) + counter += 1 + + print " **** connection established successfully after %d failures" % counter + + print "* sending..." + sock.send(msg) + + print "* sleeping for 1 second" + time.sleep (1) + + print "* receiving..." + data = sock.recv(1024) + print ' **** Received:', data + + print "starting disconnect..." + sock.close() + print "Done..." + +def main(): + syncronized_echo_client(sys.argv) # for functionality test purposes + +if __name__ == "__main__": + main() diff --git a/tests/benchmarking_test/Accelerators_Benchmarking.sh b/tests/benchmarking_test/Accelerators_Benchmarking.sh new file mode 100755 index 0000000..b305000 --- /dev/null +++ b/tests/benchmarking_test/Accelerators_Benchmarking.sh @@ -0,0 +1,311 @@ +#!/bin/sh + +# +# configurable parameters +# in order to change parameter add PARAMETER_NAME=VALUE before running the script. +#--------------------------------------------------- + +PPS=${PPS:-"100 1000 10000 50000 125000 500000 1000000 2000000 max"} +#2048 4096 8192 16384 32768 65536"} +M_SIZE=${M_SIZE:-"12 32 64 128 192 256 512 768 1024 1460"} +FD_NUMBER=${FD_NUMBER:-"1 2 5 10 25 50 100 250 500"} +#1000 25000 5000 10000 +LOOP=${LOOP:-"1"} +SOCKPERF=${SOCKPERF:-sockperf} +DURATION=${DURATION:-30} +VMA_SELECT_POLL_VAL=${VMA_SELECT_POLL_VAL:-"-1 0 1000 1000000"} +VMA_RX_POLL_VAL=${VMA_RX_POLL_VAL:-"-1"} +VMA_ENV_FLAGS=${VMA_ENV_FLAGS:-"LD_PRELOAD=libvma.so"} +IOMUX_TYPE=${IOMUX_TYPE:-"s p e"} +SERVER_FLAG=${SERVER_FLAG:-" "} +CLIENT_FLAG=${CLIENT_FLAG:=" "} +TEST=${TEST:-"pp_test tp_test tp_udp_mc_imux_test pp_udp_mc_imux_test pp_tcp_imux_test pp_udp_uc_imux_test"} +#----to do---need to add tcp_imux uc_imux + +#taskset -c 4,5,6 env + +function run_latancy_test +{ +#good for ping pong or under load test +#1 - flags +#2 - env_flags + env ${2} ${SOCKPERF} ${1} ${CLIENT_FLAG} | egrep "Latency|dropped|std-dev|50.00 =|||99.99 =|observations"| awk '{" "}; {if ($3=="Latency") printf "%4.3f ",$5 ; if ($4 ~ "std-dev") printf "%s ",$4; if ($4=="50.00") printf "%4.3f ",$6;if ($4=="99.99") printf "%4.3f ",$6; if ($3=="") printf "%4.3f ", $6 ; if ($3=="dropped") printf "%s %s %s ", $6 , $11 , $16; if ($3=="") printf "%4.3f ", $6 ; if ($9=="observations") printf "%d ", $3}' >> ${OUT_FILE} + echo " " >> ${OUT_FILE} + sleep 1 +} + +function run_throughput_test +{ + #1 - flags + #2 - env_flags + env ${2} ${SOCKPERF} tp ${1} ${CLIENT_FLAG}|egrep "Summary:"| awk '{" "} {if ($3=="Message") printf "%d ",$6 ; if ($3=="BandWidth") printf "%4.3f ",$5}' >> ${OUT_FILE} + sleep 1 +} + +function create_feed_file_uni() +{ + #1 size + #2 ip + #3 port + #4 feed file + port=$1-1 + let port=port+$3 + until [ $port -lt $3 ]; do + echo "$2:$port" >> $4 + let port-=1 + done +} +function create_feed_file_tcp() +{ + #1 size + #2 ip + #3 port + #4 feed file + +# echo "T:$2:$3" >> "$4_sr" + port=$1-1 + let port=port+$3 + until [ $port -lt $3 ]; do + echo "T:$2:$3" >> $4 + let port-=1 + done +} + +# different addresses +function create_feed_file_multi() +{ + #1 size + #2 feed file + port=10005 + ip_1=224 + ip_2=4 + ip_3=1 + ip_4=3 + counter=0 + while [ $counter -lt $1 ]; do + echo "$ip_1.$ip_2.$ip_3.$ip_4:$port" >> $2 + let counter=counter+1 + let port=port+1 + let ip_4=ip_4+1 + if [ $ip_4 = 255 ]; then + let ip_3=ip_3+1 + ip_4=3 + fi + done +} + +#create_feed_file_uni 10 17.17.17.10 10005 ${FEED_FILE} +#create_feed_file_tcp 10 17.17.17.10 10005 ${FEED_FILE} +#cat "${FEED_FILE}_sr" +#create_feed_file_multi 1000 ${FEED_FILE} + + + +#--------------------------------------TP-MC-IMUX------------------- +function tp_udp_mc_imux_test() +{ + echo "TP measurement UDP MC FEED_FILE" >> ${OUT_FILE} + echo "VMA_SELECT_POLL Imoux_type Fd_number Message-size PPS Message-rate bandwidth" >> ${OUT_FILE} + + for imoux_type in ${IOMUX_TYPE}; do + for fd_num in ${FD_NUMBER}; do + rm ${FEED_FILE} + create_feed_file_multi ${fd_num} ${FEED_FILE} + scp "${FEED_FILE}" "${SERVER}:${FEED_FILE}" + + for select_poll in ${VMA_SELECT_POLL_VAL}; do + ssh $SERVER pkill -f sockperf + sleep 1 + ssh $SERVER env VMA_RX_POLL="-1" VMA_SELECT_POLL=${select_poll} ${VMA_ENV_FLAGS} ${SOCKPERF} server -f ${FEED_FILE} -F ${imoux_type} ${SERVER_FLAG} & + sleep 5 + for pps_num in ${PPS}; do + for j in ${M_SIZE}; do + echo -n "${select_poll} ${imoux_type} ${fd_num} $j ${pps_num} " >> ${OUT_FILE} + run_throughput_test "-m ${j} --mps ${pps_num} -t ${DURATION} -f ${FEED_FILE} -F ${imoux_type}" "VMA_SELECT_POLL=${select_poll} VMA_RX_POLL=-1 ${VMA_ENV_FLAGS}" + done + done + done + done + rm ${FEED_FILE} + done + echo " " >> ${OUT_FILE} +} + +#--------------------------------------TP--------------------- +function tp_test() +{ + echo "TP measurement UDP MC" >> ${OUT_FILE} + echo "VMA_RX_POLL Message-size PPS Message-rate bandwidth" >> ${OUT_FILE} + + for rx_poll in ${VMA_RX_POLL_VAL}; do + ssh $SERVER pkill -f sockperf + sleep 1 + ssh $SERVER env VMA_SELECT_POLL="-1" VMA_RX_POLL=${rx_poll} ${VMA_ENV_FLAGS} ${SOCKPERF} server -i ${SERVER_ADD} ${SERVER_FLAG} & + sleep 5 + for pps_num in ${PPS}; do + for j in ${M_SIZE}; do + echo -n "${rx_poll} $j ${pps_num} " >> ${OUT_FILE} + run_throughput_test "-m ${j} --mps ${pps_num} -t ${DURATION} -i ${SERVER_ADD}" "VMA_SELECT_POLL=-1 VMA_RX_POLL=${rx_poll} ${VMA_ENV_FLAGS}" + done + done + done + echo " " >> ${OUT_FILE} +} + + +#--------------------------------------PP-MC-IMUX------------------- +function pp_udp_mc_imux_test() +{ + echo "Latency Ping-pong measurement UDP MC FEED_FILE" >> ${OUT_FILE} + echo "VMA_SELECT_POLL Imoux_type Fd_number Message-size PPS std-dev dropped-messages duplicated-messages out-of-order-messages Average_Latency Total_observations Max_Latency 99%_percentile 50%_percentile Min_Latency" >> ${OUT_FILE} + + for imoux_type in ${IOMUX_TYPE}; do + for fd_num in ${FD_NUMBER}; do + rm ${FEED_FILE} + create_feed_file_multi ${fd_num} ${FEED_FILE} + scp "${FEED_FILE}" "${SERVER}:${FEED_FILE}" + + for select_poll in ${VMA_SELECT_POLL_VAL}; do + ssh $SERVER pkill -f sockperf + sleep 1 + ssh $SERVER env VMA_RX_POLL="-1" VMA_SELECT_POLL=${select_poll} ${VMA_ENV_FLAGS} ${SOCKPERF} server -f ${FEED_FILE} -F ${imoux_type} ${SERVER_FLAG} & + sleep 5 + for pps_num in ${PPS}; do + for j in ${M_SIZE}; do + echo -n "${select_poll} ${imoux_type} ${fd_num} $j ${pps_num} " >> ${OUT_FILE} + run_latancy_test "pp -m ${j} --mps ${pps_num} -t ${DURATION} -f ${FEED_FILE} -F ${imoux_type}" "VMA_SELECT_POLL=${select_poll} VMA_RX_POLL=-1 ${VMA_ENV_FLAGS}" + done + done + done + done + rm ${FEED_FILE} + done + echo " " >> ${OUT_FILE} +} + +#--------------------------------------PP--------------------- +function pp_test() +{ + echo "Latency Ping-pong measurement UDP" >> ${OUT_FILE} + echo "VMA_RX_POLL Message-size PPS std-dev dropped-messages duplicated-messages out-of-order-messages Average_Latency Total_observations Max_Latency 99%_percentile 50%_percentile Min_Latency" >> ${OUT_FILE} + + for rx_poll in ${VMA_RX_POLL_VAL}; do + ssh $SERVER pkill -f sockperf + sleep 1 + ssh $SERVER env VMA_SELECT_POLL="-1" VMA_RX_POLL=${rx_poll} ${VMA_ENV_FLAGS} ${SOCKPERF} server -i ${SERVER_ADD} ${SERVER_FLAG} & + sleep 5 + for pps_num in ${PPS}; do + for j in ${M_SIZE}; do + echo -n "${rx_poll} $j ${pps_num} " >> ${OUT_FILE} + run_latancy_test "pp -m ${j} --mps ${pps_num} -t ${DURATION} -i ${SERVER_ADD}" "VMA_SELECT_POLL=-1 VMA_RX_POLL=${rx_poll} ${VMA_ENV_FLAGS}" + done + done + done + echo " " >> ${OUT_FILE} +} + + +#--------------------------------------PP-TCP-IMUX------------------- +function pp_tcp_imux_test() +{ + echo "Latency Ping-pong measurement TCP FEED_FILE" >> ${OUT_FILE} + echo "VMA_SELECT_POLL Imoux_type Fd_number Message-size PPS std-dev dropped-messages duplicated-messages out-of-order-messages Average_Latency Total_observations Max_Latency 99%_percentile 50%_percentile Min_Latency" >> ${OUT_FILE} + + for imoux_type in ${IOMUX_TYPE}; do + for fd_num in ${FD_NUMBER}; do + rm ${FEED_FILE} + create_feed_file_tcp ${fd_num} ${SERVER_ADD} 10005 ${FEED_FILE} + scp "${FEED_FILE}" "${SERVER}:${FEED_FILE}" + + for select_poll in ${VMA_SELECT_POLL_VAL}; do + ssh $SERVER pkill -f sockperf + sleep 1 + ssh $SERVER env VMA_RX_POLL="-1" VMA_SELECT_POLL=${select_poll} ${VMA_ENV_FLAGS} ${SOCKPERF} server -f ${FEED_FILE} -F ${imoux_type} ${SERVER_FLAG} & + sleep 5 + for pps_num in ${PPS}; do + for j in ${M_SIZE}; do + echo -n "${select_poll} ${imoux_type} ${fd_num} $j ${pps_num} " >> ${OUT_FILE} + run_latancy_test "pp -m ${j} --mps ${pps_num} -t ${DURATION} -f ${FEED_FILE} -F ${imoux_type}" "VMA_SELECT_POLL=${select_poll} VMA_RX_POLL=-1 ${VMA_ENV_FLAGS}" + done + done + done + done + rm ${FEED_FILE} + done + echo " " >> ${OUT_FILE} +} + + +#--------------------------------------PP-UDP-UC-IMUX------------------- +function pp_udp_uc_imux_test() +{ + echo "Latency Ping-pong measurement UDP UC FEED_FILE" >> ${OUT_FILE} + echo "VMA_SELECT_POLL Imoux_type Fd_number Message-size PPS std-dev dropped-messages duplicated-messages out-of-order-messages Average_Latency Total_observations Max_Latency 99%_percentile 50%_percentile Min_Latency" >> ${OUT_FILE} + + for imoux_type in ${IOMUX_TYPE}; do + for fd_num in ${FD_NUMBER}; do + rm ${FEED_FILE} + create_feed_file_uni ${fd_num} ${SERVER_ADD} 10005 ${FEED_FILE} + scp "${FEED_FILE}" "${SERVER}:${FEED_FILE}" + + for select_poll in ${VMA_SELECT_POLL_VAL}; do + ssh $SERVER pkill -f sockperf + sleep 1 + ssh $SERVER env VMA_RX_POLL="-1" VMA_SELECT_POLL=${select_poll} ${VMA_ENV_FLAGS} ${SOCKPERF} server -f ${FEED_FILE} -F ${imoux_type} ${SERVER_FLAG} & + sleep 5 + for pps_num in ${PPS}; do + for j in ${M_SIZE}; do + echo -n "${select_poll} ${imoux_type} ${fd_num} $j ${pps_num} " >> ${OUT_FILE} + run_latancy_test "pp -m ${j} --mps ${pps_num} -t ${DURATION} -f ${FEED_FILE} -F ${imoux_type}" "VMA_SELECT_POLL=${select_poll} VMA_RX_POLL=-1 ${VMA_ENV_FLAGS}" + done + done + done + done + rm ${FEED_FILE} + done + echo " " >> ${OUT_FILE} +} + +#-----------------------------------main----------------------------------------------- + +echo "" +echo "Usahge: $0 " +echo "" +echo "to change script parameter write: >> PARAMETER_NAME=VALUE $0" +echo "" +echo "chopse test - TEST=test_name $0 ..." +for start_test in ${TEST}; do + echo $start_test +done +echo "" +echo "script parameter:" +echo "PPS - value: ${PPS}" +echo "M_SIZE - value: ${M_SIZE}" +echo "FD_NUMBER - value: ${FD_NUMBER}" +echo "LOOP - value: ${LOOP}" +echo "SOCKPERF - value: ${SOCKPERF}" +echo "DURATION - value: ${DURATION}" +echo "VMA_SELECT_POLL_VAL - value: ${VMA_SELECT_POLL_VAL}" +echo "SERVER_FLAG - value: ${SERVER_FLAG}" +echo "CLIENT_FLAG - value: ${CLIENT_FLAG}" +echo "VMA_RX_POLL_VAL - value: ${VMA_RX_POLL_VAL}" +echo "VMA_ENV_FLAGS - value: ${VMA_ENV_FLAGS}" +echo "IOMUX_TYPE - value: ${IOMUX_TYPE}" + +if [ $# -ne 3 ]; then + exit +else + #echo "config print all configurable parameters" + echo "output will be print to ${3}" +fi + +SERVER=${1} +SERVER_ADD=${2} +OUT_FILE=${3} +FEED_FILE="/tmp/feed_file" + +for start_test in ${TEST}; do + $start_test +done + +exit diff --git a/tests/bindtodevice/client.py b/tests/bindtodevice/client.py new file mode 100755 index 0000000..cb9e814 --- /dev/null +++ b/tests/bindtodevice/client.py @@ -0,0 +1,57 @@ +#!/usr/bin/env python +# +# +#@copyright: +# Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +#@author: Alex Rosenbaum + +#@date: 18May2016 +# +# +# +# +import socket, time, sys + +if (len(sys.argv) <3): + print "In correct parameter : " + sys.argv[0] + " dst_ip_address dst_port src_ifname" + sys.exit(-1) + + +print "UDP target IP:port=<", sys.argv[1], ":", sys.argv[2], ">" +print "ifname:", sys.argv[3] + +sock=socket.socket(socket.AF_INET, socket.SOCK_DGRAM) +sock.setblocking(0) +sock.setsockopt(socket.SOL_SOCKET, 25, sys.argv[3]+'\0') # SO_BINDTODEVICE +sock.sendto("HELLO WORLD", (sys.argv[1], int(sys.argv[2]))) +time.sleep(1) +sock.close() diff --git a/tests/connect-disconnect/client.py b/tests/connect-disconnect/client.py new file mode 100755 index 0000000..03f9d08 --- /dev/null +++ b/tests/connect-disconnect/client.py @@ -0,0 +1,110 @@ +#!/usr/bin/env python +# +# +#@copyright: +# Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +#@author: Avner BenHanoch + +#@date: 31Mar2015 +# +# This script performs non blocking connect (and disconnect) to a given TCP server +# It can serve for checking [latency] effect of connect/disconnect on other client (sockperf) +# Also it can be used directly with [sockperf] server by adding debug calls to LogDuration in vma code +# (note: most effect is expected by 1st packet from a machine; hence, feel free to decrease num sockets) +# +# +# +import socket, select, os, time, sys +import datetime as dt +NUM_SOCKETS=70 +DURATION=10 # seconds + +if (len(sys.argv) <3): + print "In correct parameter : " + sys.argv[0] + " IP_Address port" + sys.exit(-1) + +sock_map = {} +for x in range(0, NUM_SOCKETS-1): + sock=socket.socket(socket.AF_INET, socket.SOCK_STREAM) + sock.setblocking(0) + sock_map[sock.fileno()] = sock + + +print "starting connect..." +n1=dt.datetime.now() +for sock in sock_map.itervalues(): + err = sock.connect_ex((sys.argv[1], int(sys.argv[2]))) + if err != 115 and err != 0: + print "error %d"%err + sys.exit (1) +n2=dt.datetime.now() +print "connect loop took ", (n2-n1).microseconds, "usec" + +n1=dt.datetime.now() +epoll = select.epoll(NUM_SOCKETS) + +for sock_fd in sock_map.iterkeys(): + epoll.register(sock_fd, select.EPOLLOUT | select.EPOLLET) + +counter = 0 +while True: + events = epoll.poll(1) + + for sock_fd, event in events: + err = sock_map[sock_fd].getsockopt(socket.SOL_SOCKET, socket.SO_ERROR) + if err != 0: + print "error %d"%err + sys.exit (1) + + epoll.unregister(sock_fd) + counter += 1 + + if (counter >= NUM_SOCKETS-1): + break +n2=dt.datetime.now() +delta=(n2-n1).seconds +print "connection established successfully within %d seconds (num sockets = %d)" % (delta, counter+1) + +epoll.close() + +left = DURATION - delta +print " >> sleeping for %d more seconds..." % left +time.sleep(left) +print "after sleep" + +print "starting disconnect..." +n1=dt.datetime.now() +for sock in sock_map.itervalues(): + sock.close() +n2=dt.datetime.now() +print "disconnect loop took ", (n2-n1).microseconds, "usec" +print "Done..." diff --git a/tests/extra_api_tests/socketxtreme/server_tcp.c b/tests/extra_api_tests/socketxtreme/server_tcp.c new file mode 100644 index 0000000..0405999 --- /dev/null +++ b/tests/extra_api_tests/socketxtreme/server_tcp.c @@ -0,0 +1,365 @@ +/* server_tcp.c + * + * build: + * epoll: gcc server_tcp.c -o server_tcp.out -DVMA_DEV="ens1f0" -DVMA_API=0 -I/usr/include + * xtreme: gcc server_tcp.c -o server_tcp.out -DVMA_DEV="ens1f0" -DVMA_API=1 -I + * + * usage: + * epoll: sudo server_tcp.out 1.1.3.15:17000 + * socketxtreme: sudo env LD_PRELOAD=libvma.so server_tcp.out 1.1.3.15:17000 + */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* mlock */ +#include +#include +#include +#include + +#if defined(VMA_API) && (VMA_API == 1) +#include +#endif /* VMA_API */ + +/* Bind to device */ +#if !defined(VMA_DEV) +#define IB_DEV "ens3f1" +#else +#define QUOTE(name) #name +#define STR(macro) QUOTE(macro) +#define IB_DEV STR(VMA_DEV) +#endif + +/* Number of listeners */ +#define SFD_NUM 2 + +/* Number of peers */ +#define FD_NUM 10 + +#define EXIT_FAILURE 1 + +#if defined(VMA_API) && (VMA_API == 1) +static struct vma_api_t *_vma_api = NULL; +static int _vma_ring_fd = -1; +#endif /* VMA_API */ + +static volatile int _done = 0; + +static inline char *_addr2str(struct sockaddr_in *addr) { + static __thread char addrbuf[100]; + inet_ntop(AF_INET, &addr->sin_addr, addrbuf, sizeof(addrbuf)); + sprintf(addrbuf, "%s:%d", addrbuf, ntohs(addr->sin_port)); + + return addrbuf; +} + +static void _proc_signal(int signal_id) +{ + _done = signal_id; +} + +static int _set_noblock(int fd) +{ + int rc = 0; + int flag; + + flag = fcntl(fd, F_GETFL); + if (flag < 0) { + rc = -errno; + printf("failed to get socket flags %s\n", strerror(errno)); + } + flag |= O_NONBLOCK; + rc = fcntl(fd, F_SETFL, flag); + if (rc < 0) { + rc = -errno; + printf("failed to set socket flags %s\n", strerror(errno)); + } + + return rc; +} + +static int _tcp_create_and_bind(struct sockaddr_in *addr) +{ + int rc = 0; + int fd; + int flag; + + fd = socket(PF_INET, SOCK_STREAM, IPPROTO_IP); + if (!fd) { + rc = -EBUSY; + printf("Failed to create socket\n"); + goto err; + } + +#if defined(IB_DEV) + rc = setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, (void *)IB_DEV, strlen(IB_DEV)); + if (rc < 0) { + printf("Failed to setsockopt(SO_BINDTODEVICE) for %s: %s\n", IB_DEV, strerror(errno)); + exit(1); + } +#endif + + flag = 1; + rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char *) &flag, sizeof(int)); + if (rc < 0) { + printf("Failed to setsockopt(SO_REUSEADDR): %s\n", strerror(errno)); + goto err; + } + + rc = bind(fd, (struct sockaddr *) addr, sizeof(*addr)); + if (rc < 0) { + rc = -EBUSY; + printf("Failed to bind socket\n"); + goto err; + } + + listen(fd, SOMAXCONN); + + printf("Listen : fd=%d %s\n", fd, _addr2str((struct sockaddr_in *)addr)); + +err: + return (rc == 0 ? fd : (-1)); +} + +int main(int argc, char *argv[]) +{ + struct sigaction sa; + int ret = 0; + int efd; + int sfd[SFD_NUM]; + int fd = -1; + int max_events = 0; + int max_sfd = 0; + struct epoll_event ev; + uint64_t event; + struct epoll_event *events = NULL; + struct conn_info { + int *fds; + int count; + char msg[1024]; + } conns; +#if defined(VMA_API) && (VMA_API == 1) + struct vma_completion_t *vma_comps; +#endif /* VMA_API */ + int flag; + struct sockaddr_in addr; + struct sockaddr in_addr; + socklen_t in_len; + int i = 0; + int j = 0; + + /* catch SIGINT to exit */ + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = _proc_signal; + sa.sa_flags = 0; + sigemptyset(&(sa.sa_mask)); + if (sigaction(SIGINT, &sa, NULL) != 0) { + perror("Failed to create signal handler"); + exit(EXIT_FAILURE); + } + + /* Step:1 Initialize VMA API */ +#if defined(VMA_API) && (VMA_API == 1) + _vma_api = vma_get_api(); + if (_vma_api == NULL) { + printf("VMA Extra API not found\n"); + } +#endif /* VMA_API */ + + max_events = FD_NUM + sizeof(sfd) / sizeof(sfd[0]); + + conns.count = 0; + conns.fds = calloc(max_events, sizeof(*conns.fds)); + assert(conns.fds); + +#if defined(VMA_API) && (VMA_API == 1) + vma_comps = calloc(max_events, sizeof(*vma_comps)); + assert(vma_comps); +#else + efd = epoll_create1(0); + assert(efd >= 0); + + events = calloc(max_events, sizeof(*events)); + assert(events); +#endif /* VMA_API */ + + printf("Launching mode...\n"); + + /* Step:2 Create listen socket */ + for (i = 0; (i < SFD_NUM) && (argc > (i + 1)); i++) { + char *optarg = argv[i + 1]; + char *token1 = NULL; + char *token2 = NULL; + const char s[2] = ":"; + + token1 = strtok(optarg, s); + token2 = strtok(NULL, s); + + memset(&addr, 0, sizeof(addr)); + addr.sin_family = PF_INET; + addr.sin_addr.s_addr = inet_addr(token1); + addr.sin_port = htons(atoi(token2)); + sfd[i] = _tcp_create_and_bind(&addr); + if (sfd[i] < 0) { + perror("Failed to create socket"); + exit(EXIT_FAILURE); + } + max_sfd++; + } + + /* Step:3 Need to get ring or set listen socket */ +#if defined(VMA_API) && (VMA_API == 1) + if (_vma_ring_fd < 0) { + _vma_api->get_socket_rings_fds(sfd[0], &_vma_ring_fd, 1); + assert((-1) != _vma_ring_fd); + } +#else + for (i = 0; i < max_sfd; i++) { + ev.events = EPOLLIN; + ev.data.fd = sfd[i]; + if (epoll_ctl(efd, EPOLL_CTL_ADD, sfd[i], &ev) == -1) { + perror("epoll_ctl() failed"); + exit(EXIT_FAILURE); + } + } +#endif /* VMA_API */ + + while (!_done) { + int n = 0; + + /* Step:4 Get events */ +#if defined(VMA_API) && (VMA_API == 1) + while (0 == n) { + n = _vma_api->socketxtreme_poll(_vma_ring_fd, vma_comps, max_events, 0); + } +#else + n = epoll_wait(efd, events, max_events, 0); +#endif /* VMA_API */ + for (j = 0; j < n; j++) { + +#if defined(VMA_API) && (VMA_API == 1) + event = vma_comps[j].events; + event |= ( event & VMA_SOCKETXTREME_PACKET ? EPOLLIN : 0); + fd = (event & VMA_SOCKETXTREME_NEW_CONNECTION_ACCEPTED ? vma_comps[j].listen_fd : vma_comps[j].user_data); +#else + event = events[j].events; + fd = events[j].data.fd; +#endif /* VMA_API */ + + if ((event & EPOLLERR) || (event & EPOLLHUP) || (event & EPOLLRDHUP)) { + printf("epoll error\n"); + exit(EXIT_FAILURE); + } + + /* Step:5 Accept connections */ + for (i = 0; i < max_sfd; i++) { + if (fd == sfd[i]) break; + } + if (i < max_sfd) { + in_len = sizeof(in_addr); +#if defined(VMA_API) && (VMA_API == 1) + fd = vma_comps[j].user_data; + memcpy(&in_addr, &vma_comps[j].src, in_len); +#else + fd = accept(fd, &in_addr, &in_len); + if (fd < 0) { + printf("Accept failed: %s", strerror(errno)); + exit(EXIT_FAILURE); + } + ev.events = EPOLLIN | EPOLLET; + ev.data.fd = fd; + if (epoll_ctl(efd, EPOLL_CTL_ADD, fd, &ev) == -1) { + printf("epoll_ctl() failed: %s", strerror(errno)); + exit(EXIT_FAILURE); + } +#endif /* VMA_API */ + + conns.fds[conns.count] = fd; + conns.count++; + + printf("Accepted: #%d by sfd=%d fd=%d from %s\n", conns.count, sfd[i], fd, _addr2str((struct sockaddr_in *)&in_addr)); + + flag = 1; + ret = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char *) &flag, sizeof(int)); + if (ret < 0) { + printf("Failed to disable NAGLE: %s\n", strerror(errno)); + exit(EXIT_FAILURE); + } + + ret = _set_noblock(fd); + continue; + } + + /* Step:6 Process data */ + if (event & EPOLLIN) { +#if defined(VMA_API) && (VMA_API == 1) + printf("vma_comps[j].packet.num_bufs equal to %d \n", vma_comps[j].packet.num_bufs); + assert(1 == vma_comps[j].packet.num_bufs); + assert(sizeof(conns.msg) > vma_comps[j].packet.total_len); + memcpy(conns.msg, vma_comps[j].packet.buff_lst->payload, vma_comps[j].packet.total_len); + ret = vma_comps[j].packet.total_len; + _vma_api->socketxtreme_free_vma_packets(&vma_comps[j].packet, 1); +#else + ret = recv(fd, conns.msg, sizeof(conns.msg), 0); +#endif /* VMA_API */ + if (ret < 0) { + exit(EXIT_FAILURE); + } + if (ret > 0) { + conns.msg[ret - 1] = '\0'; + } else { + conns.msg[0] = '\0'; + } + printf("Received: fd=%d ret=%d %s\n", fd, ret, conns.msg); + } + } + } + +err: + + for (i = 0; i < max_sfd; i++) { + if (sfd[i] > 0) { + close(sfd[i]); + } + } + + for (i = 0; i < conns.count; i++) { + if (conns.fds[i] > 0) { +#if defined(VMA_API) && (VMA_API == 1) +#else + epoll_ctl(efd, EPOLL_CTL_DEL, conns.fds[i], NULL); +#endif /* VMA_API */ + close(conns.fds[i]); + } + } + if (conns.fds) { + free(conns.fds); + } + +#if defined(VMA_API) && (VMA_API == 1) + if (vma_comps) { + free(vma_comps); + } +#else + if (events) { + free(events); + } +#endif /* VMA_API */ + + close(efd); + + exit(0); +} diff --git a/tests/extra_api_tests/tcp_zcopy_cb/README b/tests/extra_api_tests/tcp_zcopy_cb/README new file mode 100755 index 0000000..06eddad --- /dev/null +++ b/tests/extra_api_tests/tcp_zcopy_cb/README @@ -0,0 +1,27 @@ +Test Name: + extra_APIs_callback + +Author(s): + Ahmad Shanably ashanably@asaltech.com + +Short description: + This is a client server test built to test callback functionality + of VMA extra API for TCP connection. + +Supported OSes: + Linux + +Description: + This test used to filter received packet using user defined function that is registered + by extra vma APIs (register_recv_callback()) and then choose one of following action: + - receive packet using general receive functions. + - free buffer that contain previously held packet, and then hold new received packet + - drop received packet + + This test uses TCP connection that have empty queue and handle received packet from application thread + +Known issues: + None. + +To do: + None \ No newline at end of file diff --git a/tests/extra_api_tests/tcp_zcopy_cb/client.c b/tests/extra_api_tests/tcp_zcopy_cb/client.c new file mode 100755 index 0000000..33255c9 --- /dev/null +++ b/tests/extra_api_tests/tcp_zcopy_cb/client.c @@ -0,0 +1,138 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "types.h" + +extern struct config_t config; + +int client_management(int *mngSocket); +int send_data(int testSock, int mngSock); + +int client_main(){ + int rc; + int result = -1; + int testSock = INVALID_SOCKET; + int mngSock = INVALID_SOCKET; + struct sockaddr_in serverAddr; + + printf("Enter Function client_main\n"); + + rc = client_management(&mngSock); + CHECK_VALUE("client_management", rc, 0, goto cleanup); + + /* open client socket */ + testSock = socket(AF_INET, SOCK_STREAM, 0); + CHECK_NOT_EQUAL("socket", testSock, INVALID_SOCKET, goto cleanup); + + /* Prepare server information (family, port and address) */ + serverAddr.sin_family = AF_INET; + serverAddr.sin_port = htons(config.port); + serverAddr.sin_addr.s_addr = inet_addr(config.sip); + + /* Sync other side is listen*/ + rc = sync_side(mngSock, 0); + CHECK_VALUE("sync_side", rc, 0, goto cleanup); + + rc = connect(testSock, (struct sockaddr *) &serverAddr, sizeof(serverAddr)); + CHECK_VALUE("connect", rc, 0, goto cleanup); + + rc = send_data(testSock, mngSock); + CHECK_VALUE("send_data", rc, 0, goto cleanup); + + /* sync for termination */ + rc = sync_side(mngSock, 0); + CHECK_VALUE("sync_side", rc, 0, goto cleanup); + + result = 0; + cleanup: + if(testSock != INVALID_SOCKET){ + close(testSock); + CHECK_VALUE("close", rc, 0, result = -1); + } + if(mngSock != INVALID_SOCKET){ + close(mngSock); + CHECK_VALUE("close", rc, 0, result = -1); + } + return result; +} + + +int client_management( + int *mngSocket){ + int rc; + int result = -1; + struct sockaddr_in servAddr; + struct hostent *server; + + printf("Enter Function client_management\n"); + + /* Generate a socket */ + *mngSocket = socket(AF_INET, SOCK_STREAM, 0); + CHECK_NOT_EQUAL("socket", *mngSocket, INVALID_SOCKET, goto cleanup); + + server = gethostbyname(config.mngip); + CHECK_NOT_EQUAL("gethostbyname", server, NULL, goto cleanup); + + bzero((char *) &servAddr, sizeof(servAddr)); + servAddr.sin_family = AF_INET; + servAddr.sin_addr.s_addr = inet_addr(config.mngip); + servAddr.sin_port = htons(config.port + 15); + + rc = connect(*mngSocket,(struct sockaddr *) &servAddr,sizeof(servAddr)); + CHECK_VALUE("connect", rc, 0, goto cleanup); + + printf("Client Connects to host %s pport %d\n",config.mngip, config.port + 15); + + result = 0; + cleanup: + return result; +} + +/** + *Send data using TCP socket. + * + * Params: + * *sock : File descriptor represent test socket + * mngSock : File descriptor used for management += * Returns: + * These calls return 0, or -1 if an error occurred. + **/ +int send_data(int testSock, int mngSock){ + + int result = -1; + int rc; + char* message; + + printf("Enter Function send_data\n"); + + rc = sync_side(mngSock, 0); + CHECK_VALUE("sync_side", rc, 0, goto cleanup); + + if (config.callbackReturn == RECV) { + message = "recv"; + } + else if (config.callbackReturn == HOLD){ + message = "hold"; + } + else { + message = "drop"; + } + + rc = send(testSock, message, 20, 0); + CHECK_NOT_EQUAL("send", rc, -1, goto cleanup); + CHECK_NOT_EQUAL("send", rc, 0, goto cleanup); + + result = 0; +cleanup: + return result; +} + diff --git a/tests/extra_api_tests/tcp_zcopy_cb/client.h b/tests/extra_api_tests/tcp_zcopy_cb/client.h new file mode 100755 index 0000000..05dbce2 --- /dev/null +++ b/tests/extra_api_tests/tcp_zcopy_cb/client.h @@ -0,0 +1,6 @@ +#ifndef _CLIENT_H_ +#define _CLIENT_H_ + +int client_main(); + +#endif diff --git a/tests/extra_api_tests/tcp_zcopy_cb/main.c b/tests/extra_api_tests/tcp_zcopy_cb/main.c new file mode 100755 index 0000000..27746c6 --- /dev/null +++ b/tests/extra_api_tests/tcp_zcopy_cb/main.c @@ -0,0 +1,120 @@ +#include +#include +#include + +#include "types.h" +#include "server.h" +#include "client.h" + +int process_arg(char *argv[]); +void print_config(void); + +struct config_t config = { + 0, /* Is server */ + "0", /* Server's IP */ + "0", /* Management IP */ + 5000, /* Server's port num */ + 0, /* Using NONBlocking FDs*/ + 1, /* Bind Reusable Addres */ + RECV /* Callback return operation to applied on packet */ +}; + +/**************************************** + *Function: main * + ****************************************/ +int main(int argc, char *argv[]) +{ + int test_result = 1; + int rc; + + if (argc < 6 || process_arg(argv)) { + printf("usage: Incorrect parameter \n" + "%s \n", argv[0]); + return -1; + } + + print_config(); + + if (config.server) { + rc = server_main(); + CHECK_VALUE("server_main", rc, 0, goto cleanup); + } + else { + rc = client_main(); + CHECK_VALUE("client_main", rc, 0, goto cleanup); + } + + test_result = 0; + + cleanup: + if(!test_result) + printf("Test pass\n"); + else + printf("Test Fail\n"); + + return test_result; +} + +/* Fill entered arguments in config_t variable */ +int process_arg(char *argv[]) { + + if(strcmp(argv[1], "SERVER") == 0){ + config.server = 1; + } + else if(strcmp(argv[1], "CLIENT") == 0){ + config.server = 0; + } + else { + printf("unknown application type %s\n", argv[1]); + return -1; + } + + strcpy(config.sip, argv[2]); + + strcpy(config.mngip, argv[3]); + + if(strcmp(argv[4], "NONBLOCKING") == 0){ + config.nonBlocking = 1; + } + else if(strcmp(argv[4], "BLOCKING") == 0){ + config.nonBlocking = 0; + } + else { + printf("unknown blocking type %s\n", argv[5]); + return -1; + } + + if(strcmp(argv[5], "RECV") == 0){ + config.callbackReturn = RECV; + } + else if(strcmp(argv[5], "HOLD") == 0){ + config.callbackReturn = HOLD; + } + else if(strcmp(argv[5], "DROP") == 0){ + config.callbackReturn = DROP; + } + else { + printf("unknown return operation %s\n", argv[6]); + return -1; + } + + return 0; +} + +/**************************************** + *Function: print_config * + ****************************************/ +void print_config(void) +{ + printf("-----------------------------------------\n"); + printf("Is Server: %s\n", config.server ? "YES" : "NO"); + printf("Server IP %s\n", config.sip); + printf("Management IP: %s\n", config.mngip); + printf("Port Number: %d\n", config.port); + printf("-----------------------------------------\n"); +} + + + + + diff --git a/tests/extra_api_tests/tcp_zcopy_cb/server.c b/tests/extra_api_tests/tcp_zcopy_cb/server.c new file mode 100755 index 0000000..ec01d8e --- /dev/null +++ b/tests/extra_api_tests/tcp_zcopy_cb/server.c @@ -0,0 +1,325 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "types.h" + +#define BUFFER_SIZE 1024 +#define TIMEOUT 5 + +extern struct config_t config; + +typedef vma_recv_callback_retval_t (*vma_recv_callback_t)(int fd, size_t sz_iov, struct iovec iov[], + struct vma_info_t* vma_info, void *context); +vma_recv_callback_retval_t myapp_vma_recv_pkt_notify_callback( + int fd, + size_t iov_sz, + struct iovec iov[], + struct vma_info_t* vma_info, + void *context); +void free_packet(void* packet_id, int fd); +int server_management(int *mangSocket); +int get_sock_fd(int *sock, int mangSock); +int receive_data(int *sock, int mangSock); +struct vma_api_t *vma_api = NULL; + +int server_main(){ + int sock = INVALID_SOCKET; + int mangSock = INVALID_SOCKET; + int result = -1; + int rc; + struct timeval timeout; + struct pending_packet_t pending_packet; + + printf("Enter Function server_main\n"); + + if (config.callbackReturn == HOLD) { + pending_packet.valid = 0; + } + + rc = server_management(&mangSock); + CHECK_VALUE("server_management", rc, 0, goto cleanup); + + rc = get_sock_fd(&sock, mangSock); + CHECK_VALUE("get_sock_fd", rc, 0, goto cleanup); + + if(config.nonBlocking){ + rc = make_socket_non_blocking(sock); + CHECK_VALUE("make_socket_non_blocking", rc, 0, goto cleanup); + } + else { + timeout.tv_sec = TIMEOUT; + timeout.tv_usec = 0; + + rc = setsockopt (sock, SOL_SOCKET, SO_RCVTIMEO, (char *)&timeout, + sizeof(timeout)); + CHECK_VALUE("setsockopt", rc, 0, goto cleanup); + } + + vma_api = vma_get_api(); + CHECK_NOT_EQUAL("vma_get_api", vma_api, NULL, goto cleanup); + + printf("Server gets VMA APIs\n"); + + rc = vma_api->register_recv_callback(sock, myapp_vma_recv_pkt_notify_callback, &pending_packet); + CHECK_VALUE("register_recv_callback", rc, 0, goto cleanup); + + printf("Callback function registered with VMA\n"); + + rc = receive_data(&sock, mangSock); + CHECK_VALUE("receive_data", rc, 0, goto cleanup); + + if (config.callbackReturn == HOLD) { + CHECK_VALUE("pending_packet.valid", pending_packet.valid, 1, goto cleanup); + free_packet(pending_packet.vma_info->packet_id, sock); + free(pending_packet.vma_info); + } + + /* sync for termination */ + rc = sync_side(mangSock, 1); + CHECK_VALUE("sync_side", rc, 0, goto cleanup); + + result = 0; + cleanup: + if(sock != INVALID_SOCKET){ + rc = close(sock); + CHECK_VALUE("close", rc, 0, result = -1); + } + + if(mangSock != INVALID_SOCKET) { + rc = close(mangSock); + CHECK_VALUE("close", rc, 0, result = -1); + } + return result; +} + +void myapp_processes_packet_func( + struct iovec* iov, + size_t iov_sz, + void* packet_id, + int s){ + printf("Enter Function myapp_processes_packet_func\n"); + /*myapp_processes_packet_func(.....);*/ + + /* Return zero copied packet buffer back to VMA + // Would be better to collect a bunch of buffers and return them all at once + // which will save locks inside VMA + */ + free_packet(packet_id, s); +} + +/** + * Free VMA buffer reserved for given packet + * Params: + * *packet_id : ID of packet to remove + * fd : File descriptor for socket. + **/ +void free_packet(void* packet_id, int fd){ + + struct vma_packet_t* vma_packet; + vma_packet = malloc(sizeof(vma_packet->packet_id)); + vma_packet->packet_id = packet_id; + vma_api->free_packets(fd, vma_packet, 1); + free(vma_packet); +} + +vma_recv_callback_retval_t myapp_vma_recv_pkt_notify_callback( + int fd, + size_t iov_sz, + struct iovec iov[], + struct vma_info_t* vma_info, + void *context) +{ + struct pending_packet_t *p_pending_packet; + + printf("Enter Function myapp_vma_recv_pkt_notify_callback\n"); + + if (strcmp(iov[0].iov_base, "recv") == 0) { + printf("VMA's info struct is not something we recognize so un register the application's callback function\n"); + printf("VMA extra API filtered to VMA_PACKET_RECV\n"); + return VMA_PACKET_RECV; + } + + if (strcmp(iov[0].iov_base, "drop") == 0){ + printf("VMA extra API filtered to VMA_PACKET_DROP\n"); + return VMA_PACKET_DROP; + } + + if (strcmp(iov[0].iov_base, "hold") == 0){ + printf("VMA extra API filtered to VMA_PACKET_HOLD\n"); + + /* In hold case we check pending_packet,free its holding buffer if its valid and then fill it with new packet data, + so each packet will be freed in the next callback */ + p_pending_packet = (struct pending_packet_t *)context; + if (p_pending_packet->valid) + myapp_processes_packet_func(p_pending_packet->iov, p_pending_packet->iovec_size, p_pending_packet->vma_info->packet_id, fd); + memcpy(p_pending_packet->iov, iov, sizeof(struct iovec)*iov_sz); + p_pending_packet->iovec_size = iov_sz; + p_pending_packet->vma_info = malloc(sizeof(struct vma_info_t)); + memcpy (p_pending_packet->vma_info, vma_info, sizeof(struct vma_info_t)); + p_pending_packet->valid = 1; + + return VMA_PACKET_HOLD; + } + printf("VMA extra API filtered to VMA_PACKET_RECV\n"); + + return VMA_PACKET_RECV; +} + + +int server_management( + int *mangSocket){ + int rc; + int result = -1; + int on = 1; + int mainSocket = INVALID_SOCKET; + socklen_t clilen; + struct sockaddr_in servAddr; + struct sockaddr_in cliAddr; + + printf("Enter Function server_management\n"); + + mainSocket = socket(AF_INET, SOCK_STREAM, 0); + CHECK_NOT_EQUAL("socket", mainSocket, INVALID_SOCKET, goto cleanup); + + bzero((char *) &servAddr, sizeof(servAddr)); + servAddr.sin_family = AF_INET; + servAddr.sin_addr.s_addr = inet_addr(config.mngip); + servAddr.sin_port = htons(config.port + 15); + + rc = setsockopt(mainSocket, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)); + CHECK_VALUE("setsockopt", rc, 0, goto cleanup); + + rc = bind(mainSocket, (struct sockaddr *) &servAddr, sizeof(servAddr)); + CHECK_VALUE("bind", rc, 0, goto cleanup); + + listen(mainSocket,1); + CHECK_VALUE("listen", rc, 0, goto cleanup); + clilen = sizeof(cliAddr); + + *mangSocket = accept(mainSocket, (struct sockaddr *) &cliAddr, &clilen); + CHECK_NOT_EQUAL("accept", *mangSocket, INVALID_SOCKET, goto cleanup); + + printf("server Accepting new client\n"); + result = 0; + cleanup: + return result; +} + +/** + *Generate TCP socket, bind it to specific address, listen and accept new connection. + * + * Params: + * *sock : File descriptor represent generated socket + * mangSock : File descriptor used for management + * Returns: + * These calls return 0, or -1 if an error occurred. + **/ +int get_sock_fd(int *sock, int mangSock){ + int rc; + int on = 1; + int result = -1; + int mainSocket = INVALID_SOCKET; + struct sockaddr_in sAddr; + struct sockaddr_in cliAddr; + socklen_t clilen; + + printf("Enter Function get_sock_fd\n"); + + memset(&sAddr, 0, sizeof(sAddr)); + + mainSocket = socket(AF_INET, SOCK_STREAM, 0); + CHECK_NOT_EQUAL("socket", mainSocket, INVALID_SOCKET, goto cleanup); + + /* Set server Address */ + sAddr.sin_family = AF_INET; + sAddr.sin_port = htons(config.port); + sAddr.sin_addr.s_addr = inet_addr(config.sip); + + if(config.reuseAddr){ + rc = setsockopt(mainSocket, SOL_SOCKET, SO_REUSEADDR, &on, sizeof(on)); + CHECK_VALUE("setsockopt", rc, 0, goto cleanup); + } + + /* Bind socket to server address */ + rc = bind(mainSocket, (struct sockaddr *) &sAddr, sizeof(sAddr)); + CHECK_VALUE("bind", rc, 0, goto cleanup); + + memset(&cliAddr, 0, sizeof(cliAddr)); + + rc = listen(mainSocket, 1); + CHECK_VALUE("listen", rc, 0, goto cleanup); + + /* sync to connect from other side */ + rc = sync_side(mangSock, 1); + CHECK_VALUE("sync_side", rc, 0, goto cleanup); + + clilen = sizeof(cliAddr); + + *sock = accept(mainSocket, (struct sockaddr *) &cliAddr, &clilen); + CHECK_NOT_EQUAL("accept", *sock, INVALID_SOCKET, goto cleanup); + + printf("server Accepting new client\n"); + + result = 0; +cleanup: + return result; +} + +/** + *Receive data from given TCP socket fd. + * + * Params: + * *sock : File descriptor represent test socket + * mangSock : File descriptor used for management + * Returns: + * These calls return 0, or -1 if an error occurred. + **/ +int receive_data(int *sock, int mangSock){ + + int result = -1; + int rc; + void* recv_data; + + printf("Enter Function receive_data\n"); + + recv_data = malloc(sizeof(char) * BUFFER_SIZE); + CHECK_NOT_EQUAL("malloc", recv_data, NULL, goto cleanup); + + rc = sync_side(mangSock, 1); + CHECK_VALUE("sync_side", rc, 0, goto cleanup); + + if(config.nonBlocking){ + rc = select_read(sock, TIMEOUT, 0); + if (config.callbackReturn == DROP) { + CHECK_VALUE("select_read", rc, 0, goto cleanup); + } + else { + CHECK_NOT_EQUAL("select_read", rc, 0, goto cleanup); + CHECK_NOT_EQUAL("select_read", rc, -1, goto cleanup); + } + } + + rc = recv(*sock, recv_data, BUFFER_SIZE, 0); + if (config.callbackReturn == RECV) { + CHECK_NOT_EQUAL("recv", rc, -1, goto cleanup); + CHECK_NOT_EQUAL("recv", rc, 0, goto cleanup); + } + else { + CHECK_VALUE("recv", rc, -1, goto cleanup); + CHECK_VALUE("recv", errno, EAGAIN, goto cleanup); + } + + result = 0; +cleanup: + if (recv_data) + free(recv_data); + return result; +} + diff --git a/tests/extra_api_tests/tcp_zcopy_cb/server.h b/tests/extra_api_tests/tcp_zcopy_cb/server.h new file mode 100755 index 0000000..06a0ba3 --- /dev/null +++ b/tests/extra_api_tests/tcp_zcopy_cb/server.h @@ -0,0 +1,6 @@ +#ifndef _SERVER_H_ +#define _SERVER_H_ + +int server_main(); + +#endif diff --git a/tests/extra_api_tests/tcp_zcopy_cb/types.h b/tests/extra_api_tests/tcp_zcopy_cb/types.h new file mode 100755 index 0000000..51f92ad --- /dev/null +++ b/tests/extra_api_tests/tcp_zcopy_cb/types.h @@ -0,0 +1,49 @@ +#ifndef _TYPES_H_ +#define _TYPES_H_ + +#include +#include +#include +#include + +int make_socket_non_blocking (int sfd); +int select_read(int *fd, int sec, int usec); +int sync_side(int sock, int front); + +enum callback_return{ + RECV, + HOLD, + DROP +} ; + +struct __attribute__ ((packed)) config_t { + int server; + char sip[20]; + char mngip[20]; + int port; + int nonBlocking; + int reuseAddr; + enum callback_return callbackReturn; +}; + +struct __attribute__ ((packed)) pending_packet_t{ + int valid; + int iovec_size; + struct iovec iov[10]; + struct vma_info_t *vma_info; +}; + +#define INVALID_SOCKET -1 + +#define CHECK_VALUE(verb, act_val, exp_val, cmd) if((exp_val) != (act_val)){ \ + printf("Error in %s, expected value %d, actual value %d\n", \ + (verb), (exp_val), (act_val)); \ + cmd; \ + } + +#define CHECK_NOT_EQUAL(verb, act_val, exp_val, cmd) if((exp_val) == (act_val)){ \ + printf("Error in %s\n", (verb)); \ + cmd; \ + } + +#endif diff --git a/tests/extra_api_tests/tcp_zcopy_cb/utils.c b/tests/extra_api_tests/tcp_zcopy_cb/utils.c new file mode 100755 index 0000000..c49e3c6 --- /dev/null +++ b/tests/extra_api_tests/tcp_zcopy_cb/utils.c @@ -0,0 +1,98 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "types.h" + +extern struct config_t config; + +/**************************************** + * FUNCTION: make_socket_non_blocking * + ****************************************/ +int make_socket_non_blocking ( + int sfd){ + int flags; + int rc; + + printf("Enter Function make_socket_non_blocking\n"); + + flags = fcntl (sfd, F_GETFL, 0); + CHECK_NOT_EQUAL("fcntl F_GETFL", flags, -1, return -1); + + flags |= O_NONBLOCK; + + rc = fcntl (sfd, F_SETFL, flags); + CHECK_NOT_EQUAL("fcntl F_SETFL", rc, -1, return -1); + + return 0; +} + +/**************************************** + * FUNCTION: select_read * + ****************************************/ +int select_read( + int *fd, + int sec, + int usec){ + int result = -1; + int retval; + fd_set rfds; + struct timeval tv; + + printf("Enter Function select_read\n"); + + /* Watch stdin (Passed fd) to see when it has input. */ + FD_ZERO(&rfds); + FD_SET(*fd, &rfds); + + /* Wait up to five seconds. */ + tv.tv_sec = sec; + tv.tv_usec = usec; + + retval = select(*fd + 1, &rfds, NULL, NULL, &tv); + /* Don't rely on the value of tv now! */ + CHECK_NOT_EQUAL("select", retval, -1, goto cleanup); + result = (retval)? retval : 0; + /* If retval 0, No data within specific seconds. */ + cleanup: + return result; +} + +/**************************************** + * FUNCTION: sync_side * + ****************************************/ +int sync_side( + int sock, + int front){ + int rc; + int result = -1; + char data; + + printf("Enter Function sync_side\n"); + + if(front){ + rc = send(sock, &data, 1, 0); + CHECK_NOT_EQUAL("send", rc, 0, goto cleanup); + + rc = recv(sock, &data, 1, 0); + CHECK_NOT_EQUAL("recv", rc, 0, goto cleanup); + } + else{ + rc = recv(sock, &data, 1, 0); + CHECK_VALUE("recv", rc, 1, goto cleanup); + + rc = send(sock, &data, 1, 0); + CHECK_NOT_EQUAL("send", rc, 0, goto cleanup); + } + result = 0; + cleanup: + return result; +} diff --git a/tests/functionality/getsockname_test.c b/tests/functionality/getsockname_test.c new file mode 100644 index 0000000..db2e83c --- /dev/null +++ b/tests/functionality/getsockname_test.c @@ -0,0 +1,194 @@ +#include +#include +#include +#include +#include +#include + +#define BUFLEN 1024 + +void usage(char *prog) +{ + printf("Usage: %s { OPTIONS }\n where options are:\n" + " -s/c client/server (if not specified running as server)\n" + " -p port\n" + " -r remote server address(for the client side)\n" + , prog); + exit(0); +} + +int run_tcp_client(char* remote_server, int remote_port) +{ + int s; + struct sockaddr_in servaddr; + int rc; + char msg[] = "asdfg"; + + if ( (s = ::socket(AF_INET, SOCK_STREAM, 0)) < 0 ) { + perror("socket"); + fprintf(stderr, "Error creating socket.\n"); + } + + int resize = 128; + void* test; + socklen_t test_size; + rc = setsockopt(s, SOL_SOCKET, SO_RCVBUF, &resize, sizeof(int)); + if (rc < 0) + perror("setsockopt"); + + getsockopt(s, SOL_SOCKET, SO_RCVBUF, &test, &test_size); + printf("SO_RECVBUF level=%d SOL_SOCKET level=%d test %d\n", SO_RCVBUF, SOL_SOCKET, (int*)test); + + memset(&servaddr, 0, sizeof(servaddr)); + servaddr.sin_family = AF_INET; + servaddr.sin_port = htons(remote_port); + if ( inet_aton(remote_server, &servaddr.sin_addr) <= 0 ) { + printf("ERROR: Invalid remote IP address.\n"); + return -1; + } + + printf("Connecting..\n"); + rc = ::connect(s, (struct sockaddr *) &servaddr, sizeof(servaddr)); + if ( (rc < 0) ) { + printf("ECHOCLNT: Error calling connect()\n"); + perror("connect"); + close(s); + } + + sleep(1); + + for (int i=0; i<5; i++) + { + printf("Sending...\n"); + rc = send(s, msg, 6, 0); + sleep(1); + } + shutdown(s, SHUT_RDWR); + close (s); + + sleep(1); +} + +int run_tcp_server(int server_port) +{ + int i; /* index counter for loop operations */ + int rc; /* system calls return value storage */ + int s; /* socket descriptor */ + int ws; /* new connection's socket descriptor */ + char buf[1024]; /* buffer for incoming data */ + struct sockaddr_in sa, tmp; /* Internet address struct */ + struct sockaddr_in csa; /* client's address struct */ + socklen_t size_csa = sizeof(sockaddr_in); /* size of client's address struct */ + + + + /* initiate machine's Internet address structure */ + /* first clear out the struct, to avoid garbage */ + memset(&sa, 0, sizeof(sa)); + /* Using Internet address family */ + sa.sin_family = AF_INET; + /* copy port number in network byte order */ + sa.sin_port = htons(server_port); + /* we will accept cnnections coming through any IP */ + /* address that belongs to our host, using the */ + /* INADDR_ANY wild-card. */ + sa.sin_addr.s_addr = INADDR_ANY; + /* allocate a free socket */ + /* Internet address family, Stream socket */ + s = socket(AF_INET, SOCK_STREAM, 0); + if (s < 0) { + perror("socket: allocation failed"); + } + + //bind the socket to the newly formed address + rc = bind(s, (struct sockaddr *)&sa, sizeof(sa)); + if (rc) { + perror("bind"); + } + /* ask the system to listen for incoming connections */ + /* to the address we just bound. specify that up to */ + /* 5 pending connection requests will be queued by the */ + /* system, if we are not directly awaiting them using */ + /* the accept() system call, when they arrive. */ + rc = listen(s, 1024); + + + /* check there was no error */ + if (rc) { + perror("listen"); + } + + memset(&tmp, 0, sizeof(tmp)); + rc = getsockname(s, (struct sockaddr *)&tmp, &size_csa); + + /* check there was no error */ + if (rc) { + perror("getsockname"); + } + printf("Listening on port %d\n", ntohs(tmp.sin_port)); + + ws = accept(s, (struct sockaddr *)&csa, &size_csa); + + printf("Connected...\n"); + + while ( true ) + { + rc = recv(ws, buf, 1024, 0); + if ( rc < 0 ) { + perror("recv"); + return -1; + } + + printf("Got msg, size=%d\n", rc); + if ( rc == 0 ) { + shutdown(ws, SHUT_RDWR); + close(ws); + printf("Closing %d\n", ws); + return 0; + } + printf("Recieved: %s\n", buf); + } + +} + +int main(int argc, char* argv[]) +{ + char optstring[20] = "r:p:hsc"; + char c; + + char* remote_server; + int port; + bool run_as_server = true; + + while ((c = getopt(argc, argv, optstring)) != -1) + { + switch(c) + { + case 'r': + remote_server = strdup(optarg); + break; + case 'p': + port = atoi(optarg); + break; + case 's': + run_as_server = true; + break; + case 'c': + run_as_server = false; + break; + case 'h': + usage(argv[0]); + break; + default: + usage(argv[0]); + break; + } + } + + if ( run_as_server ) + run_tcp_server(port); + else + run_tcp_client(remote_server, port); + +} + diff --git a/tests/functionality/iomux/1epoll_1socket_twice.c b/tests/functionality/iomux/1epoll_1socket_twice.c new file mode 100644 index 0000000..4b06c89 --- /dev/null +++ b/tests/functionality/iomux/1epoll_1socket_twice.c @@ -0,0 +1,133 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAXEVENTS 64 + +static int make_socket_non_blocking(int sfd) +{ + int flags, s; + + flags = fcntl(sfd, F_GETFL, 0); + if (flags == -1) { + perror("fcntl"); + return -1; + } + + flags |= O_NONBLOCK; + s = fcntl(sfd, F_SETFL, flags); + if (s == -1) { + perror("fcntl"); + return -1; + } + + return 0; +} + +static int create_and_bind(char *port) +{ + struct addrinfo hints; + struct addrinfo *result, *rp; + int s, sfd; + + memset(&hints, 0, sizeof (struct addrinfo)); + hints.ai_family = AF_UNSPEC; /* Return IPv4 and IPv6 choices */ + hints.ai_socktype = SOCK_STREAM; /* We want a TCP socket */ + hints.ai_flags = AI_PASSIVE; /* All interfaces */ + + s = getaddrinfo(NULL, port, &hints, &result); + if (s != 0) { + fprintf(stderr, "getaddrinfo: %s\n", gai_strerror (s)); + return -1; + } + + for (rp = result; rp != NULL; rp = rp->ai_next) { + sfd = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol); + if (sfd == -1) + continue; + + s = bind(sfd, rp->ai_addr, rp->ai_addrlen); + if (s == 0) { + /* We managed to bind successfully! */ + break; + } + + close(sfd); + } + + if (rp == NULL) { + fprintf(stderr, "Could not bind\n"); + return -1; + } + + freeaddrinfo(result); + + return sfd; +} + +int main(int argc, char *argv[]) +{ + int sfd, s; + int efd; + struct epoll_event event; + struct epoll_event *events; + + sfd = create_and_bind((char*)"6666"); + if (sfd == -1) + goto failure; + + printf("--> create socket %d\n", sfd); + + s = make_socket_non_blocking(sfd); + if (s == -1) + goto failure; + + printf("--> set socket %d non blocking\n", sfd); + + s = listen(sfd, SOMAXCONN); + if (s == -1) { + perror("--> listen"); + goto failure; + } + + efd = epoll_create1(0); + if (efd == -1) { + perror("--> epoll_create"); + goto failure; + } + + printf("--> create epoll %d\n",efd); + + event.data.fd = sfd; + event.events = EPOLLIN | EPOLLET; + s = epoll_ctl(efd, EPOLL_CTL_ADD, sfd, &event); + if (s == -1) { + perror("--> epoll_ctl"); + goto failure; + } + + printf("--> socket %d was registered to epoll %d\n", sfd, efd); + + s = epoll_ctl(efd, EPOLL_CTL_ADD, sfd, &event); + if (s == -1) { + if (errno == EEXIST) { + printf("--> socket %d was already registered to epoll %d, errno = %d\n", sfd, efd, errno); + printf("--> SUCCESS\n"); + return EXIT_SUCCESS; + } else { + printf("--> socket %d was already registered to epoll %d, errno should be set to EEXIST, errno = %d\n", sfd, efd, errno); + goto failure; + } + } + +failure: + printf("--> FAILURE\n"); + return EXIT_FAILURE; +} diff --git a/tests/functionality/iomux/2epoll_1socket.c b/tests/functionality/iomux/2epoll_1socket.c new file mode 100644 index 0000000..a2149c2 --- /dev/null +++ b/tests/functionality/iomux/2epoll_1socket.c @@ -0,0 +1,170 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define MAXEVENTS 64 + +static int make_socket_non_blocking(int sfd) +{ + int flags, s; + + flags = fcntl(sfd, F_GETFL, 0); + if (flags == -1) { + perror("fcntl"); + return -1; + } + + flags |= O_NONBLOCK; + s = fcntl(sfd, F_SETFL, flags); + if (s == -1) { + perror("fcntl"); + return -1; + } + + return 0; +} + +static int create_and_bind(char *port) +{ + struct addrinfo hints; + struct addrinfo *result, *rp; + int s, sfd; + + memset(&hints, 0, sizeof (struct addrinfo)); + hints.ai_family = AF_UNSPEC; /* Return IPv4 and IPv6 choices */ + hints.ai_socktype = SOCK_STREAM; /* We want a TCP socket */ + hints.ai_flags = AI_PASSIVE; /* All interfaces */ + + s = getaddrinfo(NULL, port, &hints, &result); + if (s != 0) { + fprintf(stderr, "getaddrinfo: %s\n", gai_strerror (s)); + return -1; + } + + for (rp = result; rp != NULL; rp = rp->ai_next) + { + sfd = socket(rp->ai_family, rp->ai_socktype, rp->ai_protocol); + if (sfd == -1) + continue; + + s = bind(sfd, rp->ai_addr, rp->ai_addrlen); + if (s == 0) { + /* We managed to bind successfully! */ + break; + } + + close(sfd); + } + + if (rp == NULL) { + fprintf(stderr, "Could not bind\n"); + return -1; + } + + freeaddrinfo(result); + + return sfd; +} + +int main(int argc, char *argv[]) +{ + int sfd, s, with_vma; + int efd, efd2; + struct epoll_event event; + struct epoll_event *events; + + if (argc != 2) { + printf("--> Usage: ./2epoll_1socket \n"); + printf("--> With VMA run ./2epoll_1socket 1\n"); + printf("--> With OS run ./2epoll_1socket 0\n"); + return EXIT_FAILURE; + } + + with_vma = atoi(argv[1]); + if (with_vma) + printf("--> running with VMA\n"); + else + printf("--> running with OS\n"); + + sfd = create_and_bind((char*)"6666"); + if (sfd == -1) + goto failure; + + printf("--> create socket %d\n", sfd); + + s = make_socket_non_blocking(sfd); + if (s == -1) + goto failure; + + printf("--> set socket %d non blocking\n", sfd); + + s = listen(sfd, SOMAXCONN); + if (s == -1) { + perror("--> listen"); + goto failure; + } + + efd = epoll_create1(0); + if (efd == -1) { + perror("--> epoll_create"); + goto failure; + } + + printf("--> created epoll %d\n",efd); + + efd2 = epoll_create1(0); + if (efd2 == -1) { + perror("--> epoll_create"); + goto failure; + } + + printf("--> created epoll %d\n",efd2); + + event.data.fd = sfd; + event.events = EPOLLIN | EPOLLET; + + s = epoll_ctl(efd, EPOLL_CTL_ADD, sfd, &event); + if (s == -1) { + perror("--> epoll_ctl 1"); + goto failure; + } + + printf("--> socket %d was registered to epoll %d\n", sfd, efd); + + s = epoll_ctl(efd2, EPOLL_CTL_ADD, sfd, &event); + if (with_vma) { + if (s == -1) { + if (errno == ENOMEM) { + printf("--> socket %d was already registered to epoll %d, cant register to another epfd %d, errno = %d\n", sfd, efd, efd2, errno); + printf("--> SUCCESS\n"); + return EXIT_SUCCESS; + } else { + printf("--> socket %d was already registered to epoll %d, cant register to another epfd %d, errno should be set to ENOMEM, errno = %d\n", sfd, efd, efd2, errno); + goto failure; + } + } else { + printf("--> epoll_ctl didnot return with error, VMA support only 1 epfd for each socket\n", sfd, efd, efd2, errno); + goto failure; + } + } else { + if (s == -1) { + printf("--> epoll_ctl return with error, errno = %d\n", errno); + goto failure; + } + } + + printf("--> socket %d was registered to epoll %d\n", sfd, efd); + printf("--> SUCCESS\n"); + return EXIT_SUCCESS; + +failure: + printf("--> FAILURE\n"); + return EXIT_FAILURE; +} diff --git a/tests/functionality/iomux/client.py b/tests/functionality/iomux/client.py new file mode 100755 index 0000000..1523c42 --- /dev/null +++ b/tests/functionality/iomux/client.py @@ -0,0 +1,96 @@ +#!/usr/bin/python +# Written By: Avner BenHanoch +# Date: 2011-01-11 +# +""" +A client that use ctrl socket for instructing server to sleep than it send +payload on data socket during sleep time. +It compares the time the TCP window was blocked to the requested sleep time +It exit with success iff these times are close enough +In addition, this code tests select on 1 write fd using zero/fixed/infinity timeout +""" +import socket +import select +import time +import sys + +HOST = 'alf6' # The remote host +if len (sys.argv) > 1: HOST = sys.argv[1] +PPORT = 50007 # pyload port +CPORT = PPORT + 1 # ctrl port +SIZE = 1024 # size of send buf +PAYLOAD = '0' * SIZE # payload for send +SECSLEEP = 2 # seconds for requesting server to sleep without recveing data +SECGRACE = 0.2 # seconds GRACE for mismatch in sleep request vs. actual blocking time +WRITEABLE_INDICATION = 100 * 1024 # see block_till_writeable() function below + +def print_info (msg): + print "INFO: ", msg + +readfds=[] + +# for the sake of this test, socket is defined writeable if we could +# successfully use it for sending 'WRITEABLE_INDICATION' bytes of data +def block_till_writeable (sock): + sent = 0 + ret = 0 + while sent < WRITEABLE_INDICATION: + print_info(">>> before select infinity (send-ret=%d, sent=%d)" % (ret, sent)) + readready,writeready,exceptready = select.select(readfds,[sock],[]) + if sock in writeready: + print_info("<<< after select infinity, sock is writeable (sent=%d)" % sent) + ret = sock.send(PAYLOAD) + sent += ret + else: + raise Exception("no writeable socket after select infinity") + #sys.stdin.read(1) + return sent + +#ctrl socket +csocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) +csocket.connect((HOST, CPORT)) + +#payload socket +psocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) +psocket.connect((HOST, PPORT)) +psocket.setblocking(0) + +#instruct peer to sleep +print_info("instructing peer to sleep %f seconds and flooding it with data" % SECSLEEP) +csocket.send(str(SECSLEEP)) #ctrl +# flood sleeping peer with data +size = 0 + +print_info(">>> before select (size=%d)" % size) +readready,writeready,exceptready = select.select(readfds,[psocket],[], 0) +print_info("<<< after select (size=%d)" % size) +while psocket in writeready: + ret = psocket.send(PAYLOAD) + size += ret + if size > 300*1024: + raise Exception("socket is always writeable (size=%d)" % size) + print_info(">>> before select (send-ret=%d, size=%d)" % (ret, size)) + readready,writeready,exceptready = select.select(readfds,[psocket],[], 0) + print_info("<<< after select (size=%d)" % size) + +#wait till payload socket is ready for write +t1 = time.time() +print_info("---->>> TCP window was closed after sending %d bytes. Waiting till window is open..." % size ) +res = block_till_writeable(psocket) +t2 = time.time() + +#check results +blocked = t2 - t1 +diff = abs(SECSLEEP - blocked) +print_info ("<<<---- blocked time=%f; requested block=%f" % (blocked, SECSLEEP) ) +if SECGRACE >= diff: + print_info("SUCCESS in test: grace of %f >= diff of %f" % (SECGRACE,diff) ) + ret = 0 +else: + print_info("FAIL in test: grace of %f < diff of %f" % (SECGRACE,diff) ) + ret = 255 +print_info ("[total bytes sent = %d]" % (size + res) ) + +psocket.close() +csocket.close() +sys.exit (ret) \ No newline at end of file diff --git a/tests/functionality/iomux/eclient.py b/tests/functionality/iomux/eclient.py new file mode 100644 index 0000000..0e6b991 --- /dev/null +++ b/tests/functionality/iomux/eclient.py @@ -0,0 +1,108 @@ +#!/usr/bin/python +# Written By: Avner BenHanoch +# Date: 2011-01-11 +# +""" +An epoll client that use ctrl socket for instructing server to sleep than it send +payload on data socket during sleep time. +It compares the time the TCP window was blocked to the requested sleep time +It exit with success iff these times are close enough +In addition, this code tests epoll on 1 write fd using zero/fixed/infinity timeout +""" +import socket +import select +import time +import sys + +HOST = 'alf6' # The remote host +if len (sys.argv) > 1: HOST = sys.argv[1] +PPORT = 50007 # pyload port +CPORT = PPORT + 1 # ctrl port +SIZE = 1024 # size of send buf +PAYLOAD = '0' * SIZE # payload for send +SECSLEEP = 2 # seconds for requesting server to sleep without recveing data +SECGRACE = 0.2 # seconds GRACE for mismatch in sleep request vs. actual blocking time +WRITEABLE_INDICATION = 100 * 1024 # see block_till_writeable() function below + +def print_info (msg): + print "INFO: ", msg + +#ctrl socket +csocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) +csocket.connect((HOST, CPORT)) + +#print_info ("sleeping 3 second to allow server...") #dbg +#time.sleep(3) + +#payload socket +psocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) +psocket.connect((HOST, PPORT)) +psocket.setblocking(0) + +#print_info ("sleeping 3 second to allow server...") #dbg +#time.sleep(3) + +#create epoll handle +epfd = select.epoll() +epfd.register(psocket.fileno(), select.EPOLLOUT) + + +# for the sake of this test, socket is defined writeable if we could +# successfully use it for sending 'WRITEABLE_INDICATION' bytes of data +def block_till_writeable (sock): + sent = 0 + ret = 0 + while sent < WRITEABLE_INDICATION: + print_info(">>> before epoll infinity (send-ret=%d, sent=%d)" % (ret, sent)) + events = epfd.poll() + if sock and sock.fileno() in dict(events).keys() and dict(events)[sock.fileno()] & select.EPOLLOUT: + print_info("<<< after epoll infinity, sock is writeable (sent=%d)" % sent) + ret = sock.send(PAYLOAD) + sent += ret + else: + print events + raise Exception("no writeable socket after epoll infinity") + return sent + + +#instruct peer to sleep +print_info("instructing peer to sleep %f seconds and flooding it with data" % SECSLEEP) +csocket.send(str(SECSLEEP)) #ctrl +# flood sleeping peer with data +size = 0 + +print_info(">>> before epoll (size=%d)" % size) +events = epfd.poll(0) +print_info("<<< after epoll (size=%d)" % size) +while psocket.fileno() in dict(events).keys(): + ret = psocket.send(PAYLOAD) + size += ret + if size > 300*1024: + raise Exception("socket is always writeable (size=%d)" % size) + print_info(">>> before epoll (send-ret=%d, size=%d)" % (ret, size)) + events = epfd.poll(0) + print_info("<<< after epoll (size=%d)" % size) + +#wait till payload socket is ready for write +t1 = time.time() +print_info("---->>> TCP window was closed after sending %d bytes. Waiting till window is open..." % size ) +res = block_till_writeable(psocket) +t2 = time.time() + +#check results +blocked = t2 - t1 +diff = abs(SECSLEEP - blocked) +print_info ("<<<---- blocked time=%f; requested block=%f" % (blocked, SECSLEEP) ) +if SECGRACE >= diff: + print_info("SUCCESS in test: grace of %f >= diff of %f" % (SECGRACE,diff) ) + ret = 0 +else: + print_info("FAIL in test: grace of %f < diff of %f" % (SECGRACE,diff) ) + ret = 255 +print_info ("[total bytes sent = %d]" % (size + res) ) + +epfd.unregister(psocket.fileno()) +psocket.close() +csocket.close() +epfd.close() +sys.exit (ret) \ No newline at end of file diff --git a/tests/functionality/iomux/eserver.py b/tests/functionality/iomux/eserver.py new file mode 100644 index 0000000..83f26c3 --- /dev/null +++ b/tests/functionality/iomux/eserver.py @@ -0,0 +1,122 @@ +#!/usr/bin/env python +# Written By: Avner BenHanoch +# Date: 2011-03-08 +""" +epoll server test program, identical to the select server.py test program +that uses epoll instead of select + +can be used with the select client.py test program + +NOTE: epoll is only supported in python 2.6 and above +""" + +import select +import socket +import sys +import time + + +HOST = '' # IP for listening on +if len (sys.argv) > 1: HOST = sys.argv[1] +PPORT = 50007 # pyload port +CPORT = PPORT + 1 # ctrl port +SIZE = 8192 # size of recv buf +backlog = 1 + +def print_info (msg): + print "INFO: ", msg + + +#server for payload channel +pserver = socket.socket(socket.AF_INET, socket.SOCK_STREAM) +pserver.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) +pserver.bind((HOST,PPORT)) +pserver.listen(backlog) + +#server for ctrl channel +cserver = socket.socket(socket.AF_INET, socket.SOCK_STREAM) +cserver.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) +cserver.bind((HOST,CPORT)) +cserver.listen(backlog) + + +psocket = None #connected payload socket +csocket = None #connected ctrl socket + +totalBytes = 0 +timeout = 1000 + +epfd = select.epoll() +epfd.register(pserver.fileno(), select.EPOLLIN) +epfd.register(cserver.fileno(), select.EPOLLIN) + +while True: + print_info ("waiting for traffic; sleeping %d seconds on epoll..." % timeout) #dbg + events = epfd.poll(timeout) + print_info ("--------> epoll returned %d input fds" % len(events) ) #dbg + + for fileno, event in events: + if cserver and fileno == cserver.fileno(): # new connection on ctrl server socket + if csocket: raise Exception ("payload socket is already connected") + csocket, address = cserver.accept() + epfd.register(csocket.fileno(), select.EPOLLIN) + print_info ("accepted ctrl socket; peer=%s" % str(address)) + + elif pserver and fileno == pserver.fileno(): # new connection on payload server socket + if psocket: raise Exception ("payload socket is already connected") + psocket, address = pserver.accept() + epfd.register(psocket.fileno(), select.EPOLLIN) + print_info ("accepted payload socket; peer=%s" % str(address)) + + elif csocket and fileno == csocket.fileno(): #data on ctrl socket + buf = csocket.recv(SIZE) + if buf: + print_info ("got instruction on ctrl socket") + t = float(buf) + print_info (">>> going to sleep for %f seconds..." % t) + t1 = time.time() + time.sleep(t) + t2 = time.time() + print_info ("<<< sleep was finished after %f seconds" % (t2-t1)) + else: #EOF + print_info ("got EOF on ctrl socket") + epfd.unregister(csocket.fileno()) + csocket.close() + csocket = None + if psocket: timeout = 1 # wait for ordinary close of payload socket + + elif psocket and fileno == psocket.fileno() : #data on payload socket + buf = psocket.recv(SIZE) + if buf: + size = len(buf) + print_info ("got data on payload socket; len is: %d" % size) #dbg + totalBytes += size + else: #EOF + print_info (" ====> got EOF on payload socket; total bytes received: %d <<=====" % totalBytes) + totalBytes = 0 + epfd.unregister(psocket.fileno()) + psocket.close() + psocket = None + if csocket: timeout = 1 # wait for ordinary close of ctrl socket + + if not events: #timeout + print_info ("epoll (%d seconds) timeout" % timeout) + if csocket: + print_info ("closing ctrl socket") + epfd.unregister(csocket.fileno()) + csocket.close() + csocket = None + if psocket: + print_info (" ====> closing payload socket (without EOF); total bytes received: %d <<=====" % totalBytes) + totalBytes = 0 + epfd.unregister(psocket.fileno()) + psocket.close() + psocket = None + timeout = 1000 + + +if pserver: pserver.close() +if cserver: cserver.close() +if psocket: psocket.close() +if csocket: csocket.close() +if epfd: epfd.close() \ No newline at end of file diff --git a/tests/functionality/iomux/pclient.py b/tests/functionality/iomux/pclient.py new file mode 100644 index 0000000..67e8328 --- /dev/null +++ b/tests/functionality/iomux/pclient.py @@ -0,0 +1,107 @@ +#!/usr/bin/python +# Written By: Avner BenHanoch +# Date: 2011-01-11 +# +""" +An poll client that use ctrl socket for instructing server to sleep than it send +payload on data socket during sleep time. +It compares the time the TCP window was blocked to the requested sleep time +It exit with success iff these times are close enough +In addition, this code tests poll on 1 write fd using zero/fixed/infinity timeout +""" +import socket +import select +import time +import sys + +HOST = 'alf6' # The remote host +if len (sys.argv) > 1: HOST = sys.argv[1] +PPORT = 50007 # pyload port +CPORT = PPORT + 1 # ctrl port +SIZE = 1024 # size of send buf +PAYLOAD = '0' * SIZE # payload for send +SECSLEEP = 2 # seconds for requesting server to sleep without recveing data +SECGRACE = 0.2 # seconds GRACE for mismatch in sleep request vs. actual blocking time +WRITEABLE_INDICATION = 100 * 1024 # see block_till_writeable() function below + +def print_info (msg): + print "INFO: ", msg + +#ctrl socket +csocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) +csocket.connect((HOST, CPORT)) + +#print_info ("sleeping 3 second to allow server...") #dbg +#time.sleep(3) + +#payload socket +psocket = socket.socket(socket.AF_INET, socket.SOCK_STREAM) +psocket.connect((HOST, PPORT)) +psocket.setblocking(0) + +#print_info ("sleeping 3 second to allow server...") #dbg +#time.sleep(3) + +#create poll handle +pollarr = select.poll() +pollarr.register(psocket.fileno(), select.POLLOUT) + + +# for the sake of this test, socket is defined writeable if we could +# successfully use it for sending 'WRITEABLE_INDICATION' bytes of data +def block_till_writeable (sock): + sent = 0 + ret = 0 + while sent < WRITEABLE_INDICATION: + print_info(">>> before poll infinity (send-ret=%d, sent=%d)" % (ret, sent)) + events = pollarr.poll() + if sock and sock.fileno() in dict(events).keys() and dict(events)[sock.fileno()] & select.POLLOUT: + print_info("<<< after poll infinity, sock is writeable (sent=%d)" % sent) + ret = sock.send(PAYLOAD) + sent += ret + else: + print events + raise Exception("no writeable socket after poll infinity") + return sent + + +#instruct peer to sleep +print_info("instructing peer to sleep %f seconds and flooding it with data" % SECSLEEP) +csocket.send(str(SECSLEEP)) #ctrl +# flood sleeping peer with data +size = 0 + +print_info(">>> before poll (size=%d)" % size) +events = pollarr.poll(0) +print_info("<<< after poll (size=%d)" % size) +while psocket.fileno() in dict(events).keys(): + ret = psocket.send(PAYLOAD) + size += ret + if size > 300*1024: + raise Exception("socket is always writeable (size=%d)" % size) + print_info(">>> before poll (send-ret=%d, size=%d)" % (ret, size)) + events = pollarr.poll(0) + print_info("<<< after poll (size=%d)" % size) + +#wait till payload socket is ready for write +t1 = time.time() +print_info("---->>> TCP window was closed after sending %d bytes. Waiting till window is open..." % size ) +res = block_till_writeable(psocket) +t2 = time.time() + +#check results +blocked = t2 - t1 +diff = abs(SECSLEEP - blocked) +print_info ("<<<---- blocked time=%f; requested block=%f" % (blocked, SECSLEEP) ) +if SECGRACE >= diff: + print_info("SUCCESS in test: grace of %f >= diff of %f" % (SECGRACE,diff) ) + ret = 0 +else: + print_info("FAIL in test: grace of %f < diff of %f" % (SECGRACE,diff) ) + ret = 255 +print_info ("[total bytes sent = %d]" % (size + res) ) + +pollarr.unregister(psocket.fileno()) +psocket.close() +csocket.close() +sys.exit (ret) diff --git a/tests/functionality/iomux/pserver.py b/tests/functionality/iomux/pserver.py new file mode 100644 index 0000000..20d6ec6 --- /dev/null +++ b/tests/functionality/iomux/pserver.py @@ -0,0 +1,121 @@ +#!/usr/bin/env python +# Written By: Avner BenHanoch +# Date: 2011-03-08 +""" +poll server test program, identical to the select server.py test program +that uses poll instead of select + +can be used with the select client.py test program + +NOTE: poll is only supported in python 2.6 and above +""" + +import select +import socket +import sys +import time + + +HOST = '' # IP for listening on +if len (sys.argv) > 1: HOST = sys.argv[1] +PPORT = 50007 # pyload port +CPORT = PPORT + 1 # ctrl port +SIZE = 8192 # size of recv buf +backlog = 1 + +def print_info (msg): + print "INFO: ", msg + + +#server for payload channel +pserver = socket.socket(socket.AF_INET, socket.SOCK_STREAM) +pserver.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) +pserver.bind((HOST,PPORT)) +pserver.listen(backlog) + +#server for ctrl channel +cserver = socket.socket(socket.AF_INET, socket.SOCK_STREAM) +cserver.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) +cserver.bind((HOST,CPORT)) +cserver.listen(backlog) + + +psocket = None #connected payload socket +csocket = None #connected ctrl socket + +totalBytes = 0 +timeout = 1000000 + +pollarr = select.poll() +pollarr.register(pserver.fileno(), select.POLLIN) +pollarr.register(cserver.fileno(), select.POLLIN) + +while True: + print_info ("waiting for traffic; sleeping %d milliseconds on poll..." % timeout) #dbg + events = pollarr.poll(timeout) + print_info ("--------> poll returned %d input fds" % len(events) ) #dbg + + for fileno, event in events: + if cserver and fileno == cserver.fileno(): # new connection on ctrl server socket + if csocket: raise Exception ("payload socket is already connected") + csocket, address = cserver.accept() + pollarr.register(csocket.fileno(), select.POLLIN) + print_info ("accepted ctrl socket; peer=%s" % str(address)) + + elif pserver and fileno == pserver.fileno(): # new connection on payload server socket + if psocket: raise Exception ("payload socket is already connected") + psocket, address = pserver.accept() + pollarr.register(psocket.fileno(), select.POLLIN) + print_info ("accepted payload socket; peer=%s" % str(address)) + + elif csocket and fileno == csocket.fileno(): #data on ctrl socket + buf = csocket.recv(SIZE) + if buf: + print_info ("got instruction on ctrl socket") + t = float(buf) + print_info (">>> going to sleep for %f seconds..." % t) + t1 = time.time() + time.sleep(t) + t2 = time.time() + print_info ("<<< sleep was finished after %f seconds" % (t2-t1)) + else: #EOF + print_info ("got EOF on ctrl socket") + pollarr.unregister(csocket.fileno()) + csocket.close() + csocket = None + if psocket: timeout = 1 # wait for ordinary close of payload socket + + elif psocket and fileno == psocket.fileno() : #data on payload socket + buf = psocket.recv(SIZE) + if buf: + size = len(buf) + print_info ("got data on payload socket; len is: %d" % size) #dbg + totalBytes += size + else: #EOF + print_info (" ====> got EOF on payload socket; total bytes received: %d <<=====" % totalBytes) + totalBytes = 0 + pollarr.unregister(psocket.fileno()) + psocket.close() + psocket = None + if csocket: timeout = 1 # wait for ordinary close of ctrl socket + + if not events: #timeout + print_info ("poll (%d seconds) timeout" % timeout) + if csocket: + print_info ("closing ctrl socket") + pollarr.unregister(csocket.fileno()) + csocket.close() + csocket = None + if psocket: + print_info (" ====> closing payload socket (without EOF); total bytes received: %d <<=====" % totalBytes) + totalBytes = 0 + pollarr.unregister(psocket.fileno()) + psocket.close() + psocket = None + timeout = 1000000 + + +if pserver: pserver.close() +if cserver: cserver.close() +if psocket: psocket.close() +if csocket: csocket.close() diff --git a/tests/functionality/iomux/server.py b/tests/functionality/iomux/server.py new file mode 100755 index 0000000..d8df7db --- /dev/null +++ b/tests/functionality/iomux/server.py @@ -0,0 +1,117 @@ +#!/usr/bin/env python +# Written By: Avner BenHanoch +# Date: 2011-01-11 +""" +A recv [without send] server that uses ctrl channel for getting sleep instructions +and avoid recv during sleep time. +This beavior will fastly flood TCP window, thus enabling testing behavior of +peer with TCP window. +In addition, this code tests select on 2 - 4 read fds with timeout +""" + +import select +import socket +import sys +import time + + +HOST = '' # IP for listening on +if len (sys.argv) > 1: HOST = sys.argv[1] +PPORT = 50007 # pyload port +CPORT = PPORT + 1 # ctrl port +SIZE = 8192 # size of recv buf +backlog = 1 + +def print_info (msg): + print "INFO: ", msg + + +#server for payload channel +pserver = socket.socket(socket.AF_INET, socket.SOCK_STREAM) +pserver.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) +pserver.bind((HOST,PPORT)) +pserver.listen(backlog) + +#server for ctrl channel +cserver = socket.socket(socket.AF_INET, socket.SOCK_STREAM) +cserver.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) +cserver.bind((HOST,CPORT)) +cserver.listen(backlog) + + +psocket = None #connected payload socket +csocket = None #connected ctrl socket + +totalBytes = 0 +readfds = [pserver, cserver] +timeout = 1000 + +while True: + print_info ("waiting for traffic; sleeping %d seconds on select..." % timeout) + readready,writeready,exceptready = select.select(readfds,[],[], timeout) + ####print_info ("--------> select returned %d input fds" % len(readready) ) + + for s in readready: + if s == cserver: # new connection on ctrl server socket + if csocket: raise Exception ("payload socket is already connected") + csocket, address = cserver.accept() + readfds.append(csocket) + print_info ("accepted ctrl socket; peer=%s" % str(address)) + + elif s == pserver: # new connection on payload server socket + if psocket: raise Exception ("payload socket is already connected") + psocket, address = pserver.accept() + readfds.append(psocket) + print_info ("accepted payload socket; peer=%s" % str(address)) + + elif s == csocket: #data on ctrl socket + buf = s.recv(SIZE) + if buf: + print_info ("got instruction on ctrl socket") + t = float(buf) + print_info (">>> going to sleep for %f seconds..." % t) + t1 = time.time() + time.sleep(t) + t2 = time.time() + print_info ("<<< sleep was finished after %f seconds" % (t2-t1)) + else: #EOF + print_info ("got EOF on ctrl socket") + csocket.close() + readfds.remove(csocket) + csocket = None + if psocket: timeout = 1 # wait for ordinary close of payload socket + + elif s == psocket: #data on payload socket + buf = s.recv(SIZE) + if buf: + size = len(buf) + print_info ("got data on payload socket; len is: %d" % size) + totalBytes += size + else: #EOF + print_info (" ====> got EOF on payload socket; total bytes received: %d <<=====" % totalBytes) + totalBytes = 0 + psocket.close() + readfds.remove(psocket) + psocket = None + if csocket: timeout = 1 # wait for ordinary close of ctrl socket + + if not readready: #timeout + print_info ("select (%d seconds) timeout" % timeout) + if csocket: + print_info ("closing ctrl socket") + csocket.close() + readfds.remove(csocket) + csocket = None + if psocket: + print_info (" ====> closing payload socket (without EOF); total bytes received: %d <<=====" % totalBytes) + totalBytes = 0 + psocket.close() + readfds.remove(psocket) + psocket = None + timeout = 1000 + + +if pserver: pserver.close() +if cserver: cserver.close() +if psocket: psocket.close() +if csocket: csocket.close() diff --git a/tests/gtest/Makefile.am b/tests/gtest/Makefile.am new file mode 100644 index 0000000..d8edd5d --- /dev/null +++ b/tests/gtest/Makefile.am @@ -0,0 +1,162 @@ +noinst_PROGRAMS = gtest + +CXXFLAGS = $(GTEST_CXXFLAGS) + +gtest_LDADD = + +gtest_CPPFLAGS = \ + -I$(top_srcdir)/ \ + -I$(top_srcdir)/src \ + -I$(top_srcdir)/src/vma \ + -I$(top_srcdir)/tests/gtest + +gtest_LDFLAGS = $(GTEST_LDFLAGS) -no-install +gtest_CXXFLAGS = $(GTEST_CXXFLAGS) -g -O3 -fno-tree-vectorize + +gtest_SOURCES = \ + main.cc \ + \ + common/gtest-all.cc \ + common/sys.cc \ + \ + common/base.cc \ + \ + sock/sock_base.cc \ + sock/sock_socket.cc \ + \ + mix/mix_base.cc \ + mix/sg_array.cc \ + mix/mix_list.cc \ + \ + tcp/tcp_base.cc \ + tcp/tcp_bind.cc \ + tcp/tcp_connect.cc \ + tcp/tcp_connect_nb.cc \ + tcp/tcp_event.cc \ + tcp/tcp_send.cc \ + tcp/tcp_sendto.cc \ + \ + udp/udp_base.cc \ + udp/udp_bind.cc \ + udp/udp_send.cc \ + udp/udp_sendto.cc \ + \ + vma/vma_base.cc \ + vma/vma_ring.cc \ + vma/vma_poll.cc \ + vma/vma_sockopt.cc \ + \ + vmad/vmad_base.cc \ + vmad/vmad_bitmap.cc \ + vmad/vmad_hash.cc \ + vmad/vmad_init.cc \ + vmad/vmad_state.cc \ + vmad/vmad_flow.cc + +noinst_HEADERS = \ + common/gtest.h \ + common/tap.h \ + common/def.h \ + common/sys.h \ + common/log.h \ + common/cmn.h \ + \ + common/base.h \ + \ + sock/sock_base.h \ + \ + mix/mix_base.h \ + \ + tcp/tcp_base.h \ + \ + udp/udp_base.h \ + \ + vma/vma_base.h + + +# This workaround allows to compile files located +# at another directory. +# This place resolve make distcheck isue +nodist_gtest_SOURCES = \ + hash.c + +CLEANFILES = hash.c + +hash.c: + @echo "#include \"$(top_builddir)/tools/daemon/$@\"" >$@ + + +.PHONY: test gdb valgrind + + +all-local: gtest + + +# +# List variables +# +GTEST_FILTER ?= * +GTEST_EXTRA_ARGS = +LAUNCHER = LD_PRELOAD=$(abs_top_builddir)/src/vma/.libs/libvma.so +VALGRIND_EXTRA_ARGS = + +GTEST_ARGS = \ + --gtest_filter=$(GTEST_FILTER) \ + $(GTEST_EXTRA_ARGS) + +VALGRIND_ARGS = \ + --tool=memcheck \ + --leak-check=full \ + --track-origins=yes \ + --fair-sched=try \ + $(VALGRIND_EXTRA_ARGS) + +# +# List of rules +# +libvma: + $(MAKE) -C $(top_builddir) + +help: + @echo + @echo "Targets:" + @echo " list : List unit tests." + @echo " test : Run unit tests." + @echo " test_gdb : Run unit tests with GDB." + @echo " test_valgrind : Run unit tests with Valgrind." + @echo + @echo "Environment variables:" + @echo " GTEST_FILTER : Unit tests filter (\"$(GTEST_FILTER)\")" + @echo " GTEST_EXTRA_ARGS : Additional arguments for gtest (\"$(GTEST_EXTRA_ARGS)\")" + @echo " LAUNCHER : Custom launcher for gtest executable (\"$(LAUNCHER)\")" + @echo " VALGRIND_EXTRA_ARGS : Additional arguments for Valgrind (\"$(VALGRIND_EXTRA_ARGS)\")" + @echo + +# +# List unit tests +# +list: gtest + $(abs_builddir)/gtest --gtest_list_tests $(GTEST_ARGS) + +# +# Run unit tests +# +test: libvma gtest + @rm -f core.* + env $(LAUNCHER) $(abs_builddir)/gtest $(GTEST_ARGS) + +# +# Run unit tests with GDB +# +test_gdb: libvma gtest + echo -e 'r\ninit-if-undefined $$_exitcode=-1\nif $$_exitcode>=0\n\tq\nend' > .gdbcommands + env $(LAUNCHER) \ + gdb -x .gdbcommands --args $(GDB_ARGS) \ + $(abs_builddir)/gtest $(GTEST_ARGS) + +# +# Run unit tests with valgrind +# +test_valgrind: libvma gtest + env $(LAUNCHER) LD_LIBRARY_PATH="$(VALGRIND_LIBPATH):${LD_LIBRARY_PATH}" \ + valgrind $(VALGRIND_ARGS) $(abs_builddir)/gtest $(GTEST_ARGS) diff --git a/tests/gtest/README b/tests/gtest/README new file mode 100644 index 0000000..ce14ac0 --- /dev/null +++ b/tests/gtest/README @@ -0,0 +1,21 @@ +VMA Testing Environment +======================= + +This set of tests is based on Google Test C++ environment +https://github.com/google/googletest + +Testing VMA Extended API +------------------------ +1. Enable VMA_EXTRA_API_ENABLED in tests/gtest/vma/vma_base.cc +2. Configure libvma with --enable-socketxtreme option and build it +3. Compile the gtests by issuing from VMA root directory: + make -C tests/gtest +4. Launch tests by issuing: + LD_PRELOAD=path_to_your_libvma.so tests/gtest/gtest --addr=client_ip:server_ip + +Testing Environment +------------------- +The VMA Extended API gtest environment requires that there be two interfaces +connected back to back or via a switch. One is the server and the other is the +client. + diff --git a/tests/gtest/common/base.cc b/tests/gtest/common/base.cc new file mode 100644 index 0000000..a0f831c --- /dev/null +++ b/tests/gtest/common/base.cc @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "common/def.h" +#include "common/log.h" +#include "common/sys.h" +#include "base.h" + +test_base::test_base() +{ + port = gtest_conf.port; + memcpy(&client_addr, >est_conf.client_addr, sizeof(client_addr)); + memcpy(&server_addr, >est_conf.server_addr, sizeof(server_addr)); + memcpy(&remote_addr, >est_conf.remote_addr, sizeof(remote_addr)); + + bogus_port = 49999; + bogus_addr.sin_family = PF_INET; + bogus_addr.sin_addr.s_addr = inet_addr("1.1.1.1"); + bogus_addr.sin_port = 0; +} + +test_base::~test_base() +{ +} + +void *test_base::thread_func(void *arg) +{ + test_base *self = reinterpret_cast(arg); + self->barrier(); /* Let all threads start in the same time */ + return NULL; +} + +void test_base::init() +{ +} + +void test_base::cleanup() +{ +} + +bool test_base::barrier() +{ + int ret = pthread_barrier_wait(&m_barrier); + if (ret == 0) { + return false; + } else if (ret == PTHREAD_BARRIER_SERIAL_THREAD) { + return true; + } else { + log_fatal("pthread_barrier_wait() failed\n"); + } + return false; +} + +int test_base::sock_noblock(int fd) +{ + int rc = 0; + int flag; + + flag = fcntl(fd, F_GETFL); + if (flag < 0) { + rc = -errno; + log_error("failed to get socket flags %s\n", strerror(errno)); + } + flag |= O_NONBLOCK; + rc = fcntl(fd, F_SETFL, flag); + if (rc < 0) { + rc = -errno; + log_error("failed to set socket flags %s\n", strerror(errno)); + } + + return rc; +} + +int test_base::event_wait(struct epoll_event *event) +{ + int rc = 0; + int fd; + int efd = -1; + int timeout = 10 * 1000; + + if (!event) { + return -1; + } + + fd = event->data.fd; + efd = epoll_create1(0); + rc = epoll_ctl(efd, EPOLL_CTL_ADD, fd, event); + if (rc < 0) { + log_error("failed epoll_ctl() %s\n", strerror(errno)); + goto err; + } + + rc = epoll_wait(efd, event, 1, timeout); + if (rc < 0) { + log_error("failed epoll_wait() %s\n", strerror(errno)); + } + + epoll_ctl(efd, EPOLL_CTL_DEL, fd, NULL); + +err: + close(efd); + + return rc; +} diff --git a/tests/gtest/common/base.h b/tests/gtest/common/base.h new file mode 100644 index 0000000..d5c650b --- /dev/null +++ b/tests/gtest/common/base.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef TESTS_GTEST_COMMON_BASE_H_ +#define TESTS_GTEST_COMMON_BASE_H_ + + +/** + * Base class for tests + */ +class test_base { +public: + static int sock_noblock(int fd); + static int event_wait(struct epoll_event *event); + +protected: + test_base(); + virtual ~test_base(); + +protected: + virtual void cleanup(); + virtual void init(); + bool barrier(); + + struct sockaddr_in client_addr; + struct sockaddr_in server_addr; + struct sockaddr_in remote_addr; + struct sockaddr_in bogus_addr; + uint16_t port; + uint16_t bogus_port; + +private: + static void *thread_func(void *arg); + + pthread_barrier_t m_barrier; +}; + +#endif /* TESTS_GTEST_COMMON_BASE_H_ */ diff --git a/tests/gtest/common/cmn.h b/tests/gtest/common/cmn.h new file mode 100644 index 0000000..5483973 --- /dev/null +++ b/tests/gtest/common/cmn.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef TESTS_GTEST_COMMON_CMN_H_ +#define TESTS_GTEST_COMMON_CMN_H_ + +#include +#include +#include + +namespace cmn { + +class test_skip_exception : public std::exception { +public: + test_skip_exception(const std::string& reason = "") : m_reason(reason) { + } + virtual ~test_skip_exception() throw() { + } + + virtual const char* what() const throw() { + return (std::string("[ SKIPPED ] ") + m_reason).c_str(); + } + +private: + const std::string m_reason; +}; + +#define SKIP_TRUE(_expr, _reason) \ + if (!(_expr)) { \ + throw cmn::test_skip_exception(_reason); \ + } + +} /* namespace: cmn */ + +#endif /* TESTS_GTEST_COMMON_CMN_H_ */ diff --git a/tests/gtest/common/def.h b/tests/gtest/common/def.h new file mode 100644 index 0000000..cd05879 --- /dev/null +++ b/tests/gtest/common/def.h @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef TESTS_GTEST_COMMON_DEF_H_ +#define TESTS_GTEST_COMMON_DEF_H_ + +#include +#include +#include +#include +#include +#define __STDC_FORMAT_MACROS +#include /* printf PRItn */ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* printf PRItn */ +#include +#include +#include +#include +#include +#include +#include +#include + +#include "common/gtest.h" /* Google framework header */ + +#define INLINE __inline + +#ifndef UNREFERENCED_PARAMETER +#define UNREFERENCED_PARAMETER(P) ((void)P) +#endif + +#define QUOTE(name) #name +#define STR(macro) QUOTE(macro) + +#define ARRAY_SIZE(a) (sizeof(a)/sizeof(a[0])) + + +/* Platform specific 16-byte alignment macro switch. + On Visual C++ it would substitute __declspec(align(16)). + On GCC it substitutes __attribute__((aligned (16))). +*/ + +#if defined(_MSC_VER) +#define ALIGN(x) __declspec(align(x)) +#else +#define ALIGN(x) __attribute__((aligned (x))) +#endif + +#if !defined( EOK ) +#define EOK 0 /* no error */ +#endif + +#ifndef container_of +/** + * container_of - cast a member of a structure out to the containing structure + * @ptr: the pointer to the member. + * @type: the type of the container struct this is embedded in. + * @member: the name of the member within the struct. + * + */ +#define container_of(ptr, type, member) (type *)((char *)(ptr) - offsetof(type,member)) +#endif + +#define UNDEFINED_VALUE (-1) + +struct gtest_configure_t { + int log_level; + int random_seed; + struct sockaddr_in client_addr; + struct sockaddr_in server_addr; + struct sockaddr_in remote_addr; + uint16_t port; +}; + +#endif /* TESTS_GTEST_COMMON_DEF_H_ */ diff --git a/tests/gtest/common/gtest-all.cc b/tests/gtest/common/gtest-all.cc new file mode 100644 index 0000000..25ebbc3 --- /dev/null +++ b/tests/gtest/common/gtest-all.cc @@ -0,0 +1,9118 @@ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: mheule@google.com (Markus Heule) +// +// Google C++ Testing Framework (Google Test) +// +// Sometimes it's desirable to build Google Test by compiling a single file. +// This file serves this purpose. + +// This line ensures that gtest.h can be compiled on its own, even +// when it's fused. +#include "gtest.h" + +// The following lines pull in the real gtest *.cc files. +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) +// +// The Google C++ Testing Framework (Google Test) + +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) +// +// Utilities for testing Google Test itself and code that uses Google Test +// (e.g. frameworks built on top of Google Test). + +#ifndef GTEST_INCLUDE_GTEST_GTEST_SPI_H_ +#define GTEST_INCLUDE_GTEST_GTEST_SPI_H_ + + +namespace testing { + +// This helper class can be used to mock out Google Test failure reporting +// so that we can test Google Test or code that builds on Google Test. +// +// An object of this class appends a TestPartResult object to the +// TestPartResultArray object given in the constructor whenever a Google Test +// failure is reported. It can either intercept only failures that are +// generated in the same thread that created this object or it can intercept +// all generated failures. The scope of this mock object can be controlled with +// the second argument to the two arguments constructor. +class GTEST_API_ ScopedFakeTestPartResultReporter + : public TestPartResultReporterInterface { + public: + // The two possible mocking modes of this object. + enum InterceptMode { + INTERCEPT_ONLY_CURRENT_THREAD, // Intercepts only thread local failures. + INTERCEPT_ALL_THREADS // Intercepts all failures. + }; + + // The c'tor sets this object as the test part result reporter used + // by Google Test. The 'result' parameter specifies where to report the + // results. This reporter will only catch failures generated in the current + // thread. DEPRECATED + explicit ScopedFakeTestPartResultReporter(TestPartResultArray* result); + + // Same as above, but you can choose the interception scope of this object. + ScopedFakeTestPartResultReporter(InterceptMode intercept_mode, + TestPartResultArray* result); + + // The d'tor restores the previous test part result reporter. + virtual ~ScopedFakeTestPartResultReporter(); + + // Appends the TestPartResult object to the TestPartResultArray + // received in the constructor. + // + // This method is from the TestPartResultReporterInterface + // interface. + virtual void ReportTestPartResult(const TestPartResult& result); + private: + void Init(); + + const InterceptMode intercept_mode_; + TestPartResultReporterInterface* old_reporter_; + TestPartResultArray* const result_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedFakeTestPartResultReporter); +}; + +namespace internal { + +// A helper class for implementing EXPECT_FATAL_FAILURE() and +// EXPECT_NONFATAL_FAILURE(). Its destructor verifies that the given +// TestPartResultArray contains exactly one failure that has the given +// type and contains the given substring. If that's not the case, a +// non-fatal failure will be generated. +class GTEST_API_ SingleFailureChecker { + public: + // The constructor remembers the arguments. + SingleFailureChecker(const TestPartResultArray* results, + TestPartResult::Type type, + const string& substr); + ~SingleFailureChecker(); + private: + const TestPartResultArray* const results_; + const TestPartResult::Type type_; + const string substr_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(SingleFailureChecker); +}; + +} // namespace internal + +} // namespace testing + +// A set of macros for testing Google Test assertions or code that's expected +// to generate Google Test fatal failures. It verifies that the given +// statement will cause exactly one fatal Google Test failure with 'substr' +// being part of the failure message. +// +// There are two different versions of this macro. EXPECT_FATAL_FAILURE only +// affects and considers failures generated in the current thread and +// EXPECT_FATAL_FAILURE_ON_ALL_THREADS does the same but for all threads. +// +// The verification of the assertion is done correctly even when the statement +// throws an exception or aborts the current function. +// +// Known restrictions: +// - 'statement' cannot reference local non-static variables or +// non-static members of the current object. +// - 'statement' cannot return a value. +// - You cannot stream a failure message to this macro. +// +// Note that even though the implementations of the following two +// macros are much alike, we cannot refactor them to use a common +// helper macro, due to some peculiarity in how the preprocessor +// works. The AcceptsMacroThatExpandsToUnprotectedComma test in +// gtest_unittest.cc will fail to compile if we do that. +#define EXPECT_FATAL_FAILURE(statement, substr) \ + do { \ + class GTestExpectFatalFailureHelper {\ + public:\ + static void Execute() { statement; }\ + };\ + ::testing::TestPartResultArray gtest_failures;\ + ::testing::internal::SingleFailureChecker gtest_checker(\ + >est_failures, ::testing::TestPartResult::kFatalFailure, (substr));\ + {\ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\ + ::testing::ScopedFakeTestPartResultReporter:: \ + INTERCEPT_ONLY_CURRENT_THREAD, >est_failures);\ + GTestExpectFatalFailureHelper::Execute();\ + }\ + } while (::testing::internal::AlwaysFalse()) + +#define EXPECT_FATAL_FAILURE_ON_ALL_THREADS(statement, substr) \ + do { \ + class GTestExpectFatalFailureHelper {\ + public:\ + static void Execute() { statement; }\ + };\ + ::testing::TestPartResultArray gtest_failures;\ + ::testing::internal::SingleFailureChecker gtest_checker(\ + >est_failures, ::testing::TestPartResult::kFatalFailure, (substr));\ + {\ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\ + ::testing::ScopedFakeTestPartResultReporter:: \ + INTERCEPT_ALL_THREADS, >est_failures);\ + GTestExpectFatalFailureHelper::Execute();\ + }\ + } while (::testing::internal::AlwaysFalse()) + +// A macro for testing Google Test assertions or code that's expected to +// generate Google Test non-fatal failures. It asserts that the given +// statement will cause exactly one non-fatal Google Test failure with 'substr' +// being part of the failure message. +// +// There are two different versions of this macro. EXPECT_NONFATAL_FAILURE only +// affects and considers failures generated in the current thread and +// EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS does the same but for all threads. +// +// 'statement' is allowed to reference local variables and members of +// the current object. +// +// The verification of the assertion is done correctly even when the statement +// throws an exception or aborts the current function. +// +// Known restrictions: +// - You cannot stream a failure message to this macro. +// +// Note that even though the implementations of the following two +// macros are much alike, we cannot refactor them to use a common +// helper macro, due to some peculiarity in how the preprocessor +// works. If we do that, the code won't compile when the user gives +// EXPECT_NONFATAL_FAILURE() a statement that contains a macro that +// expands to code containing an unprotected comma. The +// AcceptsMacroThatExpandsToUnprotectedComma test in gtest_unittest.cc +// catches that. +// +// For the same reason, we have to write +// if (::testing::internal::AlwaysTrue()) { statement; } +// instead of +// GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) +// to avoid an MSVC warning on unreachable code. +#define EXPECT_NONFATAL_FAILURE(statement, substr) \ + do {\ + ::testing::TestPartResultArray gtest_failures;\ + ::testing::internal::SingleFailureChecker gtest_checker(\ + >est_failures, ::testing::TestPartResult::kNonFatalFailure, \ + (substr));\ + {\ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\ + ::testing::ScopedFakeTestPartResultReporter:: \ + INTERCEPT_ONLY_CURRENT_THREAD, >est_failures);\ + if (::testing::internal::AlwaysTrue()) { statement; }\ + }\ + } while (::testing::internal::AlwaysFalse()) + +#define EXPECT_NONFATAL_FAILURE_ON_ALL_THREADS(statement, substr) \ + do {\ + ::testing::TestPartResultArray gtest_failures;\ + ::testing::internal::SingleFailureChecker gtest_checker(\ + >est_failures, ::testing::TestPartResult::kNonFatalFailure, \ + (substr));\ + {\ + ::testing::ScopedFakeTestPartResultReporter gtest_reporter(\ + ::testing::ScopedFakeTestPartResultReporter::INTERCEPT_ALL_THREADS,\ + >est_failures);\ + if (::testing::internal::AlwaysTrue()) { statement; }\ + }\ + } while (::testing::internal::AlwaysFalse()) + +#endif // GTEST_INCLUDE_GTEST_GTEST_SPI_H_ + +#include +#include +#include +#include +#include +#include +#include + +#include +#include // NOLINT +#include +#include + +#if GTEST_OS_LINUX + +// TODO(kenton@google.com): Use autoconf to detect availability of +// gettimeofday(). +# define GTEST_HAS_GETTIMEOFDAY_ 1 + +# include // NOLINT +# include // NOLINT +# include // NOLINT +// Declares vsnprintf(). This header is not available on Windows. +# include // NOLINT +# include // NOLINT +# include // NOLINT +# include // NOLINT +# include + +#elif GTEST_OS_SYMBIAN +# define GTEST_HAS_GETTIMEOFDAY_ 1 +# include // NOLINT + +#elif GTEST_OS_ZOS +# define GTEST_HAS_GETTIMEOFDAY_ 1 +# include // NOLINT + +// On z/OS we additionally need strings.h for strcasecmp. +# include // NOLINT + +#elif GTEST_OS_WINDOWS_MOBILE // We are on Windows CE. + +# include // NOLINT + +#elif GTEST_OS_WINDOWS // We are on Windows proper. + +# include // NOLINT +# include // NOLINT +# include // NOLINT +# include // NOLINT + +# if GTEST_OS_WINDOWS_MINGW +// MinGW has gettimeofday() but not _ftime64(). +// TODO(kenton@google.com): Use autoconf to detect availability of +// gettimeofday(). +// TODO(kenton@google.com): There are other ways to get the time on +// Windows, like GetTickCount() or GetSystemTimeAsFileTime(). MinGW +// supports these. consider using them instead. +# define GTEST_HAS_GETTIMEOFDAY_ 1 +# include // NOLINT +# endif // GTEST_OS_WINDOWS_MINGW + +// cpplint thinks that the header is already included, so we want to +// silence it. +# include // NOLINT + +#else + +// Assume other platforms have gettimeofday(). +// TODO(kenton@google.com): Use autoconf to detect availability of +// gettimeofday(). +# define GTEST_HAS_GETTIMEOFDAY_ 1 + +// cpplint thinks that the header is already included, so we want to +// silence it. +# include // NOLINT +# include // NOLINT + +#endif // GTEST_OS_LINUX + +#if GTEST_HAS_EXCEPTIONS +# include +#endif + +#if GTEST_CAN_STREAM_RESULTS_ +# include // NOLINT +# include // NOLINT +#endif + +// Indicates that this translation unit is part of Google Test's +// implementation. It must come before gtest-internal-inl.h is +// included, or there will be a compiler error. This trick is to +// prevent a user from accidentally including gtest-internal-inl.h in +// his code. +#define GTEST_IMPLEMENTATION_ 1 +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// Utility functions and classes used by the Google C++ testing framework. +// +// Author: wan@google.com (Zhanyong Wan) +// +// This file contains purely Google Test's internal implementation. Please +// DO NOT #INCLUDE IT IN A USER PROGRAM. + +#ifndef GTEST_SRC_GTEST_INTERNAL_INL_H_ +#define GTEST_SRC_GTEST_INTERNAL_INL_H_ + +// GTEST_IMPLEMENTATION_ is defined to 1 iff the current translation unit is +// part of Google Test's implementation; otherwise it's undefined. +#if !GTEST_IMPLEMENTATION_ +// A user is trying to include this from his code - just say no. +# error "gtest-internal-inl.h is part of Google Test's internal implementation." +# error "It must not be included except by Google Test itself." +#endif // GTEST_IMPLEMENTATION_ + +#ifndef _WIN32_WCE +# include +#endif // !_WIN32_WCE +#include +#include // For strtoll/_strtoul64/malloc/free. +#include // For memmove. + +#include +#include +#include + + +#if GTEST_OS_WINDOWS +# include // NOLINT +#endif // GTEST_OS_WINDOWS + + +namespace testing { + +// Declares the flags. +// +// We don't want the users to modify this flag in the code, but want +// Google Test's own unit tests to be able to access it. Therefore we +// declare it here as opposed to in gtest.h. +GTEST_DECLARE_bool_(death_test_use_fork); + +namespace internal { + +// The value of GetTestTypeId() as seen from within the Google Test +// library. This is solely for testing GetTestTypeId(). +GTEST_API_ extern const TypeId kTestTypeIdInGoogleTest; + +// Names of the flags (needed for parsing Google Test flags). +const char kAlsoRunDisabledTestsFlag[] = "also_run_disabled_tests"; +const char kBreakOnFailureFlag[] = "break_on_failure"; +const char kCatchExceptionsFlag[] = "catch_exceptions"; +const char kColorFlag[] = "color"; +const char kFilterFlag[] = "filter"; +const char kListTestsFlag[] = "list_tests"; +const char kOutputFlag[] = "output"; +const char kPrintTimeFlag[] = "print_time"; +const char kRandomSeedFlag[] = "random_seed"; +const char kRepeatFlag[] = "repeat"; +const char kShuffleFlag[] = "shuffle"; +const char kStackTraceDepthFlag[] = "stack_trace_depth"; +const char kStreamResultToFlag[] = "stream_result_to"; +const char kThrowOnFailureFlag[] = "throw_on_failure"; + +// A valid random seed must be in [1, kMaxRandomSeed]. +const int kMaxRandomSeed = 99999; + +// g_help_flag is true iff the --help flag or an equivalent form is +// specified on the command line. +GTEST_API_ extern bool g_help_flag; + +// Returns the current time in milliseconds. +GTEST_API_ TimeInMillis GetTimeInMillis(); + +// Returns true iff Google Test should use colors in the output. +GTEST_API_ bool ShouldUseColor(bool stdout_is_tty); + +// Formats the given time in milliseconds as seconds. +GTEST_API_ std::string FormatTimeInMillisAsSeconds(TimeInMillis ms); + +// Parses a string for an Int32 flag, in the form of "--flag=value". +// +// On success, stores the value of the flag in *value, and returns +// true. On failure, returns false without changing *value. +GTEST_API_ bool ParseInt32Flag( + const char* str, const char* flag, Int32* value); + +// Returns a random seed in range [1, kMaxRandomSeed] based on the +// given --gtest_random_seed flag value. +inline int GetRandomSeedFromFlag(Int32 random_seed_flag) { + const unsigned int raw_seed = (random_seed_flag == 0) ? + static_cast(GetTimeInMillis()) : + static_cast(random_seed_flag); + + // Normalizes the actual seed to range [1, kMaxRandomSeed] such that + // it's easy to type. + const int normalized_seed = + static_cast((raw_seed - 1U) % + static_cast(kMaxRandomSeed)) + 1; + return normalized_seed; +} + +// Returns the first valid random seed after 'seed'. The behavior is +// undefined if 'seed' is invalid. The seed after kMaxRandomSeed is +// considered to be 1. +inline int GetNextRandomSeed(int seed) { + GTEST_CHECK_(1 <= seed && seed <= kMaxRandomSeed) + << "Invalid random seed " << seed << " - must be in [1, " + << kMaxRandomSeed << "]."; + const int next_seed = seed + 1; + return (next_seed > kMaxRandomSeed) ? 1 : next_seed; +} + +// This class saves the values of all Google Test flags in its c'tor, and +// restores them in its d'tor. +class GTestFlagSaver { + public: + // The c'tor. + GTestFlagSaver() { + also_run_disabled_tests_ = GTEST_FLAG(also_run_disabled_tests); + break_on_failure_ = GTEST_FLAG(break_on_failure); + catch_exceptions_ = GTEST_FLAG(catch_exceptions); + color_ = GTEST_FLAG(color); + death_test_style_ = GTEST_FLAG(death_test_style); + death_test_use_fork_ = GTEST_FLAG(death_test_use_fork); + filter_ = GTEST_FLAG(filter); + internal_run_death_test_ = GTEST_FLAG(internal_run_death_test); + list_tests_ = GTEST_FLAG(list_tests); + output_ = GTEST_FLAG(output); + print_time_ = GTEST_FLAG(print_time); + random_seed_ = GTEST_FLAG(random_seed); + repeat_ = GTEST_FLAG(repeat); + shuffle_ = GTEST_FLAG(shuffle); + stack_trace_depth_ = GTEST_FLAG(stack_trace_depth); + stream_result_to_ = GTEST_FLAG(stream_result_to); + throw_on_failure_ = GTEST_FLAG(throw_on_failure); + } + + // The d'tor is not virtual. DO NOT INHERIT FROM THIS CLASS. + ~GTestFlagSaver() { + GTEST_FLAG(also_run_disabled_tests) = also_run_disabled_tests_; + GTEST_FLAG(break_on_failure) = break_on_failure_; + GTEST_FLAG(catch_exceptions) = catch_exceptions_; + GTEST_FLAG(color) = color_; + GTEST_FLAG(death_test_style) = death_test_style_; + GTEST_FLAG(death_test_use_fork) = death_test_use_fork_; + GTEST_FLAG(filter) = filter_; + GTEST_FLAG(internal_run_death_test) = internal_run_death_test_; + GTEST_FLAG(list_tests) = list_tests_; + GTEST_FLAG(output) = output_; + GTEST_FLAG(print_time) = print_time_; + GTEST_FLAG(random_seed) = random_seed_; + GTEST_FLAG(repeat) = repeat_; + GTEST_FLAG(shuffle) = shuffle_; + GTEST_FLAG(stack_trace_depth) = stack_trace_depth_; + GTEST_FLAG(stream_result_to) = stream_result_to_; + GTEST_FLAG(throw_on_failure) = throw_on_failure_; + } + private: + // Fields for saving the original values of flags. + bool also_run_disabled_tests_; + bool break_on_failure_; + bool catch_exceptions_; + String color_; + String death_test_style_; + bool death_test_use_fork_; + String filter_; + String internal_run_death_test_; + bool list_tests_; + String output_; + bool print_time_; + bool pretty_; + internal::Int32 random_seed_; + internal::Int32 repeat_; + bool shuffle_; + internal::Int32 stack_trace_depth_; + String stream_result_to_; + bool throw_on_failure_; +} GTEST_ATTRIBUTE_UNUSED_; + +// Converts a Unicode code point to a narrow string in UTF-8 encoding. +// code_point parameter is of type UInt32 because wchar_t may not be +// wide enough to contain a code point. +// The output buffer str must containt at least 32 characters. +// The function returns the address of the output buffer. +// If the code_point is not a valid Unicode code point +// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be output +// as '(Invalid Unicode 0xXXXXXXXX)'. +GTEST_API_ char* CodePointToUtf8(UInt32 code_point, char* str); + +// Converts a wide string to a narrow string in UTF-8 encoding. +// The wide string is assumed to have the following encoding: +// UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS) +// UTF-32 if sizeof(wchar_t) == 4 (on Linux) +// Parameter str points to a null-terminated wide string. +// Parameter num_chars may additionally limit the number +// of wchar_t characters processed. -1 is used when the entire string +// should be processed. +// If the string contains code points that are not valid Unicode code points +// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output +// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding +// and contains invalid UTF-16 surrogate pairs, values in those pairs +// will be encoded as individual Unicode characters from Basic Normal Plane. +GTEST_API_ String WideStringToUtf8(const wchar_t* str, int num_chars); + +// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file +// if the variable is present. If a file already exists at this location, this +// function will write over it. If the variable is present, but the file cannot +// be created, prints an error and exits. +void WriteToShardStatusFileIfNeeded(); + +// Checks whether sharding is enabled by examining the relevant +// environment variable values. If the variables are present, +// but inconsistent (e.g., shard_index >= total_shards), prints +// an error and exits. If in_subprocess_for_death_test, sharding is +// disabled because it must only be applied to the original test +// process. Otherwise, we could filter out death tests we intended to execute. +GTEST_API_ bool ShouldShard(const char* total_shards_str, + const char* shard_index_str, + bool in_subprocess_for_death_test); + +// Parses the environment variable var as an Int32. If it is unset, +// returns default_val. If it is not an Int32, prints an error and +// and aborts. +GTEST_API_ Int32 Int32FromEnvOrDie(const char* env_var, Int32 default_val); + +// Given the total number of shards, the shard index, and the test id, +// returns true iff the test should be run on this shard. The test id is +// some arbitrary but unique non-negative integer assigned to each test +// method. Assumes that 0 <= shard_index < total_shards. +GTEST_API_ bool ShouldRunTestOnShard( + int total_shards, int shard_index, int test_id); + +// STL container utilities. + +// Returns the number of elements in the given container that satisfy +// the given predicate. +template +inline int CountIf(const Container& c, Predicate predicate) { + // Implemented as an explicit loop since std::count_if() in libCstd on + // Solaris has a non-standard signature. + int count = 0; + for (typename Container::const_iterator it = c.begin(); it != c.end(); ++it) { + if (predicate(*it)) + ++count; + } + return count; +} + +// Applies a function/functor to each element in the container. +template +void ForEach(const Container& c, Functor functor) { + std::for_each(c.begin(), c.end(), functor); +} + +// Returns the i-th element of the vector, or default_value if i is not +// in range [0, v.size()). +template +inline E GetElementOr(const std::vector& v, int i, E default_value) { + return (i < 0 || i >= static_cast(v.size())) ? default_value : v[i]; +} + +// Performs an in-place shuffle of a range of the vector's elements. +// 'begin' and 'end' are element indices as an STL-style range; +// i.e. [begin, end) are shuffled, where 'end' == size() means to +// shuffle to the end of the vector. +template +void ShuffleRange(internal::Random* random, int begin, int end, + std::vector* v) { + const int size = static_cast(v->size()); + GTEST_CHECK_(0 <= begin && begin <= size) + << "Invalid shuffle range start " << begin << ": must be in range [0, " + << size << "]."; + GTEST_CHECK_(begin <= end && end <= size) + << "Invalid shuffle range finish " << end << ": must be in range [" + << begin << ", " << size << "]."; + + // Fisher-Yates shuffle, from + // http://en.wikipedia.org/wiki/Fisher-Yates_shuffle + for (int range_width = end - begin; range_width >= 2; range_width--) { + const int last_in_range = begin + range_width - 1; + const int selected = begin + random->Generate(range_width); + std::swap((*v)[selected], (*v)[last_in_range]); + } +} + +// Performs an in-place shuffle of the vector's elements. +template +inline void Shuffle(internal::Random* random, std::vector* v) { + ShuffleRange(random, 0, static_cast(v->size()), v); +} + +// A function for deleting an object. Handy for being used as a +// functor. +template +static void Delete(T* x) { + delete x; +} + +// A predicate that checks the key of a TestProperty against a known key. +// +// TestPropertyKeyIs is copyable. +class TestPropertyKeyIs { + public: + // Constructor. + // + // TestPropertyKeyIs has NO default constructor. + explicit TestPropertyKeyIs(const char* key) + : key_(key) {} + + // Returns true iff the test name of test property matches on key_. + bool operator()(const TestProperty& test_property) const { + return String(test_property.key()).Compare(key_) == 0; + } + + private: + String key_; +}; + +// Class UnitTestOptions. +// +// This class contains functions for processing options the user +// specifies when running the tests. It has only static members. +// +// In most cases, the user can specify an option using either an +// environment variable or a command line flag. E.g. you can set the +// test filter using either GTEST_FILTER or --gtest_filter. If both +// the variable and the flag are present, the latter overrides the +// former. +class GTEST_API_ UnitTestOptions { + public: + // Functions for processing the gtest_output flag. + + // Returns the output format, or "" for normal printed output. + static String GetOutputFormat(); + + // Returns the absolute path of the requested output file, or the + // default (test_detail.xml in the original working directory) if + // none was explicitly specified. + static String GetAbsolutePathToOutputFile(); + + // Functions for processing the gtest_filter flag. + + // Returns true iff the wildcard pattern matches the string. The + // first ':' or '\0' character in pattern marks the end of it. + // + // This recursive algorithm isn't very efficient, but is clear and + // works well enough for matching test names, which are short. + static bool PatternMatchesString(const char *pattern, const char *str); + + // Returns true iff the user-specified filter matches the test case + // name and the test name. + static bool FilterMatchesTest(const String &test_case_name, + const String &test_name); + +#if GTEST_OS_WINDOWS + // Function for supporting the gtest_catch_exception flag. + + // Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the + // given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise. + // This function is useful as an __except condition. + static int GTestShouldProcessSEH(DWORD exception_code); +#endif // GTEST_OS_WINDOWS + + // Returns true if "name" matches the ':' separated list of glob-style + // filters in "filter". + static bool MatchesFilter(const String& name, const char* filter); +}; + +// Returns the current application's name, removing directory path if that +// is present. Used by UnitTestOptions::GetOutputFile. +GTEST_API_ FilePath GetCurrentExecutableName(); + +// The role interface for getting the OS stack trace as a string. +class OsStackTraceGetterInterface { + public: + OsStackTraceGetterInterface() {} + virtual ~OsStackTraceGetterInterface() {} + + // Returns the current OS stack trace as a String. Parameters: + // + // max_depth - the maximum number of stack frames to be included + // in the trace. + // skip_count - the number of top frames to be skipped; doesn't count + // against max_depth. + virtual String CurrentStackTrace(int max_depth, int skip_count) = 0; + + // UponLeavingGTest() should be called immediately before Google Test calls + // user code. It saves some information about the current stack that + // CurrentStackTrace() will use to find and hide Google Test stack frames. + virtual void UponLeavingGTest() = 0; + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetterInterface); +}; + +// A working implementation of the OsStackTraceGetterInterface interface. +class OsStackTraceGetter : public OsStackTraceGetterInterface { + public: + OsStackTraceGetter() : caller_frame_(NULL) {} + virtual String CurrentStackTrace(int max_depth, int skip_count); + virtual void UponLeavingGTest(); + + // This string is inserted in place of stack frames that are part of + // Google Test's implementation. + static const char* const kElidedFramesMarker; + + private: + Mutex mutex_; // protects all internal state + + // We save the stack frame below the frame that calls user code. + // We do this because the address of the frame immediately below + // the user code changes between the call to UponLeavingGTest() + // and any calls to CurrentStackTrace() from within the user code. + void* caller_frame_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(OsStackTraceGetter); +}; + +// Information about a Google Test trace point. +struct TraceInfo { + const char* file; + int line; + String message; +}; + +// This is the default global test part result reporter used in UnitTestImpl. +// This class should only be used by UnitTestImpl. +class DefaultGlobalTestPartResultReporter + : public TestPartResultReporterInterface { + public: + explicit DefaultGlobalTestPartResultReporter(UnitTestImpl* unit_test); + // Implements the TestPartResultReporterInterface. Reports the test part + // result in the current test. + virtual void ReportTestPartResult(const TestPartResult& result); + + private: + UnitTestImpl* const unit_test_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultGlobalTestPartResultReporter); +}; + +// This is the default per thread test part result reporter used in +// UnitTestImpl. This class should only be used by UnitTestImpl. +class DefaultPerThreadTestPartResultReporter + : public TestPartResultReporterInterface { + public: + explicit DefaultPerThreadTestPartResultReporter(UnitTestImpl* unit_test); + // Implements the TestPartResultReporterInterface. The implementation just + // delegates to the current global test part result reporter of *unit_test_. + virtual void ReportTestPartResult(const TestPartResult& result); + + private: + UnitTestImpl* const unit_test_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(DefaultPerThreadTestPartResultReporter); +}; + +// The private implementation of the UnitTest class. We don't protect +// the methods under a mutex, as this class is not accessible by a +// user and the UnitTest class that delegates work to this class does +// proper locking. +class GTEST_API_ UnitTestImpl { + public: + explicit UnitTestImpl(UnitTest* parent); + virtual ~UnitTestImpl(); + + // There are two different ways to register your own TestPartResultReporter. + // You can register your own repoter to listen either only for test results + // from the current thread or for results from all threads. + // By default, each per-thread test result repoter just passes a new + // TestPartResult to the global test result reporter, which registers the + // test part result for the currently running test. + + // Returns the global test part result reporter. + TestPartResultReporterInterface* GetGlobalTestPartResultReporter(); + + // Sets the global test part result reporter. + void SetGlobalTestPartResultReporter( + TestPartResultReporterInterface* reporter); + + // Returns the test part result reporter for the current thread. + TestPartResultReporterInterface* GetTestPartResultReporterForCurrentThread(); + + // Sets the test part result reporter for the current thread. + void SetTestPartResultReporterForCurrentThread( + TestPartResultReporterInterface* reporter); + + // Gets the number of successful test cases. + int successful_test_case_count() const; + + // Gets the number of failed test cases. + int failed_test_case_count() const; + + // Gets the number of all test cases. + int total_test_case_count() const; + + // Gets the number of all test cases that contain at least one test + // that should run. + int test_case_to_run_count() const; + + // Gets the number of successful tests. + int successful_test_count() const; + + // Gets the number of failed tests. + int failed_test_count() const; + + // Gets the number of disabled tests. + int disabled_test_count() const; + + // Gets the number of all tests. + int total_test_count() const; + + // Gets the number of tests that should run. + int test_to_run_count() const; + + // Gets the elapsed time, in milliseconds. + TimeInMillis elapsed_time() const { return elapsed_time_; } + + // Returns true iff the unit test passed (i.e. all test cases passed). + bool Passed() const { return !Failed(); } + + // Returns true iff the unit test failed (i.e. some test case failed + // or something outside of all tests failed). + bool Failed() const { + return failed_test_case_count() > 0 || ad_hoc_test_result()->Failed(); + } + + // Gets the i-th test case among all the test cases. i can range from 0 to + // total_test_case_count() - 1. If i is not in that range, returns NULL. + const TestCase* GetTestCase(int i) const { + const int index = GetElementOr(test_case_indices_, i, -1); + return index < 0 ? NULL : test_cases_[i]; + } + + // Gets the i-th test case among all the test cases. i can range from 0 to + // total_test_case_count() - 1. If i is not in that range, returns NULL. + TestCase* GetMutableTestCase(int i) { + const int index = GetElementOr(test_case_indices_, i, -1); + return index < 0 ? NULL : test_cases_[index]; + } + + // Provides access to the event listener list. + TestEventListeners* listeners() { return &listeners_; } + + // Returns the TestResult for the test that's currently running, or + // the TestResult for the ad hoc test if no test is running. + TestResult* current_test_result(); + + // Returns the TestResult for the ad hoc test. + const TestResult* ad_hoc_test_result() const { return &ad_hoc_test_result_; } + + // Sets the OS stack trace getter. + // + // Does nothing if the input and the current OS stack trace getter + // are the same; otherwise, deletes the old getter and makes the + // input the current getter. + void set_os_stack_trace_getter(OsStackTraceGetterInterface* getter); + + // Returns the current OS stack trace getter if it is not NULL; + // otherwise, creates an OsStackTraceGetter, makes it the current + // getter, and returns it. + OsStackTraceGetterInterface* os_stack_trace_getter(); + + // Returns the current OS stack trace as a String. + // + // The maximum number of stack frames to be included is specified by + // the gtest_stack_trace_depth flag. The skip_count parameter + // specifies the number of top frames to be skipped, which doesn't + // count against the number of frames to be included. + // + // For example, if Foo() calls Bar(), which in turn calls + // CurrentOsStackTraceExceptTop(1), Foo() will be included in the + // trace but Bar() and CurrentOsStackTraceExceptTop() won't. + String CurrentOsStackTraceExceptTop(int skip_count); + + // Finds and returns a TestCase with the given name. If one doesn't + // exist, creates one and returns it. + // + // Arguments: + // + // test_case_name: name of the test case + // type_param: the name of the test's type parameter, or NULL if + // this is not a typed or a type-parameterized test. + // set_up_tc: pointer to the function that sets up the test case + // tear_down_tc: pointer to the function that tears down the test case + TestCase* GetTestCase(const char* test_case_name, + const char* type_param, + Test::SetUpTestCaseFunc set_up_tc, + Test::TearDownTestCaseFunc tear_down_tc); + + // Adds a TestInfo to the unit test. + // + // Arguments: + // + // set_up_tc: pointer to the function that sets up the test case + // tear_down_tc: pointer to the function that tears down the test case + // test_info: the TestInfo object + void AddTestInfo(Test::SetUpTestCaseFunc set_up_tc, + Test::TearDownTestCaseFunc tear_down_tc, + TestInfo* test_info) { + // In order to support thread-safe death tests, we need to + // remember the original working directory when the test program + // was first invoked. We cannot do this in RUN_ALL_TESTS(), as + // the user may have changed the current directory before calling + // RUN_ALL_TESTS(). Therefore we capture the current directory in + // AddTestInfo(), which is called to register a TEST or TEST_F + // before main() is reached. + if (original_working_dir_.IsEmpty()) { + original_working_dir_.Set(FilePath::GetCurrentDir()); + GTEST_CHECK_(!original_working_dir_.IsEmpty()) + << "Failed to get the current working directory."; + } + + GetTestCase(test_info->test_case_name(), + test_info->type_param(), + set_up_tc, + tear_down_tc)->AddTestInfo(test_info); + } + +#if GTEST_HAS_PARAM_TEST + // Returns ParameterizedTestCaseRegistry object used to keep track of + // value-parameterized tests and instantiate and register them. + internal::ParameterizedTestCaseRegistry& parameterized_test_registry() { + return parameterized_test_registry_; + } +#endif // GTEST_HAS_PARAM_TEST + + // Sets the TestCase object for the test that's currently running. + void set_current_test_case(TestCase* a_current_test_case) { + current_test_case_ = a_current_test_case; + } + + // Sets the TestInfo object for the test that's currently running. If + // current_test_info is NULL, the assertion results will be stored in + // ad_hoc_test_result_. + void set_current_test_info(TestInfo* a_current_test_info) { + current_test_info_ = a_current_test_info; + } + + // Registers all parameterized tests defined using TEST_P and + // INSTANTIATE_TEST_CASE_P, creating regular tests for each test/parameter + // combination. This method can be called more then once; it has guards + // protecting from registering the tests more then once. If + // value-parameterized tests are disabled, RegisterParameterizedTests is + // present but does nothing. + void RegisterParameterizedTests(); + + // Runs all tests in this UnitTest object, prints the result, and + // returns true if all tests are successful. If any exception is + // thrown during a test, this test is considered to be failed, but + // the rest of the tests will still be run. + bool RunAllTests(); + + // Clears the results of all tests, except the ad hoc tests. + void ClearNonAdHocTestResult() { + ForEach(test_cases_, TestCase::ClearTestCaseResult); + } + + // Clears the results of ad-hoc test assertions. + void ClearAdHocTestResult() { + ad_hoc_test_result_.Clear(); + } + + enum ReactionToSharding { + HONOR_SHARDING_PROTOCOL, + IGNORE_SHARDING_PROTOCOL + }; + + // Matches the full name of each test against the user-specified + // filter to decide whether the test should run, then records the + // result in each TestCase and TestInfo object. + // If shard_tests == HONOR_SHARDING_PROTOCOL, further filters tests + // based on sharding variables in the environment. + // Returns the number of tests that should run. + int FilterTests(ReactionToSharding shard_tests); + + // Prints the names of the tests matching the user-specified filter flag. + void ListTestsMatchingFilter(); + + const TestCase* current_test_case() const { return current_test_case_; } + TestInfo* current_test_info() { return current_test_info_; } + const TestInfo* current_test_info() const { return current_test_info_; } + + // Returns the vector of environments that need to be set-up/torn-down + // before/after the tests are run. + std::vector& environments() { return environments_; } + + // Getters for the per-thread Google Test trace stack. + std::vector& gtest_trace_stack() { + return *(gtest_trace_stack_.pointer()); + } + const std::vector& gtest_trace_stack() const { + return gtest_trace_stack_.get(); + } + +#if GTEST_HAS_DEATH_TEST + void InitDeathTestSubprocessControlInfo() { + internal_run_death_test_flag_.reset(ParseInternalRunDeathTestFlag()); + } + // Returns a pointer to the parsed --gtest_internal_run_death_test + // flag, or NULL if that flag was not specified. + // This information is useful only in a death test child process. + // Must not be called before a call to InitGoogleTest. + const InternalRunDeathTestFlag* internal_run_death_test_flag() const { + return internal_run_death_test_flag_.get(); + } + + // Returns a pointer to the current death test factory. + internal::DeathTestFactory* death_test_factory() { + return death_test_factory_.get(); + } + + void SuppressTestEventsIfInSubprocess(); + + friend class ReplaceDeathTestFactory; +#endif // GTEST_HAS_DEATH_TEST + + // Initializes the event listener performing XML output as specified by + // UnitTestOptions. Must not be called before InitGoogleTest. + void ConfigureXmlOutput(); + +#if GTEST_CAN_STREAM_RESULTS_ + // Initializes the event listener for streaming test results to a socket. + // Must not be called before InitGoogleTest. + void ConfigureStreamingOutput(); +#endif + + // Performs initialization dependent upon flag values obtained in + // ParseGoogleTestFlagsOnly. Is called from InitGoogleTest after the call to + // ParseGoogleTestFlagsOnly. In case a user neglects to call InitGoogleTest + // this function is also called from RunAllTests. Since this function can be + // called more than once, it has to be idempotent. + void PostFlagParsingInit(); + + // Gets the random seed used at the start of the current test iteration. + int random_seed() const { return random_seed_; } + + // Gets the random number generator. + internal::Random* random() { return &random_; } + + // Shuffles all test cases, and the tests within each test case, + // making sure that death tests are still run first. + void ShuffleTests(); + + // Restores the test cases and tests to their order before the first shuffle. + void UnshuffleTests(); + + // Returns the value of GTEST_FLAG(catch_exceptions) at the moment + // UnitTest::Run() starts. + bool catch_exceptions() const { return catch_exceptions_; } + + private: + friend class ::testing::UnitTest; + + // Used by UnitTest::Run() to capture the state of + // GTEST_FLAG(catch_exceptions) at the moment it starts. + void set_catch_exceptions(bool value) { catch_exceptions_ = value; } + + // The UnitTest object that owns this implementation object. + UnitTest* const parent_; + + // The working directory when the first TEST() or TEST_F() was + // executed. + internal::FilePath original_working_dir_; + + // The default test part result reporters. + DefaultGlobalTestPartResultReporter default_global_test_part_result_reporter_; + DefaultPerThreadTestPartResultReporter + default_per_thread_test_part_result_reporter_; + + // Points to (but doesn't own) the global test part result reporter. + TestPartResultReporterInterface* global_test_part_result_repoter_; + + // Protects read and write access to global_test_part_result_reporter_. + internal::Mutex global_test_part_result_reporter_mutex_; + + // Points to (but doesn't own) the per-thread test part result reporter. + internal::ThreadLocal + per_thread_test_part_result_reporter_; + + // The vector of environments that need to be set-up/torn-down + // before/after the tests are run. + std::vector environments_; + + // The vector of TestCases in their original order. It owns the + // elements in the vector. + std::vector test_cases_; + + // Provides a level of indirection for the test case list to allow + // easy shuffling and restoring the test case order. The i-th + // element of this vector is the index of the i-th test case in the + // shuffled order. + std::vector test_case_indices_; + +#if GTEST_HAS_PARAM_TEST + // ParameterizedTestRegistry object used to register value-parameterized + // tests. + internal::ParameterizedTestCaseRegistry parameterized_test_registry_; + + // Indicates whether RegisterParameterizedTests() has been called already. + bool parameterized_tests_registered_; +#endif // GTEST_HAS_PARAM_TEST + + // Index of the last death test case registered. Initially -1. + int last_death_test_case_; + + // This points to the TestCase for the currently running test. It + // changes as Google Test goes through one test case after another. + // When no test is running, this is set to NULL and Google Test + // stores assertion results in ad_hoc_test_result_. Initially NULL. + TestCase* current_test_case_; + + // This points to the TestInfo for the currently running test. It + // changes as Google Test goes through one test after another. When + // no test is running, this is set to NULL and Google Test stores + // assertion results in ad_hoc_test_result_. Initially NULL. + TestInfo* current_test_info_; + + // Normally, a user only writes assertions inside a TEST or TEST_F, + // or inside a function called by a TEST or TEST_F. Since Google + // Test keeps track of which test is current running, it can + // associate such an assertion with the test it belongs to. + // + // If an assertion is encountered when no TEST or TEST_F is running, + // Google Test attributes the assertion result to an imaginary "ad hoc" + // test, and records the result in ad_hoc_test_result_. + TestResult ad_hoc_test_result_; + + // The list of event listeners that can be used to track events inside + // Google Test. + TestEventListeners listeners_; + + // The OS stack trace getter. Will be deleted when the UnitTest + // object is destructed. By default, an OsStackTraceGetter is used, + // but the user can set this field to use a custom getter if that is + // desired. + OsStackTraceGetterInterface* os_stack_trace_getter_; + + // True iff PostFlagParsingInit() has been called. + bool post_flag_parse_init_performed_; + + // The random number seed used at the beginning of the test run. + int random_seed_; + + // Our random number generator. + internal::Random random_; + + // How long the test took to run, in milliseconds. + TimeInMillis elapsed_time_; + +#if GTEST_HAS_DEATH_TEST + // The decomposed components of the gtest_internal_run_death_test flag, + // parsed when RUN_ALL_TESTS is called. + internal::scoped_ptr internal_run_death_test_flag_; + internal::scoped_ptr death_test_factory_; +#endif // GTEST_HAS_DEATH_TEST + + // A per-thread stack of traces created by the SCOPED_TRACE() macro. + internal::ThreadLocal > gtest_trace_stack_; + + // The value of GTEST_FLAG(catch_exceptions) at the moment RunAllTests() + // starts. + bool catch_exceptions_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTestImpl); +}; // class UnitTestImpl + +// Convenience function for accessing the global UnitTest +// implementation object. +inline UnitTestImpl* GetUnitTestImpl() { + return UnitTest::GetInstance()->impl(); +} + +#if GTEST_USES_SIMPLE_RE + +// Internal helper functions for implementing the simple regular +// expression matcher. +GTEST_API_ bool IsInSet(char ch, const char* str); +GTEST_API_ bool IsAsciiDigit(char ch); +GTEST_API_ bool IsAsciiPunct(char ch); +GTEST_API_ bool IsRepeat(char ch); +GTEST_API_ bool IsAsciiWhiteSpace(char ch); +GTEST_API_ bool IsAsciiWordChar(char ch); +GTEST_API_ bool IsValidEscape(char ch); +GTEST_API_ bool AtomMatchesChar(bool escaped, char pattern, char ch); +GTEST_API_ bool ValidateRegex(const char* regex); +GTEST_API_ bool MatchRegexAtHead(const char* regex, const char* str); +GTEST_API_ bool MatchRepetitionAndRegexAtHead( + bool escaped, char ch, char repeat, const char* regex, const char* str); +GTEST_API_ bool MatchRegexAnywhere(const char* regex, const char* str); + +#endif // GTEST_USES_SIMPLE_RE + +// Parses the command line for Google Test flags, without initializing +// other parts of Google Test. +GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, char** argv); +GTEST_API_ void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv); + +#if GTEST_HAS_DEATH_TEST + +// Returns the message describing the last system error, regardless of the +// platform. +GTEST_API_ String GetLastErrnoDescription(); + +# if GTEST_OS_WINDOWS +// Provides leak-safe Windows kernel handle ownership. +class AutoHandle { + public: + AutoHandle() : handle_(INVALID_HANDLE_VALUE) {} + explicit AutoHandle(HANDLE handle) : handle_(handle) {} + + ~AutoHandle() { Reset(); } + + HANDLE Get() const { return handle_; } + void Reset() { Reset(INVALID_HANDLE_VALUE); } + void Reset(HANDLE handle) { + if (handle != handle_) { + if (handle_ != INVALID_HANDLE_VALUE) + ::CloseHandle(handle_); + handle_ = handle; + } + } + + private: + HANDLE handle_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(AutoHandle); +}; +# endif // GTEST_OS_WINDOWS + +// Attempts to parse a string into a positive integer pointed to by the +// number parameter. Returns true if that is possible. +// GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we can use +// it here. +template +bool ParseNaturalNumber(const ::std::string& str, Integer* number) { + // Fail fast if the given string does not begin with a digit; + // this bypasses strtoXXX's "optional leading whitespace and plus + // or minus sign" semantics, which are undesirable here. + if (str.empty() || !IsDigit(str[0])) { + return false; + } + errno = 0; + + char* end; + // BiggestConvertible is the largest integer type that system-provided + // string-to-number conversion routines can return. + +# if GTEST_OS_WINDOWS && !defined(__GNUC__) + + // MSVC and C++ Builder define __int64 instead of the standard long long. + typedef unsigned __int64 BiggestConvertible; + const BiggestConvertible parsed = _strtoui64(str.c_str(), &end, 10); + +# else + + typedef unsigned long long BiggestConvertible; // NOLINT + const BiggestConvertible parsed = strtoull(str.c_str(), &end, 10); + +# endif // GTEST_OS_WINDOWS && !defined(__GNUC__) + + const bool parse_success = *end == '\0' && errno == 0; + + // TODO(vladl@google.com): Convert this to compile time assertion when it is + // available. + GTEST_CHECK_(sizeof(Integer) <= sizeof(parsed)); + + const Integer result = static_cast(parsed); + if (parse_success && static_cast(result) == parsed) { + *number = result; + return true; + } + return false; +} +#endif // GTEST_HAS_DEATH_TEST + +// TestResult contains some private methods that should be hidden from +// Google Test user but are required for testing. This class allow our tests +// to access them. +// +// This class is supplied only for the purpose of testing Google Test's own +// constructs. Do not use it in user tests, either directly or indirectly. +class TestResultAccessor { + public: + static void RecordProperty(TestResult* test_result, + const TestProperty& property) { + test_result->RecordProperty(property); + } + + static void ClearTestPartResults(TestResult* test_result) { + test_result->ClearTestPartResults(); + } + + static const std::vector& test_part_results( + const TestResult& test_result) { + return test_result.test_part_results(); + } +}; + +} // namespace internal +} // namespace testing + +#endif // GTEST_SRC_GTEST_INTERNAL_INL_H_ +#undef GTEST_IMPLEMENTATION_ + +#if GTEST_OS_WINDOWS +# define vsnprintf _vsnprintf +#endif // GTEST_OS_WINDOWS + +namespace testing { + +using internal::CountIf; +using internal::ForEach; +using internal::GetElementOr; +using internal::Shuffle; + +// Constants. + +// A test whose test case name or test name matches this filter is +// disabled and not run. +static const char kDisableTestFilter[] = "DISABLED_*:*/DISABLED_*"; + +// A test case whose name matches this filter is considered a death +// test case and will be run before test cases whose name doesn't +// match this filter. +static const char kDeathTestCaseFilter[] = "*DeathTest:*DeathTest/*"; + +// A test filter that matches everything. +static const char kUniversalFilter[] = "*"; + +// The default output file for XML output. +static const char kDefaultOutputFile[] = "test_detail.xml"; + +// The environment variable name for the test shard index. +static const char kTestShardIndex[] = "GTEST_SHARD_INDEX"; +// The environment variable name for the total number of test shards. +static const char kTestTotalShards[] = "GTEST_TOTAL_SHARDS"; +// The environment variable name for the test shard status file. +static const char kTestShardStatusFile[] = "GTEST_SHARD_STATUS_FILE"; + +namespace internal { + +// The text used in failure messages to indicate the start of the +// stack trace. +const char kStackTraceMarker[] = "\nStack trace:\n"; + +// g_help_flag is true iff the --help flag or an equivalent form is +// specified on the command line. +bool g_help_flag = false; + +} // namespace internal + +GTEST_DEFINE_bool_( + also_run_disabled_tests, + internal::BoolFromGTestEnv("also_run_disabled_tests", false), + "Run disabled tests too, in addition to the tests normally being run."); + +GTEST_DEFINE_bool_( + break_on_failure, + internal::BoolFromGTestEnv("break_on_failure", false), + "True iff a failed assertion should be a debugger break-point."); + +GTEST_DEFINE_bool_( + catch_exceptions, + internal::BoolFromGTestEnv("catch_exceptions", true), + "True iff " GTEST_NAME_ + " should catch exceptions and treat them as test failures."); + +GTEST_DEFINE_string_( + color, + internal::StringFromGTestEnv("color", "auto"), + "Whether to use colors in the output. Valid values: yes, no, " + "and auto. 'auto' means to use colors if the output is " + "being sent to a terminal and the TERM environment variable " + "is set to xterm, xterm-color, xterm-256color, linux or cygwin."); + +GTEST_DEFINE_string_( + filter, + internal::StringFromGTestEnv("filter", kUniversalFilter), + "A colon-separated list of glob (not regex) patterns " + "for filtering the tests to run, optionally followed by a " + "'-' and a : separated list of negative patterns (tests to " + "exclude). A test is run if it matches one of the positive " + "patterns and does not match any of the negative patterns."); + +GTEST_DEFINE_bool_(list_tests, false, + "List all tests without running them."); + +GTEST_DEFINE_string_( + output, + internal::StringFromGTestEnv("output", ""), + "A format (currently must be \"xml\"), optionally followed " + "by a colon and an output file name or directory. A directory " + "is indicated by a trailing pathname separator. " + "Examples: \"xml:filename.xml\", \"xml::directoryname/\". " + "If a directory is specified, output files will be created " + "within that directory, with file-names based on the test " + "executable's name and, if necessary, made unique by adding " + "digits."); + +GTEST_DEFINE_bool_( + print_time, + internal::BoolFromGTestEnv("print_time", true), + "True iff " GTEST_NAME_ + " should display elapsed time in text output."); + +GTEST_DEFINE_int32_( + random_seed, + internal::Int32FromGTestEnv("random_seed", 0), + "Random number seed to use when shuffling test orders. Must be in range " + "[1, 99999], or 0 to use a seed based on the current time."); + +GTEST_DEFINE_int32_( + repeat, + internal::Int32FromGTestEnv("repeat", 1), + "How many times to repeat each test. Specify a negative number " + "for repeating forever. Useful for shaking out flaky tests."); + +GTEST_DEFINE_bool_( + show_internal_stack_frames, false, + "True iff " GTEST_NAME_ " should include internal stack frames when " + "printing test failure stack traces."); + +GTEST_DEFINE_bool_( + shuffle, + internal::BoolFromGTestEnv("shuffle", false), + "True iff " GTEST_NAME_ + " should randomize tests' order on every run."); + +GTEST_DEFINE_int32_( + stack_trace_depth, + internal::Int32FromGTestEnv("stack_trace_depth", kMaxStackTraceDepth), + "The maximum number of stack frames to print when an " + "assertion fails. The valid range is 0 through 100, inclusive."); + +GTEST_DEFINE_string_( + stream_result_to, + internal::StringFromGTestEnv("stream_result_to", ""), + "This flag specifies the host name and the port number on which to stream " + "test results. Example: \"localhost:555\". The flag is effective only on " + "Linux."); + +GTEST_DEFINE_bool_( + throw_on_failure, + internal::BoolFromGTestEnv("throw_on_failure", false), + "When this flag is specified, a failed assertion will throw an exception " + "if exceptions are enabled or exit the program with a non-zero code " + "otherwise."); + +namespace internal { + +// Generates a random number from [0, range), using a Linear +// Congruential Generator (LCG). Crashes if 'range' is 0 or greater +// than kMaxRange. +UInt32 Random::Generate(UInt32 range) { + // These constants are the same as are used in glibc's rand(3). + state_ = (1103515245U*state_ + 12345U) % kMaxRange; + + GTEST_CHECK_(range > 0) + << "Cannot generate a number in the range [0, 0)."; + GTEST_CHECK_(range <= kMaxRange) + << "Generation of a number in [0, " << range << ") was requested, " + << "but this can only generate numbers in [0, " << kMaxRange << ")."; + + // Converting via modulus introduces a bit of downward bias, but + // it's simple, and a linear congruential generator isn't too good + // to begin with. + return state_ % range; +} + +// GTestIsInitialized() returns true iff the user has initialized +// Google Test. Useful for catching the user mistake of not initializing +// Google Test before calling RUN_ALL_TESTS(). +// +// A user must call testing::InitGoogleTest() to initialize Google +// Test. g_init_gtest_count is set to the number of times +// InitGoogleTest() has been called. We don't protect this variable +// under a mutex as it is only accessed in the main thread. +int g_init_gtest_count = 0; +static bool GTestIsInitialized() { return g_init_gtest_count != 0; } + +// Iterates over a vector of TestCases, keeping a running sum of the +// results of calling a given int-returning method on each. +// Returns the sum. +static int SumOverTestCaseList(const std::vector& case_list, + int (TestCase::*method)() const) { + int sum = 0; + for (size_t i = 0; i < case_list.size(); i++) { + sum += (case_list[i]->*method)(); + } + return sum; +} + +// Returns true iff the test case passed. +static bool TestCasePassed(const TestCase* test_case) { + return test_case->should_run() && test_case->Passed(); +} + +// Returns true iff the test case failed. +static bool TestCaseFailed(const TestCase* test_case) { + return test_case->should_run() && test_case->Failed(); +} + +// Returns true iff test_case contains at least one test that should +// run. +static bool ShouldRunTestCase(const TestCase* test_case) { + return test_case->should_run(); +} + +// AssertHelper constructor. +AssertHelper::AssertHelper(TestPartResult::Type type, + const char* file, + int line, + const char* message) + : data_(new AssertHelperData(type, file, line, message)) { +} + +AssertHelper::~AssertHelper() { + delete data_; +} + +// Message assignment, for assertion streaming support. +void AssertHelper::operator=(const Message& message) const { + UnitTest::GetInstance()-> + AddTestPartResult(data_->type, data_->file, data_->line, + AppendUserMessage(data_->message, message), + UnitTest::GetInstance()->impl() + ->CurrentOsStackTraceExceptTop(1) + // Skips the stack frame for this function itself. + ); // NOLINT +} + +// Mutex for linked pointers. +GTEST_DEFINE_STATIC_MUTEX_(g_linked_ptr_mutex); + +// Application pathname gotten in InitGoogleTest. +String g_executable_path; + +// Returns the current application's name, removing directory path if that +// is present. +FilePath GetCurrentExecutableName() { + FilePath result; + +#if GTEST_OS_WINDOWS + result.Set(FilePath(g_executable_path).RemoveExtension("exe")); +#else + result.Set(FilePath(g_executable_path)); +#endif // GTEST_OS_WINDOWS + + return result.RemoveDirectoryName(); +} + +// Functions for processing the gtest_output flag. + +// Returns the output format, or "" for normal printed output. +String UnitTestOptions::GetOutputFormat() { + const char* const gtest_output_flag = GTEST_FLAG(output).c_str(); + if (gtest_output_flag == NULL) return String(""); + + const char* const colon = strchr(gtest_output_flag, ':'); + return (colon == NULL) ? + String(gtest_output_flag) : + String(gtest_output_flag, colon - gtest_output_flag); +} + +// Returns the name of the requested output file, or the default if none +// was explicitly specified. +String UnitTestOptions::GetAbsolutePathToOutputFile() { + const char* const gtest_output_flag = GTEST_FLAG(output).c_str(); + if (gtest_output_flag == NULL) + return String(""); + + const char* const colon = strchr(gtest_output_flag, ':'); + if (colon == NULL) + return String(internal::FilePath::ConcatPaths( + internal::FilePath( + UnitTest::GetInstance()->original_working_dir()), + internal::FilePath(kDefaultOutputFile)).ToString() ); + + internal::FilePath output_name(colon + 1); + if (!output_name.IsAbsolutePath()) + // TODO(wan@google.com): on Windows \some\path is not an absolute + // path (as its meaning depends on the current drive), yet the + // following logic for turning it into an absolute path is wrong. + // Fix it. + output_name = internal::FilePath::ConcatPaths( + internal::FilePath(UnitTest::GetInstance()->original_working_dir()), + internal::FilePath(colon + 1)); + + if (!output_name.IsDirectory()) + return output_name.ToString(); + + internal::FilePath result(internal::FilePath::GenerateUniqueFileName( + output_name, internal::GetCurrentExecutableName(), + GetOutputFormat().c_str())); + return result.ToString(); +} + +// Returns true iff the wildcard pattern matches the string. The +// first ':' or '\0' character in pattern marks the end of it. +// +// This recursive algorithm isn't very efficient, but is clear and +// works well enough for matching test names, which are short. +bool UnitTestOptions::PatternMatchesString(const char *pattern, + const char *str) { + switch (*pattern) { + case '\0': + case ':': // Either ':' or '\0' marks the end of the pattern. + return *str == '\0'; + case '?': // Matches any single character. + return *str != '\0' && PatternMatchesString(pattern + 1, str + 1); + case '*': // Matches any string (possibly empty) of characters. + return (*str != '\0' && PatternMatchesString(pattern, str + 1)) || + PatternMatchesString(pattern + 1, str); + default: // Non-special character. Matches itself. + return *pattern == *str && + PatternMatchesString(pattern + 1, str + 1); + } +} + +bool UnitTestOptions::MatchesFilter(const String& name, const char* filter) { + const char *cur_pattern = filter; + for (;;) { + if (PatternMatchesString(cur_pattern, name.c_str())) { + return true; + } + + // Finds the next pattern in the filter. + cur_pattern = strchr(cur_pattern, ':'); + + // Returns if no more pattern can be found. + if (cur_pattern == NULL) { + return false; + } + + // Skips the pattern separater (the ':' character). + cur_pattern++; + } +} + +// TODO(keithray): move String function implementations to gtest-string.cc. + +// Returns true iff the user-specified filter matches the test case +// name and the test name. +bool UnitTestOptions::FilterMatchesTest(const String &test_case_name, + const String &test_name) { + const String& full_name = String::Format("%s.%s", + test_case_name.c_str(), + test_name.c_str()); + + // Split --gtest_filter at '-', if there is one, to separate into + // positive filter and negative filter portions + const char* const p = GTEST_FLAG(filter).c_str(); + const char* const dash = strchr(p, '-'); + String positive; + String negative; + if (dash == NULL) { + positive = GTEST_FLAG(filter).c_str(); // Whole string is a positive filter + negative = String(""); + } else { + positive = String(p, dash - p); // Everything up to the dash + negative = String(dash+1); // Everything after the dash + if (positive.empty()) { + // Treat '-test1' as the same as '*-test1' + positive = kUniversalFilter; + } + } + + // A filter is a colon-separated list of patterns. It matches a + // test if any pattern in it matches the test. + return (MatchesFilter(full_name, positive.c_str()) && + !MatchesFilter(full_name, negative.c_str())); +} + +#if GTEST_HAS_SEH +// Returns EXCEPTION_EXECUTE_HANDLER if Google Test should handle the +// given SEH exception, or EXCEPTION_CONTINUE_SEARCH otherwise. +// This function is useful as an __except condition. +int UnitTestOptions::GTestShouldProcessSEH(DWORD exception_code) { + // Google Test should handle a SEH exception if: + // 1. the user wants it to, AND + // 2. this is not a breakpoint exception, AND + // 3. this is not a C++ exception (VC++ implements them via SEH, + // apparently). + // + // SEH exception code for C++ exceptions. + // (see http://support.microsoft.com/kb/185294 for more information). + const DWORD kCxxExceptionCode = 0xe06d7363; + + bool should_handle = true; + + if (!GTEST_FLAG(catch_exceptions)) + should_handle = false; + else if (exception_code == EXCEPTION_BREAKPOINT) + should_handle = false; + else if (exception_code == kCxxExceptionCode) + should_handle = false; + + return should_handle ? EXCEPTION_EXECUTE_HANDLER : EXCEPTION_CONTINUE_SEARCH; +} +#endif // GTEST_HAS_SEH + +} // namespace internal + +// The c'tor sets this object as the test part result reporter used by +// Google Test. The 'result' parameter specifies where to report the +// results. Intercepts only failures from the current thread. +ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter( + TestPartResultArray* result) + : intercept_mode_(INTERCEPT_ONLY_CURRENT_THREAD), + result_(result) { + Init(); +} + +// The c'tor sets this object as the test part result reporter used by +// Google Test. The 'result' parameter specifies where to report the +// results. +ScopedFakeTestPartResultReporter::ScopedFakeTestPartResultReporter( + InterceptMode intercept_mode, TestPartResultArray* result) + : intercept_mode_(intercept_mode), + result_(result) { + Init(); +} + +void ScopedFakeTestPartResultReporter::Init() { + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + if (intercept_mode_ == INTERCEPT_ALL_THREADS) { + old_reporter_ = impl->GetGlobalTestPartResultReporter(); + impl->SetGlobalTestPartResultReporter(this); + } else { + old_reporter_ = impl->GetTestPartResultReporterForCurrentThread(); + impl->SetTestPartResultReporterForCurrentThread(this); + } +} + +// The d'tor restores the test part result reporter used by Google Test +// before. +ScopedFakeTestPartResultReporter::~ScopedFakeTestPartResultReporter() { + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + if (intercept_mode_ == INTERCEPT_ALL_THREADS) { + impl->SetGlobalTestPartResultReporter(old_reporter_); + } else { + impl->SetTestPartResultReporterForCurrentThread(old_reporter_); + } +} + +// Increments the test part result count and remembers the result. +// This method is from the TestPartResultReporterInterface interface. +void ScopedFakeTestPartResultReporter::ReportTestPartResult( + const TestPartResult& result) { + result_->Append(result); +} + +namespace internal { + +// Returns the type ID of ::testing::Test. We should always call this +// instead of GetTypeId< ::testing::Test>() to get the type ID of +// testing::Test. This is to work around a suspected linker bug when +// using Google Test as a framework on Mac OS X. The bug causes +// GetTypeId< ::testing::Test>() to return different values depending +// on whether the call is from the Google Test framework itself or +// from user test code. GetTestTypeId() is guaranteed to always +// return the same value, as it always calls GetTypeId<>() from the +// gtest.cc, which is within the Google Test framework. +TypeId GetTestTypeId() { + return GetTypeId(); +} + +// The value of GetTestTypeId() as seen from within the Google Test +// library. This is solely for testing GetTestTypeId(). +extern const TypeId kTestTypeIdInGoogleTest = GetTestTypeId(); + +// This predicate-formatter checks that 'results' contains a test part +// failure of the given type and that the failure message contains the +// given substring. +AssertionResult HasOneFailure(const char* /* results_expr */, + const char* /* type_expr */, + const char* /* substr_expr */, + const TestPartResultArray& results, + TestPartResult::Type type, + const string& substr) { + const String expected(type == TestPartResult::kFatalFailure ? + "1 fatal failure" : + "1 non-fatal failure"); + Message msg; + if (results.size() != 1) { + msg << "Expected: " << expected << "\n" + << " Actual: " << results.size() << " failures"; + for (int i = 0; i < results.size(); i++) { + msg << "\n" << results.GetTestPartResult(i); + } + return AssertionFailure() << msg; + } + + const TestPartResult& r = results.GetTestPartResult(0); + if (r.type() != type) { + return AssertionFailure() << "Expected: " << expected << "\n" + << " Actual:\n" + << r; + } + + if (strstr(r.message(), substr.c_str()) == NULL) { + return AssertionFailure() << "Expected: " << expected << " containing \"" + << substr << "\"\n" + << " Actual:\n" + << r; + } + + return AssertionSuccess(); +} + +// The constructor of SingleFailureChecker remembers where to look up +// test part results, what type of failure we expect, and what +// substring the failure message should contain. +SingleFailureChecker:: SingleFailureChecker( + const TestPartResultArray* results, + TestPartResult::Type type, + const string& substr) + : results_(results), + type_(type), + substr_(substr) {} + +// The destructor of SingleFailureChecker verifies that the given +// TestPartResultArray contains exactly one failure that has the given +// type and contains the given substring. If that's not the case, a +// non-fatal failure will be generated. +SingleFailureChecker::~SingleFailureChecker() { + EXPECT_PRED_FORMAT3(HasOneFailure, *results_, type_, substr_); +} + +DefaultGlobalTestPartResultReporter::DefaultGlobalTestPartResultReporter( + UnitTestImpl* unit_test) : unit_test_(unit_test) {} + +void DefaultGlobalTestPartResultReporter::ReportTestPartResult( + const TestPartResult& result) { + unit_test_->current_test_result()->AddTestPartResult(result); + unit_test_->listeners()->repeater()->OnTestPartResult(result); +} + +DefaultPerThreadTestPartResultReporter::DefaultPerThreadTestPartResultReporter( + UnitTestImpl* unit_test) : unit_test_(unit_test) {} + +void DefaultPerThreadTestPartResultReporter::ReportTestPartResult( + const TestPartResult& result) { + unit_test_->GetGlobalTestPartResultReporter()->ReportTestPartResult(result); +} + +// Returns the global test part result reporter. +TestPartResultReporterInterface* +UnitTestImpl::GetGlobalTestPartResultReporter() { + internal::MutexLock lock(&global_test_part_result_reporter_mutex_); + return global_test_part_result_repoter_; +} + +// Sets the global test part result reporter. +void UnitTestImpl::SetGlobalTestPartResultReporter( + TestPartResultReporterInterface* reporter) { + internal::MutexLock lock(&global_test_part_result_reporter_mutex_); + global_test_part_result_repoter_ = reporter; +} + +// Returns the test part result reporter for the current thread. +TestPartResultReporterInterface* +UnitTestImpl::GetTestPartResultReporterForCurrentThread() { + return per_thread_test_part_result_reporter_.get(); +} + +// Sets the test part result reporter for the current thread. +void UnitTestImpl::SetTestPartResultReporterForCurrentThread( + TestPartResultReporterInterface* reporter) { + per_thread_test_part_result_reporter_.set(reporter); +} + +// Gets the number of successful test cases. +int UnitTestImpl::successful_test_case_count() const { + return CountIf(test_cases_, TestCasePassed); +} + +// Gets the number of failed test cases. +int UnitTestImpl::failed_test_case_count() const { + return CountIf(test_cases_, TestCaseFailed); +} + +// Gets the number of all test cases. +int UnitTestImpl::total_test_case_count() const { + return static_cast(test_cases_.size()); +} + +// Gets the number of all test cases that contain at least one test +// that should run. +int UnitTestImpl::test_case_to_run_count() const { + return CountIf(test_cases_, ShouldRunTestCase); +} + +// Gets the number of successful tests. +int UnitTestImpl::successful_test_count() const { + return SumOverTestCaseList(test_cases_, &TestCase::successful_test_count); +} + +// Gets the number of failed tests. +int UnitTestImpl::failed_test_count() const { + return SumOverTestCaseList(test_cases_, &TestCase::failed_test_count); +} + +// Gets the number of disabled tests. +int UnitTestImpl::disabled_test_count() const { + return SumOverTestCaseList(test_cases_, &TestCase::disabled_test_count); +} + +// Gets the number of all tests. +int UnitTestImpl::total_test_count() const { + return SumOverTestCaseList(test_cases_, &TestCase::total_test_count); +} + +// Gets the number of tests that should run. +int UnitTestImpl::test_to_run_count() const { + return SumOverTestCaseList(test_cases_, &TestCase::test_to_run_count); +} + +// Returns the current OS stack trace as a String. +// +// The maximum number of stack frames to be included is specified by +// the gtest_stack_trace_depth flag. The skip_count parameter +// specifies the number of top frames to be skipped, which doesn't +// count against the number of frames to be included. +// +// For example, if Foo() calls Bar(), which in turn calls +// CurrentOsStackTraceExceptTop(1), Foo() will be included in the +// trace but Bar() and CurrentOsStackTraceExceptTop() won't. +String UnitTestImpl::CurrentOsStackTraceExceptTop(int skip_count) { + (void)skip_count; + return String(""); +} + +// Returns the current time in milliseconds. +TimeInMillis GetTimeInMillis() { +#if GTEST_OS_WINDOWS_MOBILE || defined(__BORLANDC__) + // Difference between 1970-01-01 and 1601-01-01 in milliseconds. + // http://analogous.blogspot.com/2005/04/epoch.html + const TimeInMillis kJavaEpochToWinFileTimeDelta = + static_cast(116444736UL) * 100000UL; + const DWORD kTenthMicrosInMilliSecond = 10000; + + SYSTEMTIME now_systime; + FILETIME now_filetime; + ULARGE_INTEGER now_int64; + // TODO(kenton@google.com): Shouldn't this just use + // GetSystemTimeAsFileTime()? + GetSystemTime(&now_systime); + if (SystemTimeToFileTime(&now_systime, &now_filetime)) { + now_int64.LowPart = now_filetime.dwLowDateTime; + now_int64.HighPart = now_filetime.dwHighDateTime; + now_int64.QuadPart = (now_int64.QuadPart / kTenthMicrosInMilliSecond) - + kJavaEpochToWinFileTimeDelta; + return now_int64.QuadPart; + } + return 0; +#elif GTEST_OS_WINDOWS && !GTEST_HAS_GETTIMEOFDAY_ + __timeb64 now; + +# ifdef _MSC_VER + + // MSVC 8 deprecates _ftime64(), so we want to suppress warning 4996 + // (deprecated function) there. + // TODO(kenton@google.com): Use GetTickCount()? Or use + // SystemTimeToFileTime() +# pragma warning(push) // Saves the current warning state. +# pragma warning(disable:4996) // Temporarily disables warning 4996. + _ftime64(&now); +# pragma warning(pop) // Restores the warning state. +# else + + _ftime64(&now); + +# endif // _MSC_VER + + return static_cast(now.time) * 1000 + now.millitm; +#elif GTEST_HAS_GETTIMEOFDAY_ + struct timeval now; + gettimeofday(&now, NULL); + return static_cast(now.tv_sec) * 1000 + now.tv_usec / 1000; +#else +# error "Don't know how to get the current time on your system." +#endif +} + +// Utilities + +// class String + +// Returns the input enclosed in double quotes if it's not NULL; +// otherwise returns "(null)". For example, "\"Hello\"" is returned +// for input "Hello". +// +// This is useful for printing a C string in the syntax of a literal. +// +// Known issue: escape sequences are not handled yet. +String String::ShowCStringQuoted(const char* c_str) { + return c_str ? String::Format("\"%s\"", c_str) : String("(null)"); +} + +// Copies at most length characters from str into a newly-allocated +// piece of memory of size length+1. The memory is allocated with new[]. +// A terminating null byte is written to the memory, and a pointer to it +// is returned. If str is NULL, NULL is returned. +static char* CloneString(const char* str, size_t length) { + if (str == NULL) { + return NULL; + } else { + char* const clone = new char[length + 1]; + posix::StrNCpy(clone, str, length); + clone[length] = '\0'; + return clone; + } +} + +// Clones a 0-terminated C string, allocating memory using new. The +// caller is responsible for deleting[] the return value. Returns the +// cloned string, or NULL if the input is NULL. +const char * String::CloneCString(const char* c_str) { + return (c_str == NULL) ? + NULL : CloneString(c_str, strlen(c_str)); +} + +#if GTEST_OS_WINDOWS_MOBILE +// Creates a UTF-16 wide string from the given ANSI string, allocating +// memory using new. The caller is responsible for deleting the return +// value using delete[]. Returns the wide string, or NULL if the +// input is NULL. +LPCWSTR String::AnsiToUtf16(const char* ansi) { + if (!ansi) return NULL; + const int length = strlen(ansi); + const int unicode_length = + MultiByteToWideChar(CP_ACP, 0, ansi, length, + NULL, 0); + WCHAR* unicode = new WCHAR[unicode_length + 1]; + MultiByteToWideChar(CP_ACP, 0, ansi, length, + unicode, unicode_length); + unicode[unicode_length] = 0; + return unicode; +} + +// Creates an ANSI string from the given wide string, allocating +// memory using new. The caller is responsible for deleting the return +// value using delete[]. Returns the ANSI string, or NULL if the +// input is NULL. +const char* String::Utf16ToAnsi(LPCWSTR utf16_str) { + if (!utf16_str) return NULL; + const int ansi_length = + WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, + NULL, 0, NULL, NULL); + char* ansi = new char[ansi_length + 1]; + WideCharToMultiByte(CP_ACP, 0, utf16_str, -1, + ansi, ansi_length, NULL, NULL); + ansi[ansi_length] = 0; + return ansi; +} + +#endif // GTEST_OS_WINDOWS_MOBILE + +// Compares two C strings. Returns true iff they have the same content. +// +// Unlike strcmp(), this function can handle NULL argument(s). A NULL +// C string is considered different to any non-NULL C string, +// including the empty string. +bool String::CStringEquals(const char * lhs, const char * rhs) { + if ( lhs == NULL ) return rhs == NULL; + + if ( rhs == NULL ) return false; + + return strcmp(lhs, rhs) == 0; +} + +#if GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING + +// Converts an array of wide chars to a narrow string using the UTF-8 +// encoding, and streams the result to the given Message object. +static void StreamWideCharsToMessage(const wchar_t* wstr, size_t length, + Message* msg) { + // TODO(wan): consider allowing a testing::String object to + // contain '\0'. This will make it behave more like std::string, + // and will allow ToUtf8String() to return the correct encoding + // for '\0' s.t. we can get rid of the conditional here (and in + // several other places). + for (size_t i = 0; i != length; ) { // NOLINT + if (wstr[i] != L'\0') { + *msg << WideStringToUtf8(wstr + i, static_cast(length - i)); + while (i != length && wstr[i] != L'\0') + i++; + } else { + *msg << '\0'; + i++; + } + } +} + +#endif // GTEST_HAS_STD_WSTRING || GTEST_HAS_GLOBAL_WSTRING + +} // namespace internal + +#if GTEST_HAS_STD_WSTRING +// Converts the given wide string to a narrow string using the UTF-8 +// encoding, and streams the result to this Message object. +Message& Message::operator <<(const ::std::wstring& wstr) { + internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this); + return *this; +} +#endif // GTEST_HAS_STD_WSTRING + +#if GTEST_HAS_GLOBAL_WSTRING +// Converts the given wide string to a narrow string using the UTF-8 +// encoding, and streams the result to this Message object. +Message& Message::operator <<(const ::wstring& wstr) { + internal::StreamWideCharsToMessage(wstr.c_str(), wstr.length(), this); + return *this; +} +#endif // GTEST_HAS_GLOBAL_WSTRING + +// AssertionResult constructors. +// Used in EXPECT_TRUE/FALSE(assertion_result). +AssertionResult::AssertionResult(const AssertionResult& other) + : success_(other.success_), + message_(other.message_.get() != NULL ? + new ::std::string(*other.message_) : + static_cast< ::std::string*>(NULL)) { +} + +// Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE. +AssertionResult AssertionResult::operator!() const { + AssertionResult negation(!success_); + if (message_.get() != NULL) + negation << *message_; + return negation; +} + +// Makes a successful assertion result. +AssertionResult AssertionSuccess() { + return AssertionResult(true); +} + +// Makes a failed assertion result. +AssertionResult AssertionFailure() { + return AssertionResult(false); +} + +// Makes a failed assertion result with the given failure message. +// Deprecated; use AssertionFailure() << message. +AssertionResult AssertionFailure(const Message& message) { + return AssertionFailure() << message; +} + +namespace internal { + +// Constructs and returns the message for an equality assertion +// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure. +// +// The first four parameters are the expressions used in the assertion +// and their values, as strings. For example, for ASSERT_EQ(foo, bar) +// where foo is 5 and bar is 6, we have: +// +// expected_expression: "foo" +// actual_expression: "bar" +// expected_value: "5" +// actual_value: "6" +// +// The ignoring_case parameter is true iff the assertion is a +// *_STRCASEEQ*. When it's true, the string " (ignoring case)" will +// be inserted into the message. +AssertionResult EqFailure(const char* expected_expression, + const char* actual_expression, + const String& expected_value, + const String& actual_value, + bool ignoring_case) { + Message msg; + msg << "Value of: " << actual_expression; + if (actual_value != actual_expression) { + msg << "\n Actual: " << actual_value; + } + + msg << "\nExpected: " << expected_expression; + if (ignoring_case) { + msg << " (ignoring case)"; + } + if (expected_value != expected_expression) { + msg << "\nWhich is: " << expected_value; + } + + return AssertionFailure() << msg; +} + +// Constructs a failure message for Boolean assertions such as EXPECT_TRUE. +String GetBoolAssertionFailureMessage(const AssertionResult& assertion_result, + const char* expression_text, + const char* actual_predicate_value, + const char* expected_predicate_value) { + const char* actual_message = assertion_result.message(); + Message msg; + msg << "Value of: " << expression_text + << "\n Actual: " << actual_predicate_value; + if (actual_message[0] != '\0') + msg << " (" << actual_message << ")"; + msg << "\nExpected: " << expected_predicate_value; + return msg.GetString(); +} + +// Helper function for implementing ASSERT_NEAR. +AssertionResult DoubleNearPredFormat(const char* expr1, + const char* expr2, + const char* abs_error_expr, + double val1, + double val2, + double abs_error) { + const double diff = fabs(val1 - val2); + if (diff <= abs_error) return AssertionSuccess(); + + // TODO(wan): do not print the value of an expression if it's + // already a literal. + return AssertionFailure() + << "The difference between " << expr1 << " and " << expr2 + << " is " << diff << ", which exceeds " << abs_error_expr << ", where\n" + << expr1 << " evaluates to " << val1 << ",\n" + << expr2 << " evaluates to " << val2 << ", and\n" + << abs_error_expr << " evaluates to " << abs_error << "."; +} + + +// Helper template for implementing FloatLE() and DoubleLE(). +template +AssertionResult FloatingPointLE(const char* expr1, + const char* expr2, + RawType val1, + RawType val2) { + // Returns success if val1 is less than val2, + if (val1 < val2) { + return AssertionSuccess(); + } + + // or if val1 is almost equal to val2. + const FloatingPoint lhs(val1), rhs(val2); + if (lhs.AlmostEquals(rhs)) { + return AssertionSuccess(); + } + + // Note that the above two checks will both fail if either val1 or + // val2 is NaN, as the IEEE floating-point standard requires that + // any predicate involving a NaN must return false. + + ::std::stringstream val1_ss; + val1_ss << std::setprecision(std::numeric_limits::digits10 + 2) + << val1; + + ::std::stringstream val2_ss; + val2_ss << std::setprecision(std::numeric_limits::digits10 + 2) + << val2; + + return AssertionFailure() + << "Expected: (" << expr1 << ") <= (" << expr2 << ")\n" + << " Actual: " << StringStreamToString(&val1_ss) << " vs " + << StringStreamToString(&val2_ss); +} + +} // namespace internal + +// Asserts that val1 is less than, or almost equal to, val2. Fails +// otherwise. In particular, it fails if either val1 or val2 is NaN. +AssertionResult FloatLE(const char* expr1, const char* expr2, + float val1, float val2) { + return internal::FloatingPointLE(expr1, expr2, val1, val2); +} + +// Asserts that val1 is less than, or almost equal to, val2. Fails +// otherwise. In particular, it fails if either val1 or val2 is NaN. +AssertionResult DoubleLE(const char* expr1, const char* expr2, + double val1, double val2) { + return internal::FloatingPointLE(expr1, expr2, val1, val2); +} + +namespace internal { + +// The helper function for {ASSERT|EXPECT}_EQ with int or enum +// arguments. +AssertionResult CmpHelperEQ(const char* expected_expression, + const char* actual_expression, + BiggestInt expected, + BiggestInt actual) { + if (expected == actual) { + return AssertionSuccess(); + } + + return EqFailure(expected_expression, + actual_expression, + FormatForComparisonFailureMessage(expected, actual), + FormatForComparisonFailureMessage(actual, expected), + false); +} + +// A macro for implementing the helper functions needed to implement +// ASSERT_?? and EXPECT_?? with integer or enum arguments. It is here +// just to avoid copy-and-paste of similar code. +#define GTEST_IMPL_CMP_HELPER_(op_name, op)\ +AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \ + BiggestInt val1, BiggestInt val2) {\ + if (val1 op val2) {\ + return AssertionSuccess();\ + } else {\ + return AssertionFailure() \ + << "Expected: (" << expr1 << ") " #op " (" << expr2\ + << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\ + << " vs " << FormatForComparisonFailureMessage(val2, val1);\ + }\ +} + +// Implements the helper function for {ASSERT|EXPECT}_NE with int or +// enum arguments. +GTEST_IMPL_CMP_HELPER_(NE, !=) +// Implements the helper function for {ASSERT|EXPECT}_LE with int or +// enum arguments. +GTEST_IMPL_CMP_HELPER_(LE, <=) +// Implements the helper function for {ASSERT|EXPECT}_LT with int or +// enum arguments. +GTEST_IMPL_CMP_HELPER_(LT, < ) +// Implements the helper function for {ASSERT|EXPECT}_GE with int or +// enum arguments. +GTEST_IMPL_CMP_HELPER_(GE, >=) +// Implements the helper function for {ASSERT|EXPECT}_GT with int or +// enum arguments. +GTEST_IMPL_CMP_HELPER_(GT, > ) + +#undef GTEST_IMPL_CMP_HELPER_ + +// The helper function for {ASSERT|EXPECT}_STREQ. +AssertionResult CmpHelperSTREQ(const char* expected_expression, + const char* actual_expression, + const char* expected, + const char* actual) { + if (String::CStringEquals(expected, actual)) { + return AssertionSuccess(); + } + + return EqFailure(expected_expression, + actual_expression, + String::ShowCStringQuoted(expected), + String::ShowCStringQuoted(actual), + false); +} + +// The helper function for {ASSERT|EXPECT}_STRCASEEQ. +AssertionResult CmpHelperSTRCASEEQ(const char* expected_expression, + const char* actual_expression, + const char* expected, + const char* actual) { + if (String::CaseInsensitiveCStringEquals(expected, actual)) { + return AssertionSuccess(); + } + + return EqFailure(expected_expression, + actual_expression, + String::ShowCStringQuoted(expected), + String::ShowCStringQuoted(actual), + true); +} + +// The helper function for {ASSERT|EXPECT}_STRNE. +AssertionResult CmpHelperSTRNE(const char* s1_expression, + const char* s2_expression, + const char* s1, + const char* s2) { + if (!String::CStringEquals(s1, s2)) { + return AssertionSuccess(); + } else { + return AssertionFailure() << "Expected: (" << s1_expression << ") != (" + << s2_expression << "), actual: \"" + << s1 << "\" vs \"" << s2 << "\""; + } +} + +// The helper function for {ASSERT|EXPECT}_STRCASENE. +AssertionResult CmpHelperSTRCASENE(const char* s1_expression, + const char* s2_expression, + const char* s1, + const char* s2) { + if (!String::CaseInsensitiveCStringEquals(s1, s2)) { + return AssertionSuccess(); + } else { + return AssertionFailure() + << "Expected: (" << s1_expression << ") != (" + << s2_expression << ") (ignoring case), actual: \"" + << s1 << "\" vs \"" << s2 << "\""; + } +} + +} // namespace internal + +namespace { + +// Helper functions for implementing IsSubString() and IsNotSubstring(). + +// This group of overloaded functions return true iff needle is a +// substring of haystack. NULL is considered a substring of itself +// only. + +bool IsSubstringPred(const char* needle, const char* haystack) { + if (needle == NULL || haystack == NULL) + return needle == haystack; + + return strstr(haystack, needle) != NULL; +} + +bool IsSubstringPred(const wchar_t* needle, const wchar_t* haystack) { + if (needle == NULL || haystack == NULL) + return needle == haystack; + + return wcsstr(haystack, needle) != NULL; +} + +// StringType here can be either ::std::string or ::std::wstring. +template +bool IsSubstringPred(const StringType& needle, + const StringType& haystack) { + return haystack.find(needle) != StringType::npos; +} + +// This function implements either IsSubstring() or IsNotSubstring(), +// depending on the value of the expected_to_be_substring parameter. +// StringType here can be const char*, const wchar_t*, ::std::string, +// or ::std::wstring. +template +AssertionResult IsSubstringImpl( + bool expected_to_be_substring, + const char* needle_expr, const char* haystack_expr, + const StringType& needle, const StringType& haystack) { + if (IsSubstringPred(needle, haystack) == expected_to_be_substring) + return AssertionSuccess(); + + const bool is_wide_string = sizeof(needle[0]) > 1; + const char* const begin_string_quote = is_wide_string ? "L\"" : "\""; + return AssertionFailure() + << "Value of: " << needle_expr << "\n" + << " Actual: " << begin_string_quote << needle << "\"\n" + << "Expected: " << (expected_to_be_substring ? "" : "not ") + << "a substring of " << haystack_expr << "\n" + << "Which is: " << begin_string_quote << haystack << "\""; +} + +} // namespace + +// IsSubstring() and IsNotSubstring() check whether needle is a +// substring of haystack (NULL is considered a substring of itself +// only), and return an appropriate error message when they fail. + +AssertionResult IsSubstring( + const char* needle_expr, const char* haystack_expr, + const char* needle, const char* haystack) { + return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsSubstring( + const char* needle_expr, const char* haystack_expr, + const wchar_t* needle, const wchar_t* haystack) { + return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsNotSubstring( + const char* needle_expr, const char* haystack_expr, + const char* needle, const char* haystack) { + return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsNotSubstring( + const char* needle_expr, const char* haystack_expr, + const wchar_t* needle, const wchar_t* haystack) { + return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsSubstring( + const char* needle_expr, const char* haystack_expr, + const ::std::string& needle, const ::std::string& haystack) { + return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsNotSubstring( + const char* needle_expr, const char* haystack_expr, + const ::std::string& needle, const ::std::string& haystack) { + return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); +} + +#if GTEST_HAS_STD_WSTRING +AssertionResult IsSubstring( + const char* needle_expr, const char* haystack_expr, + const ::std::wstring& needle, const ::std::wstring& haystack) { + return IsSubstringImpl(true, needle_expr, haystack_expr, needle, haystack); +} + +AssertionResult IsNotSubstring( + const char* needle_expr, const char* haystack_expr, + const ::std::wstring& needle, const ::std::wstring& haystack) { + return IsSubstringImpl(false, needle_expr, haystack_expr, needle, haystack); +} +#endif // GTEST_HAS_STD_WSTRING + +namespace internal { + +#if GTEST_OS_WINDOWS + +namespace { + +// Helper function for IsHRESULT{SuccessFailure} predicates +AssertionResult HRESULTFailureHelper(const char* expr, + const char* expected, + long hr) { // NOLINT +# if GTEST_OS_WINDOWS_MOBILE + + // Windows CE doesn't support FormatMessage. + const char error_text[] = ""; + +# else + + // Looks up the human-readable system message for the HRESULT code + // and since we're not passing any params to FormatMessage, we don't + // want inserts expanded. + const DWORD kFlags = FORMAT_MESSAGE_FROM_SYSTEM | + FORMAT_MESSAGE_IGNORE_INSERTS; + const DWORD kBufSize = 4096; // String::Format can't exceed this length. + // Gets the system's human readable message string for this HRESULT. + char error_text[kBufSize] = { '\0' }; + DWORD message_length = ::FormatMessageA(kFlags, + 0, // no source, we're asking system + hr, // the error + 0, // no line width restrictions + error_text, // output buffer + kBufSize, // buf size + NULL); // no arguments for inserts + // Trims tailing white space (FormatMessage leaves a trailing cr-lf) + for (; message_length && IsSpace(error_text[message_length - 1]); + --message_length) { + error_text[message_length - 1] = '\0'; + } + +# endif // GTEST_OS_WINDOWS_MOBILE + + const String error_hex(String::Format("0x%08X ", hr)); + return ::testing::AssertionFailure() + << "Expected: " << expr << " " << expected << ".\n" + << " Actual: " << error_hex << error_text << "\n"; +} + +} // namespace + +AssertionResult IsHRESULTSuccess(const char* expr, long hr) { // NOLINT + if (SUCCEEDED(hr)) { + return AssertionSuccess(); + } + return HRESULTFailureHelper(expr, "succeeds", hr); +} + +AssertionResult IsHRESULTFailure(const char* expr, long hr) { // NOLINT + if (FAILED(hr)) { + return AssertionSuccess(); + } + return HRESULTFailureHelper(expr, "fails", hr); +} + +#endif // GTEST_OS_WINDOWS + +// Utility functions for encoding Unicode text (wide strings) in +// UTF-8. + +// A Unicode code-point can have upto 21 bits, and is encoded in UTF-8 +// like this: +// +// Code-point length Encoding +// 0 - 7 bits 0xxxxxxx +// 8 - 11 bits 110xxxxx 10xxxxxx +// 12 - 16 bits 1110xxxx 10xxxxxx 10xxxxxx +// 17 - 21 bits 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx + +// The maximum code-point a one-byte UTF-8 sequence can represent. +const UInt32 kMaxCodePoint1 = (static_cast(1) << 7) - 1; + +// The maximum code-point a two-byte UTF-8 sequence can represent. +const UInt32 kMaxCodePoint2 = (static_cast(1) << (5 + 6)) - 1; + +// The maximum code-point a three-byte UTF-8 sequence can represent. +const UInt32 kMaxCodePoint3 = (static_cast(1) << (4 + 2*6)) - 1; + +// The maximum code-point a four-byte UTF-8 sequence can represent. +const UInt32 kMaxCodePoint4 = (static_cast(1) << (3 + 3*6)) - 1; + +// Chops off the n lowest bits from a bit pattern. Returns the n +// lowest bits. As a side effect, the original bit pattern will be +// shifted to the right by n bits. +inline UInt32 ChopLowBits(UInt32* bits, int n) { + const UInt32 low_bits = *bits & ((static_cast(1) << n) - 1); + *bits >>= n; + return low_bits; +} + +// Converts a Unicode code point to a narrow string in UTF-8 encoding. +// code_point parameter is of type UInt32 because wchar_t may not be +// wide enough to contain a code point. +// The output buffer str must containt at least 32 characters. +// The function returns the address of the output buffer. +// If the code_point is not a valid Unicode code point +// (i.e. outside of Unicode range U+0 to U+10FFFF) it will be output +// as '(Invalid Unicode 0xXXXXXXXX)'. +char* CodePointToUtf8(UInt32 code_point, char* str) { + if (code_point <= kMaxCodePoint1) { + str[1] = '\0'; + str[0] = static_cast(code_point); // 0xxxxxxx + } else if (code_point <= kMaxCodePoint2) { + str[2] = '\0'; + str[1] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[0] = static_cast(0xC0 | code_point); // 110xxxxx + } else if (code_point <= kMaxCodePoint3) { + str[3] = '\0'; + str[2] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[1] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[0] = static_cast(0xE0 | code_point); // 1110xxxx + } else if (code_point <= kMaxCodePoint4) { + str[4] = '\0'; + str[3] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[2] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[1] = static_cast(0x80 | ChopLowBits(&code_point, 6)); // 10xxxxxx + str[0] = static_cast(0xF0 | code_point); // 11110xxx + } else { + // The longest string String::Format can produce when invoked + // with these parameters is 28 character long (not including + // the terminating nul character). We are asking for 32 character + // buffer just in case. This is also enough for strncpy to + // null-terminate the destination string. + posix::StrNCpy( + str, String::Format("(Invalid Unicode 0x%X)", code_point).c_str(), 32); + str[31] = '\0'; // Makes sure no change in the format to strncpy leaves + // the result unterminated. + } + return str; +} + +// The following two functions only make sense if the the system +// uses UTF-16 for wide string encoding. All supported systems +// with 16 bit wchar_t (Windows, Cygwin, Symbian OS) do use UTF-16. + +// Determines if the arguments constitute UTF-16 surrogate pair +// and thus should be combined into a single Unicode code point +// using CreateCodePointFromUtf16SurrogatePair. +inline bool IsUtf16SurrogatePair(wchar_t first, wchar_t second) { + return sizeof(wchar_t) == 2 && + (first & 0xFC00) == 0xD800 && (second & 0xFC00) == 0xDC00; +} + +// Creates a Unicode code point from UTF16 surrogate pair. +inline UInt32 CreateCodePointFromUtf16SurrogatePair(wchar_t first, + wchar_t second) { + const UInt32 mask = (1 << 10) - 1; + return (sizeof(wchar_t) == 2) ? + (((first & mask) << 10) | (second & mask)) + 0x10000 : + // This function should not be called when the condition is + // false, but we provide a sensible default in case it is. + static_cast(first); +} + +// Converts a wide string to a narrow string in UTF-8 encoding. +// The wide string is assumed to have the following encoding: +// UTF-16 if sizeof(wchar_t) == 2 (on Windows, Cygwin, Symbian OS) +// UTF-32 if sizeof(wchar_t) == 4 (on Linux) +// Parameter str points to a null-terminated wide string. +// Parameter num_chars may additionally limit the number +// of wchar_t characters processed. -1 is used when the entire string +// should be processed. +// If the string contains code points that are not valid Unicode code points +// (i.e. outside of Unicode range U+0 to U+10FFFF) they will be output +// as '(Invalid Unicode 0xXXXXXXXX)'. If the string is in UTF16 encoding +// and contains invalid UTF-16 surrogate pairs, values in those pairs +// will be encoded as individual Unicode characters from Basic Normal Plane. +String WideStringToUtf8(const wchar_t* str, int num_chars) { + if (num_chars == -1) + num_chars = static_cast(wcslen(str)); + + ::std::stringstream stream; + for (int i = 0; i < num_chars; ++i) { + UInt32 unicode_code_point; + + if (str[i] == L'\0') { + break; + } else if (i + 1 < num_chars && IsUtf16SurrogatePair(str[i], str[i + 1])) { + unicode_code_point = CreateCodePointFromUtf16SurrogatePair(str[i], + str[i + 1]); + i++; + } else { + unicode_code_point = static_cast(str[i]); + } + + char buffer[32]; // CodePointToUtf8 requires a buffer this big. + stream << CodePointToUtf8(unicode_code_point, buffer); + } + return StringStreamToString(&stream); +} + +// Converts a wide C string to a String using the UTF-8 encoding. +// NULL will be converted to "(null)". +String String::ShowWideCString(const wchar_t * wide_c_str) { + if (wide_c_str == NULL) return String("(null)"); + + return String(internal::WideStringToUtf8(wide_c_str, -1).c_str()); +} + +// Similar to ShowWideCString(), except that this function encloses +// the converted string in double quotes. +String String::ShowWideCStringQuoted(const wchar_t* wide_c_str) { + if (wide_c_str == NULL) return String("(null)"); + + return String::Format("L\"%s\"", + String::ShowWideCString(wide_c_str).c_str()); +} + +// Compares two wide C strings. Returns true iff they have the same +// content. +// +// Unlike wcscmp(), this function can handle NULL argument(s). A NULL +// C string is considered different to any non-NULL C string, +// including the empty string. +bool String::WideCStringEquals(const wchar_t * lhs, const wchar_t * rhs) { + if (lhs == NULL) return rhs == NULL; + + if (rhs == NULL) return false; + + return wcscmp(lhs, rhs) == 0; +} + +// Helper function for *_STREQ on wide strings. +AssertionResult CmpHelperSTREQ(const char* expected_expression, + const char* actual_expression, + const wchar_t* expected, + const wchar_t* actual) { + if (String::WideCStringEquals(expected, actual)) { + return AssertionSuccess(); + } + + return EqFailure(expected_expression, + actual_expression, + String::ShowWideCStringQuoted(expected), + String::ShowWideCStringQuoted(actual), + false); +} + +// Helper function for *_STRNE on wide strings. +AssertionResult CmpHelperSTRNE(const char* s1_expression, + const char* s2_expression, + const wchar_t* s1, + const wchar_t* s2) { + if (!String::WideCStringEquals(s1, s2)) { + return AssertionSuccess(); + } + + return AssertionFailure() << "Expected: (" << s1_expression << ") != (" + << s2_expression << "), actual: " + << String::ShowWideCStringQuoted(s1) + << " vs " << String::ShowWideCStringQuoted(s2); +} + +// Compares two C strings, ignoring case. Returns true iff they have +// the same content. +// +// Unlike strcasecmp(), this function can handle NULL argument(s). A +// NULL C string is considered different to any non-NULL C string, +// including the empty string. +bool String::CaseInsensitiveCStringEquals(const char * lhs, const char * rhs) { + if (lhs == NULL) + return rhs == NULL; + if (rhs == NULL) + return false; + return posix::StrCaseCmp(lhs, rhs) == 0; +} + + // Compares two wide C strings, ignoring case. Returns true iff they + // have the same content. + // + // Unlike wcscasecmp(), this function can handle NULL argument(s). + // A NULL C string is considered different to any non-NULL wide C string, + // including the empty string. + // NB: The implementations on different platforms slightly differ. + // On windows, this method uses _wcsicmp which compares according to LC_CTYPE + // environment variable. On GNU platform this method uses wcscasecmp + // which compares according to LC_CTYPE category of the current locale. + // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the + // current locale. +bool String::CaseInsensitiveWideCStringEquals(const wchar_t* lhs, + const wchar_t* rhs) { + if (lhs == NULL) return rhs == NULL; + + if (rhs == NULL) return false; + +#if GTEST_OS_WINDOWS + return _wcsicmp(lhs, rhs) == 0; +#elif GTEST_OS_LINUX && !GTEST_OS_LINUX_ANDROID + return wcscasecmp(lhs, rhs) == 0; +#else + // Android, Mac OS X and Cygwin don't define wcscasecmp. + // Other unknown OSes may not define it either. + wint_t left, right; + do { + left = towlower(*lhs++); + right = towlower(*rhs++); + } while (left && left == right); + return left == right; +#endif // OS selector +} + +// Compares this with another String. +// Returns < 0 if this is less than rhs, 0 if this is equal to rhs, or > 0 +// if this is greater than rhs. +int String::Compare(const String & rhs) const { + const char* const lhs_c_str = c_str(); + const char* const rhs_c_str = rhs.c_str(); + + if (lhs_c_str == NULL) { + return rhs_c_str == NULL ? 0 : -1; // NULL < anything except NULL + } else if (rhs_c_str == NULL) { + return 1; + } + + const size_t shorter_str_len = + length() <= rhs.length() ? length() : rhs.length(); + for (size_t i = 0; i != shorter_str_len; i++) { + if (lhs_c_str[i] < rhs_c_str[i]) { + return -1; + } else if (lhs_c_str[i] > rhs_c_str[i]) { + return 1; + } + } + return (length() < rhs.length()) ? -1 : + (length() > rhs.length()) ? 1 : 0; +} + +// Returns true iff this String ends with the given suffix. *Any* +// String is considered to end with a NULL or empty suffix. +bool String::EndsWith(const char* suffix) const { + if (suffix == NULL || CStringEquals(suffix, "")) return true; + + if (c_str() == NULL) return false; + + const size_t this_len = strlen(c_str()); + const size_t suffix_len = strlen(suffix); + return (this_len >= suffix_len) && + CStringEquals(c_str() + this_len - suffix_len, suffix); +} + +// Returns true iff this String ends with the given suffix, ignoring case. +// Any String is considered to end with a NULL or empty suffix. +bool String::EndsWithCaseInsensitive(const char* suffix) const { + if (suffix == NULL || CStringEquals(suffix, "")) return true; + + if (c_str() == NULL) return false; + + const size_t this_len = strlen(c_str()); + const size_t suffix_len = strlen(suffix); + return (this_len >= suffix_len) && + CaseInsensitiveCStringEquals(c_str() + this_len - suffix_len, suffix); +} + +// Formats a list of arguments to a String, using the same format +// spec string as for printf. +// +// We do not use the StringPrintf class as it is not universally +// available. +// +// The result is limited to 4096 characters (including the tailing 0). +// If 4096 characters are not enough to format the input, or if +// there's an error, "" is +// returned. +String String::Format(const char * format, ...) { + va_list args; + va_start(args, format); + + char buffer[4096]; + const int kBufferSize = sizeof(buffer)/sizeof(buffer[0]); + + // MSVC 8 deprecates vsnprintf(), so we want to suppress warning + // 4996 (deprecated function) there. +#ifdef _MSC_VER // We are using MSVC. +# pragma warning(push) // Saves the current warning state. +# pragma warning(disable:4996) // Temporarily disables warning 4996. + + const int size = vsnprintf(buffer, kBufferSize, format, args); + +# pragma warning(pop) // Restores the warning state. +#else // We are not using MSVC. + const int size = vsnprintf(buffer, kBufferSize, format, args); +#endif // _MSC_VER + va_end(args); + + // vsnprintf()'s behavior is not portable. When the buffer is not + // big enough, it returns a negative value in MSVC, and returns the + // needed buffer size on Linux. When there is an output error, it + // always returns a negative value. For simplicity, we lump the two + // error cases together. + if (size < 0 || size >= kBufferSize) { + return String(""); + } else { + return String(buffer, size); + } +} + +// Converts the buffer in a stringstream to a String, converting NUL +// bytes to "\\0" along the way. +String StringStreamToString(::std::stringstream* ss) { + const ::std::string& str = ss->str(); + const char* const start = str.c_str(); + const char* const end = start + str.length(); + + // We need to use a helper stringstream to do this transformation + // because String doesn't support push_back(). + ::std::stringstream helper; + for (const char* ch = start; ch != end; ++ch) { + if (*ch == '\0') { + helper << "\\0"; // Replaces NUL with "\\0"; + } else { + helper.put(*ch); + } + } + + return String(helper.str().c_str()); +} + +// Appends the user-supplied message to the Google-Test-generated message. +String AppendUserMessage(const String& gtest_msg, + const Message& user_msg) { + // Appends the user message if it's non-empty. + const String user_msg_string = user_msg.GetString(); + if (user_msg_string.empty()) { + return gtest_msg; + } + + Message msg; + msg << gtest_msg << "\n" << user_msg_string; + + return msg.GetString(); +} + +} // namespace internal + +// class TestResult + +// Creates an empty TestResult. +TestResult::TestResult() + : death_test_count_(0), + elapsed_time_(0) { +} + +// D'tor. +TestResult::~TestResult() { +} + +// Returns the i-th test part result among all the results. i can +// range from 0 to total_part_count() - 1. If i is not in that range, +// aborts the program. +const TestPartResult& TestResult::GetTestPartResult(int i) const { + if (i < 0 || i >= total_part_count()) + internal::posix::Abort(); + return test_part_results_.at(i); +} + +// Returns the i-th test property. i can range from 0 to +// test_property_count() - 1. If i is not in that range, aborts the +// program. +const TestProperty& TestResult::GetTestProperty(int i) const { + if (i < 0 || i >= test_property_count()) + internal::posix::Abort(); + return test_properties_.at(i); +} + +// Clears the test part results. +void TestResult::ClearTestPartResults() { + test_part_results_.clear(); +} + +// Adds a test part result to the list. +void TestResult::AddTestPartResult(const TestPartResult& test_part_result) { + test_part_results_.push_back(test_part_result); +} + +// Adds a test property to the list. If a property with the same key as the +// supplied property is already represented, the value of this test_property +// replaces the old value for that key. +void TestResult::RecordProperty(const TestProperty& test_property) { + if (!ValidateTestProperty(test_property)) { + return; + } + internal::MutexLock lock(&test_properites_mutex_); + const std::vector::iterator property_with_matching_key = + std::find_if(test_properties_.begin(), test_properties_.end(), + internal::TestPropertyKeyIs(test_property.key())); + if (property_with_matching_key == test_properties_.end()) { + test_properties_.push_back(test_property); + return; + } + property_with_matching_key->SetValue(test_property.value()); +} + +// Adds a failure if the key is a reserved attribute of Google Test +// testcase tags. Returns true if the property is valid. +bool TestResult::ValidateTestProperty(const TestProperty& test_property) { + internal::String key(test_property.key()); + if (key == "name" || key == "status" || key == "time" || key == "classname") { + ADD_FAILURE() + << "Reserved key used in RecordProperty(): " + << key + << " ('name', 'status', 'time', and 'classname' are reserved by " + << GTEST_NAME_ << ")"; + return false; + } + return true; +} + +// Clears the object. +void TestResult::Clear() { + test_part_results_.clear(); + test_properties_.clear(); + death_test_count_ = 0; + elapsed_time_ = 0; +} + +// Returns true iff the test failed. +bool TestResult::Failed() const { + for (int i = 0; i < total_part_count(); ++i) { + if (GetTestPartResult(i).failed()) + return true; + } + return false; +} + +// Returns true iff the test part fatally failed. +static bool TestPartFatallyFailed(const TestPartResult& result) { + return result.fatally_failed(); +} + +// Returns true iff the test fatally failed. +bool TestResult::HasFatalFailure() const { + return CountIf(test_part_results_, TestPartFatallyFailed) > 0; +} + +// Returns true iff the test part non-fatally failed. +static bool TestPartNonfatallyFailed(const TestPartResult& result) { + return result.nonfatally_failed(); +} + +// Returns true iff the test has a non-fatal failure. +bool TestResult::HasNonfatalFailure() const { + return CountIf(test_part_results_, TestPartNonfatallyFailed) > 0; +} + +// Gets the number of all test parts. This is the sum of the number +// of successful test parts and the number of failed test parts. +int TestResult::total_part_count() const { + return static_cast(test_part_results_.size()); +} + +// Returns the number of the test properties. +int TestResult::test_property_count() const { + return static_cast(test_properties_.size()); +} + +// class Test + +// Creates a Test object. + +// The c'tor saves the values of all Google Test flags. +Test::Test() + : gtest_flag_saver_(new internal::GTestFlagSaver) { +} + +// The d'tor restores the values of all Google Test flags. +Test::~Test() { + delete gtest_flag_saver_; +} + +// Sets up the test fixture. +// +// A sub-class may override this. +void Test::SetUp() { +} + +// Tears down the test fixture. +// +// A sub-class may override this. +void Test::TearDown() { +} + +// Allows user supplied key value pairs to be recorded for later output. +void Test::RecordProperty(const char* key, const char* value) { + UnitTest::GetInstance()->RecordPropertyForCurrentTest(key, value); +} + +// Allows user supplied key value pairs to be recorded for later output. +void Test::RecordProperty(const char* key, int value) { + Message value_message; + value_message << value; + RecordProperty(key, value_message.GetString().c_str()); +} + +namespace internal { + +void ReportFailureInUnknownLocation(TestPartResult::Type result_type, + const String& message) { + // This function is a friend of UnitTest and as such has access to + // AddTestPartResult. + UnitTest::GetInstance()->AddTestPartResult( + result_type, + NULL, // No info about the source file where the exception occurred. + -1, // We have no info on which line caused the exception. + message, + String()); // No stack trace, either. +} + +} // namespace internal + +// Google Test requires all tests in the same test case to use the same test +// fixture class. This function checks if the current test has the +// same fixture class as the first test in the current test case. If +// yes, it returns true; otherwise it generates a Google Test failure and +// returns false. +bool Test::HasSameFixtureClass() { + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + const TestCase* const test_case = impl->current_test_case(); + + // Info about the first test in the current test case. + const TestInfo* const first_test_info = test_case->test_info_list()[0]; + const internal::TypeId first_fixture_id = first_test_info->fixture_class_id_; + const char* const first_test_name = first_test_info->name(); + + // Info about the current test. + const TestInfo* const this_test_info = impl->current_test_info(); + const internal::TypeId this_fixture_id = this_test_info->fixture_class_id_; + const char* const this_test_name = this_test_info->name(); + + if (this_fixture_id != first_fixture_id) { + // Is the first test defined using TEST? + const bool first_is_TEST = first_fixture_id == internal::GetTestTypeId(); + // Is this test defined using TEST? + const bool this_is_TEST = this_fixture_id == internal::GetTestTypeId(); + + if (first_is_TEST || this_is_TEST) { + // The user mixed TEST and TEST_F in this test case - we'll tell + // him/her how to fix it. + + // Gets the name of the TEST and the name of the TEST_F. Note + // that first_is_TEST and this_is_TEST cannot both be true, as + // the fixture IDs are different for the two tests. + const char* const TEST_name = + first_is_TEST ? first_test_name : this_test_name; + const char* const TEST_F_name = + first_is_TEST ? this_test_name : first_test_name; + + ADD_FAILURE() + << "All tests in the same test case must use the same test fixture\n" + << "class, so mixing TEST_F and TEST in the same test case is\n" + << "illegal. In test case " << this_test_info->test_case_name() + << ",\n" + << "test " << TEST_F_name << " is defined using TEST_F but\n" + << "test " << TEST_name << " is defined using TEST. You probably\n" + << "want to change the TEST to TEST_F or move it to another test\n" + << "case."; + } else { + // The user defined two fixture classes with the same name in + // two namespaces - we'll tell him/her how to fix it. + ADD_FAILURE() + << "All tests in the same test case must use the same test fixture\n" + << "class. However, in test case " + << this_test_info->test_case_name() << ",\n" + << "you defined test " << first_test_name + << " and test " << this_test_name << "\n" + << "using two different test fixture classes. This can happen if\n" + << "the two classes are from different namespaces or translation\n" + << "units and have the same name. You should probably rename one\n" + << "of the classes to put the tests into different test cases."; + } + return false; + } + + return true; +} + +#if GTEST_HAS_SEH + +// Adds an "exception thrown" fatal failure to the current test. This +// function returns its result via an output parameter pointer because VC++ +// prohibits creation of objects with destructors on stack in functions +// using __try (see error C2712). +static internal::String* FormatSehExceptionMessage(DWORD exception_code, + const char* location) { + Message message; + message << "SEH exception with code 0x" << std::setbase(16) << + exception_code << std::setbase(10) << " thrown in " << location << "."; + + return new internal::String(message.GetString()); +} + +#endif // GTEST_HAS_SEH + +#if GTEST_HAS_EXCEPTIONS + +// Adds an "exception thrown" fatal failure to the current test. +static internal::String FormatCxxExceptionMessage(const char* description, + const char* location) { + Message message; + if (description != NULL) { + message << "C++ exception with description \"" << description << "\""; + } else { + message << "Unknown C++ exception"; + } + message << " thrown in " << location << "."; + + return message.GetString(); +} + +static internal::String PrintTestPartResultToString( + const TestPartResult& test_part_result); + +// A failed Google Test assertion will throw an exception of this type when +// GTEST_FLAG(throw_on_failure) is true (if exceptions are enabled). We +// derive it from std::runtime_error, which is for errors presumably +// detectable only at run time. Since std::runtime_error inherits from +// std::exception, many testing frameworks know how to extract and print the +// message inside it. +class GoogleTestFailureException : public ::std::runtime_error { + public: + explicit GoogleTestFailureException(const TestPartResult& failure) + : ::std::runtime_error(PrintTestPartResultToString(failure).c_str()) {} +}; +#endif // GTEST_HAS_EXCEPTIONS + +namespace internal { +// We put these helper functions in the internal namespace as IBM's xlC +// compiler rejects the code if they were declared static. + +// Runs the given method and handles SEH exceptions it throws, when +// SEH is supported; returns the 0-value for type Result in case of an +// SEH exception. (Microsoft compilers cannot handle SEH and C++ +// exceptions in the same function. Therefore, we provide a separate +// wrapper function for handling SEH exceptions.) +template +Result HandleSehExceptionsInMethodIfSupported( + T* object, Result (T::*method)(), const char* location) { +#if GTEST_HAS_SEH + __try { + return (object->*method)(); + } __except (internal::UnitTestOptions::GTestShouldProcessSEH( // NOLINT + GetExceptionCode())) { + // We create the exception message on the heap because VC++ prohibits + // creation of objects with destructors on stack in functions using __try + // (see error C2712). + internal::String* exception_message = FormatSehExceptionMessage( + GetExceptionCode(), location); + internal::ReportFailureInUnknownLocation(TestPartResult::kFatalFailure, + *exception_message); + delete exception_message; + return static_cast(0); + } +#else + (void)location; + return (object->*method)(); +#endif // GTEST_HAS_SEH +} + +// Runs the given method and catches and reports C++ and/or SEH-style +// exceptions, if they are supported; returns the 0-value for type +// Result in case of an SEH exception. +template +Result HandleExceptionsInMethodIfSupported( + T* object, Result (T::*method)(), const char* location) { + // NOTE: The user code can affect the way in which Google Test handles + // exceptions by setting GTEST_FLAG(catch_exceptions), but only before + // RUN_ALL_TESTS() starts. It is technically possible to check the flag + // after the exception is caught and either report or re-throw the + // exception based on the flag's value: + // + // try { + // // Perform the test method. + // } catch (...) { + // if (GTEST_FLAG(catch_exceptions)) + // // Report the exception as failure. + // else + // throw; // Re-throws the original exception. + // } + // + // However, the purpose of this flag is to allow the program to drop into + // the debugger when the exception is thrown. On most platforms, once the + // control enters the catch block, the exception origin information is + // lost and the debugger will stop the program at the point of the + // re-throw in this function -- instead of at the point of the original + // throw statement in the code under test. For this reason, we perform + // the check early, sacrificing the ability to affect Google Test's + // exception handling in the method where the exception is thrown. + if (internal::GetUnitTestImpl()->catch_exceptions()) { +#if GTEST_HAS_EXCEPTIONS + try { + return HandleSehExceptionsInMethodIfSupported(object, method, location); + } catch (const GoogleTestFailureException&) { // NOLINT + // This exception doesn't originate in code under test. It makes no + // sense to report it as a test failure. + throw; + } catch (const std::exception& e) { // NOLINT + internal::ReportFailureInUnknownLocation( + TestPartResult::kFatalFailure, + FormatCxxExceptionMessage(e.what(), location)); + } catch (...) { // NOLINT + internal::ReportFailureInUnknownLocation( + TestPartResult::kFatalFailure, + FormatCxxExceptionMessage(NULL, location)); + } + return static_cast(0); +#else + return HandleSehExceptionsInMethodIfSupported(object, method, location); +#endif // GTEST_HAS_EXCEPTIONS + } else { + return (object->*method)(); + } +} + +} // namespace internal + +// Runs the test and updates the test result. +void Test::Run() { + if (!HasSameFixtureClass()) return; + + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported(this, &Test::SetUp, "SetUp()"); + // We will run the test only if SetUp() was successful. + if (!HasFatalFailure()) { + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported( + this, &Test::TestBody, "the test body"); + } + + // However, we want to clean up as much as possible. Hence we will + // always call TearDown(), even if SetUp() or the test body has + // failed. + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported( + this, &Test::TearDown, "TearDown()"); +} + +// Returns true iff the current test has a fatal failure. +bool Test::HasFatalFailure() { + return internal::GetUnitTestImpl()->current_test_result()->HasFatalFailure(); +} + +// Returns true iff the current test has a non-fatal failure. +bool Test::HasNonfatalFailure() { + return internal::GetUnitTestImpl()->current_test_result()-> + HasNonfatalFailure(); +} + +// class TestInfo + +// Constructs a TestInfo object. It assumes ownership of the test factory +// object. +// TODO(vladl@google.com): Make a_test_case_name and a_name const string&'s +// to signify they cannot be NULLs. +TestInfo::TestInfo(const char* a_test_case_name, + const char* a_name, + const char* a_type_param, + const char* a_value_param, + internal::TypeId fixture_class_id, + internal::TestFactoryBase* factory) + : test_case_name_(a_test_case_name), + name_(a_name), + type_param_(a_type_param ? new std::string(a_type_param) : NULL), + value_param_(a_value_param ? new std::string(a_value_param) : NULL), + fixture_class_id_(fixture_class_id), + should_run_(false), + is_disabled_(false), + matches_filter_(false), + factory_(factory), + result_() {} + +// Destructs a TestInfo object. +TestInfo::~TestInfo() { delete factory_; } + +namespace internal { + +// Creates a new TestInfo object and registers it with Google Test; +// returns the created object. +// +// Arguments: +// +// test_case_name: name of the test case +// name: name of the test +// type_param: the name of the test's type parameter, or NULL if +// this is not a typed or a type-parameterized test. +// value_param: text representation of the test's value parameter, +// or NULL if this is not a value-parameterized test. +// fixture_class_id: ID of the test fixture class +// set_up_tc: pointer to the function that sets up the test case +// tear_down_tc: pointer to the function that tears down the test case +// factory: pointer to the factory that creates a test object. +// The newly created TestInfo instance will assume +// ownership of the factory object. +TestInfo* MakeAndRegisterTestInfo( + const char* test_case_name, const char* name, + const char* type_param, + const char* value_param, + TypeId fixture_class_id, + SetUpTestCaseFunc set_up_tc, + TearDownTestCaseFunc tear_down_tc, + TestFactoryBase* factory) { + TestInfo* const test_info = + new TestInfo(test_case_name, name, type_param, value_param, + fixture_class_id, factory); + GetUnitTestImpl()->AddTestInfo(set_up_tc, tear_down_tc, test_info); + return test_info; +} + +#if GTEST_HAS_PARAM_TEST +void ReportInvalidTestCaseType(const char* test_case_name, + const char* file, int line) { + Message errors; + errors + << "Attempted redefinition of test case " << test_case_name << ".\n" + << "All tests in the same test case must use the same test fixture\n" + << "class. However, in test case " << test_case_name << ", you tried\n" + << "to define a test using a fixture class different from the one\n" + << "used earlier. This can happen if the two fixture classes are\n" + << "from different namespaces and have the same name. You should\n" + << "probably rename one of the classes to put the tests into different\n" + << "test cases."; + + fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(), + errors.GetString().c_str()); +} +#endif // GTEST_HAS_PARAM_TEST + +} // namespace internal + +namespace { + +// A predicate that checks the test name of a TestInfo against a known +// value. +// +// This is used for implementation of the TestCase class only. We put +// it in the anonymous namespace to prevent polluting the outer +// namespace. +// +// TestNameIs is copyable. +class TestNameIs { + public: + // Constructor. + // + // TestNameIs has NO default constructor. + explicit TestNameIs(const char* name) + : name_(name) {} + + // Returns true iff the test name of test_info matches name_. + bool operator()(const TestInfo * test_info) const { + return test_info && internal::String(test_info->name()).Compare(name_) == 0; + } + + private: + internal::String name_; +}; + +} // namespace + +namespace internal { + +// This method expands all parameterized tests registered with macros TEST_P +// and INSTANTIATE_TEST_CASE_P into regular tests and registers those. +// This will be done just once during the program runtime. +void UnitTestImpl::RegisterParameterizedTests() { +#if GTEST_HAS_PARAM_TEST + if (!parameterized_tests_registered_) { + parameterized_test_registry_.RegisterTests(); + parameterized_tests_registered_ = true; + } +#endif +} + +} // namespace internal + +// Creates the test object, runs it, records its result, and then +// deletes it. +void TestInfo::Run() { + if (!should_run_) return; + + // Tells UnitTest where to store test result. + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + impl->set_current_test_info(this); + + TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater(); + + // Notifies the unit test event listeners that a test is about to start. + repeater->OnTestStart(*this); + + const TimeInMillis start = internal::GetTimeInMillis(); + + impl->os_stack_trace_getter()->UponLeavingGTest(); + + // Creates the test object. + Test* const test = internal::HandleExceptionsInMethodIfSupported( + factory_, &internal::TestFactoryBase::CreateTest, + "the test fixture's constructor"); + + // Runs the test only if the test object was created and its + // constructor didn't generate a fatal failure. + if ((test != NULL) && !Test::HasFatalFailure()) { + // This doesn't throw as all user code that can throw are wrapped into + // exception handling code. + test->Run(); + } + + // Deletes the test object. + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported( + test, &Test::DeleteSelf_, "the test fixture's destructor"); + + result_.set_elapsed_time(internal::GetTimeInMillis() - start); + + // Notifies the unit test event listener that a test has just finished. + repeater->OnTestEnd(*this); + + // Tells UnitTest to stop associating assertion results to this + // test. + impl->set_current_test_info(NULL); +} + +// class TestCase + +// Gets the number of successful tests in this test case. +int TestCase::successful_test_count() const { + return CountIf(test_info_list_, TestPassed); +} + +// Gets the number of failed tests in this test case. +int TestCase::failed_test_count() const { + return CountIf(test_info_list_, TestFailed); +} + +int TestCase::disabled_test_count() const { + return CountIf(test_info_list_, TestDisabled); +} + +// Get the number of tests in this test case that should run. +int TestCase::test_to_run_count() const { + return CountIf(test_info_list_, ShouldRunTest); +} + +// Gets the number of all tests. +int TestCase::total_test_count() const { + return static_cast(test_info_list_.size()); +} + +// Creates a TestCase with the given name. +// +// Arguments: +// +// name: name of the test case +// a_type_param: the name of the test case's type parameter, or NULL if +// this is not a typed or a type-parameterized test case. +// set_up_tc: pointer to the function that sets up the test case +// tear_down_tc: pointer to the function that tears down the test case +TestCase::TestCase(const char* a_name, const char* a_type_param, + Test::SetUpTestCaseFunc set_up_tc, + Test::TearDownTestCaseFunc tear_down_tc) + : name_(a_name), + type_param_(a_type_param ? new std::string(a_type_param) : NULL), + set_up_tc_(set_up_tc), + tear_down_tc_(tear_down_tc), + should_run_(false), + elapsed_time_(0) { +} + +// Destructor of TestCase. +TestCase::~TestCase() { + // Deletes every Test in the collection. + ForEach(test_info_list_, internal::Delete); +} + +// Returns the i-th test among all the tests. i can range from 0 to +// total_test_count() - 1. If i is not in that range, returns NULL. +const TestInfo* TestCase::GetTestInfo(int i) const { + const int index = GetElementOr(test_indices_, i, -1); + return index < 0 ? NULL : test_info_list_[index]; +} + +// Returns the i-th test among all the tests. i can range from 0 to +// total_test_count() - 1. If i is not in that range, returns NULL. +TestInfo* TestCase::GetMutableTestInfo(int i) { + const int index = GetElementOr(test_indices_, i, -1); + return index < 0 ? NULL : test_info_list_[index]; +} + +// Adds a test to this test case. Will delete the test upon +// destruction of the TestCase object. +void TestCase::AddTestInfo(TestInfo * test_info) { + test_info_list_.push_back(test_info); + test_indices_.push_back(static_cast(test_indices_.size())); +} + +// Runs every test in this TestCase. +void TestCase::Run() { + if (!should_run_) return; + + internal::UnitTestImpl* const impl = internal::GetUnitTestImpl(); + impl->set_current_test_case(this); + + TestEventListener* repeater = UnitTest::GetInstance()->listeners().repeater(); + + repeater->OnTestCaseStart(*this); + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported( + this, &TestCase::RunSetUpTestCase, "SetUpTestCase()"); + + const internal::TimeInMillis start = internal::GetTimeInMillis(); + for (int i = 0; i < total_test_count(); i++) { + GetMutableTestInfo(i)->Run(); + } + elapsed_time_ = internal::GetTimeInMillis() - start; + + impl->os_stack_trace_getter()->UponLeavingGTest(); + internal::HandleExceptionsInMethodIfSupported( + this, &TestCase::RunTearDownTestCase, "TearDownTestCase()"); + + repeater->OnTestCaseEnd(*this); + impl->set_current_test_case(NULL); +} + +// Clears the results of all tests in this test case. +void TestCase::ClearResult() { + ForEach(test_info_list_, TestInfo::ClearTestResult); +} + +// Shuffles the tests in this test case. +void TestCase::ShuffleTests(internal::Random* random) { + Shuffle(random, &test_indices_); +} + +// Restores the test order to before the first shuffle. +void TestCase::UnshuffleTests() { + for (size_t i = 0; i < test_indices_.size(); i++) { + test_indices_[i] = static_cast(i); + } +} + +// Formats a countable noun. Depending on its quantity, either the +// singular form or the plural form is used. e.g. +// +// FormatCountableNoun(1, "formula", "formuli") returns "1 formula". +// FormatCountableNoun(5, "book", "books") returns "5 books". +static internal::String FormatCountableNoun(int count, + const char * singular_form, + const char * plural_form) { + return internal::String::Format("%d %s", count, + count == 1 ? singular_form : plural_form); +} + +// Formats the count of tests. +static internal::String FormatTestCount(int test_count) { + return FormatCountableNoun(test_count, "test", "tests"); +} + +// Formats the count of test cases. +static internal::String FormatTestCaseCount(int test_case_count) { + return FormatCountableNoun(test_case_count, "test case", "test cases"); +} + +// Converts a TestPartResult::Type enum to human-friendly string +// representation. Both kNonFatalFailure and kFatalFailure are translated +// to "Failure", as the user usually doesn't care about the difference +// between the two when viewing the test result. +static const char * TestPartResultTypeToString(TestPartResult::Type type) { + switch (type) { + case TestPartResult::kSuccess: + return "Success"; + + case TestPartResult::kNonFatalFailure: + case TestPartResult::kFatalFailure: +#ifdef _MSC_VER + return "error: "; +#else + return "Failure\n"; +#endif + default: + return "Unknown result type"; + } +} + +// Prints a TestPartResult to a String. +static internal::String PrintTestPartResultToString( + const TestPartResult& test_part_result) { + return (Message() + << internal::FormatFileLocation(test_part_result.file_name(), + test_part_result.line_number()) + << " " << TestPartResultTypeToString(test_part_result.type()) + << test_part_result.message()).GetString(); +} + +// Prints a TestPartResult. +static void PrintTestPartResult(const TestPartResult& test_part_result) { + const internal::String& result = + PrintTestPartResultToString(test_part_result); + printf("%s\n", result.c_str()); + fflush(stdout); + // If the test program runs in Visual Studio or a debugger, the + // following statements add the test part result message to the Output + // window such that the user can double-click on it to jump to the + // corresponding source code location; otherwise they do nothing. +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE + // We don't call OutputDebugString*() on Windows Mobile, as printing + // to stdout is done by OutputDebugString() there already - we don't + // want the same message printed twice. + ::OutputDebugStringA(result.c_str()); + ::OutputDebugStringA("\n"); +#endif +} + +// class PrettyUnitTestResultPrinter + +namespace internal { + +enum GTestColor { + COLOR_DEFAULT, + COLOR_RED, + COLOR_GREEN, + COLOR_YELLOW +}; + +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE + +// Returns the character attribute for the given color. +WORD GetColorAttribute(GTestColor color) { + switch (color) { + case COLOR_RED: return FOREGROUND_RED; + case COLOR_GREEN: return FOREGROUND_GREEN; + case COLOR_YELLOW: return FOREGROUND_RED | FOREGROUND_GREEN; + default: return 0; + } +} + +#else + +// Returns the ANSI color code for the given color. COLOR_DEFAULT is +// an invalid input. +const char* GetAnsiColorCode(GTestColor color) { + switch (color) { + case COLOR_RED: return "1"; + case COLOR_GREEN: return "2"; + case COLOR_YELLOW: return "3"; + default: return NULL; + }; +} + +#endif // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE + +// Returns true iff Google Test should use colors in the output. +bool ShouldUseColor(bool stdout_is_tty) { + const char* const gtest_color = GTEST_FLAG(color).c_str(); + + if (String::CaseInsensitiveCStringEquals(gtest_color, "auto")) { +#if GTEST_OS_WINDOWS + // On Windows the TERM variable is usually not set, but the + // console there does support colors. + return stdout_is_tty; +#else + // On non-Windows platforms, we rely on the TERM variable. + const char* const term = posix::GetEnv("TERM"); + const bool term_supports_color = + String::CStringEquals(term, "xterm") || + String::CStringEquals(term, "xterm-color") || + String::CStringEquals(term, "xterm-256color") || + String::CStringEquals(term, "screen") || + String::CStringEquals(term, "linux") || + String::CStringEquals(term, "cygwin"); + return stdout_is_tty && term_supports_color; +#endif // GTEST_OS_WINDOWS + } + + return String::CaseInsensitiveCStringEquals(gtest_color, "yes") || + String::CaseInsensitiveCStringEquals(gtest_color, "true") || + String::CaseInsensitiveCStringEquals(gtest_color, "t") || + String::CStringEquals(gtest_color, "1"); + // We take "yes", "true", "t", and "1" as meaning "yes". If the + // value is neither one of these nor "auto", we treat it as "no" to + // be conservative. +} + +// Helpers for printing colored strings to stdout. Note that on Windows, we +// cannot simply emit special characters and have the terminal change colors. +// This routine must actually emit the characters rather than return a string +// that would be colored when printed, as can be done on Linux. +void ColoredPrintf(GTestColor color, const char* fmt, ...) { + va_list args; + va_start(args, fmt); + +#if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS + const bool use_color = false; +#else + static const bool in_color_mode = + ShouldUseColor(posix::IsATTY(posix::FileNo(stdout)) != 0); + const bool use_color = in_color_mode && (color != COLOR_DEFAULT); +#endif // GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN || GTEST_OS_ZOS + // The '!= 0' comparison is necessary to satisfy MSVC 7.1. + + if (!use_color) { + vprintf(fmt, args); + va_end(args); + return; + } + +#if GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE + const HANDLE stdout_handle = GetStdHandle(STD_OUTPUT_HANDLE); + + // Gets the current text color. + CONSOLE_SCREEN_BUFFER_INFO buffer_info; + GetConsoleScreenBufferInfo(stdout_handle, &buffer_info); + const WORD old_color_attrs = buffer_info.wAttributes; + + // We need to flush the stream buffers into the console before each + // SetConsoleTextAttribute call lest it affect the text that is already + // printed but has not yet reached the console. + fflush(stdout); + SetConsoleTextAttribute(stdout_handle, + GetColorAttribute(color) | FOREGROUND_INTENSITY); + vprintf(fmt, args); + + fflush(stdout); + // Restores the text color. + SetConsoleTextAttribute(stdout_handle, old_color_attrs); +#else + printf("\033[0;3%sm", GetAnsiColorCode(color)); + vprintf(fmt, args); + printf("\033[m"); // Resets the terminal to default. +#endif // GTEST_OS_WINDOWS && !GTEST_OS_WINDOWS_MOBILE + va_end(args); +} + +void PrintFullTestCommentIfPresent(const TestInfo& test_info) { + const char* const type_param = test_info.type_param(); + const char* const value_param = test_info.value_param(); + + if (type_param != NULL || value_param != NULL) { + printf(", where "); + if (type_param != NULL) { + printf("TypeParam = %s", type_param); + if (value_param != NULL) + printf(" and "); + } + if (value_param != NULL) { + printf("GetParam() = %s", value_param); + } + } +} + +// This class implements the TestEventListener interface. +// +// Class PrettyUnitTestResultPrinter is copyable. +class PrettyUnitTestResultPrinter : public TestEventListener { + public: + PrettyUnitTestResultPrinter() {} + static void PrintTestName(const char * test_case, const char * test) { + printf("%s.%s", test_case, test); + } + + // The following methods override what's in the TestEventListener class. + virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {} + virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration); + virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test); + virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {} + virtual void OnTestCaseStart(const TestCase& test_case); + virtual void OnTestStart(const TestInfo& test_info); + virtual void OnTestPartResult(const TestPartResult& result); + virtual void OnTestEnd(const TestInfo& test_info); + virtual void OnTestCaseEnd(const TestCase& test_case); + virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test); + virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {} + virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration); + virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {} + + private: + static void PrintFailedTests(const UnitTest& unit_test); + + internal::String test_case_name_; +}; + + // Fired before each iteration of tests starts. +void PrettyUnitTestResultPrinter::OnTestIterationStart( + const UnitTest& unit_test, int iteration) { + if (GTEST_FLAG(repeat) != 1) + printf("\nRepeating all tests (iteration %d) . . .\n\n", iteration + 1); + + const char* const filter = GTEST_FLAG(filter).c_str(); + + // Prints the filter if it's not *. This reminds the user that some + // tests may be skipped. + if (!internal::String::CStringEquals(filter, kUniversalFilter)) { + ColoredPrintf(COLOR_YELLOW, + "Note: %s filter = %s\n", GTEST_NAME_, filter); + } + + if (internal::ShouldShard(kTestTotalShards, kTestShardIndex, false)) { + const Int32 shard_index = Int32FromEnvOrDie(kTestShardIndex, -1); + ColoredPrintf(COLOR_YELLOW, + "Note: This is test shard %d of %s.\n", + static_cast(shard_index) + 1, + internal::posix::GetEnv(kTestTotalShards)); + } + + if (GTEST_FLAG(shuffle)) { + ColoredPrintf(COLOR_YELLOW, + "Note: Randomizing tests' orders with a seed of %d .\n", + unit_test.random_seed()); + } + + ColoredPrintf(COLOR_GREEN, "[==========] "); + printf("Running %s from %s.\n", + FormatTestCount(unit_test.test_to_run_count()).c_str(), + FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str()); + fflush(stdout); +} + +void PrettyUnitTestResultPrinter::OnEnvironmentsSetUpStart( + const UnitTest& /*unit_test*/) { + ColoredPrintf(COLOR_GREEN, "[----------] "); + printf("Global test environment set-up.\n"); + fflush(stdout); +} + +void PrettyUnitTestResultPrinter::OnTestCaseStart(const TestCase& test_case) { + test_case_name_ = test_case.name(); + const internal::String counts = + FormatCountableNoun(test_case.test_to_run_count(), "test", "tests"); + ColoredPrintf(COLOR_GREEN, "[----------] "); + printf("%s from %s", counts.c_str(), test_case_name_.c_str()); + if (test_case.type_param() == NULL) { + printf("\n"); + } else { + printf(", where TypeParam = %s\n", test_case.type_param()); + } + fflush(stdout); +} + +void PrettyUnitTestResultPrinter::OnTestStart(const TestInfo& test_info) { + ColoredPrintf(COLOR_GREEN, "[ RUN ] "); + PrintTestName(test_case_name_.c_str(), test_info.name()); + printf("\n"); + fflush(stdout); +} + +// Called after an assertion failure. +void PrettyUnitTestResultPrinter::OnTestPartResult( + const TestPartResult& result) { + // If the test part succeeded, we don't need to do anything. + if (result.type() == TestPartResult::kSuccess) + return; + + // Print failure message from the assertion (e.g. expected this and got that). + PrintTestPartResult(result); + fflush(stdout); +} + +void PrettyUnitTestResultPrinter::OnTestEnd(const TestInfo& test_info) { + if (test_info.result()->Passed()) { + ColoredPrintf(COLOR_GREEN, "[ OK ] "); + } else { + ColoredPrintf(COLOR_RED, "[ FAILED ] "); + } + PrintTestName(test_case_name_.c_str(), test_info.name()); + if (test_info.result()->Failed()) + PrintFullTestCommentIfPresent(test_info); + + if (GTEST_FLAG(print_time)) { + printf(" (%s ms)\n", internal::StreamableToString( + test_info.result()->elapsed_time()).c_str()); + } else { + printf("\n"); + } + fflush(stdout); +} + +void PrettyUnitTestResultPrinter::OnTestCaseEnd(const TestCase& test_case) { + if (!GTEST_FLAG(print_time)) return; + + test_case_name_ = test_case.name(); + const internal::String counts = + FormatCountableNoun(test_case.test_to_run_count(), "test", "tests"); + ColoredPrintf(COLOR_GREEN, "[----------] "); + printf("%s from %s (%s ms total)\n\n", + counts.c_str(), test_case_name_.c_str(), + internal::StreamableToString(test_case.elapsed_time()).c_str()); + fflush(stdout); +} + +void PrettyUnitTestResultPrinter::OnEnvironmentsTearDownStart( + const UnitTest& /*unit_test*/) { + ColoredPrintf(COLOR_GREEN, "[----------] "); + printf("Global test environment tear-down\n"); + fflush(stdout); +} + +// Internal helper for printing the list of failed tests. +void PrettyUnitTestResultPrinter::PrintFailedTests(const UnitTest& unit_test) { + const int failed_test_count = unit_test.failed_test_count(); + if (failed_test_count == 0) { + return; + } + + for (int i = 0; i < unit_test.total_test_case_count(); ++i) { + const TestCase& test_case = *unit_test.GetTestCase(i); + if (!test_case.should_run() || (test_case.failed_test_count() == 0)) { + continue; + } + for (int j = 0; j < test_case.total_test_count(); ++j) { + const TestInfo& test_info = *test_case.GetTestInfo(j); + if (!test_info.should_run() || test_info.result()->Passed()) { + continue; + } + ColoredPrintf(COLOR_RED, "[ FAILED ] "); + printf("%s.%s", test_case.name(), test_info.name()); + PrintFullTestCommentIfPresent(test_info); + printf("\n"); + } + } +} + +void PrettyUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, + int /*iteration*/) { + ColoredPrintf(COLOR_GREEN, "[==========] "); + printf("%s from %s ran.", + FormatTestCount(unit_test.test_to_run_count()).c_str(), + FormatTestCaseCount(unit_test.test_case_to_run_count()).c_str()); + if (GTEST_FLAG(print_time)) { + printf(" (%s ms total)", + internal::StreamableToString(unit_test.elapsed_time()).c_str()); + } + printf("\n"); + ColoredPrintf(COLOR_GREEN, "[ PASSED ] "); + printf("%s.\n", FormatTestCount(unit_test.successful_test_count()).c_str()); + + int num_failures = unit_test.failed_test_count(); + if (!unit_test.Passed()) { + const int failed_test_count = unit_test.failed_test_count(); + ColoredPrintf(COLOR_RED, "[ FAILED ] "); + printf("%s, listed below:\n", FormatTestCount(failed_test_count).c_str()); + PrintFailedTests(unit_test); + printf("\n%2d FAILED %s\n", num_failures, + num_failures == 1 ? "TEST" : "TESTS"); + } + + int num_disabled = unit_test.disabled_test_count(); + if (num_disabled && !GTEST_FLAG(also_run_disabled_tests)) { + if (!num_failures) { + printf("\n"); // Add a spacer if no FAILURE banner is displayed. + } + ColoredPrintf(COLOR_YELLOW, + " YOU HAVE %d DISABLED %s\n\n", + num_disabled, + num_disabled == 1 ? "TEST" : "TESTS"); + } + // Ensure that Google Test output is printed before, e.g., heapchecker output. + fflush(stdout); +} + +// End PrettyUnitTestResultPrinter + +// class TestEventRepeater +// +// This class forwards events to other event listeners. +class TestEventRepeater : public TestEventListener { + public: + TestEventRepeater() : forwarding_enabled_(true) {} + virtual ~TestEventRepeater(); + void Append(TestEventListener *listener); + TestEventListener* Release(TestEventListener* listener); + + // Controls whether events will be forwarded to listeners_. Set to false + // in death test child processes. + bool forwarding_enabled() const { return forwarding_enabled_; } + void set_forwarding_enabled(bool enable) { forwarding_enabled_ = enable; } + + virtual void OnTestProgramStart(const UnitTest& unit_test); + virtual void OnTestIterationStart(const UnitTest& unit_test, int iteration); + virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test); + virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test); + virtual void OnTestCaseStart(const TestCase& test_case); + virtual void OnTestStart(const TestInfo& test_info); + virtual void OnTestPartResult(const TestPartResult& result); + virtual void OnTestEnd(const TestInfo& test_info); + virtual void OnTestCaseEnd(const TestCase& test_case); + virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test); + virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test); + virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration); + virtual void OnTestProgramEnd(const UnitTest& unit_test); + + private: + // Controls whether events will be forwarded to listeners_. Set to false + // in death test child processes. + bool forwarding_enabled_; + // The list of listeners that receive events. + std::vector listeners_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventRepeater); +}; + +TestEventRepeater::~TestEventRepeater() { + ForEach(listeners_, Delete); +} + +void TestEventRepeater::Append(TestEventListener *listener) { + listeners_.push_back(listener); +} + +// TODO(vladl@google.com): Factor the search functionality into Vector::Find. +TestEventListener* TestEventRepeater::Release(TestEventListener *listener) { + for (size_t i = 0; i < listeners_.size(); ++i) { + if (listeners_[i] == listener) { + listeners_.erase(listeners_.begin() + i); + return listener; + } + } + + return NULL; +} + +// Since most methods are very similar, use macros to reduce boilerplate. +// This defines a member that forwards the call to all listeners. +#define GTEST_REPEATER_METHOD_(Name, Type) \ +void TestEventRepeater::Name(const Type& parameter) { \ + if (forwarding_enabled_) { \ + for (size_t i = 0; i < listeners_.size(); i++) { \ + listeners_[i]->Name(parameter); \ + } \ + } \ +} +// This defines a member that forwards the call to all listeners in reverse +// order. +#define GTEST_REVERSE_REPEATER_METHOD_(Name, Type) \ +void TestEventRepeater::Name(const Type& parameter) { \ + if (forwarding_enabled_) { \ + for (int i = static_cast(listeners_.size()) - 1; i >= 0; i--) { \ + listeners_[i]->Name(parameter); \ + } \ + } \ +} + +GTEST_REPEATER_METHOD_(OnTestProgramStart, UnitTest) +GTEST_REPEATER_METHOD_(OnEnvironmentsSetUpStart, UnitTest) +GTEST_REPEATER_METHOD_(OnTestCaseStart, TestCase) +GTEST_REPEATER_METHOD_(OnTestStart, TestInfo) +GTEST_REPEATER_METHOD_(OnTestPartResult, TestPartResult) +GTEST_REPEATER_METHOD_(OnEnvironmentsTearDownStart, UnitTest) +GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsSetUpEnd, UnitTest) +GTEST_REVERSE_REPEATER_METHOD_(OnEnvironmentsTearDownEnd, UnitTest) +GTEST_REVERSE_REPEATER_METHOD_(OnTestEnd, TestInfo) +GTEST_REVERSE_REPEATER_METHOD_(OnTestCaseEnd, TestCase) +GTEST_REVERSE_REPEATER_METHOD_(OnTestProgramEnd, UnitTest) + +#undef GTEST_REPEATER_METHOD_ +#undef GTEST_REVERSE_REPEATER_METHOD_ + +void TestEventRepeater::OnTestIterationStart(const UnitTest& unit_test, + int iteration) { + if (forwarding_enabled_) { + for (size_t i = 0; i < listeners_.size(); i++) { + listeners_[i]->OnTestIterationStart(unit_test, iteration); + } + } +} + +void TestEventRepeater::OnTestIterationEnd(const UnitTest& unit_test, + int iteration) { + if (forwarding_enabled_) { + for (int i = static_cast(listeners_.size()) - 1; i >= 0; i--) { + listeners_[i]->OnTestIterationEnd(unit_test, iteration); + } + } +} + +// End TestEventRepeater + +// This class generates an XML output file. +class XmlUnitTestResultPrinter : public EmptyTestEventListener { + public: + explicit XmlUnitTestResultPrinter(const char* output_file); + + virtual void OnTestIterationEnd(const UnitTest& unit_test, int iteration); + + private: + // Is c a whitespace character that is normalized to a space character + // when it appears in an XML attribute value? + static bool IsNormalizableWhitespace(char c) { + return c == 0x9 || c == 0xA || c == 0xD; + } + + // May c appear in a well-formed XML document? + static bool IsValidXmlCharacter(char c) { + return IsNormalizableWhitespace(c) || c >= 0x20; + } + + // Returns an XML-escaped copy of the input string str. If + // is_attribute is true, the text is meant to appear as an attribute + // value, and normalizable whitespace is preserved by replacing it + // with character references. + static String EscapeXml(const char* str, bool is_attribute); + + // Returns the given string with all characters invalid in XML removed. + static string RemoveInvalidXmlCharacters(const string& str); + + // Convenience wrapper around EscapeXml when str is an attribute value. + static String EscapeXmlAttribute(const char* str) { + return EscapeXml(str, true); + } + + // Convenience wrapper around EscapeXml when str is not an attribute value. + static String EscapeXmlText(const char* str) { return EscapeXml(str, false); } + + // Streams an XML CDATA section, escaping invalid CDATA sequences as needed. + static void OutputXmlCDataSection(::std::ostream* stream, const char* data); + + // Streams an XML representation of a TestInfo object. + static void OutputXmlTestInfo(::std::ostream* stream, + const char* test_case_name, + const TestInfo& test_info); + + // Prints an XML representation of a TestCase object + static void PrintXmlTestCase(FILE* out, const TestCase& test_case); + + // Prints an XML summary of unit_test to output stream out. + static void PrintXmlUnitTest(FILE* out, const UnitTest& unit_test); + + // Produces a string representing the test properties in a result as space + // delimited XML attributes based on the property key="value" pairs. + // When the String is not empty, it includes a space at the beginning, + // to delimit this attribute from prior attributes. + static String TestPropertiesAsXmlAttributes(const TestResult& result); + + // The output file. + const String output_file_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(XmlUnitTestResultPrinter); +}; + +// Creates a new XmlUnitTestResultPrinter. +XmlUnitTestResultPrinter::XmlUnitTestResultPrinter(const char* output_file) + : output_file_(output_file) { + if (output_file_.c_str() == NULL || output_file_.empty()) { + fprintf(stderr, "XML output file may not be null\n"); + fflush(stderr); + exit(EXIT_FAILURE); + } +} + +// Called after the unit test ends. +void XmlUnitTestResultPrinter::OnTestIterationEnd(const UnitTest& unit_test, + int /*iteration*/) { + FILE* xmlout = NULL; + FilePath output_file(output_file_); + FilePath output_dir(output_file.RemoveFileName()); + + if (output_dir.CreateDirectoriesRecursively()) { + xmlout = posix::FOpen(output_file_.c_str(), "w"); + } + if (xmlout == NULL) { + // TODO(wan): report the reason of the failure. + // + // We don't do it for now as: + // + // 1. There is no urgent need for it. + // 2. It's a bit involved to make the errno variable thread-safe on + // all three operating systems (Linux, Windows, and Mac OS). + // 3. To interpret the meaning of errno in a thread-safe way, + // we need the strerror_r() function, which is not available on + // Windows. + fprintf(stderr, + "Unable to open file \"%s\"\n", + output_file_.c_str()); + fflush(stderr); + exit(EXIT_FAILURE); + } + PrintXmlUnitTest(xmlout, unit_test); + fclose(xmlout); +} + +// Returns an XML-escaped copy of the input string str. If is_attribute +// is true, the text is meant to appear as an attribute value, and +// normalizable whitespace is preserved by replacing it with character +// references. +// +// Invalid XML characters in str, if any, are stripped from the output. +// It is expected that most, if not all, of the text processed by this +// module will consist of ordinary English text. +// If this module is ever modified to produce version 1.1 XML output, +// most invalid characters can be retained using character references. +// TODO(wan): It might be nice to have a minimally invasive, human-readable +// escaping scheme for invalid characters, rather than dropping them. +String XmlUnitTestResultPrinter::EscapeXml(const char* str, bool is_attribute) { + Message m; + + if (str != NULL) { + for (const char* src = str; *src; ++src) { + switch (*src) { + case '<': + m << "<"; + break; + case '>': + m << ">"; + break; + case '&': + m << "&"; + break; + case '\'': + if (is_attribute) + m << "'"; + else + m << '\''; + break; + case '"': + if (is_attribute) + m << """; + else + m << '"'; + break; + default: + if (IsValidXmlCharacter(*src)) { + if (is_attribute && IsNormalizableWhitespace(*src)) + m << String::Format("&#x%02X;", unsigned(*src)); + else + m << *src; + } + break; + } + } + } + + return m.GetString(); +} + +// Returns the given string with all characters invalid in XML removed. +// Currently invalid characters are dropped from the string. An +// alternative is to replace them with certain characters such as . or ?. +string XmlUnitTestResultPrinter::RemoveInvalidXmlCharacters(const string& str) { + string output; + output.reserve(str.size()); + for (string::const_iterator it = str.begin(); it != str.end(); ++it) + if (IsValidXmlCharacter(*it)) + output.push_back(*it); + + return output; +} + +// The following routines generate an XML representation of a UnitTest +// object. +// +// This is how Google Test concepts map to the DTD: +// +// <-- corresponds to a UnitTest object +// <-- corresponds to a TestCase object +// <-- corresponds to a TestInfo object +// ... +// ... +// ... +// <-- individual assertion failures +// +// +// + +// Formats the given time in milliseconds as seconds. +std::string FormatTimeInMillisAsSeconds(TimeInMillis ms) { + ::std::stringstream ss; + ss << ms/1000.0; + return ss.str(); +} + +// Streams an XML CDATA section, escaping invalid CDATA sequences as needed. +void XmlUnitTestResultPrinter::OutputXmlCDataSection(::std::ostream* stream, + const char* data) { + const char* segment = data; + *stream << ""); + if (next_segment != NULL) { + stream->write( + segment, static_cast(next_segment - segment)); + *stream << "]]>]]>"); + } else { + *stream << segment; + break; + } + } + *stream << "]]>"; +} + +// Prints an XML representation of a TestInfo object. +// TODO(wan): There is also value in printing properties with the plain printer. +void XmlUnitTestResultPrinter::OutputXmlTestInfo(::std::ostream* stream, + const char* test_case_name, + const TestInfo& test_info) { + const TestResult& result = *test_info.result(); + *stream << " \n"; + *stream << " "; + const string location = internal::FormatCompilerIndependentFileLocation( + part.file_name(), part.line_number()); + const string message = location + "\n" + part.message(); + OutputXmlCDataSection(stream, + RemoveInvalidXmlCharacters(message).c_str()); + *stream << "\n"; + } + } + + if (failures == 0) + *stream << " />\n"; + else + *stream << " \n"; +} + +// Prints an XML representation of a TestCase object +void XmlUnitTestResultPrinter::PrintXmlTestCase(FILE* out, + const TestCase& test_case) { + fprintf(out, + " \n", + FormatTimeInMillisAsSeconds(test_case.elapsed_time()).c_str()); + for (int i = 0; i < test_case.total_test_count(); ++i) { + ::std::stringstream stream; + OutputXmlTestInfo(&stream, test_case.name(), *test_case.GetTestInfo(i)); + fprintf(out, "%s", StringStreamToString(&stream).c_str()); + } + fprintf(out, " \n"); +} + +// Prints an XML summary of unit_test to output stream out. +void XmlUnitTestResultPrinter::PrintXmlUnitTest(FILE* out, + const UnitTest& unit_test) { + fprintf(out, "\n"); + fprintf(out, + "\n"); + for (int i = 0; i < unit_test.total_test_case_count(); ++i) + PrintXmlTestCase(out, *unit_test.GetTestCase(i)); + fprintf(out, "\n"); +} + +// Produces a string representing the test properties in a result as space +// delimited XML attributes based on the property key="value" pairs. +String XmlUnitTestResultPrinter::TestPropertiesAsXmlAttributes( + const TestResult& result) { + Message attributes; + for (int i = 0; i < result.test_property_count(); ++i) { + const TestProperty& property = result.GetTestProperty(i); + attributes << " " << property.key() << "=" + << "\"" << EscapeXmlAttribute(property.value()) << "\""; + } + return attributes.GetString(); +} + +// End XmlUnitTestResultPrinter + +#if GTEST_CAN_STREAM_RESULTS_ + +// Streams test results to the given port on the given host machine. +class StreamingListener : public EmptyTestEventListener { + public: + // Escapes '=', '&', '%', and '\n' characters in str as "%xx". + static string UrlEncode(const char* str); + + StreamingListener(const string& host, const string& port) + : sockfd_(-1), host_name_(host), port_num_(port) { + MakeConnection(); + Send("gtest_streaming_protocol_version=1.0\n"); + } + + virtual ~StreamingListener() { + if (sockfd_ != -1) + CloseConnection(); + } + + void OnTestProgramStart(const UnitTest& /* unit_test */) { + Send("event=TestProgramStart\n"); + } + + void OnTestProgramEnd(const UnitTest& unit_test) { + // Note that Google Test current only report elapsed time for each + // test iteration, not for the entire test program. + Send(String::Format("event=TestProgramEnd&passed=%d\n", + unit_test.Passed())); + + // Notify the streaming server to stop. + CloseConnection(); + } + + void OnTestIterationStart(const UnitTest& /* unit_test */, int iteration) { + Send(String::Format("event=TestIterationStart&iteration=%d\n", + iteration)); + } + + void OnTestIterationEnd(const UnitTest& unit_test, int /* iteration */) { + Send(String::Format("event=TestIterationEnd&passed=%d&elapsed_time=%sms\n", + unit_test.Passed(), + StreamableToString(unit_test.elapsed_time()).c_str())); + } + + void OnTestCaseStart(const TestCase& test_case) { + Send(String::Format("event=TestCaseStart&name=%s\n", test_case.name())); + } + + void OnTestCaseEnd(const TestCase& test_case) { + Send(String::Format("event=TestCaseEnd&passed=%d&elapsed_time=%sms\n", + test_case.Passed(), + StreamableToString(test_case.elapsed_time()).c_str())); + } + + void OnTestStart(const TestInfo& test_info) { + Send(String::Format("event=TestStart&name=%s\n", test_info.name())); + } + + void OnTestEnd(const TestInfo& test_info) { + Send(String::Format( + "event=TestEnd&passed=%d&elapsed_time=%sms\n", + (test_info.result())->Passed(), + StreamableToString((test_info.result())->elapsed_time()).c_str())); + } + + void OnTestPartResult(const TestPartResult& test_part_result) { + const char* file_name = test_part_result.file_name(); + if (file_name == NULL) + file_name = ""; + Send(String::Format("event=TestPartResult&file=%s&line=%d&message=", + UrlEncode(file_name).c_str(), + test_part_result.line_number())); + Send(UrlEncode(test_part_result.message()) + "\n"); + } + + private: + // Creates a client socket and connects to the server. + void MakeConnection(); + + // Closes the socket. + void CloseConnection() { + GTEST_CHECK_(sockfd_ != -1) + << "CloseConnection() can be called only when there is a connection."; + + close(sockfd_); + sockfd_ = -1; + } + + // Sends a string to the socket. + void Send(const string& message) { + GTEST_CHECK_(sockfd_ != -1) + << "Send() can be called only when there is a connection."; + + const int len = static_cast(message.length()); + if (write(sockfd_, message.c_str(), len) != len) { + GTEST_LOG_(WARNING) + << "stream_result_to: failed to stream to " + << host_name_ << ":" << port_num_; + } + } + + int sockfd_; // socket file descriptor + const string host_name_; + const string port_num_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(StreamingListener); +}; // class StreamingListener + +// Checks if str contains '=', '&', '%' or '\n' characters. If yes, +// replaces them by "%xx" where xx is their hexadecimal value. For +// example, replaces "=" with "%3D". This algorithm is O(strlen(str)) +// in both time and space -- important as the input str may contain an +// arbitrarily long test failure message and stack trace. +string StreamingListener::UrlEncode(const char* str) { + string result; + result.reserve(strlen(str) + 1); + for (char ch = *str; ch != '\0'; ch = *++str) { + switch (ch) { + case '%': + case '=': + case '&': + case '\n': + result.append(String::Format("%%%02x", static_cast(ch))); + break; + default: + result.push_back(ch); + break; + } + } + return result; +} + +void StreamingListener::MakeConnection() { + GTEST_CHECK_(sockfd_ == -1) + << "MakeConnection() can't be called when there is already a connection."; + + addrinfo hints; + memset(&hints, 0, sizeof(hints)); + hints.ai_family = AF_UNSPEC; // To allow both IPv4 and IPv6 addresses. + hints.ai_socktype = SOCK_STREAM; + addrinfo* servinfo = NULL; + + // Use the getaddrinfo() to get a linked list of IP addresses for + // the given host name. + const int error_num = getaddrinfo( + host_name_.c_str(), port_num_.c_str(), &hints, &servinfo); + if (error_num != 0) { + GTEST_LOG_(WARNING) << "stream_result_to: getaddrinfo() failed: " + << gai_strerror(error_num); + } + + // Loop through all the results and connect to the first we can. + for (addrinfo* cur_addr = servinfo; sockfd_ == -1 && cur_addr != NULL; + cur_addr = cur_addr->ai_next) { + sockfd_ = socket( + cur_addr->ai_family, cur_addr->ai_socktype, cur_addr->ai_protocol); + if (sockfd_ != -1) { + // Connect the client socket to the server socket. + if (connect(sockfd_, cur_addr->ai_addr, cur_addr->ai_addrlen) == -1) { + close(sockfd_); + sockfd_ = -1; + } + } + } + + freeaddrinfo(servinfo); // all done with this structure + + if (sockfd_ == -1) { + GTEST_LOG_(WARNING) << "stream_result_to: failed to connect to " + << host_name_ << ":" << port_num_; + } +} + +// End of class Streaming Listener +#endif // GTEST_CAN_STREAM_RESULTS__ + +// Class ScopedTrace + +// Pushes the given source file location and message onto a per-thread +// trace stack maintained by Google Test. +// L < UnitTest::mutex_ +ScopedTrace::ScopedTrace(const char* file, int line, const Message& message) { + TraceInfo trace; + trace.file = file; + trace.line = line; + trace.message = message.GetString(); + + UnitTest::GetInstance()->PushGTestTrace(trace); +} + +// Pops the info pushed by the c'tor. +// L < UnitTest::mutex_ +ScopedTrace::~ScopedTrace() { + UnitTest::GetInstance()->PopGTestTrace(); +} + + +// class OsStackTraceGetter + +// Returns the current OS stack trace as a String. Parameters: +// +// max_depth - the maximum number of stack frames to be included +// in the trace. +// skip_count - the number of top frames to be skipped; doesn't count +// against max_depth. +// +// L < mutex_ +// We use "L < mutex_" to denote that the function may acquire mutex_. +String OsStackTraceGetter::CurrentStackTrace(int, int) { + return String(""); +} + +// L < mutex_ +void OsStackTraceGetter::UponLeavingGTest() { +} + +const char* const +OsStackTraceGetter::kElidedFramesMarker = + "... " GTEST_NAME_ " internal frames ..."; + +} // namespace internal + +// class TestEventListeners + +TestEventListeners::TestEventListeners() + : repeater_(new internal::TestEventRepeater()), + default_result_printer_(NULL), + default_xml_generator_(NULL) { +} + +TestEventListeners::~TestEventListeners() { delete repeater_; } + +// Returns the standard listener responsible for the default console +// output. Can be removed from the listeners list to shut down default +// console output. Note that removing this object from the listener list +// with Release transfers its ownership to the user. +void TestEventListeners::Append(TestEventListener* listener) { + repeater_->Append(listener); +} + +// Removes the given event listener from the list and returns it. It then +// becomes the caller's responsibility to delete the listener. Returns +// NULL if the listener is not found in the list. +TestEventListener* TestEventListeners::Release(TestEventListener* listener) { + if (listener == default_result_printer_) + default_result_printer_ = NULL; + else if (listener == default_xml_generator_) + default_xml_generator_ = NULL; + return repeater_->Release(listener); +} + +// Returns repeater that broadcasts the TestEventListener events to all +// subscribers. +TestEventListener* TestEventListeners::repeater() { return repeater_; } + +// Sets the default_result_printer attribute to the provided listener. +// The listener is also added to the listener list and previous +// default_result_printer is removed from it and deleted. The listener can +// also be NULL in which case it will not be added to the list. Does +// nothing if the previous and the current listener objects are the same. +void TestEventListeners::SetDefaultResultPrinter(TestEventListener* listener) { + if (default_result_printer_ != listener) { + // It is an error to pass this method a listener that is already in the + // list. + delete Release(default_result_printer_); + default_result_printer_ = listener; + if (listener != NULL) + Append(listener); + } +} + +// Sets the default_xml_generator attribute to the provided listener. The +// listener is also added to the listener list and previous +// default_xml_generator is removed from it and deleted. The listener can +// also be NULL in which case it will not be added to the list. Does +// nothing if the previous and the current listener objects are the same. +void TestEventListeners::SetDefaultXmlGenerator(TestEventListener* listener) { + if (default_xml_generator_ != listener) { + // It is an error to pass this method a listener that is already in the + // list. + delete Release(default_xml_generator_); + default_xml_generator_ = listener; + if (listener != NULL) + Append(listener); + } +} + +// Controls whether events will be forwarded by the repeater to the +// listeners in the list. +bool TestEventListeners::EventForwardingEnabled() const { + return repeater_->forwarding_enabled(); +} + +void TestEventListeners::SuppressEventForwarding() { + repeater_->set_forwarding_enabled(false); +} + +// class UnitTest + +// Gets the singleton UnitTest object. The first time this method is +// called, a UnitTest object is constructed and returned. Consecutive +// calls will return the same object. +// +// We don't protect this under mutex_ as a user is not supposed to +// call this before main() starts, from which point on the return +// value will never change. +UnitTest * UnitTest::GetInstance() { + // When compiled with MSVC 7.1 in optimized mode, destroying the + // UnitTest object upon exiting the program messes up the exit code, + // causing successful tests to appear failed. We have to use a + // different implementation in this case to bypass the compiler bug. + // This implementation makes the compiler happy, at the cost of + // leaking the UnitTest object. + + // CodeGear C++Builder insists on a public destructor for the + // default implementation. Use this implementation to keep good OO + // design with private destructor. + +#if (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__) + static UnitTest* const instance = new UnitTest; + return instance; +#else + static UnitTest instance; + return &instance; +#endif // (_MSC_VER == 1310 && !defined(_DEBUG)) || defined(__BORLANDC__) +} + +// Gets the number of successful test cases. +int UnitTest::successful_test_case_count() const { + return impl()->successful_test_case_count(); +} + +// Gets the number of failed test cases. +int UnitTest::failed_test_case_count() const { + return impl()->failed_test_case_count(); +} + +// Gets the number of all test cases. +int UnitTest::total_test_case_count() const { + return impl()->total_test_case_count(); +} + +// Gets the number of all test cases that contain at least one test +// that should run. +int UnitTest::test_case_to_run_count() const { + return impl()->test_case_to_run_count(); +} + +// Gets the number of successful tests. +int UnitTest::successful_test_count() const { + return impl()->successful_test_count(); +} + +// Gets the number of failed tests. +int UnitTest::failed_test_count() const { return impl()->failed_test_count(); } + +// Gets the number of disabled tests. +int UnitTest::disabled_test_count() const { + return impl()->disabled_test_count(); +} + +// Gets the number of all tests. +int UnitTest::total_test_count() const { return impl()->total_test_count(); } + +// Gets the number of tests that should run. +int UnitTest::test_to_run_count() const { return impl()->test_to_run_count(); } + +// Gets the elapsed time, in milliseconds. +internal::TimeInMillis UnitTest::elapsed_time() const { + return impl()->elapsed_time(); +} + +// Returns true iff the unit test passed (i.e. all test cases passed). +bool UnitTest::Passed() const { return impl()->Passed(); } + +// Returns true iff the unit test failed (i.e. some test case failed +// or something outside of all tests failed). +bool UnitTest::Failed() const { return impl()->Failed(); } + +// Gets the i-th test case among all the test cases. i can range from 0 to +// total_test_case_count() - 1. If i is not in that range, returns NULL. +const TestCase* UnitTest::GetTestCase(int i) const { + return impl()->GetTestCase(i); +} + +// Gets the i-th test case among all the test cases. i can range from 0 to +// total_test_case_count() - 1. If i is not in that range, returns NULL. +TestCase* UnitTest::GetMutableTestCase(int i) { + return impl()->GetMutableTestCase(i); +} + +// Returns the list of event listeners that can be used to track events +// inside Google Test. +TestEventListeners& UnitTest::listeners() { + return *impl()->listeners(); +} + +// Registers and returns a global test environment. When a test +// program is run, all global test environments will be set-up in the +// order they were registered. After all tests in the program have +// finished, all global test environments will be torn-down in the +// *reverse* order they were registered. +// +// The UnitTest object takes ownership of the given environment. +// +// We don't protect this under mutex_, as we only support calling it +// from the main thread. +Environment* UnitTest::AddEnvironment(Environment* env) { + if (env == NULL) { + return NULL; + } + + impl_->environments().push_back(env); + return env; +} + +// Adds a TestPartResult to the current TestResult object. All Google Test +// assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) eventually call +// this to report their results. The user code should use the +// assertion macros instead of calling this directly. +// L < mutex_ +void UnitTest::AddTestPartResult(TestPartResult::Type result_type, + const char* file_name, + int line_number, + const internal::String& message, + const internal::String& os_stack_trace) { + Message msg; + msg << message; + + internal::MutexLock lock(&mutex_); + if (impl_->gtest_trace_stack().size() > 0) { + msg << "\n" << GTEST_NAME_ << " trace:"; + + for (int i = static_cast(impl_->gtest_trace_stack().size()); + i > 0; --i) { + const internal::TraceInfo& trace = impl_->gtest_trace_stack()[i - 1]; + msg << "\n" << internal::FormatFileLocation(trace.file, trace.line) + << " " << trace.message; + } + } + + if (os_stack_trace.c_str() != NULL && !os_stack_trace.empty()) { + msg << internal::kStackTraceMarker << os_stack_trace; + } + + const TestPartResult result = + TestPartResult(result_type, file_name, line_number, + msg.GetString().c_str()); + impl_->GetTestPartResultReporterForCurrentThread()-> + ReportTestPartResult(result); + + if (result_type != TestPartResult::kSuccess) { + // gtest_break_on_failure takes precedence over + // gtest_throw_on_failure. This allows a user to set the latter + // in the code (perhaps in order to use Google Test assertions + // with another testing framework) and specify the former on the + // command line for debugging. + if (GTEST_FLAG(break_on_failure)) { +#if GTEST_OS_WINDOWS + // Using DebugBreak on Windows allows gtest to still break into a debugger + // when a failure happens and both the --gtest_break_on_failure and + // the --gtest_catch_exceptions flags are specified. + DebugBreak(); +#else + // Dereference NULL through a volatile pointer to prevent the compiler + // from removing. We use this rather than abort() or __builtin_trap() for + // portability: Symbian doesn't implement abort() well, and some debuggers + // don't correctly trap abort(). + *static_cast(NULL) = 1; +#endif // GTEST_OS_WINDOWS + } else if (GTEST_FLAG(throw_on_failure)) { +#if GTEST_HAS_EXCEPTIONS + throw GoogleTestFailureException(result); +#else + // We cannot call abort() as it generates a pop-up in debug mode + // that cannot be suppressed in VC 7.1 or below. + exit(1); +#endif + } + } +} + +// Creates and adds a property to the current TestResult. If a property matching +// the supplied value already exists, updates its value instead. +void UnitTest::RecordPropertyForCurrentTest(const char* key, + const char* value) { + const TestProperty test_property(key, value); + impl_->current_test_result()->RecordProperty(test_property); +} + +// Runs all tests in this UnitTest object and prints the result. +// Returns 0 if successful, or 1 otherwise. +// +// We don't protect this under mutex_, as we only support calling it +// from the main thread. +int UnitTest::Run() { + // Captures the value of GTEST_FLAG(catch_exceptions). This value will be + // used for the duration of the program. + impl()->set_catch_exceptions(GTEST_FLAG(catch_exceptions)); + +#if GTEST_HAS_SEH + const bool in_death_test_child_process = + internal::GTEST_FLAG(internal_run_death_test).length() > 0; + + // Either the user wants Google Test to catch exceptions thrown by the + // tests or this is executing in the context of death test child + // process. In either case the user does not want to see pop-up dialogs + // about crashes - they are expected. + if (impl()->catch_exceptions() || in_death_test_child_process) { + +# if !GTEST_OS_WINDOWS_MOBILE + // SetErrorMode doesn't exist on CE. + SetErrorMode(SEM_FAILCRITICALERRORS | SEM_NOALIGNMENTFAULTEXCEPT | + SEM_NOGPFAULTERRORBOX | SEM_NOOPENFILEERRORBOX); +# endif // !GTEST_OS_WINDOWS_MOBILE + +# if (defined(_MSC_VER) || GTEST_OS_WINDOWS_MINGW) && !GTEST_OS_WINDOWS_MOBILE + // Death test children can be terminated with _abort(). On Windows, + // _abort() can show a dialog with a warning message. This forces the + // abort message to go to stderr instead. + _set_error_mode(_OUT_TO_STDERR); +# endif + +# if _MSC_VER >= 1400 && !GTEST_OS_WINDOWS_MOBILE + // In the debug version, Visual Studio pops up a separate dialog + // offering a choice to debug the aborted program. We need to suppress + // this dialog or it will pop up for every EXPECT/ASSERT_DEATH statement + // executed. Google Test will notify the user of any unexpected + // failure via stderr. + // + // VC++ doesn't define _set_abort_behavior() prior to the version 8.0. + // Users of prior VC versions shall suffer the agony and pain of + // clicking through the countless debug dialogs. + // TODO(vladl@google.com): find a way to suppress the abort dialog() in the + // debug mode when compiled with VC 7.1 or lower. + if (!GTEST_FLAG(break_on_failure)) + _set_abort_behavior( + 0x0, // Clear the following flags: + _WRITE_ABORT_MSG | _CALL_REPORTFAULT); // pop-up window, core dump. +# endif + + } +#endif // GTEST_HAS_SEH + + return internal::HandleExceptionsInMethodIfSupported( + impl(), + &internal::UnitTestImpl::RunAllTests, + "auxiliary test code (environments or event listeners)") ? 0 : 1; +} + +// Returns the working directory when the first TEST() or TEST_F() was +// executed. +const char* UnitTest::original_working_dir() const { + return impl_->original_working_dir_.c_str(); +} + +// Returns the TestCase object for the test that's currently running, +// or NULL if no test is running. +// L < mutex_ +const TestCase* UnitTest::current_test_case() const { + internal::MutexLock lock(&mutex_); + return impl_->current_test_case(); +} + +// Returns the TestInfo object for the test that's currently running, +// or NULL if no test is running. +// L < mutex_ +const TestInfo* UnitTest::current_test_info() const { + internal::MutexLock lock(&mutex_); + return impl_->current_test_info(); +} + +// Returns the random seed used at the start of the current test run. +int UnitTest::random_seed() const { return impl_->random_seed(); } + +#if GTEST_HAS_PARAM_TEST +// Returns ParameterizedTestCaseRegistry object used to keep track of +// value-parameterized tests and instantiate and register them. +// L < mutex_ +internal::ParameterizedTestCaseRegistry& + UnitTest::parameterized_test_registry() { + return impl_->parameterized_test_registry(); +} +#endif // GTEST_HAS_PARAM_TEST + +// Creates an empty UnitTest. +UnitTest::UnitTest() { + impl_ = new internal::UnitTestImpl(this); +} + +// Destructor of UnitTest. +UnitTest::~UnitTest() { + delete impl_; +} + +// Pushes a trace defined by SCOPED_TRACE() on to the per-thread +// Google Test trace stack. +// L < mutex_ +void UnitTest::PushGTestTrace(const internal::TraceInfo& trace) { + internal::MutexLock lock(&mutex_); + impl_->gtest_trace_stack().push_back(trace); +} + +// Pops a trace from the per-thread Google Test trace stack. +// L < mutex_ +void UnitTest::PopGTestTrace() { + internal::MutexLock lock(&mutex_); + impl_->gtest_trace_stack().pop_back(); +} + +namespace internal { + +UnitTestImpl::UnitTestImpl(UnitTest* parent) + : parent_(parent), +#ifdef _MSC_VER +# pragma warning(push) // Saves the current warning state. +# pragma warning(disable:4355) // Temporarily disables warning 4355 + // (using this in initializer). + default_global_test_part_result_reporter_(this), + default_per_thread_test_part_result_reporter_(this), +# pragma warning(pop) // Restores the warning state again. +#else + default_global_test_part_result_reporter_(this), + default_per_thread_test_part_result_reporter_(this), +#endif // _MSC_VER + global_test_part_result_repoter_( + &default_global_test_part_result_reporter_), + per_thread_test_part_result_reporter_( + &default_per_thread_test_part_result_reporter_), +#if GTEST_HAS_PARAM_TEST + parameterized_test_registry_(), + parameterized_tests_registered_(false), +#endif // GTEST_HAS_PARAM_TEST + last_death_test_case_(-1), + current_test_case_(NULL), + current_test_info_(NULL), + ad_hoc_test_result_(), + os_stack_trace_getter_(NULL), + post_flag_parse_init_performed_(false), + random_seed_(0), // Will be overridden by the flag before first use. + random_(0), // Will be reseeded before first use. + elapsed_time_(0), +#if GTEST_HAS_DEATH_TEST + internal_run_death_test_flag_(NULL), + death_test_factory_(new DefaultDeathTestFactory), +#endif + // Will be overridden by the flag before first use. + catch_exceptions_(false) { + listeners()->SetDefaultResultPrinter(new PrettyUnitTestResultPrinter); +} + +UnitTestImpl::~UnitTestImpl() { + // Deletes every TestCase. + ForEach(test_cases_, internal::Delete); + + // Deletes every Environment. + ForEach(environments_, internal::Delete); + + delete os_stack_trace_getter_; +} + +#if GTEST_HAS_DEATH_TEST +// Disables event forwarding if the control is currently in a death test +// subprocess. Must not be called before InitGoogleTest. +void UnitTestImpl::SuppressTestEventsIfInSubprocess() { + if (internal_run_death_test_flag_.get() != NULL) + listeners()->SuppressEventForwarding(); +} +#endif // GTEST_HAS_DEATH_TEST + +// Initializes event listeners performing XML output as specified by +// UnitTestOptions. Must not be called before InitGoogleTest. +void UnitTestImpl::ConfigureXmlOutput() { + const String& output_format = UnitTestOptions::GetOutputFormat(); + if (output_format == "xml") { + listeners()->SetDefaultXmlGenerator(new XmlUnitTestResultPrinter( + UnitTestOptions::GetAbsolutePathToOutputFile().c_str())); + } else if (output_format != "") { + printf("WARNING: unrecognized output format \"%s\" ignored.\n", + output_format.c_str()); + fflush(stdout); + } +} + +#if GTEST_CAN_STREAM_RESULTS_ +// Initializes event listeners for streaming test results in String form. +// Must not be called before InitGoogleTest. +void UnitTestImpl::ConfigureStreamingOutput() { + const string& target = GTEST_FLAG(stream_result_to); + if (!target.empty()) { + const size_t pos = target.find(':'); + if (pos != string::npos) { + listeners()->Append(new StreamingListener(target.substr(0, pos), + target.substr(pos+1))); + } else { + printf("WARNING: unrecognized streaming target \"%s\" ignored.\n", + target.c_str()); + fflush(stdout); + } + } +} +#endif // GTEST_CAN_STREAM_RESULTS_ + +// Performs initialization dependent upon flag values obtained in +// ParseGoogleTestFlagsOnly. Is called from InitGoogleTest after the call to +// ParseGoogleTestFlagsOnly. In case a user neglects to call InitGoogleTest +// this function is also called from RunAllTests. Since this function can be +// called more than once, it has to be idempotent. +void UnitTestImpl::PostFlagParsingInit() { + // Ensures that this function does not execute more than once. + if (!post_flag_parse_init_performed_) { + post_flag_parse_init_performed_ = true; + +#if GTEST_HAS_DEATH_TEST + InitDeathTestSubprocessControlInfo(); + SuppressTestEventsIfInSubprocess(); +#endif // GTEST_HAS_DEATH_TEST + + // Registers parameterized tests. This makes parameterized tests + // available to the UnitTest reflection API without running + // RUN_ALL_TESTS. + RegisterParameterizedTests(); + + // Configures listeners for XML output. This makes it possible for users + // to shut down the default XML output before invoking RUN_ALL_TESTS. + ConfigureXmlOutput(); + +#if GTEST_CAN_STREAM_RESULTS_ + // Configures listeners for streaming test results to the specified server. + ConfigureStreamingOutput(); +#endif // GTEST_CAN_STREAM_RESULTS_ + } +} + +// A predicate that checks the name of a TestCase against a known +// value. +// +// This is used for implementation of the UnitTest class only. We put +// it in the anonymous namespace to prevent polluting the outer +// namespace. +// +// TestCaseNameIs is copyable. +class TestCaseNameIs { + public: + // Constructor. + explicit TestCaseNameIs(const String& name) + : name_(name) {} + + // Returns true iff the name of test_case matches name_. + bool operator()(const TestCase* test_case) const { + return test_case != NULL && strcmp(test_case->name(), name_.c_str()) == 0; + } + + private: + String name_; +}; + +// Finds and returns a TestCase with the given name. If one doesn't +// exist, creates one and returns it. It's the CALLER'S +// RESPONSIBILITY to ensure that this function is only called WHEN THE +// TESTS ARE NOT SHUFFLED. +// +// Arguments: +// +// test_case_name: name of the test case +// type_param: the name of the test case's type parameter, or NULL if +// this is not a typed or a type-parameterized test case. +// set_up_tc: pointer to the function that sets up the test case +// tear_down_tc: pointer to the function that tears down the test case +TestCase* UnitTestImpl::GetTestCase(const char* test_case_name, + const char* type_param, + Test::SetUpTestCaseFunc set_up_tc, + Test::TearDownTestCaseFunc tear_down_tc) { + // Can we find a TestCase with the given name? + const std::vector::const_iterator test_case = + std::find_if(test_cases_.begin(), test_cases_.end(), + TestCaseNameIs(test_case_name)); + + if (test_case != test_cases_.end()) + return *test_case; + + // No. Let's create one. + TestCase* const new_test_case = + new TestCase(test_case_name, type_param, set_up_tc, tear_down_tc); + + // Is this a death test case? + if (internal::UnitTestOptions::MatchesFilter(String(test_case_name), + kDeathTestCaseFilter)) { + // Yes. Inserts the test case after the last death test case + // defined so far. This only works when the test cases haven't + // been shuffled. Otherwise we may end up running a death test + // after a non-death test. + ++last_death_test_case_; + test_cases_.insert(test_cases_.begin() + last_death_test_case_, + new_test_case); + } else { + // No. Appends to the end of the list. + test_cases_.push_back(new_test_case); + } + + test_case_indices_.push_back(static_cast(test_case_indices_.size())); + return new_test_case; +} + +// Helpers for setting up / tearing down the given environment. They +// are for use in the ForEach() function. +static void SetUpEnvironment(Environment* env) { env->SetUp(); } +static void TearDownEnvironment(Environment* env) { env->TearDown(); } + +// Runs all tests in this UnitTest object, prints the result, and +// returns true if all tests are successful. If any exception is +// thrown during a test, the test is considered to be failed, but the +// rest of the tests will still be run. +// +// When parameterized tests are enabled, it expands and registers +// parameterized tests first in RegisterParameterizedTests(). +// All other functions called from RunAllTests() may safely assume that +// parameterized tests are ready to be counted and run. +bool UnitTestImpl::RunAllTests() { + // Makes sure InitGoogleTest() was called. + if (!GTestIsInitialized()) { + printf("%s", + "\nThis test program did NOT call ::testing::InitGoogleTest " + "before calling RUN_ALL_TESTS(). Please fix it.\n"); + return false; + } + + // Do not run any test if the --help flag was specified. + if (g_help_flag) + return true; + + // Repeats the call to the post-flag parsing initialization in case the + // user didn't call InitGoogleTest. + PostFlagParsingInit(); + + // Even if sharding is not on, test runners may want to use the + // GTEST_SHARD_STATUS_FILE to query whether the test supports the sharding + // protocol. + internal::WriteToShardStatusFileIfNeeded(); + + // True iff we are in a subprocess for running a thread-safe-style + // death test. + bool in_subprocess_for_death_test = false; + +#if GTEST_HAS_DEATH_TEST + in_subprocess_for_death_test = (internal_run_death_test_flag_.get() != NULL); +#endif // GTEST_HAS_DEATH_TEST + + const bool should_shard = ShouldShard(kTestTotalShards, kTestShardIndex, + in_subprocess_for_death_test); + + // Compares the full test names with the filter to decide which + // tests to run. + const bool has_tests_to_run = FilterTests(should_shard + ? HONOR_SHARDING_PROTOCOL + : IGNORE_SHARDING_PROTOCOL) > 0; + + // Lists the tests and exits if the --gtest_list_tests flag was specified. + if (GTEST_FLAG(list_tests)) { + // This must be called *after* FilterTests() has been called. + ListTestsMatchingFilter(); + return true; + } + + random_seed_ = GTEST_FLAG(shuffle) ? + GetRandomSeedFromFlag(GTEST_FLAG(random_seed)) : 0; + + // True iff at least one test has failed. + bool failed = false; + + TestEventListener* repeater = listeners()->repeater(); + + repeater->OnTestProgramStart(*parent_); + + // How many times to repeat the tests? We don't want to repeat them + // when we are inside the subprocess of a death test. + const int repeat = in_subprocess_for_death_test ? 1 : GTEST_FLAG(repeat); + // Repeats forever if the repeat count is negative. + const bool forever = repeat < 0; + for (int i = 0; forever || i != repeat; i++) { + // We want to preserve failures generated by ad-hoc test + // assertions executed before RUN_ALL_TESTS(). + ClearNonAdHocTestResult(); + + const TimeInMillis start = GetTimeInMillis(); + + // Shuffles test cases and tests if requested. + if (has_tests_to_run && GTEST_FLAG(shuffle)) { + random()->Reseed(random_seed_); + // This should be done before calling OnTestIterationStart(), + // such that a test event listener can see the actual test order + // in the event. + ShuffleTests(); + } + + // Tells the unit test event listeners that the tests are about to start. + repeater->OnTestIterationStart(*parent_, i); + + // Runs each test case if there is at least one test to run. + if (has_tests_to_run) { + // Sets up all environments beforehand. + repeater->OnEnvironmentsSetUpStart(*parent_); + ForEach(environments_, SetUpEnvironment); + repeater->OnEnvironmentsSetUpEnd(*parent_); + + // Runs the tests only if there was no fatal failure during global + // set-up. + if (!Test::HasFatalFailure()) { + for (int test_index = 0; test_index < total_test_case_count(); + test_index++) { + GetMutableTestCase(test_index)->Run(); + } + } + + // Tears down all environments in reverse order afterwards. + repeater->OnEnvironmentsTearDownStart(*parent_); + std::for_each(environments_.rbegin(), environments_.rend(), + TearDownEnvironment); + repeater->OnEnvironmentsTearDownEnd(*parent_); + } + + elapsed_time_ = GetTimeInMillis() - start; + + // Tells the unit test event listener that the tests have just finished. + repeater->OnTestIterationEnd(*parent_, i); + + // Gets the result and clears it. + if (!Passed()) { + failed = true; + } + + // Restores the original test order after the iteration. This + // allows the user to quickly repro a failure that happens in the + // N-th iteration without repeating the first (N - 1) iterations. + // This is not enclosed in "if (GTEST_FLAG(shuffle)) { ... }", in + // case the user somehow changes the value of the flag somewhere + // (it's always safe to unshuffle the tests). + UnshuffleTests(); + + if (GTEST_FLAG(shuffle)) { + // Picks a new random seed for each iteration. + random_seed_ = GetNextRandomSeed(random_seed_); + } + } + + repeater->OnTestProgramEnd(*parent_); + + return !failed; +} + +// Reads the GTEST_SHARD_STATUS_FILE environment variable, and creates the file +// if the variable is present. If a file already exists at this location, this +// function will write over it. If the variable is present, but the file cannot +// be created, prints an error and exits. +void WriteToShardStatusFileIfNeeded() { + const char* const test_shard_file = posix::GetEnv(kTestShardStatusFile); + if (test_shard_file != NULL) { + FILE* const file = posix::FOpen(test_shard_file, "w"); + if (file == NULL) { + ColoredPrintf(COLOR_RED, + "Could not write to the test shard status file \"%s\" " + "specified by the %s environment variable.\n", + test_shard_file, kTestShardStatusFile); + fflush(stdout); + exit(EXIT_FAILURE); + } + fclose(file); + } +} + +// Checks whether sharding is enabled by examining the relevant +// environment variable values. If the variables are present, +// but inconsistent (i.e., shard_index >= total_shards), prints +// an error and exits. If in_subprocess_for_death_test, sharding is +// disabled because it must only be applied to the original test +// process. Otherwise, we could filter out death tests we intended to execute. +bool ShouldShard(const char* total_shards_env, + const char* shard_index_env, + bool in_subprocess_for_death_test) { + if (in_subprocess_for_death_test) { + return false; + } + + const Int32 total_shards = Int32FromEnvOrDie(total_shards_env, -1); + const Int32 shard_index = Int32FromEnvOrDie(shard_index_env, -1); + + if (total_shards == -1 && shard_index == -1) { + return false; + } else if (total_shards == -1 && shard_index != -1) { + const Message msg = Message() + << "Invalid environment variables: you have " + << kTestShardIndex << " = " << shard_index + << ", but have left " << kTestTotalShards << " unset.\n"; + ColoredPrintf(COLOR_RED, msg.GetString().c_str()); + fflush(stdout); + exit(EXIT_FAILURE); + } else if (total_shards != -1 && shard_index == -1) { + const Message msg = Message() + << "Invalid environment variables: you have " + << kTestTotalShards << " = " << total_shards + << ", but have left " << kTestShardIndex << " unset.\n"; + ColoredPrintf(COLOR_RED, msg.GetString().c_str()); + fflush(stdout); + exit(EXIT_FAILURE); + } else if (shard_index < 0 || shard_index >= total_shards) { + const Message msg = Message() + << "Invalid environment variables: we require 0 <= " + << kTestShardIndex << " < " << kTestTotalShards + << ", but you have " << kTestShardIndex << "=" << shard_index + << ", " << kTestTotalShards << "=" << total_shards << ".\n"; + ColoredPrintf(COLOR_RED, msg.GetString().c_str()); + fflush(stdout); + exit(EXIT_FAILURE); + } + + return total_shards > 1; +} + +// Parses the environment variable var as an Int32. If it is unset, +// returns default_val. If it is not an Int32, prints an error +// and aborts. +Int32 Int32FromEnvOrDie(const char* var, Int32 default_val) { + const char* str_val = posix::GetEnv(var); + if (str_val == NULL) { + return default_val; + } + + Int32 result; + if (!ParseInt32(Message() << "The value of environment variable " << var, + str_val, &result)) { + exit(EXIT_FAILURE); + } + return result; +} + +// Given the total number of shards, the shard index, and the test id, +// returns true iff the test should be run on this shard. The test id is +// some arbitrary but unique non-negative integer assigned to each test +// method. Assumes that 0 <= shard_index < total_shards. +bool ShouldRunTestOnShard(int total_shards, int shard_index, int test_id) { + return (test_id % total_shards) == shard_index; +} + +// Compares the name of each test with the user-specified filter to +// decide whether the test should be run, then records the result in +// each TestCase and TestInfo object. +// If shard_tests == true, further filters tests based on sharding +// variables in the environment - see +// http://code.google.com/p/googletest/wiki/GoogleTestAdvancedGuide. +// Returns the number of tests that should run. +int UnitTestImpl::FilterTests(ReactionToSharding shard_tests) { + const Int32 total_shards = shard_tests == HONOR_SHARDING_PROTOCOL ? + Int32FromEnvOrDie(kTestTotalShards, -1) : -1; + const Int32 shard_index = shard_tests == HONOR_SHARDING_PROTOCOL ? + Int32FromEnvOrDie(kTestShardIndex, -1) : -1; + + // num_runnable_tests are the number of tests that will + // run across all shards (i.e., match filter and are not disabled). + // num_selected_tests are the number of tests to be run on + // this shard. + int num_runnable_tests = 0; + int num_selected_tests = 0; + for (size_t i = 0; i < test_cases_.size(); i++) { + TestCase* const test_case = test_cases_[i]; + const String &test_case_name = test_case->name(); + test_case->set_should_run(false); + + for (size_t j = 0; j < test_case->test_info_list().size(); j++) { + TestInfo* const test_info = test_case->test_info_list()[j]; + const String test_name(test_info->name()); + // A test is disabled if test case name or test name matches + // kDisableTestFilter. + const bool is_disabled = + internal::UnitTestOptions::MatchesFilter(test_case_name, + kDisableTestFilter) || + internal::UnitTestOptions::MatchesFilter(test_name, + kDisableTestFilter); + test_info->is_disabled_ = is_disabled; + + const bool matches_filter = + internal::UnitTestOptions::FilterMatchesTest(test_case_name, + test_name); + test_info->matches_filter_ = matches_filter; + + const bool is_runnable = + (GTEST_FLAG(also_run_disabled_tests) || !is_disabled) && + matches_filter; + + const bool is_selected = is_runnable && + (shard_tests == IGNORE_SHARDING_PROTOCOL || + ShouldRunTestOnShard(total_shards, shard_index, + num_runnable_tests)); + + num_runnable_tests += is_runnable; + num_selected_tests += is_selected; + + test_info->should_run_ = is_selected; + test_case->set_should_run(test_case->should_run() || is_selected); + } + } + return num_selected_tests; +} + +// Prints the names of the tests matching the user-specified filter flag. +void UnitTestImpl::ListTestsMatchingFilter() { + for (size_t i = 0; i < test_cases_.size(); i++) { + const TestCase* const test_case = test_cases_[i]; + bool printed_test_case_name = false; + + for (size_t j = 0; j < test_case->test_info_list().size(); j++) { + const TestInfo* const test_info = + test_case->test_info_list()[j]; + if (test_info->matches_filter_) { + if (!printed_test_case_name) { + printed_test_case_name = true; + printf("%s.\n", test_case->name()); + } + printf(" %s\n", test_info->name()); + } + } + } + fflush(stdout); +} + +// Sets the OS stack trace getter. +// +// Does nothing if the input and the current OS stack trace getter are +// the same; otherwise, deletes the old getter and makes the input the +// current getter. +void UnitTestImpl::set_os_stack_trace_getter( + OsStackTraceGetterInterface* getter) { + if (os_stack_trace_getter_ != getter) { + delete os_stack_trace_getter_; + os_stack_trace_getter_ = getter; + } +} + +// Returns the current OS stack trace getter if it is not NULL; +// otherwise, creates an OsStackTraceGetter, makes it the current +// getter, and returns it. +OsStackTraceGetterInterface* UnitTestImpl::os_stack_trace_getter() { + if (os_stack_trace_getter_ == NULL) { + os_stack_trace_getter_ = new OsStackTraceGetter; + } + + return os_stack_trace_getter_; +} + +// Returns the TestResult for the test that's currently running, or +// the TestResult for the ad hoc test if no test is running. +TestResult* UnitTestImpl::current_test_result() { + return current_test_info_ ? + &(current_test_info_->result_) : &ad_hoc_test_result_; +} + +// Shuffles all test cases, and the tests within each test case, +// making sure that death tests are still run first. +void UnitTestImpl::ShuffleTests() { + // Shuffles the death test cases. + ShuffleRange(random(), 0, last_death_test_case_ + 1, &test_case_indices_); + + // Shuffles the non-death test cases. + ShuffleRange(random(), last_death_test_case_ + 1, + static_cast(test_cases_.size()), &test_case_indices_); + + // Shuffles the tests inside each test case. + for (size_t i = 0; i < test_cases_.size(); i++) { + test_cases_[i]->ShuffleTests(random()); + } +} + +// Restores the test cases and tests to their order before the first shuffle. +void UnitTestImpl::UnshuffleTests() { + for (size_t i = 0; i < test_cases_.size(); i++) { + // Unshuffles the tests in each test case. + test_cases_[i]->UnshuffleTests(); + // Resets the index of each test case. + test_case_indices_[i] = static_cast(i); + } +} + +// Returns the current OS stack trace as a String. +// +// The maximum number of stack frames to be included is specified by +// the gtest_stack_trace_depth flag. The skip_count parameter +// specifies the number of top frames to be skipped, which doesn't +// count against the number of frames to be included. +// +// For example, if Foo() calls Bar(), which in turn calls +// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in +// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't. +String GetCurrentOsStackTraceExceptTop(UnitTest* /*unit_test*/, + int skip_count) { + // We pass skip_count + 1 to skip this wrapper function in addition + // to what the user really wants to skip. + return GetUnitTestImpl()->CurrentOsStackTraceExceptTop(skip_count + 1); +} + +// Used by the GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_ macro to +// suppress unreachable code warnings. +namespace { +class ClassUniqueToAlwaysTrue {}; +} + +bool IsTrue(bool condition) { return condition; } + +bool AlwaysTrue() { +#if GTEST_HAS_EXCEPTIONS + // This condition is always false so AlwaysTrue() never actually throws, + // but it makes the compiler think that it may throw. + if (IsTrue(false)) + throw ClassUniqueToAlwaysTrue(); +#endif // GTEST_HAS_EXCEPTIONS + return true; +} + +// If *pstr starts with the given prefix, modifies *pstr to be right +// past the prefix and returns true; otherwise leaves *pstr unchanged +// and returns false. None of pstr, *pstr, and prefix can be NULL. +bool SkipPrefix(const char* prefix, const char** pstr) { + const size_t prefix_len = strlen(prefix); + if (strncmp(*pstr, prefix, prefix_len) == 0) { + *pstr += prefix_len; + return true; + } + return false; +} + +// Parses a string as a command line flag. The string should have +// the format "--flag=value". When def_optional is true, the "=value" +// part can be omitted. +// +// Returns the value of the flag, or NULL if the parsing failed. +const char* ParseFlagValue(const char* str, + const char* flag, + bool def_optional) { + // str and flag must not be NULL. + if (str == NULL || flag == NULL) return NULL; + + // The flag must start with "--" followed by GTEST_FLAG_PREFIX_. + const String flag_str = String::Format("--%s%s", GTEST_FLAG_PREFIX_, flag); + const size_t flag_len = flag_str.length(); + if (strncmp(str, flag_str.c_str(), flag_len) != 0) return NULL; + + // Skips the flag name. + const char* flag_end = str + flag_len; + + // When def_optional is true, it's OK to not have a "=value" part. + if (def_optional && (flag_end[0] == '\0')) { + return flag_end; + } + + // If def_optional is true and there are more characters after the + // flag name, or if def_optional is false, there must be a '=' after + // the flag name. + if (flag_end[0] != '=') return NULL; + + // Returns the string after "=". + return flag_end + 1; +} + +// Parses a string for a bool flag, in the form of either +// "--flag=value" or "--flag". +// +// In the former case, the value is taken as true as long as it does +// not start with '0', 'f', or 'F'. +// +// In the latter case, the value is taken as true. +// +// On success, stores the value of the flag in *value, and returns +// true. On failure, returns false without changing *value. +bool ParseBoolFlag(const char* str, const char* flag, bool* value) { + // Gets the value of the flag as a string. + const char* const value_str = ParseFlagValue(str, flag, true); + + // Aborts if the parsing failed. + if (value_str == NULL) return false; + + // Converts the string value to a bool. + *value = !(*value_str == '0' || *value_str == 'f' || *value_str == 'F'); + return true; +} + +// Parses a string for an Int32 flag, in the form of +// "--flag=value". +// +// On success, stores the value of the flag in *value, and returns +// true. On failure, returns false without changing *value. +bool ParseInt32Flag(const char* str, const char* flag, Int32* value) { + // Gets the value of the flag as a string. + const char* const value_str = ParseFlagValue(str, flag, false); + + // Aborts if the parsing failed. + if (value_str == NULL) return false; + + // Sets *value to the value of the flag. + return ParseInt32(Message() << "The value of flag --" << flag, + value_str, value); +} + +// Parses a string for a string flag, in the form of +// "--flag=value". +// +// On success, stores the value of the flag in *value, and returns +// true. On failure, returns false without changing *value. +bool ParseStringFlag(const char* str, const char* flag, String* value) { + // Gets the value of the flag as a string. + const char* const value_str = ParseFlagValue(str, flag, false); + + // Aborts if the parsing failed. + if (value_str == NULL) return false; + + // Sets *value to the value of the flag. + *value = value_str; + return true; +} + +// Determines whether a string has a prefix that Google Test uses for its +// flags, i.e., starts with GTEST_FLAG_PREFIX_ or GTEST_FLAG_PREFIX_DASH_. +// If Google Test detects that a command line flag has its prefix but is not +// recognized, it will print its help message. Flags starting with +// GTEST_INTERNAL_PREFIX_ followed by "internal_" are considered Google Test +// internal flags and do not trigger the help message. +static bool HasGoogleTestFlagPrefix(const char* str) { + return (SkipPrefix("--", &str) || + SkipPrefix("-", &str) || + SkipPrefix("/", &str)) && + !SkipPrefix(GTEST_FLAG_PREFIX_ "internal_", &str) && + (SkipPrefix(GTEST_FLAG_PREFIX_, &str) || + SkipPrefix(GTEST_FLAG_PREFIX_DASH_, &str)); +} + +// Prints a string containing code-encoded text. The following escape +// sequences can be used in the string to control the text color: +// +// @@ prints a single '@' character. +// @R changes the color to red. +// @G changes the color to green. +// @Y changes the color to yellow. +// @D changes to the default terminal text color. +// +// TODO(wan@google.com): Write tests for this once we add stdout +// capturing to Google Test. +static void PrintColorEncoded(const char* str) { + GTestColor color = COLOR_DEFAULT; // The current color. + + // Conceptually, we split the string into segments divided by escape + // sequences. Then we print one segment at a time. At the end of + // each iteration, the str pointer advances to the beginning of the + // next segment. + for (;;) { + const char* p = strchr(str, '@'); + if (p == NULL) { + ColoredPrintf(color, "%s", str); + return; + } + + ColoredPrintf(color, "%s", String(str, p - str).c_str()); + + const char ch = p[1]; + str = p + 2; + if (ch == '@') { + ColoredPrintf(color, "@"); + } else if (ch == 'D') { + color = COLOR_DEFAULT; + } else if (ch == 'R') { + color = COLOR_RED; + } else if (ch == 'G') { + color = COLOR_GREEN; + } else if (ch == 'Y') { + color = COLOR_YELLOW; + } else { + --str; + } + } +} + +static const char kColorEncodedHelpMessage[] = +"This program contains tests written using " GTEST_NAME_ ". You can use the\n" +"following command line flags to control its behavior:\n" +"\n" +"Test Selection:\n" +" @G--" GTEST_FLAG_PREFIX_ "list_tests@D\n" +" List the names of all tests instead of running them. The name of\n" +" TEST(Foo, Bar) is \"Foo.Bar\".\n" +" @G--" GTEST_FLAG_PREFIX_ "filter=@YPOSTIVE_PATTERNS" + "[@G-@YNEGATIVE_PATTERNS]@D\n" +" Run only the tests whose name matches one of the positive patterns but\n" +" none of the negative patterns. '?' matches any single character; '*'\n" +" matches any substring; ':' separates two patterns.\n" +" @G--" GTEST_FLAG_PREFIX_ "also_run_disabled_tests@D\n" +" Run all disabled tests too.\n" +"\n" +"Test Execution:\n" +" @G--" GTEST_FLAG_PREFIX_ "repeat=@Y[COUNT]@D\n" +" Run the tests repeatedly; use a negative count to repeat forever.\n" +" @G--" GTEST_FLAG_PREFIX_ "shuffle@D\n" +" Randomize tests' orders on every iteration.\n" +" @G--" GTEST_FLAG_PREFIX_ "random_seed=@Y[NUMBER]@D\n" +" Random number seed to use for shuffling test orders (between 1 and\n" +" 99999, or 0 to use a seed based on the current time).\n" +"\n" +"Test Output:\n" +" @G--" GTEST_FLAG_PREFIX_ "color=@Y(@Gyes@Y|@Gno@Y|@Gauto@Y)@D\n" +" Enable/disable colored output. The default is @Gauto@D.\n" +" -@G-" GTEST_FLAG_PREFIX_ "print_time=0@D\n" +" Don't print the elapsed time of each test.\n" +" @G--" GTEST_FLAG_PREFIX_ "output=xml@Y[@G:@YDIRECTORY_PATH@G" + GTEST_PATH_SEP_ "@Y|@G:@YFILE_PATH]@D\n" +" Generate an XML report in the given directory or with the given file\n" +" name. @YFILE_PATH@D defaults to @Gtest_details.xml@D.\n" +#if GTEST_CAN_STREAM_RESULTS_ +" @G--" GTEST_FLAG_PREFIX_ "stream_result_to=@YHOST@G:@YPORT@D\n" +" Stream test results to the given server.\n" +#endif // GTEST_CAN_STREAM_RESULTS_ +"\n" +"Assertion Behavior:\n" +#if GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS +" @G--" GTEST_FLAG_PREFIX_ "death_test_style=@Y(@Gfast@Y|@Gthreadsafe@Y)@D\n" +" Set the default death test style.\n" +#endif // GTEST_HAS_DEATH_TEST && !GTEST_OS_WINDOWS +" @G--" GTEST_FLAG_PREFIX_ "break_on_failure@D\n" +" Turn assertion failures into debugger break-points.\n" +" @G--" GTEST_FLAG_PREFIX_ "throw_on_failure@D\n" +" Turn assertion failures into C++ exceptions.\n" +" @G--" GTEST_FLAG_PREFIX_ "catch_exceptions=0@D\n" +" Do not report exceptions as test failures. Instead, allow them\n" +" to crash the program or throw a pop-up (on Windows).\n" +"\n" +"Except for @G--" GTEST_FLAG_PREFIX_ "list_tests@D, you can alternatively set " + "the corresponding\n" +"environment variable of a flag (all letters in upper-case). For example, to\n" +"disable colored text output, you can either specify @G--" GTEST_FLAG_PREFIX_ + "color=no@D or set\n" +"the @G" GTEST_FLAG_PREFIX_UPPER_ "COLOR@D environment variable to @Gno@D.\n" +"\n" +"For more information, please read the " GTEST_NAME_ " documentation at\n" +"@G" GTEST_PROJECT_URL_ "@D. If you find a bug in " GTEST_NAME_ "\n" +"(not one in your own code or tests), please report it to\n" +"@G<" GTEST_DEV_EMAIL_ ">@D.\n"; + +// Parses the command line for Google Test flags, without initializing +// other parts of Google Test. The type parameter CharType can be +// instantiated to either char or wchar_t. +template +void ParseGoogleTestFlagsOnlyImpl(int* argc, CharType** argv) { + for (int i = 1; i < *argc; i++) { + const String arg_string = StreamableToString(argv[i]); + const char* const arg = arg_string.c_str(); + + using internal::ParseBoolFlag; + using internal::ParseInt32Flag; + using internal::ParseStringFlag; + + // Do we see a Google Test flag? + if (ParseBoolFlag(arg, kAlsoRunDisabledTestsFlag, + >EST_FLAG(also_run_disabled_tests)) || + ParseBoolFlag(arg, kBreakOnFailureFlag, + >EST_FLAG(break_on_failure)) || + ParseBoolFlag(arg, kCatchExceptionsFlag, + >EST_FLAG(catch_exceptions)) || + ParseStringFlag(arg, kColorFlag, >EST_FLAG(color)) || + ParseStringFlag(arg, kDeathTestStyleFlag, + >EST_FLAG(death_test_style)) || + ParseBoolFlag(arg, kDeathTestUseFork, + >EST_FLAG(death_test_use_fork)) || + ParseStringFlag(arg, kFilterFlag, >EST_FLAG(filter)) || + ParseStringFlag(arg, kInternalRunDeathTestFlag, + >EST_FLAG(internal_run_death_test)) || + ParseBoolFlag(arg, kListTestsFlag, >EST_FLAG(list_tests)) || + ParseStringFlag(arg, kOutputFlag, >EST_FLAG(output)) || + ParseBoolFlag(arg, kPrintTimeFlag, >EST_FLAG(print_time)) || + ParseInt32Flag(arg, kRandomSeedFlag, >EST_FLAG(random_seed)) || + ParseInt32Flag(arg, kRepeatFlag, >EST_FLAG(repeat)) || + ParseBoolFlag(arg, kShuffleFlag, >EST_FLAG(shuffle)) || + ParseInt32Flag(arg, kStackTraceDepthFlag, + >EST_FLAG(stack_trace_depth)) || + ParseStringFlag(arg, kStreamResultToFlag, + >EST_FLAG(stream_result_to)) || + ParseBoolFlag(arg, kThrowOnFailureFlag, + >EST_FLAG(throw_on_failure)) + ) { + // Yes. Shift the remainder of the argv list left by one. Note + // that argv has (*argc + 1) elements, the last one always being + // NULL. The following loop moves the trailing NULL element as + // well. + for (int j = i; j != *argc; j++) { + argv[j] = argv[j + 1]; + } + + // Decrements the argument count. + (*argc)--; + + // We also need to decrement the iterator as we just removed + // an element. + i--; + } else if (arg_string == "--help" || arg_string == "-h" || + arg_string == "-?" || arg_string == "/?" || + HasGoogleTestFlagPrefix(arg)) { + // Both help flag and unrecognized Google Test flags (excluding + // internal ones) trigger help display. + g_help_flag = true; + } + } + + if (g_help_flag) { + // We print the help here instead of in RUN_ALL_TESTS(), as the + // latter may not be called at all if the user is using Google + // Test with another testing framework. + PrintColorEncoded(kColorEncodedHelpMessage); + } +} + +// Parses the command line for Google Test flags, without initializing +// other parts of Google Test. +void ParseGoogleTestFlagsOnly(int* argc, char** argv) { + ParseGoogleTestFlagsOnlyImpl(argc, argv); +} +void ParseGoogleTestFlagsOnly(int* argc, wchar_t** argv) { + ParseGoogleTestFlagsOnlyImpl(argc, argv); +} + +// The internal implementation of InitGoogleTest(). +// +// The type parameter CharType can be instantiated to either char or +// wchar_t. +template +void InitGoogleTestImpl(int* argc, CharType** argv) { + g_init_gtest_count++; + + // We don't want to run the initialization code twice. + if (g_init_gtest_count != 1) return; + + if (*argc <= 0) return; + + internal::g_executable_path = internal::StreamableToString(argv[0]); + +#if GTEST_HAS_DEATH_TEST + + g_argvs.clear(); + for (int i = 0; i != *argc; i++) { + g_argvs.push_back(StreamableToString(argv[i])); + } + +#endif // GTEST_HAS_DEATH_TEST + + ParseGoogleTestFlagsOnly(argc, argv); + GetUnitTestImpl()->PostFlagParsingInit(); +} + +} // namespace internal + +// Initializes Google Test. This must be called before calling +// RUN_ALL_TESTS(). In particular, it parses a command line for the +// flags that Google Test recognizes. Whenever a Google Test flag is +// seen, it is removed from argv, and *argc is decremented. +// +// No value is returned. Instead, the Google Test flag variables are +// updated. +// +// Calling the function for the second time has no user-visible effect. +void InitGoogleTest(int* argc, char** argv) { + internal::InitGoogleTestImpl(argc, argv); +} + +// This overloaded version can be used in Windows programs compiled in +// UNICODE mode. +void InitGoogleTest(int* argc, wchar_t** argv) { + internal::InitGoogleTestImpl(argc, argv); +} + +} // namespace testing +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan), vladl@google.com (Vlad Losev) +// +// This file implements death tests. + + +#if GTEST_HAS_DEATH_TEST + +# if GTEST_OS_MAC +# include +# endif // GTEST_OS_MAC + +# include +# include +# include +# include + +# if GTEST_OS_WINDOWS +# include +# else +# include +# include +# endif // GTEST_OS_WINDOWS + +#endif // GTEST_HAS_DEATH_TEST + + +// Indicates that this translation unit is part of Google Test's +// implementation. It must come before gtest-internal-inl.h is +// included, or there will be a compiler error. This trick is to +// prevent a user from accidentally including gtest-internal-inl.h in +// his code. +#define GTEST_IMPLEMENTATION_ 1 +#undef GTEST_IMPLEMENTATION_ + +namespace testing { + +// Constants. + +// The default death test style. +static const char kDefaultDeathTestStyle[] = "fast"; + +GTEST_DEFINE_string_( + death_test_style, + internal::StringFromGTestEnv("death_test_style", kDefaultDeathTestStyle), + "Indicates how to run a death test in a forked child process: " + "\"threadsafe\" (child process re-executes the test binary " + "from the beginning, running only the specific death test) or " + "\"fast\" (child process runs the death test immediately " + "after forking)."); + +GTEST_DEFINE_bool_( + death_test_use_fork, + internal::BoolFromGTestEnv("death_test_use_fork", false), + "Instructs to use fork()/_exit() instead of clone() in death tests. " + "Ignored and always uses fork() on POSIX systems where clone() is not " + "implemented. Useful when running under valgrind or similar tools if " + "those do not support clone(). Valgrind 3.3.1 will just fail if " + "it sees an unsupported combination of clone() flags. " + "It is not recommended to use this flag w/o valgrind though it will " + "work in 99% of the cases. Once valgrind is fixed, this flag will " + "most likely be removed."); + +namespace internal { +GTEST_DEFINE_string_( + internal_run_death_test, "", + "Indicates the file, line number, temporal index of " + "the single death test to run, and a file descriptor to " + "which a success code may be sent, all separated by " + "colons. This flag is specified if and only if the current " + "process is a sub-process launched for running a thread-safe " + "death test. FOR INTERNAL USE ONLY."); +} // namespace internal + +#if GTEST_HAS_DEATH_TEST + +// ExitedWithCode constructor. +ExitedWithCode::ExitedWithCode(int exit_code) : exit_code_(exit_code) { +} + +// ExitedWithCode function-call operator. +bool ExitedWithCode::operator()(int exit_status) const { +# if GTEST_OS_WINDOWS + + return exit_status == exit_code_; + +# else + + return WIFEXITED(exit_status) && WEXITSTATUS(exit_status) == exit_code_; + +# endif // GTEST_OS_WINDOWS +} + +# if !GTEST_OS_WINDOWS +// KilledBySignal constructor. +KilledBySignal::KilledBySignal(int signum) : signum_(signum) { +} + +// KilledBySignal function-call operator. +bool KilledBySignal::operator()(int exit_status) const { + return WIFSIGNALED(exit_status) && WTERMSIG(exit_status) == signum_; +} +# endif // !GTEST_OS_WINDOWS + +namespace internal { + +// Utilities needed for death tests. + +// Generates a textual description of a given exit code, in the format +// specified by wait(2). +static String ExitSummary(int exit_code) { + Message m; + +# if GTEST_OS_WINDOWS + + m << "Exited with exit status " << exit_code; + +# else + + if (WIFEXITED(exit_code)) { + m << "Exited with exit status " << WEXITSTATUS(exit_code); + } else if (WIFSIGNALED(exit_code)) { + m << "Terminated by signal " << WTERMSIG(exit_code); + } +# ifdef WCOREDUMP + if (WCOREDUMP(exit_code)) { + m << " (core dumped)"; + } +# endif +# endif // GTEST_OS_WINDOWS + + return m.GetString(); +} + +// Returns true if exit_status describes a process that was terminated +// by a signal, or exited normally with a nonzero exit code. +bool ExitedUnsuccessfully(int exit_status) { + return !ExitedWithCode(0)(exit_status); +} + +# if !GTEST_OS_WINDOWS +// Generates a textual failure message when a death test finds more than +// one thread running, or cannot determine the number of threads, prior +// to executing the given statement. It is the responsibility of the +// caller not to pass a thread_count of 1. +static String DeathTestThreadWarning(size_t thread_count) { + Message msg; + msg << "Death tests use fork(), which is unsafe particularly" + << " in a threaded context. For this test, " << GTEST_NAME_ << " "; + if (thread_count == 0) + msg << "couldn't detect the number of threads."; + else + msg << "detected " << thread_count << " threads."; + return msg.GetString(); +} +# endif // !GTEST_OS_WINDOWS + +// Flag characters for reporting a death test that did not die. +static const char kDeathTestLived = 'L'; +static const char kDeathTestReturned = 'R'; +static const char kDeathTestThrew = 'T'; +static const char kDeathTestInternalError = 'I'; + +// An enumeration describing all of the possible ways that a death test can +// conclude. DIED means that the process died while executing the test +// code; LIVED means that process lived beyond the end of the test code; +// RETURNED means that the test statement attempted to execute a return +// statement, which is not allowed; THREW means that the test statement +// returned control by throwing an exception. IN_PROGRESS means the test +// has not yet concluded. +// TODO(vladl@google.com): Unify names and possibly values for +// AbortReason, DeathTestOutcome, and flag characters above. +enum DeathTestOutcome { IN_PROGRESS, DIED, LIVED, RETURNED, THREW }; + +// Routine for aborting the program which is safe to call from an +// exec-style death test child process, in which case the error +// message is propagated back to the parent process. Otherwise, the +// message is simply printed to stderr. In either case, the program +// then exits with status 1. +void DeathTestAbort(const String& message) { + // On a POSIX system, this function may be called from a threadsafe-style + // death test child process, which operates on a very small stack. Use + // the heap for any additional non-minuscule memory requirements. + const InternalRunDeathTestFlag* const flag = + GetUnitTestImpl()->internal_run_death_test_flag(); + if (flag != NULL) { + FILE* parent = posix::FDOpen(flag->write_fd(), "w"); + fputc(kDeathTestInternalError, parent); + fprintf(parent, "%s", message.c_str()); + fflush(parent); + _exit(1); + } else { + fprintf(stderr, "%s", message.c_str()); + fflush(stderr); + posix::Abort(); + } +} + +// A replacement for CHECK that calls DeathTestAbort if the assertion +// fails. +# define GTEST_DEATH_TEST_CHECK_(expression) \ + do { \ + if (!::testing::internal::IsTrue(expression)) { \ + DeathTestAbort(::testing::internal::String::Format( \ + "CHECK failed: File %s, line %d: %s", \ + __FILE__, __LINE__, #expression)); \ + } \ + } while (::testing::internal::AlwaysFalse()) + +// This macro is similar to GTEST_DEATH_TEST_CHECK_, but it is meant for +// evaluating any system call that fulfills two conditions: it must return +// -1 on failure, and set errno to EINTR when it is interrupted and +// should be tried again. The macro expands to a loop that repeatedly +// evaluates the expression as long as it evaluates to -1 and sets +// errno to EINTR. If the expression evaluates to -1 but errno is +// something other than EINTR, DeathTestAbort is called. +# define GTEST_DEATH_TEST_CHECK_SYSCALL_(expression) \ + do { \ + int gtest_retval; \ + do { \ + gtest_retval = (expression); \ + } while (gtest_retval == -1 && errno == EINTR); \ + if (gtest_retval == -1) { \ + DeathTestAbort(::testing::internal::String::Format( \ + "CHECK failed: File %s, line %d: %s != -1", \ + __FILE__, __LINE__, #expression)); \ + } \ + } while (::testing::internal::AlwaysFalse()) + +// Returns the message describing the last system error in errno. +String GetLastErrnoDescription() { + return String(errno == 0 ? "" : posix::StrError(errno)); +} + +// This is called from a death test parent process to read a failure +// message from the death test child process and log it with the FATAL +// severity. On Windows, the message is read from a pipe handle. On other +// platforms, it is read from a file descriptor. +static void FailFromInternalError(int fd) { + Message error; + char buffer[256]; + int num_read; + + do { + while ((num_read = posix::Read(fd, buffer, 255)) > 0) { + buffer[num_read] = '\0'; + error << buffer; + } + } while (num_read == -1 && errno == EINTR); + + if (num_read == 0) { + GTEST_LOG_(FATAL) << error.GetString(); + } else { + const int last_error = errno; + GTEST_LOG_(FATAL) << "Error while reading death test internal: " + << GetLastErrnoDescription() << " [" << last_error << "]"; + } +} + +// Death test constructor. Increments the running death test count +// for the current test. +DeathTest::DeathTest() { + TestInfo* const info = GetUnitTestImpl()->current_test_info(); + if (info == NULL) { + DeathTestAbort("Cannot run a death test outside of a TEST or " + "TEST_F construct"); + } +} + +// Creates and returns a death test by dispatching to the current +// death test factory. +bool DeathTest::Create(const char* statement, const RE* regex, + const char* file, int line, DeathTest** test) { + return GetUnitTestImpl()->death_test_factory()->Create( + statement, regex, file, line, test); +} + +const char* DeathTest::LastMessage() { + return last_death_test_message_.c_str(); +} + +void DeathTest::set_last_death_test_message(const String& message) { + last_death_test_message_ = message; +} + +String DeathTest::last_death_test_message_; + +// Provides cross platform implementation for some death functionality. +class DeathTestImpl : public DeathTest { + protected: + DeathTestImpl(const char* a_statement, const RE* a_regex) + : statement_(a_statement), + regex_(a_regex), + spawned_(false), + status_(-1), + outcome_(IN_PROGRESS), + read_fd_(-1), + write_fd_(-1) {} + + // read_fd_ is expected to be closed and cleared by a derived class. + ~DeathTestImpl() { GTEST_DEATH_TEST_CHECK_(read_fd_ == -1); } + + void Abort(AbortReason reason); + virtual bool Passed(bool status_ok); + + const char* statement() const { return statement_; } + const RE* regex() const { return regex_; } + bool spawned() const { return spawned_; } + void set_spawned(bool is_spawned) { spawned_ = is_spawned; } + int status() const { return status_; } + void set_status(int a_status) { status_ = a_status; } + DeathTestOutcome outcome() const { return outcome_; } + void set_outcome(DeathTestOutcome an_outcome) { outcome_ = an_outcome; } + int read_fd() const { return read_fd_; } + void set_read_fd(int fd) { read_fd_ = fd; } + int write_fd() const { return write_fd_; } + void set_write_fd(int fd) { write_fd_ = fd; } + + // Called in the parent process only. Reads the result code of the death + // test child process via a pipe, interprets it to set the outcome_ + // member, and closes read_fd_. Outputs diagnostics and terminates in + // case of unexpected codes. + void ReadAndInterpretStatusByte(); + + private: + // The textual content of the code this object is testing. This class + // doesn't own this string and should not attempt to delete it. + const char* const statement_; + // The regular expression which test output must match. DeathTestImpl + // doesn't own this object and should not attempt to delete it. + const RE* const regex_; + // True if the death test child process has been successfully spawned. + bool spawned_; + // The exit status of the child process. + int status_; + // How the death test concluded. + DeathTestOutcome outcome_; + // Descriptor to the read end of the pipe to the child process. It is + // always -1 in the child process. The child keeps its write end of the + // pipe in write_fd_. + int read_fd_; + // Descriptor to the child's write end of the pipe to the parent process. + // It is always -1 in the parent process. The parent keeps its end of the + // pipe in read_fd_. + int write_fd_; +}; + +// Called in the parent process only. Reads the result code of the death +// test child process via a pipe, interprets it to set the outcome_ +// member, and closes read_fd_. Outputs diagnostics and terminates in +// case of unexpected codes. +void DeathTestImpl::ReadAndInterpretStatusByte() { + char flag; + int bytes_read; + + // The read() here blocks until data is available (signifying the + // failure of the death test) or until the pipe is closed (signifying + // its success), so it's okay to call this in the parent before + // the child process has exited. + do { + bytes_read = posix::Read(read_fd(), &flag, 1); + } while (bytes_read == -1 && errno == EINTR); + + if (bytes_read == 0) { + set_outcome(DIED); + } else if (bytes_read == 1) { + switch (flag) { + case kDeathTestReturned: + set_outcome(RETURNED); + break; + case kDeathTestThrew: + set_outcome(THREW); + break; + case kDeathTestLived: + set_outcome(LIVED); + break; + case kDeathTestInternalError: + FailFromInternalError(read_fd()); // Does not return. + break; + default: + GTEST_LOG_(FATAL) << "Death test child process reported " + << "unexpected status byte (" + << static_cast(flag) << ")"; + } + } else { + GTEST_LOG_(FATAL) << "Read from death test child process failed: " + << GetLastErrnoDescription(); + } + GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Close(read_fd())); + set_read_fd(-1); +} + +// Signals that the death test code which should have exited, didn't. +// Should be called only in a death test child process. +// Writes a status byte to the child's status file descriptor, then +// calls _exit(1). +void DeathTestImpl::Abort(AbortReason reason) { + // The parent process considers the death test to be a failure if + // it finds any data in our pipe. So, here we write a single flag byte + // to the pipe, then exit. + const char status_ch = + reason == TEST_DID_NOT_DIE ? kDeathTestLived : + reason == TEST_THREW_EXCEPTION ? kDeathTestThrew : kDeathTestReturned; + + GTEST_DEATH_TEST_CHECK_SYSCALL_(posix::Write(write_fd(), &status_ch, 1)); + // We are leaking the descriptor here because on some platforms (i.e., + // when built as Windows DLL), destructors of global objects will still + // run after calling _exit(). On such systems, write_fd_ will be + // indirectly closed from the destructor of UnitTestImpl, causing double + // close if it is also closed here. On debug configurations, double close + // may assert. As there are no in-process buffers to flush here, we are + // relying on the OS to close the descriptor after the process terminates + // when the destructors are not run. + _exit(1); // Exits w/o any normal exit hooks (we were supposed to crash) +} + +// Returns an indented copy of stderr output for a death test. +// This makes distinguishing death test output lines from regular log lines +// much easier. +static ::std::string FormatDeathTestOutput(const ::std::string& output) { + ::std::string ret; + for (size_t at = 0; ; ) { + const size_t line_end = output.find('\n', at); + ret += "[ DEATH ] "; + if (line_end == ::std::string::npos) { + ret += output.substr(at); + break; + } + ret += output.substr(at, line_end + 1 - at); + at = line_end + 1; + } + return ret; +} + +// Assesses the success or failure of a death test, using both private +// members which have previously been set, and one argument: +// +// Private data members: +// outcome: An enumeration describing how the death test +// concluded: DIED, LIVED, THREW, or RETURNED. The death test +// fails in the latter three cases. +// status: The exit status of the child process. On *nix, it is in the +// in the format specified by wait(2). On Windows, this is the +// value supplied to the ExitProcess() API or a numeric code +// of the exception that terminated the program. +// regex: A regular expression object to be applied to +// the test's captured standard error output; the death test +// fails if it does not match. +// +// Argument: +// status_ok: true if exit_status is acceptable in the context of +// this particular death test, which fails if it is false +// +// Returns true iff all of the above conditions are met. Otherwise, the +// first failing condition, in the order given above, is the one that is +// reported. Also sets the last death test message string. +bool DeathTestImpl::Passed(bool status_ok) { + if (!spawned()) + return false; + + const String error_message = GetCapturedStderr(); + + bool success = false; + Message buffer; + + buffer << "Death test: " << statement() << "\n"; + switch (outcome()) { + case LIVED: + buffer << " Result: failed to die.\n" + << " Error msg:\n" << FormatDeathTestOutput(error_message); + break; + case THREW: + buffer << " Result: threw an exception.\n" + << " Error msg:\n" << FormatDeathTestOutput(error_message); + break; + case RETURNED: + buffer << " Result: illegal return in test statement.\n" + << " Error msg:\n" << FormatDeathTestOutput(error_message); + break; + case DIED: + if (status_ok) { + const bool matched = RE::PartialMatch(error_message.c_str(), *regex()); + if (matched) { + success = true; + } else { + buffer << " Result: died but not with expected error.\n" + << " Expected: " << regex()->pattern() << "\n" + << "Actual msg:\n" << FormatDeathTestOutput(error_message); + } + } else { + buffer << " Result: died but not with expected exit code:\n" + << " " << ExitSummary(status()) << "\n" + << "Actual msg:\n" << FormatDeathTestOutput(error_message); + } + break; + case IN_PROGRESS: + default: + GTEST_LOG_(FATAL) + << "DeathTest::Passed somehow called before conclusion of test"; + } + + DeathTest::set_last_death_test_message(buffer.GetString()); + return success; +} + +# if GTEST_OS_WINDOWS +// WindowsDeathTest implements death tests on Windows. Due to the +// specifics of starting new processes on Windows, death tests there are +// always threadsafe, and Google Test considers the +// --gtest_death_test_style=fast setting to be equivalent to +// --gtest_death_test_style=threadsafe there. +// +// A few implementation notes: Like the Linux version, the Windows +// implementation uses pipes for child-to-parent communication. But due to +// the specifics of pipes on Windows, some extra steps are required: +// +// 1. The parent creates a communication pipe and stores handles to both +// ends of it. +// 2. The parent starts the child and provides it with the information +// necessary to acquire the handle to the write end of the pipe. +// 3. The child acquires the write end of the pipe and signals the parent +// using a Windows event. +// 4. Now the parent can release the write end of the pipe on its side. If +// this is done before step 3, the object's reference count goes down to +// 0 and it is destroyed, preventing the child from acquiring it. The +// parent now has to release it, or read operations on the read end of +// the pipe will not return when the child terminates. +// 5. The parent reads child's output through the pipe (outcome code and +// any possible error messages) from the pipe, and its stderr and then +// determines whether to fail the test. +// +// Note: to distinguish Win32 API calls from the local method and function +// calls, the former are explicitly resolved in the global namespace. +// +class WindowsDeathTest : public DeathTestImpl { + public: + WindowsDeathTest(const char* a_statement, + const RE* a_regex, + const char* file, + int line) + : DeathTestImpl(a_statement, a_regex), file_(file), line_(line) {} + + // All of these virtual functions are inherited from DeathTest. + virtual int Wait(); + virtual TestRole AssumeRole(); + + private: + // The name of the file in which the death test is located. + const char* const file_; + // The line number on which the death test is located. + const int line_; + // Handle to the write end of the pipe to the child process. + AutoHandle write_handle_; + // Child process handle. + AutoHandle child_handle_; + // Event the child process uses to signal the parent that it has + // acquired the handle to the write end of the pipe. After seeing this + // event the parent can release its own handles to make sure its + // ReadFile() calls return when the child terminates. + AutoHandle event_handle_; +}; + +// Waits for the child in a death test to exit, returning its exit +// status, or 0 if no child process exists. As a side effect, sets the +// outcome data member. +int WindowsDeathTest::Wait() { + if (!spawned()) + return 0; + + // Wait until the child either signals that it has acquired the write end + // of the pipe or it dies. + const HANDLE wait_handles[2] = { child_handle_.Get(), event_handle_.Get() }; + switch (::WaitForMultipleObjects(2, + wait_handles, + FALSE, // Waits for any of the handles. + INFINITE)) { + case WAIT_OBJECT_0: + case WAIT_OBJECT_0 + 1: + break; + default: + GTEST_DEATH_TEST_CHECK_(false); // Should not get here. + } + + // The child has acquired the write end of the pipe or exited. + // We release the handle on our side and continue. + write_handle_.Reset(); + event_handle_.Reset(); + + ReadAndInterpretStatusByte(); + + // Waits for the child process to exit if it haven't already. This + // returns immediately if the child has already exited, regardless of + // whether previous calls to WaitForMultipleObjects synchronized on this + // handle or not. + GTEST_DEATH_TEST_CHECK_( + WAIT_OBJECT_0 == ::WaitForSingleObject(child_handle_.Get(), + INFINITE)); + DWORD status_code; + GTEST_DEATH_TEST_CHECK_( + ::GetExitCodeProcess(child_handle_.Get(), &status_code) != FALSE); + child_handle_.Reset(); + set_status(static_cast(status_code)); + return status(); +} + +// The AssumeRole process for a Windows death test. It creates a child +// process with the same executable as the current process to run the +// death test. The child process is given the --gtest_filter and +// --gtest_internal_run_death_test flags such that it knows to run the +// current death test only. +DeathTest::TestRole WindowsDeathTest::AssumeRole() { + const UnitTestImpl* const impl = GetUnitTestImpl(); + const InternalRunDeathTestFlag* const flag = + impl->internal_run_death_test_flag(); + const TestInfo* const info = impl->current_test_info(); + const int death_test_index = info->result()->death_test_count(); + + if (flag != NULL) { + // ParseInternalRunDeathTestFlag() has performed all the necessary + // processing. + set_write_fd(flag->write_fd()); + return EXECUTE_TEST; + } + + // WindowsDeathTest uses an anonymous pipe to communicate results of + // a death test. + SECURITY_ATTRIBUTES handles_are_inheritable = { + sizeof(SECURITY_ATTRIBUTES), NULL, TRUE }; + HANDLE read_handle, write_handle; + GTEST_DEATH_TEST_CHECK_( + ::CreatePipe(&read_handle, &write_handle, &handles_are_inheritable, + 0) // Default buffer size. + != FALSE); + set_read_fd(::_open_osfhandle(reinterpret_cast(read_handle), + O_RDONLY)); + write_handle_.Reset(write_handle); + event_handle_.Reset(::CreateEvent( + &handles_are_inheritable, + TRUE, // The event will automatically reset to non-signaled state. + FALSE, // The initial state is non-signalled. + NULL)); // The even is unnamed. + GTEST_DEATH_TEST_CHECK_(event_handle_.Get() != NULL); + const String filter_flag = String::Format("--%s%s=%s.%s", + GTEST_FLAG_PREFIX_, kFilterFlag, + info->test_case_name(), + info->name()); + const String internal_flag = String::Format( + "--%s%s=%s|%d|%d|%u|%Iu|%Iu", + GTEST_FLAG_PREFIX_, + kInternalRunDeathTestFlag, + file_, line_, + death_test_index, + static_cast(::GetCurrentProcessId()), + // size_t has the same with as pointers on both 32-bit and 64-bit + // Windows platforms. + // See http://msdn.microsoft.com/en-us/library/tcxf1dw6.aspx. + reinterpret_cast(write_handle), + reinterpret_cast(event_handle_.Get())); + + char executable_path[_MAX_PATH + 1]; // NOLINT + GTEST_DEATH_TEST_CHECK_( + _MAX_PATH + 1 != ::GetModuleFileNameA(NULL, + executable_path, + _MAX_PATH)); + + String command_line = String::Format("%s %s \"%s\"", + ::GetCommandLineA(), + filter_flag.c_str(), + internal_flag.c_str()); + + DeathTest::set_last_death_test_message(""); + + CaptureStderr(); + // Flush the log buffers since the log streams are shared with the child. + FlushInfoLog(); + + // The child process will share the standard handles with the parent. + STARTUPINFOA startup_info; + memset(&startup_info, 0, sizeof(STARTUPINFO)); + startup_info.dwFlags = STARTF_USESTDHANDLES; + startup_info.hStdInput = ::GetStdHandle(STD_INPUT_HANDLE); + startup_info.hStdOutput = ::GetStdHandle(STD_OUTPUT_HANDLE); + startup_info.hStdError = ::GetStdHandle(STD_ERROR_HANDLE); + + PROCESS_INFORMATION process_info; + GTEST_DEATH_TEST_CHECK_(::CreateProcessA( + executable_path, + const_cast(command_line.c_str()), + NULL, // Retuned process handle is not inheritable. + NULL, // Retuned thread handle is not inheritable. + TRUE, // Child inherits all inheritable handles (for write_handle_). + 0x0, // Default creation flags. + NULL, // Inherit the parent's environment. + UnitTest::GetInstance()->original_working_dir(), + &startup_info, + &process_info) != FALSE); + child_handle_.Reset(process_info.hProcess); + ::CloseHandle(process_info.hThread); + set_spawned(true); + return OVERSEE_TEST; +} +# else // We are not on Windows. + +// ForkingDeathTest provides implementations for most of the abstract +// methods of the DeathTest interface. Only the AssumeRole method is +// left undefined. +class ForkingDeathTest : public DeathTestImpl { + public: + ForkingDeathTest(const char* statement, const RE* regex); + + // All of these virtual functions are inherited from DeathTest. + virtual int Wait(); + + protected: + void set_child_pid(pid_t child_pid) { child_pid_ = child_pid; } + + private: + // PID of child process during death test; 0 in the child process itself. + pid_t child_pid_; +}; + +// Constructs a ForkingDeathTest. +ForkingDeathTest::ForkingDeathTest(const char* a_statement, const RE* a_regex) + : DeathTestImpl(a_statement, a_regex), + child_pid_(-1) {} + +// Waits for the child in a death test to exit, returning its exit +// status, or 0 if no child process exists. As a side effect, sets the +// outcome data member. +int ForkingDeathTest::Wait() { + if (!spawned()) + return 0; + + ReadAndInterpretStatusByte(); + + int status_value; + GTEST_DEATH_TEST_CHECK_SYSCALL_(waitpid(child_pid_, &status_value, 0)); + set_status(status_value); + return status_value; +} + +// A concrete death test class that forks, then immediately runs the test +// in the child process. +class NoExecDeathTest : public ForkingDeathTest { + public: + NoExecDeathTest(const char* a_statement, const RE* a_regex) : + ForkingDeathTest(a_statement, a_regex) { } + virtual TestRole AssumeRole(); +}; + +// The AssumeRole process for a fork-and-run death test. It implements a +// straightforward fork, with a simple pipe to transmit the status byte. +DeathTest::TestRole NoExecDeathTest::AssumeRole() { + const size_t thread_count = GetThreadCount(); + if (thread_count != 1) { + GTEST_LOG_(WARNING) << DeathTestThreadWarning(thread_count); + } + + int pipe_fd[2]; + GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1); + + DeathTest::set_last_death_test_message(""); + CaptureStderr(); + // When we fork the process below, the log file buffers are copied, but the + // file descriptors are shared. We flush all log files here so that closing + // the file descriptors in the child process doesn't throw off the + // synchronization between descriptors and buffers in the parent process. + // This is as close to the fork as possible to avoid a race condition in case + // there are multiple threads running before the death test, and another + // thread writes to the log file. + FlushInfoLog(); + + const pid_t child_pid = fork(); + GTEST_DEATH_TEST_CHECK_(child_pid != -1); + set_child_pid(child_pid); + if (child_pid == 0) { + GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[0])); + set_write_fd(pipe_fd[1]); + // Redirects all logging to stderr in the child process to prevent + // concurrent writes to the log files. We capture stderr in the parent + // process and append the child process' output to a log. + LogToStderr(); + // Event forwarding to the listeners of event listener API mush be shut + // down in death test subprocesses. + GetUnitTestImpl()->listeners()->SuppressEventForwarding(); + return EXECUTE_TEST; + } else { + GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1])); + set_read_fd(pipe_fd[0]); + set_spawned(true); + return OVERSEE_TEST; + } +} + +// A concrete death test class that forks and re-executes the main +// program from the beginning, with command-line flags set that cause +// only this specific death test to be run. +class ExecDeathTest : public ForkingDeathTest { + public: + ExecDeathTest(const char* a_statement, const RE* a_regex, + const char* file, int line) : + ForkingDeathTest(a_statement, a_regex), file_(file), line_(line) { } + virtual TestRole AssumeRole(); + private: + // The name of the file in which the death test is located. + const char* const file_; + // The line number on which the death test is located. + const int line_; +}; + +// Utility class for accumulating command-line arguments. +class Arguments { + public: + Arguments() { + args_.push_back(NULL); + } + + ~Arguments() { + for (std::vector::iterator i = args_.begin(); i != args_.end(); + ++i) { + free(*i); + } + } + void AddArgument(const char* argument) { + args_.insert(args_.end() - 1, posix::StrDup(argument)); + } + + template + void AddArguments(const ::std::vector& arguments) { + for (typename ::std::vector::const_iterator i = arguments.begin(); + i != arguments.end(); + ++i) { + args_.insert(args_.end() - 1, posix::StrDup(i->c_str())); + } + } + char* const* Argv() { + return &args_[0]; + } + private: + std::vector args_; +}; + +// A struct that encompasses the arguments to the child process of a +// threadsafe-style death test process. +struct ExecDeathTestArgs { + char* const* argv; // Command-line arguments for the child's call to exec + int close_fd; // File descriptor to close; the read end of a pipe +}; + +# if GTEST_OS_MAC +inline char** GetEnviron() { + // When Google Test is built as a framework on MacOS X, the environ variable + // is unavailable. Apple's documentation (man environ) recommends using + // _NSGetEnviron() instead. + return *_NSGetEnviron(); +} +# else +// Some POSIX platforms expect you to declare environ. extern "C" makes +// it reside in the global namespace. +extern "C" char** environ; +inline char** GetEnviron() { return environ; } +# endif // GTEST_OS_MAC + +// The main function for a threadsafe-style death test child process. +// This function is called in a clone()-ed process and thus must avoid +// any potentially unsafe operations like malloc or libc functions. +static int ExecDeathTestChildMain(void* child_arg) { + ExecDeathTestArgs* const args = static_cast(child_arg); + GTEST_DEATH_TEST_CHECK_SYSCALL_(close(args->close_fd)); + + // We need to execute the test program in the same environment where + // it was originally invoked. Therefore we change to the original + // working directory first. + const char* const original_dir = + UnitTest::GetInstance()->original_working_dir(); + // We can safely call chdir() as it's a direct system call. + if (chdir(original_dir) != 0) { + DeathTestAbort(String::Format("chdir(\"%s\") failed: %s", + original_dir, + GetLastErrnoDescription().c_str())); + return EXIT_FAILURE; + } + + // We can safely call execve() as it's a direct system call. We + // cannot use execvp() as it's a libc function and thus potentially + // unsafe. Since execve() doesn't search the PATH, the user must + // invoke the test program via a valid path that contains at least + // one path separator. + execve(args->argv[0], args->argv, GetEnviron()); + DeathTestAbort(String::Format("execve(%s, ...) in %s failed: %s", + args->argv[0], + original_dir, + GetLastErrnoDescription().c_str())); + return EXIT_FAILURE; +} + +// Two utility routines that together determine the direction the stack +// grows. +// This could be accomplished more elegantly by a single recursive +// function, but we want to guard against the unlikely possibility of +// a smart compiler optimizing the recursion away. +// +// GTEST_NO_INLINE_ is required to prevent GCC 4.6 from inlining +// StackLowerThanAddress into StackGrowsDown, which then doesn't give +// correct answer. +bool StackLowerThanAddress(const void* ptr) GTEST_NO_INLINE_; +bool StackLowerThanAddress(const void* ptr) { + int dummy; + return &dummy < ptr; +} + +bool StackGrowsDown() { + int dummy; + return StackLowerThanAddress(&dummy); +} + +// A threadsafe implementation of fork(2) for threadsafe-style death tests +// that uses clone(2). It dies with an error message if anything goes +// wrong. +static pid_t ExecDeathTestFork(char* const* argv, int close_fd) { + ExecDeathTestArgs args = { argv, close_fd }; + pid_t child_pid = -1; + +# if GTEST_HAS_CLONE + const bool use_fork = GTEST_FLAG(death_test_use_fork); + + if (!use_fork) { + static const bool stack_grows_down = StackGrowsDown(); + const size_t stack_size = getpagesize(); + // MMAP_ANONYMOUS is not defined on Mac, so we use MAP_ANON instead. + void* const stack = mmap(NULL, stack_size, PROT_READ | PROT_WRITE, + MAP_ANON | MAP_PRIVATE, -1, 0); + GTEST_DEATH_TEST_CHECK_(stack != MAP_FAILED); + void* const stack_top = + static_cast(stack) + (stack_grows_down ? stack_size : 0); + + child_pid = clone(&ExecDeathTestChildMain, stack_top, SIGCHLD, &args); + + GTEST_DEATH_TEST_CHECK_(munmap(stack, stack_size) != -1); + } +# else + const bool use_fork = true; +# endif // GTEST_HAS_CLONE + + if (use_fork && (child_pid = fork()) == 0) { + ExecDeathTestChildMain(&args); + _exit(0); + } + + GTEST_DEATH_TEST_CHECK_(child_pid != -1); + return child_pid; +} + +// The AssumeRole process for a fork-and-exec death test. It re-executes the +// main program from the beginning, setting the --gtest_filter +// and --gtest_internal_run_death_test flags to cause only the current +// death test to be re-run. +DeathTest::TestRole ExecDeathTest::AssumeRole() { + const UnitTestImpl* const impl = GetUnitTestImpl(); + const InternalRunDeathTestFlag* const flag = + impl->internal_run_death_test_flag(); + const TestInfo* const info = impl->current_test_info(); + const int death_test_index = info->result()->death_test_count(); + + if (flag != NULL) { + set_write_fd(flag->write_fd()); + return EXECUTE_TEST; + } + + int pipe_fd[2]; + GTEST_DEATH_TEST_CHECK_(pipe(pipe_fd) != -1); + // Clear the close-on-exec flag on the write end of the pipe, lest + // it be closed when the child process does an exec: + GTEST_DEATH_TEST_CHECK_(fcntl(pipe_fd[1], F_SETFD, 0) != -1); + + const String filter_flag = + String::Format("--%s%s=%s.%s", + GTEST_FLAG_PREFIX_, kFilterFlag, + info->test_case_name(), info->name()); + const String internal_flag = + String::Format("--%s%s=%s|%d|%d|%d", + GTEST_FLAG_PREFIX_, kInternalRunDeathTestFlag, + file_, line_, death_test_index, pipe_fd[1]); + Arguments args; + args.AddArguments(GetArgvs()); + args.AddArgument(filter_flag.c_str()); + args.AddArgument(internal_flag.c_str()); + + DeathTest::set_last_death_test_message(""); + + CaptureStderr(); + // See the comment in NoExecDeathTest::AssumeRole for why the next line + // is necessary. + FlushInfoLog(); + + const pid_t child_pid = ExecDeathTestFork(args.Argv(), pipe_fd[0]); + GTEST_DEATH_TEST_CHECK_SYSCALL_(close(pipe_fd[1])); + set_child_pid(child_pid); + set_read_fd(pipe_fd[0]); + set_spawned(true); + return OVERSEE_TEST; +} + +# endif // !GTEST_OS_WINDOWS + +// Creates a concrete DeathTest-derived class that depends on the +// --gtest_death_test_style flag, and sets the pointer pointed to +// by the "test" argument to its address. If the test should be +// skipped, sets that pointer to NULL. Returns true, unless the +// flag is set to an invalid value. +bool DefaultDeathTestFactory::Create(const char* statement, const RE* regex, + const char* file, int line, + DeathTest** test) { + UnitTestImpl* const impl = GetUnitTestImpl(); + const InternalRunDeathTestFlag* const flag = + impl->internal_run_death_test_flag(); + const int death_test_index = impl->current_test_info() + ->increment_death_test_count(); + + if (flag != NULL) { + if (death_test_index > flag->index()) { + DeathTest::set_last_death_test_message(String::Format( + "Death test count (%d) somehow exceeded expected maximum (%d)", + death_test_index, flag->index())); + return false; + } + + if (!(flag->file() == file && flag->line() == line && + flag->index() == death_test_index)) { + *test = NULL; + return true; + } + } + +# if GTEST_OS_WINDOWS + + if (GTEST_FLAG(death_test_style) == "threadsafe" || + GTEST_FLAG(death_test_style) == "fast") { + *test = new WindowsDeathTest(statement, regex, file, line); + } + +# else + + if (GTEST_FLAG(death_test_style) == "threadsafe") { + *test = new ExecDeathTest(statement, regex, file, line); + } else if (GTEST_FLAG(death_test_style) == "fast") { + *test = new NoExecDeathTest(statement, regex); + } + +# endif // GTEST_OS_WINDOWS + + else { // NOLINT - this is more readable than unbalanced brackets inside #if. + DeathTest::set_last_death_test_message(String::Format( + "Unknown death test style \"%s\" encountered", + GTEST_FLAG(death_test_style).c_str())); + return false; + } + + return true; +} + +// Splits a given string on a given delimiter, populating a given +// vector with the fields. GTEST_HAS_DEATH_TEST implies that we have +// ::std::string, so we can use it here. +static void SplitString(const ::std::string& str, char delimiter, + ::std::vector< ::std::string>* dest) { + ::std::vector< ::std::string> parsed; + ::std::string::size_type pos = 0; + while (::testing::internal::AlwaysTrue()) { + const ::std::string::size_type colon = str.find(delimiter, pos); + if (colon == ::std::string::npos) { + parsed.push_back(str.substr(pos)); + break; + } else { + parsed.push_back(str.substr(pos, colon - pos)); + pos = colon + 1; + } + } + dest->swap(parsed); +} + +# if GTEST_OS_WINDOWS +// Recreates the pipe and event handles from the provided parameters, +// signals the event, and returns a file descriptor wrapped around the pipe +// handle. This function is called in the child process only. +int GetStatusFileDescriptor(unsigned int parent_process_id, + size_t write_handle_as_size_t, + size_t event_handle_as_size_t) { + AutoHandle parent_process_handle(::OpenProcess(PROCESS_DUP_HANDLE, + FALSE, // Non-inheritable. + parent_process_id)); + if (parent_process_handle.Get() == INVALID_HANDLE_VALUE) { + DeathTestAbort(String::Format("Unable to open parent process %u", + parent_process_id)); + } + + // TODO(vladl@google.com): Replace the following check with a + // compile-time assertion when available. + GTEST_CHECK_(sizeof(HANDLE) <= sizeof(size_t)); + + const HANDLE write_handle = + reinterpret_cast(write_handle_as_size_t); + HANDLE dup_write_handle; + + // The newly initialized handle is accessible only in in the parent + // process. To obtain one accessible within the child, we need to use + // DuplicateHandle. + if (!::DuplicateHandle(parent_process_handle.Get(), write_handle, + ::GetCurrentProcess(), &dup_write_handle, + 0x0, // Requested privileges ignored since + // DUPLICATE_SAME_ACCESS is used. + FALSE, // Request non-inheritable handler. + DUPLICATE_SAME_ACCESS)) { + DeathTestAbort(String::Format( + "Unable to duplicate the pipe handle %Iu from the parent process %u", + write_handle_as_size_t, parent_process_id)); + } + + const HANDLE event_handle = reinterpret_cast(event_handle_as_size_t); + HANDLE dup_event_handle; + + if (!::DuplicateHandle(parent_process_handle.Get(), event_handle, + ::GetCurrentProcess(), &dup_event_handle, + 0x0, + FALSE, + DUPLICATE_SAME_ACCESS)) { + DeathTestAbort(String::Format( + "Unable to duplicate the event handle %Iu from the parent process %u", + event_handle_as_size_t, parent_process_id)); + } + + const int write_fd = + ::_open_osfhandle(reinterpret_cast(dup_write_handle), O_APPEND); + if (write_fd == -1) { + DeathTestAbort(String::Format( + "Unable to convert pipe handle %Iu to a file descriptor", + write_handle_as_size_t)); + } + + // Signals the parent that the write end of the pipe has been acquired + // so the parent can release its own write end. + ::SetEvent(dup_event_handle); + + return write_fd; +} +# endif // GTEST_OS_WINDOWS + +// Returns a newly created InternalRunDeathTestFlag object with fields +// initialized from the GTEST_FLAG(internal_run_death_test) flag if +// the flag is specified; otherwise returns NULL. +InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag() { + if (GTEST_FLAG(internal_run_death_test) == "") return NULL; + + // GTEST_HAS_DEATH_TEST implies that we have ::std::string, so we + // can use it here. + int line = -1; + int index = -1; + ::std::vector< ::std::string> fields; + SplitString(GTEST_FLAG(internal_run_death_test).c_str(), '|', &fields); + int write_fd = -1; + +# if GTEST_OS_WINDOWS + + unsigned int parent_process_id = 0; + size_t write_handle_as_size_t = 0; + size_t event_handle_as_size_t = 0; + + if (fields.size() != 6 + || !ParseNaturalNumber(fields[1], &line) + || !ParseNaturalNumber(fields[2], &index) + || !ParseNaturalNumber(fields[3], &parent_process_id) + || !ParseNaturalNumber(fields[4], &write_handle_as_size_t) + || !ParseNaturalNumber(fields[5], &event_handle_as_size_t)) { + DeathTestAbort(String::Format( + "Bad --gtest_internal_run_death_test flag: %s", + GTEST_FLAG(internal_run_death_test).c_str())); + } + write_fd = GetStatusFileDescriptor(parent_process_id, + write_handle_as_size_t, + event_handle_as_size_t); +# else + + if (fields.size() != 4 + || !ParseNaturalNumber(fields[1], &line) + || !ParseNaturalNumber(fields[2], &index) + || !ParseNaturalNumber(fields[3], &write_fd)) { + DeathTestAbort(String::Format( + "Bad --gtest_internal_run_death_test flag: %s", + GTEST_FLAG(internal_run_death_test).c_str())); + } + +# endif // GTEST_OS_WINDOWS + + return new InternalRunDeathTestFlag(fields[0], line, index, write_fd); +} + +} // namespace internal + +#endif // GTEST_HAS_DEATH_TEST + +} // namespace testing +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Authors: keith.ray@gmail.com (Keith Ray) + + +#include + +#if GTEST_OS_WINDOWS_MOBILE +# include +#elif GTEST_OS_WINDOWS +# include +# include +#elif GTEST_OS_SYMBIAN || GTEST_OS_NACL +// Symbian OpenC and NaCl have PATH_MAX in sys/syslimits.h +# include +#else +# include +# include // Some Linux distributions define PATH_MAX here. +#endif // GTEST_OS_WINDOWS_MOBILE + +#if GTEST_OS_WINDOWS +# define GTEST_PATH_MAX_ _MAX_PATH +#elif defined(PATH_MAX) +# define GTEST_PATH_MAX_ PATH_MAX +#elif defined(_XOPEN_PATH_MAX) +# define GTEST_PATH_MAX_ _XOPEN_PATH_MAX +#else +# define GTEST_PATH_MAX_ _POSIX_PATH_MAX +#endif // GTEST_OS_WINDOWS + + +namespace testing { +namespace internal { + +#if GTEST_OS_WINDOWS +// On Windows, '\\' is the standard path separator, but many tools and the +// Windows API also accept '/' as an alternate path separator. Unless otherwise +// noted, a file path can contain either kind of path separators, or a mixture +// of them. +const char kPathSeparator = '\\'; +const char kAlternatePathSeparator = '/'; +const char kPathSeparatorString[] = "\\"; +const char kAlternatePathSeparatorString[] = "/"; +# if GTEST_OS_WINDOWS_MOBILE +// Windows CE doesn't have a current directory. You should not use +// the current directory in tests on Windows CE, but this at least +// provides a reasonable fallback. +const char kCurrentDirectoryString[] = "\\"; +// Windows CE doesn't define INVALID_FILE_ATTRIBUTES +const DWORD kInvalidFileAttributes = 0xffffffff; +# else +const char kCurrentDirectoryString[] = ".\\"; +# endif // GTEST_OS_WINDOWS_MOBILE +#else +const char kPathSeparator = '/'; +const char kPathSeparatorString[] = "/"; +const char kCurrentDirectoryString[] = "./"; +#endif // GTEST_OS_WINDOWS + +// Returns whether the given character is a valid path separator. +static bool IsPathSeparator(char c) { +#if GTEST_HAS_ALT_PATH_SEP_ + return (c == kPathSeparator) || (c == kAlternatePathSeparator); +#else + return c == kPathSeparator; +#endif +} + +// Returns the current working directory, or "" if unsuccessful. +FilePath FilePath::GetCurrentDir() { +#if GTEST_OS_WINDOWS_MOBILE + // Windows CE doesn't have a current directory, so we just return + // something reasonable. + return FilePath(kCurrentDirectoryString); +#elif GTEST_OS_WINDOWS + char cwd[GTEST_PATH_MAX_ + 1] = { '\0' }; + return FilePath(_getcwd(cwd, sizeof(cwd)) == NULL ? "" : cwd); +#else + char cwd[GTEST_PATH_MAX_ + 1] = { '\0' }; + return FilePath(getcwd(cwd, sizeof(cwd)) == NULL ? "" : cwd); +#endif // GTEST_OS_WINDOWS_MOBILE +} + +// Returns a copy of the FilePath with the case-insensitive extension removed. +// Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns +// FilePath("dir/file"). If a case-insensitive extension is not +// found, returns a copy of the original FilePath. +FilePath FilePath::RemoveExtension(const char* extension) const { + String dot_extension(String::Format(".%s", extension)); + if (pathname_.EndsWithCaseInsensitive(dot_extension.c_str())) { + return FilePath(String(pathname_.c_str(), pathname_.length() - 4)); + } + return *this; +} + +// Returns a pointer to the last occurence of a valid path separator in +// the FilePath. On Windows, for example, both '/' and '\' are valid path +// separators. Returns NULL if no path separator was found. +const char* FilePath::FindLastPathSeparator() const { + const char* const last_sep = strrchr(c_str(), kPathSeparator); +#if GTEST_HAS_ALT_PATH_SEP_ + const char* const last_alt_sep = strrchr(c_str(), kAlternatePathSeparator); + // Comparing two pointers of which only one is NULL is undefined. + if (last_alt_sep != NULL && + (last_sep == NULL || last_alt_sep > last_sep)) { + return last_alt_sep; + } +#endif + return last_sep; +} + +// Returns a copy of the FilePath with the directory part removed. +// Example: FilePath("path/to/file").RemoveDirectoryName() returns +// FilePath("file"). If there is no directory part ("just_a_file"), it returns +// the FilePath unmodified. If there is no file part ("just_a_dir/") it +// returns an empty FilePath (""). +// On Windows platform, '\' is the path separator, otherwise it is '/'. +FilePath FilePath::RemoveDirectoryName() const { + const char* const last_sep = FindLastPathSeparator(); + return last_sep ? FilePath(String(last_sep + 1)) : *this; +} + +// RemoveFileName returns the directory path with the filename removed. +// Example: FilePath("path/to/file").RemoveFileName() returns "path/to/". +// If the FilePath is "a_file" or "/a_file", RemoveFileName returns +// FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does +// not have a file, like "just/a/dir/", it returns the FilePath unmodified. +// On Windows platform, '\' is the path separator, otherwise it is '/'. +FilePath FilePath::RemoveFileName() const { + const char* const last_sep = FindLastPathSeparator(); + String dir; + if (last_sep) { + dir = String(c_str(), last_sep + 1 - c_str()); + } else { + dir = kCurrentDirectoryString; + } + return FilePath(dir); +} + +// Helper functions for naming files in a directory for xml output. + +// Given directory = "dir", base_name = "test", number = 0, +// extension = "xml", returns "dir/test.xml". If number is greater +// than zero (e.g., 12), returns "dir/test_12.xml". +// On Windows platform, uses \ as the separator rather than /. +FilePath FilePath::MakeFileName(const FilePath& directory, + const FilePath& base_name, + int number, + const char* extension) { + String file; + if (number == 0) { + file = String::Format("%s.%s", base_name.c_str(), extension); + } else { + file = String::Format("%s_%d.%s", base_name.c_str(), number, extension); + } + return ConcatPaths(directory, FilePath(file)); +} + +// Given directory = "dir", relative_path = "test.xml", returns "dir/test.xml". +// On Windows, uses \ as the separator rather than /. +FilePath FilePath::ConcatPaths(const FilePath& directory, + const FilePath& relative_path) { + if (directory.IsEmpty()) + return relative_path; + const FilePath dir(directory.RemoveTrailingPathSeparator()); + return FilePath(String::Format("%s%c%s", dir.c_str(), kPathSeparator, + relative_path.c_str())); +} + +// Returns true if pathname describes something findable in the file-system, +// either a file, directory, or whatever. +bool FilePath::FileOrDirectoryExists() const { +#if GTEST_OS_WINDOWS_MOBILE + LPCWSTR unicode = String::AnsiToUtf16(pathname_.c_str()); + const DWORD attributes = GetFileAttributes(unicode); + delete [] unicode; + return attributes != kInvalidFileAttributes; +#else + posix::StatStruct file_stat; + return posix::Stat(pathname_.c_str(), &file_stat) == 0; +#endif // GTEST_OS_WINDOWS_MOBILE +} + +// Returns true if pathname describes a directory in the file-system +// that exists. +bool FilePath::DirectoryExists() const { + bool result = false; +#if GTEST_OS_WINDOWS + // Don't strip off trailing separator if path is a root directory on + // Windows (like "C:\\"). + const FilePath& path(IsRootDirectory() ? *this : + RemoveTrailingPathSeparator()); +#else + const FilePath& path(*this); +#endif + +#if GTEST_OS_WINDOWS_MOBILE + LPCWSTR unicode = String::AnsiToUtf16(path.c_str()); + const DWORD attributes = GetFileAttributes(unicode); + delete [] unicode; + if ((attributes != kInvalidFileAttributes) && + (attributes & FILE_ATTRIBUTE_DIRECTORY)) { + result = true; + } +#else + posix::StatStruct file_stat; + result = posix::Stat(path.c_str(), &file_stat) == 0 && + posix::IsDir(file_stat); +#endif // GTEST_OS_WINDOWS_MOBILE + + return result; +} + +// Returns true if pathname describes a root directory. (Windows has one +// root directory per disk drive.) +bool FilePath::IsRootDirectory() const { +#if GTEST_OS_WINDOWS + // TODO(wan@google.com): on Windows a network share like + // \\server\share can be a root directory, although it cannot be the + // current directory. Handle this properly. + return pathname_.length() == 3 && IsAbsolutePath(); +#else + return pathname_.length() == 1 && IsPathSeparator(pathname_.c_str()[0]); +#endif +} + +// Returns true if pathname describes an absolute path. +bool FilePath::IsAbsolutePath() const { + const char* const name = pathname_.c_str(); +#if GTEST_OS_WINDOWS + return pathname_.length() >= 3 && + ((name[0] >= 'a' && name[0] <= 'z') || + (name[0] >= 'A' && name[0] <= 'Z')) && + name[1] == ':' && + IsPathSeparator(name[2]); +#else + return IsPathSeparator(name[0]); +#endif +} + +// Returns a pathname for a file that does not currently exist. The pathname +// will be directory/base_name.extension or +// directory/base_name_.extension if directory/base_name.extension +// already exists. The number will be incremented until a pathname is found +// that does not already exist. +// Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'. +// There could be a race condition if two or more processes are calling this +// function at the same time -- they could both pick the same filename. +FilePath FilePath::GenerateUniqueFileName(const FilePath& directory, + const FilePath& base_name, + const char* extension) { + FilePath full_pathname; + int number = 0; + do { + full_pathname.Set(MakeFileName(directory, base_name, number++, extension)); + } while (full_pathname.FileOrDirectoryExists()); + return full_pathname; +} + +// Returns true if FilePath ends with a path separator, which indicates that +// it is intended to represent a directory. Returns false otherwise. +// This does NOT check that a directory (or file) actually exists. +bool FilePath::IsDirectory() const { + return !pathname_.empty() && + IsPathSeparator(pathname_.c_str()[pathname_.length() - 1]); +} + +// Create directories so that path exists. Returns true if successful or if +// the directories already exist; returns false if unable to create directories +// for any reason. +bool FilePath::CreateDirectoriesRecursively() const { + if (!this->IsDirectory()) { + return false; + } + + if (pathname_.length() == 0 || this->DirectoryExists()) { + return true; + } + + const FilePath parent(this->RemoveTrailingPathSeparator().RemoveFileName()); + return parent.CreateDirectoriesRecursively() && this->CreateFolder(); +} + +// Create the directory so that path exists. Returns true if successful or +// if the directory already exists; returns false if unable to create the +// directory for any reason, including if the parent directory does not +// exist. Not named "CreateDirectory" because that's a macro on Windows. +bool FilePath::CreateFolder() const { +#if GTEST_OS_WINDOWS_MOBILE + FilePath removed_sep(this->RemoveTrailingPathSeparator()); + LPCWSTR unicode = String::AnsiToUtf16(removed_sep.c_str()); + int result = CreateDirectory(unicode, NULL) ? 0 : -1; + delete [] unicode; +#elif GTEST_OS_WINDOWS + int result = _mkdir(pathname_.c_str()); +#else + int result = mkdir(pathname_.c_str(), 0777); +#endif // GTEST_OS_WINDOWS_MOBILE + + if (result == -1) { + return this->DirectoryExists(); // An error is OK if the directory exists. + } + return true; // No error. +} + +// If input name has a trailing separator character, remove it and return the +// name, otherwise return the name string unmodified. +// On Windows platform, uses \ as the separator, other platforms use /. +FilePath FilePath::RemoveTrailingPathSeparator() const { + return IsDirectory() + ? FilePath(String(pathname_.c_str(), pathname_.length() - 1)) + : *this; +} + +// Removes any redundant separators that might be in the pathname. +// For example, "bar///foo" becomes "bar/foo". Does not eliminate other +// redundancies that might be in a pathname involving "." or "..". +// TODO(wan@google.com): handle Windows network shares (e.g. \\server\share). +void FilePath::Normalize() { + if (pathname_.c_str() == NULL) { + pathname_ = ""; + return; + } + const char* src = pathname_.c_str(); + char* const dest = new char[pathname_.length() + 1]; + char* dest_ptr = dest; + memset(dest_ptr, 0, pathname_.length() + 1); + + while (*src != '\0') { + *dest_ptr = *src; + if (!IsPathSeparator(*src)) { + src++; + } else { +#if GTEST_HAS_ALT_PATH_SEP_ + if (*dest_ptr == kAlternatePathSeparator) { + *dest_ptr = kPathSeparator; + } +#endif + while (IsPathSeparator(*src)) + src++; + } + dest_ptr++; + } + *dest_ptr = '\0'; + pathname_ = dest; + delete[] dest; +} + +} // namespace internal +} // namespace testing +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) + + +#include +#include +#include +#include + +#if GTEST_OS_WINDOWS_MOBILE +# include // For TerminateProcess() +#elif GTEST_OS_WINDOWS +# include +# include +#else +# include +#endif // GTEST_OS_WINDOWS_MOBILE + +#if GTEST_OS_MAC +# include +# include +# include +#endif // GTEST_OS_MAC + + +// Indicates that this translation unit is part of Google Test's +// implementation. It must come before gtest-internal-inl.h is +// included, or there will be a compiler error. This trick is to +// prevent a user from accidentally including gtest-internal-inl.h in +// his code. +#define GTEST_IMPLEMENTATION_ 1 +#undef GTEST_IMPLEMENTATION_ + +namespace testing { +namespace internal { + +#if defined(_MSC_VER) || defined(__BORLANDC__) +// MSVC and C++Builder do not provide a definition of STDERR_FILENO. +const int kStdOutFileno = 1; +const int kStdErrFileno = 2; +#else +const int kStdOutFileno = STDOUT_FILENO; +const int kStdErrFileno = STDERR_FILENO; +#endif // _MSC_VER + +#if GTEST_OS_MAC + +// Returns the number of threads running in the process, or 0 to indicate that +// we cannot detect it. +size_t GetThreadCount() { + const task_t task = mach_task_self(); + mach_msg_type_number_t thread_count; + thread_act_array_t thread_list; + const kern_return_t status = task_threads(task, &thread_list, &thread_count); + if (status == KERN_SUCCESS) { + // task_threads allocates resources in thread_list and we need to free them + // to avoid leaks. + vm_deallocate(task, + reinterpret_cast(thread_list), + sizeof(thread_t) * thread_count); + return static_cast(thread_count); + } else { + return 0; + } +} + +#else + +size_t GetThreadCount() { + // There's no portable way to detect the number of threads, so we just + // return 0 to indicate that we cannot detect it. + return 0; +} + +#endif // GTEST_OS_MAC + +#if GTEST_USES_POSIX_RE + +// Implements RE. Currently only needed for death tests. + +RE::~RE() { + if (is_valid_) { + // regfree'ing an invalid regex might crash because the content + // of the regex is undefined. Since the regex's are essentially + // the same, one cannot be valid (or invalid) without the other + // being so too. + regfree(&partial_regex_); + regfree(&full_regex_); + } + free(const_cast(pattern_)); +} + +// Returns true iff regular expression re matches the entire str. +bool RE::FullMatch(const char* str, const RE& re) { + if (!re.is_valid_) return false; + + regmatch_t match; + return regexec(&re.full_regex_, str, 1, &match, 0) == 0; +} + +// Returns true iff regular expression re matches a substring of str +// (including str itself). +bool RE::PartialMatch(const char* str, const RE& re) { + if (!re.is_valid_) return false; + + regmatch_t match; + return regexec(&re.partial_regex_, str, 1, &match, 0) == 0; +} + +// Initializes an RE from its string representation. +void RE::Init(const char* regex) { + pattern_ = posix::StrDup(regex); + + // Reserves enough bytes to hold the regular expression used for a + // full match. + const size_t full_regex_len = strlen(regex) + 10; + char* const full_pattern = new char[full_regex_len]; + + snprintf(full_pattern, full_regex_len, "^(%s)$", regex); + is_valid_ = regcomp(&full_regex_, full_pattern, REG_EXTENDED) == 0; + // We want to call regcomp(&partial_regex_, ...) even if the + // previous expression returns false. Otherwise partial_regex_ may + // not be properly initialized can may cause trouble when it's + // freed. + // + // Some implementation of POSIX regex (e.g. on at least some + // versions of Cygwin) doesn't accept the empty string as a valid + // regex. We change it to an equivalent form "()" to be safe. + if (is_valid_) { + const char* const partial_regex = (*regex == '\0') ? "()" : regex; + is_valid_ = regcomp(&partial_regex_, partial_regex, REG_EXTENDED) == 0; + } + EXPECT_TRUE(is_valid_) + << "Regular expression \"" << regex + << "\" is not a valid POSIX Extended regular expression."; + + delete[] full_pattern; +} + +#elif GTEST_USES_SIMPLE_RE + +// Returns true iff ch appears anywhere in str (excluding the +// terminating '\0' character). +bool IsInSet(char ch, const char* str) { + return ch != '\0' && strchr(str, ch) != NULL; +} + +// Returns true iff ch belongs to the given classification. Unlike +// similar functions in , these aren't affected by the +// current locale. +bool IsAsciiDigit(char ch) { return '0' <= ch && ch <= '9'; } +bool IsAsciiPunct(char ch) { + return IsInSet(ch, "^-!\"#$%&'()*+,./:;<=>?@[\\]_`{|}~"); +} +bool IsRepeat(char ch) { return IsInSet(ch, "?*+"); } +bool IsAsciiWhiteSpace(char ch) { return IsInSet(ch, " \f\n\r\t\v"); } +bool IsAsciiWordChar(char ch) { + return ('a' <= ch && ch <= 'z') || ('A' <= ch && ch <= 'Z') || + ('0' <= ch && ch <= '9') || ch == '_'; +} + +// Returns true iff "\\c" is a supported escape sequence. +bool IsValidEscape(char c) { + return (IsAsciiPunct(c) || IsInSet(c, "dDfnrsStvwW")); +} + +// Returns true iff the given atom (specified by escaped and pattern) +// matches ch. The result is undefined if the atom is invalid. +bool AtomMatchesChar(bool escaped, char pattern_char, char ch) { + if (escaped) { // "\\p" where p is pattern_char. + switch (pattern_char) { + case 'd': return IsAsciiDigit(ch); + case 'D': return !IsAsciiDigit(ch); + case 'f': return ch == '\f'; + case 'n': return ch == '\n'; + case 'r': return ch == '\r'; + case 's': return IsAsciiWhiteSpace(ch); + case 'S': return !IsAsciiWhiteSpace(ch); + case 't': return ch == '\t'; + case 'v': return ch == '\v'; + case 'w': return IsAsciiWordChar(ch); + case 'W': return !IsAsciiWordChar(ch); + } + return IsAsciiPunct(pattern_char) && pattern_char == ch; + } + + return (pattern_char == '.' && ch != '\n') || pattern_char == ch; +} + +// Helper function used by ValidateRegex() to format error messages. +String FormatRegexSyntaxError(const char* regex, int index) { + return (Message() << "Syntax error at index " << index + << " in simple regular expression \"" << regex << "\": ").GetString(); +} + +// Generates non-fatal failures and returns false if regex is invalid; +// otherwise returns true. +bool ValidateRegex(const char* regex) { + if (regex == NULL) { + // TODO(wan@google.com): fix the source file location in the + // assertion failures to match where the regex is used in user + // code. + ADD_FAILURE() << "NULL is not a valid simple regular expression."; + return false; + } + + bool is_valid = true; + + // True iff ?, *, or + can follow the previous atom. + bool prev_repeatable = false; + for (int i = 0; regex[i]; i++) { + if (regex[i] == '\\') { // An escape sequence + i++; + if (regex[i] == '\0') { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1) + << "'\\' cannot appear at the end."; + return false; + } + + if (!IsValidEscape(regex[i])) { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i - 1) + << "invalid escape sequence \"\\" << regex[i] << "\"."; + is_valid = false; + } + prev_repeatable = true; + } else { // Not an escape sequence. + const char ch = regex[i]; + + if (ch == '^' && i > 0) { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i) + << "'^' can only appear at the beginning."; + is_valid = false; + } else if (ch == '$' && regex[i + 1] != '\0') { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i) + << "'$' can only appear at the end."; + is_valid = false; + } else if (IsInSet(ch, "()[]{}|")) { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i) + << "'" << ch << "' is unsupported."; + is_valid = false; + } else if (IsRepeat(ch) && !prev_repeatable) { + ADD_FAILURE() << FormatRegexSyntaxError(regex, i) + << "'" << ch << "' can only follow a repeatable token."; + is_valid = false; + } + + prev_repeatable = !IsInSet(ch, "^$?*+"); + } + } + + return is_valid; +} + +// Matches a repeated regex atom followed by a valid simple regular +// expression. The regex atom is defined as c if escaped is false, +// or \c otherwise. repeat is the repetition meta character (?, *, +// or +). The behavior is undefined if str contains too many +// characters to be indexable by size_t, in which case the test will +// probably time out anyway. We are fine with this limitation as +// std::string has it too. +bool MatchRepetitionAndRegexAtHead( + bool escaped, char c, char repeat, const char* regex, + const char* str) { + const size_t min_count = (repeat == '+') ? 1 : 0; + const size_t max_count = (repeat == '?') ? 1 : + static_cast(-1) - 1; + // We cannot call numeric_limits::max() as it conflicts with the + // max() macro on Windows. + + for (size_t i = 0; i <= max_count; ++i) { + // We know that the atom matches each of the first i characters in str. + if (i >= min_count && MatchRegexAtHead(regex, str + i)) { + // We have enough matches at the head, and the tail matches too. + // Since we only care about *whether* the pattern matches str + // (as opposed to *how* it matches), there is no need to find a + // greedy match. + return true; + } + if (str[i] == '\0' || !AtomMatchesChar(escaped, c, str[i])) + return false; + } + return false; +} + +// Returns true iff regex matches a prefix of str. regex must be a +// valid simple regular expression and not start with "^", or the +// result is undefined. +bool MatchRegexAtHead(const char* regex, const char* str) { + if (*regex == '\0') // An empty regex matches a prefix of anything. + return true; + + // "$" only matches the end of a string. Note that regex being + // valid guarantees that there's nothing after "$" in it. + if (*regex == '$') + return *str == '\0'; + + // Is the first thing in regex an escape sequence? + const bool escaped = *regex == '\\'; + if (escaped) + ++regex; + if (IsRepeat(regex[1])) { + // MatchRepetitionAndRegexAtHead() calls MatchRegexAtHead(), so + // here's an indirect recursion. It terminates as the regex gets + // shorter in each recursion. + return MatchRepetitionAndRegexAtHead( + escaped, regex[0], regex[1], regex + 2, str); + } else { + // regex isn't empty, isn't "$", and doesn't start with a + // repetition. We match the first atom of regex with the first + // character of str and recurse. + return (*str != '\0') && AtomMatchesChar(escaped, *regex, *str) && + MatchRegexAtHead(regex + 1, str + 1); + } +} + +// Returns true iff regex matches any substring of str. regex must be +// a valid simple regular expression, or the result is undefined. +// +// The algorithm is recursive, but the recursion depth doesn't exceed +// the regex length, so we won't need to worry about running out of +// stack space normally. In rare cases the time complexity can be +// exponential with respect to the regex length + the string length, +// but usually it's must faster (often close to linear). +bool MatchRegexAnywhere(const char* regex, const char* str) { + if (regex == NULL || str == NULL) + return false; + + if (*regex == '^') + return MatchRegexAtHead(regex + 1, str); + + // A successful match can be anywhere in str. + do { + if (MatchRegexAtHead(regex, str)) + return true; + } while (*str++ != '\0'); + return false; +} + +// Implements the RE class. + +RE::~RE() { + free(const_cast(pattern_)); + free(const_cast(full_pattern_)); +} + +// Returns true iff regular expression re matches the entire str. +bool RE::FullMatch(const char* str, const RE& re) { + return re.is_valid_ && MatchRegexAnywhere(re.full_pattern_, str); +} + +// Returns true iff regular expression re matches a substring of str +// (including str itself). +bool RE::PartialMatch(const char* str, const RE& re) { + return re.is_valid_ && MatchRegexAnywhere(re.pattern_, str); +} + +// Initializes an RE from its string representation. +void RE::Init(const char* regex) { + pattern_ = full_pattern_ = NULL; + if (regex != NULL) { + pattern_ = posix::StrDup(regex); + } + + is_valid_ = ValidateRegex(regex); + if (!is_valid_) { + // No need to calculate the full pattern when the regex is invalid. + return; + } + + const size_t len = strlen(regex); + // Reserves enough bytes to hold the regular expression used for a + // full match: we need space to prepend a '^', append a '$', and + // terminate the string with '\0'. + char* buffer = static_cast(malloc(len + 3)); + full_pattern_ = buffer; + + if (*regex != '^') + *buffer++ = '^'; // Makes sure full_pattern_ starts with '^'. + + // We don't use snprintf or strncpy, as they trigger a warning when + // compiled with VC++ 8.0. + memcpy(buffer, regex, len); + buffer += len; + + if (len == 0 || regex[len - 1] != '$') + *buffer++ = '$'; // Makes sure full_pattern_ ends with '$'. + + *buffer = '\0'; +} + +#endif // GTEST_USES_POSIX_RE + +const char kUnknownFile[] = "unknown file"; + +// Formats a source file path and a line number as they would appear +// in an error message from the compiler used to compile this code. +GTEST_API_ ::std::string FormatFileLocation(const char* file, int line) { + const char* const file_name = file == NULL ? kUnknownFile : file; + + if (line < 0) { + return String::Format("%s:", file_name).c_str(); + } +#ifdef _MSC_VER + return String::Format("%s(%d):", file_name, line).c_str(); +#else + return String::Format("%s:%d:", file_name, line).c_str(); +#endif // _MSC_VER +} + +// Formats a file location for compiler-independent XML output. +// Although this function is not platform dependent, we put it next to +// FormatFileLocation in order to contrast the two functions. +// Note that FormatCompilerIndependentFileLocation() does NOT append colon +// to the file location it produces, unlike FormatFileLocation(). +GTEST_API_ ::std::string FormatCompilerIndependentFileLocation( + const char* file, int line) { + const char* const file_name = file == NULL ? kUnknownFile : file; + + if (line < 0) + return file_name; + else + return String::Format("%s:%d", file_name, line).c_str(); +} + + +GTestLog::GTestLog(GTestLogSeverity severity, const char* file, int line) + : severity_(severity) { + const char* const marker = + severity == GTEST_INFO ? "[ INFO ]" : + severity == GTEST_WARNING ? "[WARNING]" : + severity == GTEST_ERROR ? "[ ERROR ]" : "[ FATAL ]"; + GetStream() << ::std::endl << marker << " " + << FormatFileLocation(file, line).c_str() << ": "; +} + +// Flushes the buffers and, if severity is GTEST_FATAL, aborts the program. +GTestLog::~GTestLog() { + GetStream() << ::std::endl; + if (severity_ == GTEST_FATAL) { + fflush(stderr); + posix::Abort(); + } +} +// Disable Microsoft deprecation warnings for POSIX functions called from +// this class (creat, dup, dup2, and close) +#ifdef _MSC_VER +# pragma warning(push) +# pragma warning(disable: 4996) +#endif // _MSC_VER + +#if GTEST_HAS_STREAM_REDIRECTION + +// Object that captures an output stream (stdout/stderr). +class CapturedStream { + public: + // The ctor redirects the stream to a temporary file. + CapturedStream(int fd) : fd_(fd), uncaptured_fd_(dup(fd)) { + +# if GTEST_OS_WINDOWS + char temp_dir_path[MAX_PATH + 1] = { '\0' }; // NOLINT + char temp_file_path[MAX_PATH + 1] = { '\0' }; // NOLINT + + ::GetTempPathA(sizeof(temp_dir_path), temp_dir_path); + const UINT success = ::GetTempFileNameA(temp_dir_path, + "gtest_redir", + 0, // Generate unique file name. + temp_file_path); + GTEST_CHECK_(success != 0) + << "Unable to create a temporary file in " << temp_dir_path; + const int captured_fd = creat(temp_file_path, _S_IREAD | _S_IWRITE); + GTEST_CHECK_(captured_fd != -1) << "Unable to open temporary file " + << temp_file_path; + filename_ = temp_file_path; +# else + // There's no guarantee that a test has write access to the + // current directory, so we create the temporary file in the /tmp + // directory instead. + char name_template[] = "/tmp/captured_stream.XXXXXX"; + const int captured_fd = mkstemp(name_template); + filename_ = name_template; +# endif // GTEST_OS_WINDOWS + fflush(NULL); + dup2(captured_fd, fd_); + close(captured_fd); + } + + ~CapturedStream() { + remove(filename_.c_str()); + } + + String GetCapturedString() { + if (uncaptured_fd_ != -1) { + // Restores the original stream. + fflush(NULL); + dup2(uncaptured_fd_, fd_); + close(uncaptured_fd_); + uncaptured_fd_ = -1; + } + + FILE* const file = posix::FOpen(filename_.c_str(), "r"); + const String content = ReadEntireFile(file); + posix::FClose(file); + return content; + } + + private: + // Reads the entire content of a file as a String. + static String ReadEntireFile(FILE* file); + + // Returns the size (in bytes) of a file. + static size_t GetFileSize(FILE* file); + + const int fd_; // A stream to capture. + int uncaptured_fd_; + // Name of the temporary file holding the stderr output. + ::std::string filename_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(CapturedStream); +}; + +// Returns the size (in bytes) of a file. +size_t CapturedStream::GetFileSize(FILE* file) { + fseek(file, 0, SEEK_END); + return static_cast(ftell(file)); +} + +// Reads the entire content of a file as a string. +String CapturedStream::ReadEntireFile(FILE* file) { + const size_t file_size = GetFileSize(file); + char* const buffer = new char[file_size]; + + size_t bytes_last_read = 0; // # of bytes read in the last fread() + size_t bytes_read = 0; // # of bytes read so far + + fseek(file, 0, SEEK_SET); + + // Keeps reading the file until we cannot read further or the + // pre-determined file size is reached. + do { + bytes_last_read = fread(buffer+bytes_read, 1, file_size-bytes_read, file); + bytes_read += bytes_last_read; + } while (bytes_last_read > 0 && bytes_read < file_size); + + const String content(buffer, bytes_read); + delete[] buffer; + + return content; +} + +# ifdef _MSC_VER +# pragma warning(pop) +# endif // _MSC_VER + +static CapturedStream* g_captured_stderr = NULL; +static CapturedStream* g_captured_stdout = NULL; + +// Starts capturing an output stream (stdout/stderr). +void CaptureStream(int fd, const char* stream_name, CapturedStream** stream) { + if (*stream != NULL) { + GTEST_LOG_(FATAL) << "Only one " << stream_name + << " capturer can exist at a time."; + } + *stream = new CapturedStream(fd); +} + +// Stops capturing the output stream and returns the captured string. +String GetCapturedStream(CapturedStream** captured_stream) { + const String content = (*captured_stream)->GetCapturedString(); + + delete *captured_stream; + *captured_stream = NULL; + + return content; +} + +// Starts capturing stdout. +void CaptureStdout() { + CaptureStream(kStdOutFileno, "stdout", &g_captured_stdout); +} + +// Starts capturing stderr. +void CaptureStderr() { + CaptureStream(kStdErrFileno, "stderr", &g_captured_stderr); +} + +// Stops capturing stdout and returns the captured string. +String GetCapturedStdout() { return GetCapturedStream(&g_captured_stdout); } + +// Stops capturing stderr and returns the captured string. +String GetCapturedStderr() { return GetCapturedStream(&g_captured_stderr); } + +#endif // GTEST_HAS_STREAM_REDIRECTION + +#if GTEST_HAS_DEATH_TEST + +// A copy of all command line arguments. Set by InitGoogleTest(). +::std::vector g_argvs; + +// Returns the command line as a vector of strings. +const ::std::vector& GetArgvs() { return g_argvs; } + +#endif // GTEST_HAS_DEATH_TEST + +#if GTEST_OS_WINDOWS_MOBILE +namespace posix { +void Abort() { + DebugBreak(); + TerminateProcess(GetCurrentProcess(), 1); +} +} // namespace posix +#endif // GTEST_OS_WINDOWS_MOBILE + +// Returns the name of the environment variable corresponding to the +// given flag. For example, FlagToEnvVar("foo") will return +// "GTEST_FOO" in the open-source version. +static String FlagToEnvVar(const char* flag) { + const String full_flag = + (Message() << GTEST_FLAG_PREFIX_ << flag).GetString(); + + Message env_var; + for (size_t i = 0; i != full_flag.length(); i++) { + env_var << ToUpper(full_flag.c_str()[i]); + } + + return env_var.GetString(); +} + +// Parses 'str' for a 32-bit signed integer. If successful, writes +// the result to *value and returns true; otherwise leaves *value +// unchanged and returns false. +bool ParseInt32(const Message& src_text, const char* str, Int32* value) { + // Parses the environment variable as a decimal integer. + char* end = NULL; + const long long_value = strtol(str, &end, 10); // NOLINT + + // Has strtol() consumed all characters in the string? + if (*end != '\0') { + // No - an invalid character was encountered. + Message msg; + msg << "WARNING: " << src_text + << " is expected to be a 32-bit integer, but actually" + << " has value \"" << str << "\".\n"; + printf("%s", msg.GetString().c_str()); + fflush(stdout); + return false; + } + + // Is the parsed value in the range of an Int32? + const Int32 result = static_cast(long_value); + if (long_value == LONG_MAX || long_value == LONG_MIN || + // The parsed value overflows as a long. (strtol() returns + // LONG_MAX or LONG_MIN when the input overflows.) + result != long_value + // The parsed value overflows as an Int32. + ) { + Message msg; + msg << "WARNING: " << src_text + << " is expected to be a 32-bit integer, but actually" + << " has value " << str << ", which overflows.\n"; + printf("%s", msg.GetString().c_str()); + fflush(stdout); + return false; + } + + *value = result; + return true; +} + +// Reads and returns the Boolean environment variable corresponding to +// the given flag; if it's not set, returns default_value. +// +// The value is considered true iff it's not "0". +bool BoolFromGTestEnv(const char* flag, bool default_value) { + const String env_var = FlagToEnvVar(flag); + const char* const string_value = posix::GetEnv(env_var.c_str()); + return string_value == NULL ? + default_value : strcmp(string_value, "0") != 0; +} + +// Reads and returns a 32-bit integer stored in the environment +// variable corresponding to the given flag; if it isn't set or +// doesn't represent a valid 32-bit integer, returns default_value. +Int32 Int32FromGTestEnv(const char* flag, Int32 default_value) { + const String env_var = FlagToEnvVar(flag); + const char* const string_value = posix::GetEnv(env_var.c_str()); + if (string_value == NULL) { + // The environment variable is not set. + return default_value; + } + + Int32 result = default_value; + if (!ParseInt32(Message() << "Environment variable " << env_var, + string_value, &result)) { + printf("The default value %s is used.\n", + (Message() << default_value).GetString().c_str()); + fflush(stdout); + return default_value; + } + + return result; +} + +// Reads and returns the string environment variable corresponding to +// the given flag; if it's not set, returns default_value. +const char* StringFromGTestEnv(const char* flag, const char* default_value) { + const String env_var = FlagToEnvVar(flag); + const char* const value = posix::GetEnv(env_var.c_str()); + return value == NULL ? default_value : value; +} + +} // namespace internal +} // namespace testing +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) + +// Google Test - The Google C++ Testing Framework +// +// This file implements a universal value printer that can print a +// value of any type T: +// +// void ::testing::internal::UniversalPrinter::Print(value, ostream_ptr); +// +// It uses the << operator when possible, and prints the bytes in the +// object otherwise. A user can override its behavior for a class +// type Foo by defining either operator<<(::std::ostream&, const Foo&) +// or void PrintTo(const Foo&, ::std::ostream*) in the namespace that +// defines Foo. + +#include +#include +#include // NOLINT +#include + +namespace testing { + +namespace { + +using ::std::ostream; + +#if GTEST_OS_WINDOWS_MOBILE // Windows CE does not define _snprintf_s. +# define snprintf _snprintf +#elif _MSC_VER >= 1400 // VC 8.0 and later deprecate snprintf and _snprintf. +# define snprintf _snprintf_s +#elif _MSC_VER +# define snprintf _snprintf +#endif // GTEST_OS_WINDOWS_MOBILE + +// Prints a segment of bytes in the given object. +void PrintByteSegmentInObjectTo(const unsigned char* obj_bytes, size_t start, + size_t count, ostream* os) { + char text[5] = ""; + for (size_t i = 0; i != count; i++) { + const size_t j = start + i; + if (i != 0) { + // Organizes the bytes into groups of 2 for easy parsing by + // human. + if ((j % 2) == 0) + *os << ' '; + else + *os << '-'; + } + snprintf(text, sizeof(text), "%02X", obj_bytes[j]); + *os << text; + } +} + +// Prints the bytes in the given value to the given ostream. +void PrintBytesInObjectToImpl(const unsigned char* obj_bytes, size_t count, + ostream* os) { + // Tells the user how big the object is. + *os << count << "-byte object <"; + + const size_t kThreshold = 132; + const size_t kChunkSize = 64; + // If the object size is bigger than kThreshold, we'll have to omit + // some details by printing only the first and the last kChunkSize + // bytes. + // TODO(wan): let the user control the threshold using a flag. + if (count < kThreshold) { + PrintByteSegmentInObjectTo(obj_bytes, 0, count, os); + } else { + PrintByteSegmentInObjectTo(obj_bytes, 0, kChunkSize, os); + *os << " ... "; + // Rounds up to 2-byte boundary. + const size_t resume_pos = (count - kChunkSize + 1)/2*2; + PrintByteSegmentInObjectTo(obj_bytes, resume_pos, count - resume_pos, os); + } + *os << ">"; +} + +} // namespace + +namespace internal2 { + +// Delegates to PrintBytesInObjectToImpl() to print the bytes in the +// given object. The delegation simplifies the implementation, which +// uses the << operator and thus is easier done outside of the +// ::testing::internal namespace, which contains a << operator that +// sometimes conflicts with the one in STL. +void PrintBytesInObjectTo(const unsigned char* obj_bytes, size_t count, + ostream* os) { + PrintBytesInObjectToImpl(obj_bytes, count, os); +} + +} // namespace internal2 + +namespace internal { + +// Depending on the value of a char (or wchar_t), we print it in one +// of three formats: +// - as is if it's a printable ASCII (e.g. 'a', '2', ' '), +// - as a hexidecimal escape sequence (e.g. '\x7F'), or +// - as a special escape sequence (e.g. '\r', '\n'). +enum CharFormat { + kAsIs, + kHexEscape, + kSpecialEscape +}; + +// Returns true if c is a printable ASCII character. We test the +// value of c directly instead of calling isprint(), which is buggy on +// Windows Mobile. +inline bool IsPrintableAscii(wchar_t c) { + return 0x20 <= c && c <= 0x7E; +} + +// Prints a wide or narrow char c as a character literal without the +// quotes, escaping it when necessary; returns how c was formatted. +// The template argument UnsignedChar is the unsigned version of Char, +// which is the type of c. +template +static CharFormat PrintAsCharLiteralTo(Char c, ostream* os) { + switch (static_cast(c)) { + case L'\0': + *os << "\\0"; + break; + case L'\'': + *os << "\\'"; + break; + case L'\\': + *os << "\\\\"; + break; + case L'\a': + *os << "\\a"; + break; + case L'\b': + *os << "\\b"; + break; + case L'\f': + *os << "\\f"; + break; + case L'\n': + *os << "\\n"; + break; + case L'\r': + *os << "\\r"; + break; + case L'\t': + *os << "\\t"; + break; + case L'\v': + *os << "\\v"; + break; + default: + if (IsPrintableAscii(c)) { + *os << static_cast(c); + return kAsIs; + } else { + *os << String::Format("\\x%X", static_cast(c)); + return kHexEscape; + } + } + return kSpecialEscape; +} + +// Prints a char c as if it's part of a string literal, escaping it when +// necessary; returns how c was formatted. +static CharFormat PrintAsWideStringLiteralTo(wchar_t c, ostream* os) { + switch (c) { + case L'\'': + *os << "'"; + return kAsIs; + case L'"': + *os << "\\\""; + return kSpecialEscape; + default: + return PrintAsCharLiteralTo(c, os); + } +} + +// Prints a char c as if it's part of a string literal, escaping it when +// necessary; returns how c was formatted. +static CharFormat PrintAsNarrowStringLiteralTo(char c, ostream* os) { + return PrintAsWideStringLiteralTo(static_cast(c), os); +} + +// Prints a wide or narrow character c and its code. '\0' is printed +// as "'\\0'", other unprintable characters are also properly escaped +// using the standard C++ escape sequence. The template argument +// UnsignedChar is the unsigned version of Char, which is the type of c. +template +void PrintCharAndCodeTo(Char c, ostream* os) { + // First, print c as a literal in the most readable form we can find. + *os << ((sizeof(c) > 1) ? "L'" : "'"); + const CharFormat format = PrintAsCharLiteralTo(c, os); + *os << "'"; + + // To aid user debugging, we also print c's code in decimal, unless + // it's 0 (in which case c was printed as '\\0', making the code + // obvious). + if (c == 0) + return; + *os << " (" << String::Format("%d", c).c_str(); + + // For more convenience, we print c's code again in hexidecimal, + // unless c was already printed in the form '\x##' or the code is in + // [1, 9]. + if (format == kHexEscape || (1 <= c && c <= 9)) { + // Do nothing. + } else { + *os << String::Format(", 0x%X", + static_cast(c)).c_str(); + } + *os << ")"; +} + +void PrintTo(unsigned char c, ::std::ostream* os) { + PrintCharAndCodeTo(c, os); +} +void PrintTo(signed char c, ::std::ostream* os) { + PrintCharAndCodeTo(c, os); +} + +// Prints a wchar_t as a symbol if it is printable or as its internal +// code otherwise and also as its code. L'\0' is printed as "L'\\0'". +void PrintTo(wchar_t wc, ostream* os) { + PrintCharAndCodeTo(wc, os); +} + +// Prints the given array of characters to the ostream. +// The array starts at *begin, the length is len, it may include '\0' characters +// and may not be null-terminated. +static void PrintCharsAsStringTo(const char* begin, size_t len, ostream* os) { + *os << "\""; + bool is_previous_hex = false; + for (size_t index = 0; index < len; ++index) { + const char cur = begin[index]; + if (is_previous_hex && IsXDigit(cur)) { + // Previous character is of '\x..' form and this character can be + // interpreted as another hexadecimal digit in its number. Break string to + // disambiguate. + *os << "\" \""; + } + is_previous_hex = PrintAsNarrowStringLiteralTo(cur, os) == kHexEscape; + } + *os << "\""; +} + +// Prints a (const) char array of 'len' elements, starting at address 'begin'. +void UniversalPrintArray(const char* begin, size_t len, ostream* os) { + PrintCharsAsStringTo(begin, len, os); +} + +// Prints the given array of wide characters to the ostream. +// The array starts at *begin, the length is len, it may include L'\0' +// characters and may not be null-terminated. +static void PrintWideCharsAsStringTo(const wchar_t* begin, size_t len, + ostream* os) { + *os << "L\""; + bool is_previous_hex = false; + for (size_t index = 0; index < len; ++index) { + const wchar_t cur = begin[index]; + if (is_previous_hex && isascii(cur) && IsXDigit(static_cast(cur))) { + // Previous character is of '\x..' form and this character can be + // interpreted as another hexadecimal digit in its number. Break string to + // disambiguate. + *os << "\" L\""; + } + is_previous_hex = PrintAsWideStringLiteralTo(cur, os) == kHexEscape; + } + *os << "\""; +} + +// Prints the given C string to the ostream. +void PrintTo(const char* s, ostream* os) { + if (s == NULL) { + *os << "NULL"; + } else { + *os << ImplicitCast_(s) << " pointing to "; + PrintCharsAsStringTo(s, strlen(s), os); + } +} + +// MSVC compiler can be configured to define whar_t as a typedef +// of unsigned short. Defining an overload for const wchar_t* in that case +// would cause pointers to unsigned shorts be printed as wide strings, +// possibly accessing more memory than intended and causing invalid +// memory accesses. MSVC defines _NATIVE_WCHAR_T_DEFINED symbol when +// wchar_t is implemented as a native type. +#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED) +// Prints the given wide C string to the ostream. +void PrintTo(const wchar_t* s, ostream* os) { + if (s == NULL) { + *os << "NULL"; + } else { + *os << ImplicitCast_(s) << " pointing to "; + PrintWideCharsAsStringTo(s, wcslen(s), os); + } +} +#endif // wchar_t is native + +// Prints a ::string object. +#if GTEST_HAS_GLOBAL_STRING +void PrintStringTo(const ::string& s, ostream* os) { + PrintCharsAsStringTo(s.data(), s.size(), os); +} +#endif // GTEST_HAS_GLOBAL_STRING + +void PrintStringTo(const ::std::string& s, ostream* os) { + PrintCharsAsStringTo(s.data(), s.size(), os); +} + +// Prints a ::wstring object. +#if GTEST_HAS_GLOBAL_WSTRING +void PrintWideStringTo(const ::wstring& s, ostream* os) { + PrintWideCharsAsStringTo(s.data(), s.size(), os); +} +#endif // GTEST_HAS_GLOBAL_WSTRING + +#if GTEST_HAS_STD_WSTRING +void PrintWideStringTo(const ::std::wstring& s, ostream* os) { + PrintWideCharsAsStringTo(s.data(), s.size(), os); +} +#endif // GTEST_HAS_STD_WSTRING + +} // namespace internal + +} // namespace testing +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: mheule@google.com (Markus Heule) +// +// The Google C++ Testing Framework (Google Test) + + +// Indicates that this translation unit is part of Google Test's +// implementation. It must come before gtest-internal-inl.h is +// included, or there will be a compiler error. This trick is to +// prevent a user from accidentally including gtest-internal-inl.h in +// his code. +#define GTEST_IMPLEMENTATION_ 1 +#undef GTEST_IMPLEMENTATION_ + +namespace testing { + +using internal::GetUnitTestImpl; + +// Gets the summary of the failure message by omitting the stack trace +// in it. +internal::String TestPartResult::ExtractSummary(const char* message) { + const char* const stack_trace = strstr(message, internal::kStackTraceMarker); + return stack_trace == NULL ? internal::String(message) : + internal::String(message, stack_trace - message); +} + +// Prints a TestPartResult object. +std::ostream& operator<<(std::ostream& os, const TestPartResult& result) { + return os + << result.file_name() << ":" << result.line_number() << ": " + << (result.type() == TestPartResult::kSuccess ? "Success" : + result.type() == TestPartResult::kFatalFailure ? "Fatal failure" : + "Non-fatal failure") << ":\n" + << result.message() << std::endl; +} + +// Appends a TestPartResult to the array. +void TestPartResultArray::Append(const TestPartResult& result) { + array_.push_back(result); +} + +// Returns the TestPartResult at the given index (0-based). +const TestPartResult& TestPartResultArray::GetTestPartResult(int index) const { + if (index < 0 || index >= size()) { + printf("\nInvalid index (%d) into TestPartResultArray.\n", index); + internal::posix::Abort(); + } + + return array_[index]; +} + +// Returns the number of TestPartResult objects in the array. +int TestPartResultArray::size() const { + return static_cast(array_.size()); +} + +namespace internal { + +HasNewFatalFailureHelper::HasNewFatalFailureHelper() + : has_new_fatal_failure_(false), + original_reporter_(GetUnitTestImpl()-> + GetTestPartResultReporterForCurrentThread()) { + GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread(this); +} + +HasNewFatalFailureHelper::~HasNewFatalFailureHelper() { + GetUnitTestImpl()->SetTestPartResultReporterForCurrentThread( + original_reporter_); +} + +void HasNewFatalFailureHelper::ReportTestPartResult( + const TestPartResult& result) { + if (result.fatally_failed()) + has_new_fatal_failure_ = true; + original_reporter_->ReportTestPartResult(result); +} + +} // namespace internal + +} // namespace testing +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) + + +namespace testing { +namespace internal { + +#if GTEST_HAS_TYPED_TEST_P + +// Skips to the first non-space char in str. Returns an empty string if str +// contains only whitespace characters. +static const char* SkipSpaces(const char* str) { + while (IsSpace(*str)) + str++; + return str; +} + +// Verifies that registered_tests match the test names in +// defined_test_names_; returns registered_tests if successful, or +// aborts the program otherwise. +const char* TypedTestCasePState::VerifyRegisteredTestNames( + const char* file, int line, const char* registered_tests) { + typedef ::std::set::const_iterator DefinedTestIter; + registered_ = true; + + // Skip initial whitespace in registered_tests since some + // preprocessors prefix stringizied literals with whitespace. + registered_tests = SkipSpaces(registered_tests); + + Message errors; + ::std::set tests; + for (const char* names = registered_tests; names != NULL; + names = SkipComma(names)) { + const String name = GetPrefixUntilComma(names); + if (tests.count(name) != 0) { + errors << "Test " << name << " is listed more than once.\n"; + continue; + } + + bool found = false; + for (DefinedTestIter it = defined_test_names_.begin(); + it != defined_test_names_.end(); + ++it) { + if (name == *it) { + found = true; + break; + } + } + + if (found) { + tests.insert(name); + } else { + errors << "No test named " << name + << " can be found in this test case.\n"; + } + } + + for (DefinedTestIter it = defined_test_names_.begin(); + it != defined_test_names_.end(); + ++it) { + if (tests.count(*it) == 0) { + errors << "You forgot to list test " << *it << ".\n"; + } + } + + const String& errors_str = errors.GetString(); + if (errors_str != "") { + fprintf(stderr, "%s %s", FormatFileLocation(file, line).c_str(), + errors_str.c_str()); + fflush(stderr); + posix::Abort(); + } + + return registered_tests; +} + +#endif // GTEST_HAS_TYPED_TEST_P + +} // namespace internal +} // namespace testing diff --git a/tests/gtest/common/gtest.h b/tests/gtest/common/gtest.h new file mode 100644 index 0000000..3143bd6 --- /dev/null +++ b/tests/gtest/common/gtest.h @@ -0,0 +1,19537 @@ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) +// +// The Google C++ Testing Framework (Google Test) +// +// This header file defines the public API for Google Test. It should be +// included by any test program that uses Google Test. +// +// IMPORTANT NOTE: Due to limitation of the C++ language, we have to +// leave some internal implementation details in this header file. +// They are clearly marked by comments like this: +// +// // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +// +// Such code is NOT meant to be used by a user directly, and is subject +// to CHANGE WITHOUT NOTICE. Therefore DO NOT DEPEND ON IT in a user +// program! +// +// Acknowledgment: Google Test borrowed the idea of automatic test +// registration from Barthelemy Dagenais' (barthelemy@prologique.com) +// easyUnit framework. + +#ifndef GTEST_INCLUDE_GTEST_GTEST_H_ +#define GTEST_INCLUDE_GTEST_GTEST_H_ + +#include +#include + +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee) +// +// The Google C++ Testing Framework (Google Test) +// +// This header file declares functions and macros used internally by +// Google Test. They are subject to change without notice. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ + +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Authors: wan@google.com (Zhanyong Wan) +// +// Low-level types and utilities for porting Google Test to various +// platforms. They are subject to change without notice. DO NOT USE +// THEM IN USER CODE. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ + +// The user can define the following macros in the build script to +// control Google Test's behavior. If the user doesn't define a macro +// in this list, Google Test will define it. +// +// GTEST_HAS_CLONE - Define it to 1/0 to indicate that clone(2) +// is/isn't available. +// GTEST_HAS_EXCEPTIONS - Define it to 1/0 to indicate that exceptions +// are enabled. +// GTEST_HAS_GLOBAL_STRING - Define it to 1/0 to indicate that ::string +// is/isn't available (some systems define +// ::string, which is different to std::string). +// GTEST_HAS_GLOBAL_WSTRING - Define it to 1/0 to indicate that ::string +// is/isn't available (some systems define +// ::wstring, which is different to std::wstring). +// GTEST_HAS_POSIX_RE - Define it to 1/0 to indicate that POSIX regular +// expressions are/aren't available. +// GTEST_HAS_PTHREAD - Define it to 1/0 to indicate that +// is/isn't available. +// GTEST_HAS_RTTI - Define it to 1/0 to indicate that RTTI is/isn't +// enabled. +// GTEST_HAS_STD_WSTRING - Define it to 1/0 to indicate that +// std::wstring does/doesn't work (Google Test can +// be used where std::wstring is unavailable). +// GTEST_HAS_TR1_TUPLE - Define it to 1/0 to indicate tr1::tuple +// is/isn't available. +// GTEST_HAS_SEH - Define it to 1/0 to indicate whether the +// compiler supports Microsoft's "Structured +// Exception Handling". +// GTEST_HAS_STREAM_REDIRECTION +// - Define it to 1/0 to indicate whether the +// platform supports I/O stream redirection using +// dup() and dup2(). +// GTEST_USE_OWN_TR1_TUPLE - Define it to 1/0 to indicate whether Google +// Test's own tr1 tuple implementation should be +// used. Unused when the user sets +// GTEST_HAS_TR1_TUPLE to 0. +// GTEST_LINKED_AS_SHARED_LIBRARY +// - Define to 1 when compiling tests that use +// Google Test as a shared library (known as +// DLL on Windows). +// GTEST_CREATE_SHARED_LIBRARY +// - Define to 1 when compiling Google Test itself +// as a shared library. + +// This header defines the following utilities: +// +// Macros indicating the current platform (defined to 1 if compiled on +// the given platform; otherwise undefined): +// GTEST_OS_AIX - IBM AIX +// GTEST_OS_CYGWIN - Cygwin +// GTEST_OS_HPUX - HP-UX +// GTEST_OS_LINUX - Linux +// GTEST_OS_LINUX_ANDROID - Google Android +// GTEST_OS_MAC - Mac OS X +// GTEST_OS_NACL - Google Native Client (NaCl) +// GTEST_OS_SOLARIS - Sun Solaris +// GTEST_OS_SYMBIAN - Symbian +// GTEST_OS_WINDOWS - Windows (Desktop, MinGW, or Mobile) +// GTEST_OS_WINDOWS_DESKTOP - Windows Desktop +// GTEST_OS_WINDOWS_MINGW - MinGW +// GTEST_OS_WINDOWS_MOBILE - Windows Mobile +// GTEST_OS_ZOS - z/OS +// +// Among the platforms, Cygwin, Linux, Max OS X, and Windows have the +// most stable support. Since core members of the Google Test project +// don't have access to other platforms, support for them may be less +// stable. If you notice any problems on your platform, please notify +// googletestframework@googlegroups.com (patches for fixing them are +// even more welcome!). +// +// Note that it is possible that none of the GTEST_OS_* macros are defined. +// +// Macros indicating available Google Test features (defined to 1 if +// the corresponding feature is supported; otherwise undefined): +// GTEST_HAS_COMBINE - the Combine() function (for value-parameterized +// tests) +// GTEST_HAS_DEATH_TEST - death tests +// GTEST_HAS_PARAM_TEST - value-parameterized tests +// GTEST_HAS_TYPED_TEST - typed tests +// GTEST_HAS_TYPED_TEST_P - type-parameterized tests +// GTEST_USES_POSIX_RE - enhanced POSIX regex is used. Do not confuse with +// GTEST_HAS_POSIX_RE (see above) which users can +// define themselves. +// GTEST_USES_SIMPLE_RE - our own simple regex is used; +// the above two are mutually exclusive. +// GTEST_CAN_COMPARE_NULL - accepts untyped NULL in EXPECT_EQ(). +// +// Macros for basic C++ coding: +// GTEST_AMBIGUOUS_ELSE_BLOCKER_ - for disabling a gcc warning. +// GTEST_ATTRIBUTE_UNUSED_ - declares that a class' instances or a +// variable don't have to be used. +// GTEST_DISALLOW_ASSIGN_ - disables operator=. +// GTEST_DISALLOW_COPY_AND_ASSIGN_ - disables copy ctor and operator=. +// GTEST_MUST_USE_RESULT_ - declares that a function's result must be used. +// +// Synchronization: +// Mutex, MutexLock, ThreadLocal, GetThreadCount() +// - synchronization primitives. +// GTEST_IS_THREADSAFE - defined to 1 to indicate that the above +// synchronization primitives have real implementations +// and Google Test is thread-safe; or 0 otherwise. +// +// Template meta programming: +// is_pointer - as in TR1; needed on Symbian and IBM XL C/C++ only. +// IteratorTraits - partial implementation of std::iterator_traits, which +// is not available in libCstd when compiled with Sun C++. +// +// Smart pointers: +// scoped_ptr - as in TR2. +// +// Regular expressions: +// RE - a simple regular expression class using the POSIX +// Extended Regular Expression syntax on UNIX-like +// platforms, or a reduced regular exception syntax on +// other platforms, including Windows. +// +// Logging: +// GTEST_LOG_() - logs messages at the specified severity level. +// LogToStderr() - directs all log messages to stderr. +// FlushInfoLog() - flushes informational log messages. +// +// Stdout and stderr capturing: +// CaptureStdout() - starts capturing stdout. +// GetCapturedStdout() - stops capturing stdout and returns the captured +// string. +// CaptureStderr() - starts capturing stderr. +// GetCapturedStderr() - stops capturing stderr and returns the captured +// string. +// +// Integer types: +// TypeWithSize - maps an integer to a int type. +// Int32, UInt32, Int64, UInt64, TimeInMillis +// - integers of known sizes. +// BiggestInt - the biggest signed integer type. +// +// Command-line utilities: +// GTEST_FLAG() - references a flag. +// GTEST_DECLARE_*() - declares a flag. +// GTEST_DEFINE_*() - defines a flag. +// GetArgvs() - returns the command line as a vector of strings. +// +// Environment variable utilities: +// GetEnv() - gets the value of an environment variable. +// BoolFromGTestEnv() - parses a bool environment variable. +// Int32FromGTestEnv() - parses an Int32 environment variable. +// StringFromGTestEnv() - parses a string environment variable. + +#include // for isspace, etc +#include // for ptrdiff_t +#include +#include +#include +#ifndef _WIN32_WCE +# include +# include +#endif // !_WIN32_WCE + +#include // NOLINT +#include // NOLINT +#include // NOLINT + +#define GTEST_DEV_EMAIL_ "googletestframework@@googlegroups.com" +#define GTEST_FLAG_PREFIX_ "gtest_" +#define GTEST_FLAG_PREFIX_DASH_ "gtest-" +#define GTEST_FLAG_PREFIX_UPPER_ "GTEST_" +#define GTEST_NAME_ "Google Test" +#define GTEST_PROJECT_URL_ "http://code.google.com/p/googletest/" + +// Determines the version of gcc that is used to compile this. +#ifdef __GNUC__ +// 40302 means version 4.3.2. +# define GTEST_GCC_VER_ \ + (__GNUC__*10000 + __GNUC_MINOR__*100 + __GNUC_PATCHLEVEL__) +#endif // __GNUC__ + +// Determines the platform on which Google Test is compiled. +#ifdef __CYGWIN__ +# define GTEST_OS_CYGWIN 1 +#elif defined __SYMBIAN32__ +# define GTEST_OS_SYMBIAN 1 +#elif defined _WIN32 +# define GTEST_OS_WINDOWS 1 +# ifdef _WIN32_WCE +# define GTEST_OS_WINDOWS_MOBILE 1 +# elif defined(__MINGW__) || defined(__MINGW32__) +# define GTEST_OS_WINDOWS_MINGW 1 +# else +# define GTEST_OS_WINDOWS_DESKTOP 1 +# endif // _WIN32_WCE +#elif defined __APPLE__ +# define GTEST_OS_MAC 1 +#elif defined __linux__ +# define GTEST_OS_LINUX 1 +# ifdef ANDROID +# define GTEST_OS_LINUX_ANDROID 1 +# endif // ANDROID +#elif defined __MVS__ +# define GTEST_OS_ZOS 1 +#elif defined(__sun) && defined(__SVR4) +# define GTEST_OS_SOLARIS 1 +#elif defined(_AIX) +# define GTEST_OS_AIX 1 +#elif defined(__hpux) +# define GTEST_OS_HPUX 1 +#elif defined __native_client__ +# define GTEST_OS_NACL 1 +#endif // __CYGWIN__ + +// Brings in definitions for functions used in the testing::internal::posix +// namespace (read, write, close, chdir, isatty, stat). We do not currently +// use them on Windows Mobile. +#if !GTEST_OS_WINDOWS +// This assumes that non-Windows OSes provide unistd.h. For OSes where this +// is not the case, we need to include headers that provide the functions +// mentioned above. +# include +# if !GTEST_OS_NACL +// TODO(vladl@google.com): Remove this condition when Native Client SDK adds +// strings.h (tracked in +// http://code.google.com/p/nativeclient/issues/detail?id=1175). +# include // Native Client doesn't provide strings.h. +# endif +#elif !GTEST_OS_WINDOWS_MOBILE +# include +# include +#endif + +// Defines this to true iff Google Test can use POSIX regular expressions. +#ifndef GTEST_HAS_POSIX_RE +# define GTEST_HAS_POSIX_RE (!GTEST_OS_WINDOWS) +#endif + +#if GTEST_HAS_POSIX_RE + +// On some platforms, needs someone to define size_t, and +// won't compile otherwise. We can #include it here as we already +// included , which is guaranteed to define size_t through +// . +# include // NOLINT + +# define GTEST_USES_POSIX_RE 1 + +#elif GTEST_OS_WINDOWS + +// is not available on Windows. Use our own simple regex +// implementation instead. +# define GTEST_USES_SIMPLE_RE 1 + +#else + +// may not be available on this platform. Use our own +// simple regex implementation instead. +# define GTEST_USES_SIMPLE_RE 1 + +#endif // GTEST_HAS_POSIX_RE + +#ifndef GTEST_HAS_EXCEPTIONS +// The user didn't tell us whether exceptions are enabled, so we need +// to figure it out. +# if defined(_MSC_VER) || defined(__BORLANDC__) +// MSVC's and C++Builder's implementations of the STL use the _HAS_EXCEPTIONS +// macro to enable exceptions, so we'll do the same. +// Assumes that exceptions are enabled by default. +# ifndef _HAS_EXCEPTIONS +# define _HAS_EXCEPTIONS 1 +# endif // _HAS_EXCEPTIONS +# define GTEST_HAS_EXCEPTIONS _HAS_EXCEPTIONS +# elif defined(__GNUC__) && __EXCEPTIONS +// gcc defines __EXCEPTIONS to 1 iff exceptions are enabled. +# define GTEST_HAS_EXCEPTIONS 1 +# elif defined(__SUNPRO_CC) +// Sun Pro CC supports exceptions. However, there is no compile-time way of +// detecting whether they are enabled or not. Therefore, we assume that +// they are enabled unless the user tells us otherwise. +# define GTEST_HAS_EXCEPTIONS 1 +# elif defined(__IBMCPP__) && __EXCEPTIONS +// xlC defines __EXCEPTIONS to 1 iff exceptions are enabled. +# define GTEST_HAS_EXCEPTIONS 1 +# elif defined(__HP_aCC) +// Exception handling is in effect by default in HP aCC compiler. It has to +// be turned of by +noeh compiler option if desired. +# define GTEST_HAS_EXCEPTIONS 1 +# else +// For other compilers, we assume exceptions are disabled to be +// conservative. +# define GTEST_HAS_EXCEPTIONS 0 +# endif // defined(_MSC_VER) || defined(__BORLANDC__) +#endif // GTEST_HAS_EXCEPTIONS + +#if !defined(GTEST_HAS_STD_STRING) +// Even though we don't use this macro any longer, we keep it in case +// some clients still depend on it. +# define GTEST_HAS_STD_STRING 1 +#elif !GTEST_HAS_STD_STRING +// The user told us that ::std::string isn't available. +# error "Google Test cannot be used where ::std::string isn't available." +#endif // !defined(GTEST_HAS_STD_STRING) + +#ifndef GTEST_HAS_GLOBAL_STRING +// The user didn't tell us whether ::string is available, so we need +// to figure it out. + +# define GTEST_HAS_GLOBAL_STRING 0 + +#endif // GTEST_HAS_GLOBAL_STRING + +#ifndef GTEST_HAS_STD_WSTRING +// The user didn't tell us whether ::std::wstring is available, so we need +// to figure it out. +// TODO(wan@google.com): uses autoconf to detect whether ::std::wstring +// is available. + +// Cygwin 1.7 and below doesn't support ::std::wstring. +// Solaris' libc++ doesn't support it either. Android has +// no support for it at least as recent as Froyo (2.2). +# define GTEST_HAS_STD_WSTRING \ + (!(GTEST_OS_LINUX_ANDROID || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS)) + +#endif // GTEST_HAS_STD_WSTRING + +#ifndef GTEST_HAS_GLOBAL_WSTRING +// The user didn't tell us whether ::wstring is available, so we need +// to figure it out. +# define GTEST_HAS_GLOBAL_WSTRING \ + (GTEST_HAS_STD_WSTRING && GTEST_HAS_GLOBAL_STRING) +#endif // GTEST_HAS_GLOBAL_WSTRING + +// Determines whether RTTI is available. +#ifndef GTEST_HAS_RTTI +// The user didn't tell us whether RTTI is enabled, so we need to +// figure it out. + +# ifdef _MSC_VER + +# ifdef _CPPRTTI // MSVC defines this macro iff RTTI is enabled. +# define GTEST_HAS_RTTI 1 +# else +# define GTEST_HAS_RTTI 0 +# endif + +// Starting with version 4.3.2, gcc defines __GXX_RTTI iff RTTI is enabled. +# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40302) + +# ifdef __GXX_RTTI +# define GTEST_HAS_RTTI 1 +# else +# define GTEST_HAS_RTTI 0 +# endif // __GXX_RTTI + +// Starting with version 9.0 IBM Visual Age defines __RTTI_ALL__ to 1 if +// both the typeid and dynamic_cast features are present. +# elif defined(__IBMCPP__) && (__IBMCPP__ >= 900) + +# ifdef __RTTI_ALL__ +# define GTEST_HAS_RTTI 1 +# else +# define GTEST_HAS_RTTI 0 +# endif + +# else + +// For all other compilers, we assume RTTI is enabled. +# define GTEST_HAS_RTTI 1 + +# endif // _MSC_VER + +#endif // GTEST_HAS_RTTI + +// It's this header's responsibility to #include when RTTI +// is enabled. +#if GTEST_HAS_RTTI +# include +#endif + +// Determines whether Google Test can use the pthreads library. +#ifndef GTEST_HAS_PTHREAD +// The user didn't tell us explicitly, so we assume pthreads support is +// available on Linux and Mac. +// +// To disable threading support in Google Test, add -DGTEST_HAS_PTHREAD=0 +// to your compiler flags. +# define GTEST_HAS_PTHREAD (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_HPUX) +#endif // GTEST_HAS_PTHREAD + +#if GTEST_HAS_PTHREAD +// gtest-port.h guarantees to #include when GTEST_HAS_PTHREAD is +// true. +# include // NOLINT + +// For timespec and nanosleep, used below. +# include // NOLINT +#endif + +// Determines whether Google Test can use tr1/tuple. You can define +// this macro to 0 to prevent Google Test from using tuple (any +// feature depending on tuple with be disabled in this mode). +#ifndef GTEST_HAS_TR1_TUPLE +// The user didn't tell us not to do it, so we assume it's OK. +# define GTEST_HAS_TR1_TUPLE 1 +#endif // GTEST_HAS_TR1_TUPLE + +// Determines whether Google Test's own tr1 tuple implementation +// should be used. +#ifndef GTEST_USE_OWN_TR1_TUPLE +// The user didn't tell us, so we need to figure it out. + +// We use our own TR1 tuple if we aren't sure the user has an +// implementation of it already. At this time, GCC 4.0.0+ and MSVC +// 2010 are the only mainstream compilers that come with a TR1 tuple +// implementation. NVIDIA's CUDA NVCC compiler pretends to be GCC by +// defining __GNUC__ and friends, but cannot compile GCC's tuple +// implementation. MSVC 2008 (9.0) provides TR1 tuple in a 323 MB +// Feature Pack download, which we cannot assume the user has. +# if (defined(__GNUC__) && !defined(__CUDACC__) && (GTEST_GCC_VER_ >= 40000)) \ + || _MSC_VER >= 1600 +# define GTEST_USE_OWN_TR1_TUPLE 0 +# else +# define GTEST_USE_OWN_TR1_TUPLE 1 +# endif + +#endif // GTEST_USE_OWN_TR1_TUPLE + +// To avoid conditional compilation everywhere, we make it +// gtest-port.h's responsibility to #include the header implementing +// tr1/tuple. +#if GTEST_HAS_TR1_TUPLE + +# if GTEST_USE_OWN_TR1_TUPLE +// This file was GENERATED by a script. DO NOT EDIT BY HAND!!! + +// Copyright 2009 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) + +// Implements a subset of TR1 tuple needed by Google Test and Google Mock. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ + +#include // For ::std::pair. + +// The compiler used in Symbian has a bug that prevents us from declaring the +// tuple template as a friend (it complains that tuple is redefined). This +// hack bypasses the bug by declaring the members that should otherwise be +// private as public. +// Sun Studio versions < 12 also have the above bug. +#if defined(__SYMBIAN32__) || (defined(__SUNPRO_CC) && __SUNPRO_CC < 0x590) +# define GTEST_DECLARE_TUPLE_AS_FRIEND_ public: +#else +# define GTEST_DECLARE_TUPLE_AS_FRIEND_ \ + template friend class tuple; \ + private: +#endif + +// GTEST_n_TUPLE_(T) is the type of an n-tuple. +#define GTEST_0_TUPLE_(T) tuple<> +#define GTEST_1_TUPLE_(T) tuple +#define GTEST_2_TUPLE_(T) tuple +#define GTEST_3_TUPLE_(T) tuple +#define GTEST_4_TUPLE_(T) tuple +#define GTEST_5_TUPLE_(T) tuple +#define GTEST_6_TUPLE_(T) tuple +#define GTEST_7_TUPLE_(T) tuple +#define GTEST_8_TUPLE_(T) tuple +#define GTEST_9_TUPLE_(T) tuple +#define GTEST_10_TUPLE_(T) tuple + +// GTEST_n_TYPENAMES_(T) declares a list of n typenames. +#define GTEST_0_TYPENAMES_(T) +#define GTEST_1_TYPENAMES_(T) typename T##0 +#define GTEST_2_TYPENAMES_(T) typename T##0, typename T##1 +#define GTEST_3_TYPENAMES_(T) typename T##0, typename T##1, typename T##2 +#define GTEST_4_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ + typename T##3 +#define GTEST_5_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ + typename T##3, typename T##4 +#define GTEST_6_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ + typename T##3, typename T##4, typename T##5 +#define GTEST_7_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ + typename T##3, typename T##4, typename T##5, typename T##6 +#define GTEST_8_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ + typename T##3, typename T##4, typename T##5, typename T##6, typename T##7 +#define GTEST_9_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ + typename T##3, typename T##4, typename T##5, typename T##6, \ + typename T##7, typename T##8 +#define GTEST_10_TYPENAMES_(T) typename T##0, typename T##1, typename T##2, \ + typename T##3, typename T##4, typename T##5, typename T##6, \ + typename T##7, typename T##8, typename T##9 + +// In theory, defining stuff in the ::std namespace is undefined +// behavior. We can do this as we are playing the role of a standard +// library vendor. +namespace std { +namespace tr1 { + +template +class tuple; + +// Anything in namespace gtest_internal is Google Test's INTERNAL +// IMPLEMENTATION DETAIL and MUST NOT BE USED DIRECTLY in user code. +namespace gtest_internal { + +// ByRef::type is T if T is a reference; otherwise it's const T&. +template +struct ByRef { typedef const T& type; }; // NOLINT +template +struct ByRef { typedef T& type; }; // NOLINT + +// A handy wrapper for ByRef. +#define GTEST_BY_REF_(T) typename ::std::tr1::gtest_internal::ByRef::type + +// AddRef::type is T if T is a reference; otherwise it's T&. This +// is the same as tr1::add_reference::type. +template +struct AddRef { typedef T& type; }; // NOLINT +template +struct AddRef { typedef T& type; }; // NOLINT + +// A handy wrapper for AddRef. +#define GTEST_ADD_REF_(T) typename ::std::tr1::gtest_internal::AddRef::type + +// A helper for implementing get(). +template class Get; + +// A helper for implementing tuple_element. kIndexValid is true +// iff k < the number of fields in tuple type T. +template +struct TupleElement; + +template +struct TupleElement { typedef T0 type; }; + +template +struct TupleElement { typedef T1 type; }; + +template +struct TupleElement { typedef T2 type; }; + +template +struct TupleElement { typedef T3 type; }; + +template +struct TupleElement { typedef T4 type; }; + +template +struct TupleElement { typedef T5 type; }; + +template +struct TupleElement { typedef T6 type; }; + +template +struct TupleElement { typedef T7 type; }; + +template +struct TupleElement { typedef T8 type; }; + +template +struct TupleElement { typedef T9 type; }; + +} // namespace gtest_internal + +template <> +class tuple<> { + public: + tuple() {} + tuple(const tuple& /* t */) {} + tuple& operator=(const tuple& /* t */) { return *this; } +}; + +template +class GTEST_1_TUPLE_(T) { + public: + template friend class gtest_internal::Get; + + tuple() : f0_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0) : f0_(f0) {} + + tuple(const tuple& t) : f0_(t.f0_) {} + + template + tuple(const GTEST_1_TUPLE_(U)& t) : f0_(t.f0_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_1_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_1_TUPLE_(U)& t) { + f0_ = t.f0_; + return *this; + } + + T0 f0_; +}; + +template +class GTEST_2_TUPLE_(T) { + public: + template friend class gtest_internal::Get; + + tuple() : f0_(), f1_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1) : f0_(f0), + f1_(f1) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_) {} + + template + tuple(const GTEST_2_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_) {} + template + tuple(const ::std::pair& p) : f0_(p.first), f1_(p.second) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_2_TUPLE_(U)& t) { + return CopyFrom(t); + } + template + tuple& operator=(const ::std::pair& p) { + f0_ = p.first; + f1_ = p.second; + return *this; + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_2_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + return *this; + } + + T0 f0_; + T1 f1_; +}; + +template +class GTEST_3_TUPLE_(T) { + public: + template friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2) : f0_(f0), f1_(f1), f2_(f2) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {} + + template + tuple(const GTEST_3_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_3_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_3_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; +}; + +template +class GTEST_4_TUPLE_(T) { + public: + template friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_(), f3_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3) : f0_(f0), f1_(f1), f2_(f2), + f3_(f3) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_) {} + + template + tuple(const GTEST_4_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), + f3_(t.f3_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_4_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_4_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + f3_ = t.f3_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; + T3 f3_; +}; + +template +class GTEST_5_TUPLE_(T) { + public: + template friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_(), f3_(), f4_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, + GTEST_BY_REF_(T4) f4) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), + f4_(t.f4_) {} + + template + tuple(const GTEST_5_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), + f3_(t.f3_), f4_(t.f4_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_5_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_5_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + f3_ = t.f3_; + f4_ = t.f4_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; + T3 f3_; + T4 f4_; +}; + +template +class GTEST_6_TUPLE_(T) { + public: + template friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, + GTEST_BY_REF_(T5) f5) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4), + f5_(f5) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), + f4_(t.f4_), f5_(t.f5_) {} + + template + tuple(const GTEST_6_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), + f3_(t.f3_), f4_(t.f4_), f5_(t.f5_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_6_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_6_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + f3_ = t.f3_; + f4_ = t.f4_; + f5_ = t.f5_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; + T3 f3_; + T4 f4_; + T5 f5_; +}; + +template +class GTEST_7_TUPLE_(T) { + public: + template friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, + GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6) : f0_(f0), f1_(f1), f2_(f2), + f3_(f3), f4_(f4), f5_(f5), f6_(f6) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), + f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {} + + template + tuple(const GTEST_7_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), + f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_7_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_7_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + f3_ = t.f3_; + f4_ = t.f4_; + f5_ = t.f5_; + f6_ = t.f6_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; + T3 f3_; + T4 f4_; + T5 f5_; + T6 f6_; +}; + +template +class GTEST_8_TUPLE_(T) { + public: + template friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, + GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, + GTEST_BY_REF_(T7) f7) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4), + f5_(f5), f6_(f6), f7_(f7) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), + f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {} + + template + tuple(const GTEST_8_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), + f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_8_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_8_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + f3_ = t.f3_; + f4_ = t.f4_; + f5_ = t.f5_; + f6_ = t.f6_; + f7_ = t.f7_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; + T3 f3_; + T4 f4_; + T5 f5_; + T6 f6_; + T7 f7_; +}; + +template +class GTEST_9_TUPLE_(T) { + public: + template friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, + GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7, + GTEST_BY_REF_(T8) f8) : f0_(f0), f1_(f1), f2_(f2), f3_(f3), f4_(f4), + f5_(f5), f6_(f6), f7_(f7), f8_(f8) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), + f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {} + + template + tuple(const GTEST_9_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), + f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_9_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_9_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + f3_ = t.f3_; + f4_ = t.f4_; + f5_ = t.f5_; + f6_ = t.f6_; + f7_ = t.f7_; + f8_ = t.f8_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; + T3 f3_; + T4 f4_; + T5 f5_; + T6 f6_; + T7 f7_; + T8 f8_; +}; + +template +class tuple { + public: + template friend class gtest_internal::Get; + + tuple() : f0_(), f1_(), f2_(), f3_(), f4_(), f5_(), f6_(), f7_(), f8_(), + f9_() {} + + explicit tuple(GTEST_BY_REF_(T0) f0, GTEST_BY_REF_(T1) f1, + GTEST_BY_REF_(T2) f2, GTEST_BY_REF_(T3) f3, GTEST_BY_REF_(T4) f4, + GTEST_BY_REF_(T5) f5, GTEST_BY_REF_(T6) f6, GTEST_BY_REF_(T7) f7, + GTEST_BY_REF_(T8) f8, GTEST_BY_REF_(T9) f9) : f0_(f0), f1_(f1), f2_(f2), + f3_(f3), f4_(f4), f5_(f5), f6_(f6), f7_(f7), f8_(f8), f9_(f9) {} + + tuple(const tuple& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), f3_(t.f3_), + f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_), f9_(t.f9_) {} + + template + tuple(const GTEST_10_TUPLE_(U)& t) : f0_(t.f0_), f1_(t.f1_), f2_(t.f2_), + f3_(t.f3_), f4_(t.f4_), f5_(t.f5_), f6_(t.f6_), f7_(t.f7_), f8_(t.f8_), + f9_(t.f9_) {} + + tuple& operator=(const tuple& t) { return CopyFrom(t); } + + template + tuple& operator=(const GTEST_10_TUPLE_(U)& t) { + return CopyFrom(t); + } + + GTEST_DECLARE_TUPLE_AS_FRIEND_ + + template + tuple& CopyFrom(const GTEST_10_TUPLE_(U)& t) { + f0_ = t.f0_; + f1_ = t.f1_; + f2_ = t.f2_; + f3_ = t.f3_; + f4_ = t.f4_; + f5_ = t.f5_; + f6_ = t.f6_; + f7_ = t.f7_; + f8_ = t.f8_; + f9_ = t.f9_; + return *this; + } + + T0 f0_; + T1 f1_; + T2 f2_; + T3 f3_; + T4 f4_; + T5 f5_; + T6 f6_; + T7 f7_; + T8 f8_; + T9 f9_; +}; + +// 6.1.3.2 Tuple creation functions. + +// Known limitations: we don't support passing an +// std::tr1::reference_wrapper to make_tuple(). And we don't +// implement tie(). + +inline tuple<> make_tuple() { return tuple<>(); } + +template +inline GTEST_1_TUPLE_(T) make_tuple(const T0& f0) { + return GTEST_1_TUPLE_(T)(f0); +} + +template +inline GTEST_2_TUPLE_(T) make_tuple(const T0& f0, const T1& f1) { + return GTEST_2_TUPLE_(T)(f0, f1); +} + +template +inline GTEST_3_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2) { + return GTEST_3_TUPLE_(T)(f0, f1, f2); +} + +template +inline GTEST_4_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, + const T3& f3) { + return GTEST_4_TUPLE_(T)(f0, f1, f2, f3); +} + +template +inline GTEST_5_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, + const T3& f3, const T4& f4) { + return GTEST_5_TUPLE_(T)(f0, f1, f2, f3, f4); +} + +template +inline GTEST_6_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, + const T3& f3, const T4& f4, const T5& f5) { + return GTEST_6_TUPLE_(T)(f0, f1, f2, f3, f4, f5); +} + +template +inline GTEST_7_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, + const T3& f3, const T4& f4, const T5& f5, const T6& f6) { + return GTEST_7_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6); +} + +template +inline GTEST_8_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, + const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7) { + return GTEST_8_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7); +} + +template +inline GTEST_9_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, + const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7, + const T8& f8) { + return GTEST_9_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8); +} + +template +inline GTEST_10_TUPLE_(T) make_tuple(const T0& f0, const T1& f1, const T2& f2, + const T3& f3, const T4& f4, const T5& f5, const T6& f6, const T7& f7, + const T8& f8, const T9& f9) { + return GTEST_10_TUPLE_(T)(f0, f1, f2, f3, f4, f5, f6, f7, f8, f9); +} + +// 6.1.3.3 Tuple helper classes. + +template struct tuple_size; + +template +struct tuple_size { static const int value = 0; }; + +template +struct tuple_size { static const int value = 1; }; + +template +struct tuple_size { static const int value = 2; }; + +template +struct tuple_size { static const int value = 3; }; + +template +struct tuple_size { static const int value = 4; }; + +template +struct tuple_size { static const int value = 5; }; + +template +struct tuple_size { static const int value = 6; }; + +template +struct tuple_size { static const int value = 7; }; + +template +struct tuple_size { static const int value = 8; }; + +template +struct tuple_size { static const int value = 9; }; + +template +struct tuple_size { static const int value = 10; }; + +template +struct tuple_element { + typedef typename gtest_internal::TupleElement< + k < (tuple_size::value), k, Tuple>::type type; +}; + +#define GTEST_TUPLE_ELEMENT_(k, Tuple) typename tuple_element::type + +// 6.1.3.4 Element access. + +namespace gtest_internal { + +template <> +class Get<0> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple)) + Field(Tuple& t) { return t.f0_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(0, Tuple)) + ConstField(const Tuple& t) { return t.f0_; } +}; + +template <> +class Get<1> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple)) + Field(Tuple& t) { return t.f1_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(1, Tuple)) + ConstField(const Tuple& t) { return t.f1_; } +}; + +template <> +class Get<2> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple)) + Field(Tuple& t) { return t.f2_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(2, Tuple)) + ConstField(const Tuple& t) { return t.f2_; } +}; + +template <> +class Get<3> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple)) + Field(Tuple& t) { return t.f3_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(3, Tuple)) + ConstField(const Tuple& t) { return t.f3_; } +}; + +template <> +class Get<4> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple)) + Field(Tuple& t) { return t.f4_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(4, Tuple)) + ConstField(const Tuple& t) { return t.f4_; } +}; + +template <> +class Get<5> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple)) + Field(Tuple& t) { return t.f5_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(5, Tuple)) + ConstField(const Tuple& t) { return t.f5_; } +}; + +template <> +class Get<6> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple)) + Field(Tuple& t) { return t.f6_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(6, Tuple)) + ConstField(const Tuple& t) { return t.f6_; } +}; + +template <> +class Get<7> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple)) + Field(Tuple& t) { return t.f7_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(7, Tuple)) + ConstField(const Tuple& t) { return t.f7_; } +}; + +template <> +class Get<8> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple)) + Field(Tuple& t) { return t.f8_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(8, Tuple)) + ConstField(const Tuple& t) { return t.f8_; } +}; + +template <> +class Get<9> { + public: + template + static GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple)) + Field(Tuple& t) { return t.f9_; } // NOLINT + + template + static GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(9, Tuple)) + ConstField(const Tuple& t) { return t.f9_; } +}; + +} // namespace gtest_internal + +template +GTEST_ADD_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_10_TUPLE_(T))) +get(GTEST_10_TUPLE_(T)& t) { + return gtest_internal::Get::Field(t); +} + +template +GTEST_BY_REF_(GTEST_TUPLE_ELEMENT_(k, GTEST_10_TUPLE_(T))) +get(const GTEST_10_TUPLE_(T)& t) { + return gtest_internal::Get::ConstField(t); +} + +// 6.1.3.5 Relational operators + +// We only implement == and !=, as we don't have a need for the rest yet. + +namespace gtest_internal { + +// SameSizeTuplePrefixComparator::Eq(t1, t2) returns true if the +// first k fields of t1 equals the first k fields of t2. +// SameSizeTuplePrefixComparator(k1, k2) would be a compiler error if +// k1 != k2. +template +struct SameSizeTuplePrefixComparator; + +template <> +struct SameSizeTuplePrefixComparator<0, 0> { + template + static bool Eq(const Tuple1& /* t1 */, const Tuple2& /* t2 */) { + return true; + } +}; + +template +struct SameSizeTuplePrefixComparator { + template + static bool Eq(const Tuple1& t1, const Tuple2& t2) { + return SameSizeTuplePrefixComparator::Eq(t1, t2) && + ::std::tr1::get(t1) == ::std::tr1::get(t2); + } +}; + +} // namespace gtest_internal + +template +inline bool operator==(const GTEST_10_TUPLE_(T)& t, + const GTEST_10_TUPLE_(U)& u) { + return gtest_internal::SameSizeTuplePrefixComparator< + tuple_size::value, + tuple_size::value>::Eq(t, u); +} + +template +inline bool operator!=(const GTEST_10_TUPLE_(T)& t, + const GTEST_10_TUPLE_(U)& u) { return !(t == u); } + +// 6.1.4 Pairs. +// Unimplemented. + +} // namespace tr1 +} // namespace std + +#undef GTEST_0_TUPLE_ +#undef GTEST_1_TUPLE_ +#undef GTEST_2_TUPLE_ +#undef GTEST_3_TUPLE_ +#undef GTEST_4_TUPLE_ +#undef GTEST_5_TUPLE_ +#undef GTEST_6_TUPLE_ +#undef GTEST_7_TUPLE_ +#undef GTEST_8_TUPLE_ +#undef GTEST_9_TUPLE_ +#undef GTEST_10_TUPLE_ + +#undef GTEST_0_TYPENAMES_ +#undef GTEST_1_TYPENAMES_ +#undef GTEST_2_TYPENAMES_ +#undef GTEST_3_TYPENAMES_ +#undef GTEST_4_TYPENAMES_ +#undef GTEST_5_TYPENAMES_ +#undef GTEST_6_TYPENAMES_ +#undef GTEST_7_TYPENAMES_ +#undef GTEST_8_TYPENAMES_ +#undef GTEST_9_TYPENAMES_ +#undef GTEST_10_TYPENAMES_ + +#undef GTEST_DECLARE_TUPLE_AS_FRIEND_ +#undef GTEST_BY_REF_ +#undef GTEST_ADD_REF_ +#undef GTEST_TUPLE_ELEMENT_ + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TUPLE_H_ +# elif GTEST_OS_SYMBIAN + +// On Symbian, BOOST_HAS_TR1_TUPLE causes Boost's TR1 tuple library to +// use STLport's tuple implementation, which unfortunately doesn't +// work as the copy of STLport distributed with Symbian is incomplete. +// By making sure BOOST_HAS_TR1_TUPLE is undefined, we force Boost to +// use its own tuple implementation. +# ifdef BOOST_HAS_TR1_TUPLE +# undef BOOST_HAS_TR1_TUPLE +# endif // BOOST_HAS_TR1_TUPLE + +// This prevents , which defines +// BOOST_HAS_TR1_TUPLE, from being #included by Boost's . +# define BOOST_TR1_DETAIL_CONFIG_HPP_INCLUDED +# include + +# elif defined(__GNUC__) && (GTEST_GCC_VER_ >= 40000) +// GCC 4.0+ implements tr1/tuple in the header. This does +// not conform to the TR1 spec, which requires the header to be . + +# if !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302 +// Until version 4.3.2, gcc has a bug that causes , +// which is #included by , to not compile when RTTI is +// disabled. _TR1_FUNCTIONAL is the header guard for +// . Hence the following #define is a hack to prevent +// from being included. +# define _TR1_FUNCTIONAL 1 +# include +# undef _TR1_FUNCTIONAL // Allows the user to #include + // if he chooses to. +# else +# include // NOLINT +# endif // !GTEST_HAS_RTTI && GTEST_GCC_VER_ < 40302 + +# else +// If the compiler is not GCC 4.0+, we assume the user is using a +// spec-conforming TR1 implementation. +# include // NOLINT +# endif // GTEST_USE_OWN_TR1_TUPLE + +#endif // GTEST_HAS_TR1_TUPLE + +// Determines whether clone(2) is supported. +// Usually it will only be available on Linux, excluding +// Linux on the Itanium architecture. +// Also see http://linux.die.net/man/2/clone. +#ifndef GTEST_HAS_CLONE +// The user didn't tell us, so we need to figure it out. + +# if GTEST_OS_LINUX && !defined(__ia64__) +# define GTEST_HAS_CLONE 1 +# else +# define GTEST_HAS_CLONE 0 +# endif // GTEST_OS_LINUX && !defined(__ia64__) + +#endif // GTEST_HAS_CLONE + +// Determines whether to support stream redirection. This is used to test +// output correctness and to implement death tests. +#ifndef GTEST_HAS_STREAM_REDIRECTION +// By default, we assume that stream redirection is supported on all +// platforms except known mobile ones. +# if GTEST_OS_WINDOWS_MOBILE || GTEST_OS_SYMBIAN +# define GTEST_HAS_STREAM_REDIRECTION 0 +# else +# define GTEST_HAS_STREAM_REDIRECTION 1 +# endif // !GTEST_OS_WINDOWS_MOBILE && !GTEST_OS_SYMBIAN +#endif // GTEST_HAS_STREAM_REDIRECTION + +// Determines whether to support death tests. +// Google Test does not support death tests for VC 7.1 and earlier as +// abort() in a VC 7.1 application compiled as GUI in debug config +// pops up a dialog window that cannot be suppressed programmatically. +#if (GTEST_OS_LINUX || GTEST_OS_MAC || GTEST_OS_CYGWIN || GTEST_OS_SOLARIS || \ + (GTEST_OS_WINDOWS_DESKTOP && _MSC_VER >= 1400) || \ + GTEST_OS_WINDOWS_MINGW || GTEST_OS_AIX || GTEST_OS_HPUX) +# define GTEST_HAS_DEATH_TEST 1 +# include // NOLINT +#endif + +// We don't support MSVC 7.1 with exceptions disabled now. Therefore +// all the compilers we care about are adequate for supporting +// value-parameterized tests. +#define GTEST_HAS_PARAM_TEST 1 + +// Determines whether to support type-driven tests. + +// Typed tests need and variadic macros, which GCC, VC++ 8.0, +// Sun Pro CC, IBM Visual Age, and HP aCC support. +#if defined(__GNUC__) || (_MSC_VER >= 1400) || defined(__SUNPRO_CC) || \ + defined(__IBMCPP__) || defined(__HP_aCC) +# define GTEST_HAS_TYPED_TEST 1 +# define GTEST_HAS_TYPED_TEST_P 1 +#endif + +// Determines whether to support Combine(). This only makes sense when +// value-parameterized tests are enabled. The implementation doesn't +// work on Sun Studio since it doesn't understand templated conversion +// operators. +#if GTEST_HAS_PARAM_TEST && GTEST_HAS_TR1_TUPLE && !defined(__SUNPRO_CC) +# define GTEST_HAS_COMBINE 1 +#endif + +// Determines whether the system compiler uses UTF-16 for encoding wide strings. +#define GTEST_WIDE_STRING_USES_UTF16_ \ + (GTEST_OS_WINDOWS || GTEST_OS_CYGWIN || GTEST_OS_SYMBIAN || GTEST_OS_AIX) + +// Determines whether test results can be streamed to a socket. +#if GTEST_OS_LINUX +# define GTEST_CAN_STREAM_RESULTS_ 1 +#endif + +// Defines some utility macros. + +// The GNU compiler emits a warning if nested "if" statements are followed by +// an "else" statement and braces are not used to explicitly disambiguate the +// "else" binding. This leads to problems with code like: +// +// if (gate) +// ASSERT_*(condition) << "Some message"; +// +// The "switch (0) case 0:" idiom is used to suppress this. +#ifdef __INTEL_COMPILER +# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ +#else +# define GTEST_AMBIGUOUS_ELSE_BLOCKER_ switch (0) case 0: default: // NOLINT +#endif + +// Use this annotation at the end of a struct/class definition to +// prevent the compiler from optimizing away instances that are never +// used. This is useful when all interesting logic happens inside the +// c'tor and / or d'tor. Example: +// +// struct Foo { +// Foo() { ... } +// } GTEST_ATTRIBUTE_UNUSED_; +// +// Also use it after a variable or parameter declaration to tell the +// compiler the variable/parameter does not have to be used. +#if defined(__GNUC__) && !defined(COMPILER_ICC) +# define GTEST_ATTRIBUTE_UNUSED_ __attribute__ ((unused)) +#else +# define GTEST_ATTRIBUTE_UNUSED_ +#endif + +// A macro to disallow operator= +// This should be used in the private: declarations for a class. +#define GTEST_DISALLOW_ASSIGN_(type)\ + void operator=(type const &) + +// A macro to disallow copy constructor and operator= +// This should be used in the private: declarations for a class. +#define GTEST_DISALLOW_COPY_AND_ASSIGN_(type)\ + type(type const &);\ + GTEST_DISALLOW_ASSIGN_(type) + +// Tell the compiler to warn about unused return values for functions declared +// with this macro. The macro should be used on function declarations +// following the argument list: +// +// Sprocket* AllocateSprocket() GTEST_MUST_USE_RESULT_; +#if defined(__GNUC__) && (GTEST_GCC_VER_ >= 30400) && !defined(COMPILER_ICC) +# define GTEST_MUST_USE_RESULT_ __attribute__ ((warn_unused_result)) +#else +# define GTEST_MUST_USE_RESULT_ +#endif // __GNUC__ && (GTEST_GCC_VER_ >= 30400) && !COMPILER_ICC + +// Determine whether the compiler supports Microsoft's Structured Exception +// Handling. This is supported by several Windows compilers but generally +// does not exist on any other system. +#ifndef GTEST_HAS_SEH +// The user didn't tell us, so we need to figure it out. + +# if defined(_MSC_VER) || defined(__BORLANDC__) +// These two compilers are known to support SEH. +# define GTEST_HAS_SEH 1 +# else +// Assume no SEH. +# define GTEST_HAS_SEH 0 +# endif + +#endif // GTEST_HAS_SEH + +#ifdef _MSC_VER + +# if GTEST_LINKED_AS_SHARED_LIBRARY +# define GTEST_API_ __declspec(dllimport) +# elif GTEST_CREATE_SHARED_LIBRARY +# define GTEST_API_ __declspec(dllexport) +# endif + +#endif // _MSC_VER + +#ifndef GTEST_API_ +# define GTEST_API_ +#endif + +#ifdef __GNUC__ +// Ask the compiler to never inline a given function. +# define GTEST_NO_INLINE_ __attribute__((noinline)) +#else +# define GTEST_NO_INLINE_ +#endif + +namespace testing { + +class Message; + +namespace internal { + +class String; + +// The GTEST_COMPILE_ASSERT_ macro can be used to verify that a compile time +// expression is true. For example, you could use it to verify the +// size of a static array: +// +// GTEST_COMPILE_ASSERT_(ARRAYSIZE(content_type_names) == CONTENT_NUM_TYPES, +// content_type_names_incorrect_size); +// +// or to make sure a struct is smaller than a certain size: +// +// GTEST_COMPILE_ASSERT_(sizeof(foo) < 128, foo_too_large); +// +// The second argument to the macro is the name of the variable. If +// the expression is false, most compilers will issue a warning/error +// containing the name of the variable. + +template +struct CompileAssert { +}; + +#define GTEST_COMPILE_ASSERT_(expr, msg) \ + typedef ::testing::internal::CompileAssert<(bool(expr))> \ + msg[bool(expr) ? 1 : -1] + +// Implementation details of GTEST_COMPILE_ASSERT_: +// +// - GTEST_COMPILE_ASSERT_ works by defining an array type that has -1 +// elements (and thus is invalid) when the expression is false. +// +// - The simpler definition +// +// #define GTEST_COMPILE_ASSERT_(expr, msg) typedef char msg[(expr) ? 1 : -1] +// +// does not work, as gcc supports variable-length arrays whose sizes +// are determined at run-time (this is gcc's extension and not part +// of the C++ standard). As a result, gcc fails to reject the +// following code with the simple definition: +// +// int foo; +// GTEST_COMPILE_ASSERT_(foo, msg); // not supposed to compile as foo is +// // not a compile-time constant. +// +// - By using the type CompileAssert<(bool(expr))>, we ensures that +// expr is a compile-time constant. (Template arguments must be +// determined at compile-time.) +// +// - The outter parentheses in CompileAssert<(bool(expr))> are necessary +// to work around a bug in gcc 3.4.4 and 4.0.1. If we had written +// +// CompileAssert +// +// instead, these compilers will refuse to compile +// +// GTEST_COMPILE_ASSERT_(5 > 0, some_message); +// +// (They seem to think the ">" in "5 > 0" marks the end of the +// template argument list.) +// +// - The array size is (bool(expr) ? 1 : -1), instead of simply +// +// ((expr) ? 1 : -1). +// +// This is to avoid running into a bug in MS VC 7.1, which +// causes ((0.0) ? 1 : -1) to incorrectly evaluate to 1. + +// StaticAssertTypeEqHelper is used by StaticAssertTypeEq defined in gtest.h. +// +// This template is declared, but intentionally undefined. +template +struct StaticAssertTypeEqHelper; + +template +struct StaticAssertTypeEqHelper {}; + +#if GTEST_HAS_GLOBAL_STRING +typedef ::string string; +#else +typedef ::std::string string; +#endif // GTEST_HAS_GLOBAL_STRING + +#if GTEST_HAS_GLOBAL_WSTRING +typedef ::wstring wstring; +#elif GTEST_HAS_STD_WSTRING +typedef ::std::wstring wstring; +#endif // GTEST_HAS_GLOBAL_WSTRING + +// A helper for suppressing warnings on constant condition. It just +// returns 'condition'. +GTEST_API_ bool IsTrue(bool condition); + +// Defines scoped_ptr. + +// This implementation of scoped_ptr is PARTIAL - it only contains +// enough stuff to satisfy Google Test's need. +template +class scoped_ptr { + public: + typedef T element_type; + + explicit scoped_ptr(T* p = NULL) : ptr_(p) {} + ~scoped_ptr() { reset(); } + + T& operator*() const { return *ptr_; } + T* operator->() const { return ptr_; } + T* get() const { return ptr_; } + + T* release() { + T* const ptr = ptr_; + ptr_ = NULL; + return ptr; + } + + void reset(T* p = NULL) { + if (p != ptr_) { + if (IsTrue(sizeof(T) > 0)) { // Makes sure T is a complete type. + delete ptr_; + } + ptr_ = p; + } + } + private: + T* ptr_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(scoped_ptr); +}; + +// Defines RE. + +// A simple C++ wrapper for . It uses the POSIX Extended +// Regular Expression syntax. +class GTEST_API_ RE { + public: + // A copy constructor is required by the Standard to initialize object + // references from r-values. + RE(const RE& other) { Init(other.pattern()); } + + // Constructs an RE from a string. + RE(const ::std::string& regex) { Init(regex.c_str()); } // NOLINT + +#if GTEST_HAS_GLOBAL_STRING + + RE(const ::string& regex) { Init(regex.c_str()); } // NOLINT + +#endif // GTEST_HAS_GLOBAL_STRING + + RE(const char* regex) { Init(regex); } // NOLINT + ~RE(); + + // Returns the string representation of the regex. + const char* pattern() const { return pattern_; } + + // FullMatch(str, re) returns true iff regular expression re matches + // the entire str. + // PartialMatch(str, re) returns true iff regular expression re + // matches a substring of str (including str itself). + // + // TODO(wan@google.com): make FullMatch() and PartialMatch() work + // when str contains NUL characters. + static bool FullMatch(const ::std::string& str, const RE& re) { + return FullMatch(str.c_str(), re); + } + static bool PartialMatch(const ::std::string& str, const RE& re) { + return PartialMatch(str.c_str(), re); + } + +#if GTEST_HAS_GLOBAL_STRING + + static bool FullMatch(const ::string& str, const RE& re) { + return FullMatch(str.c_str(), re); + } + static bool PartialMatch(const ::string& str, const RE& re) { + return PartialMatch(str.c_str(), re); + } + +#endif // GTEST_HAS_GLOBAL_STRING + + static bool FullMatch(const char* str, const RE& re); + static bool PartialMatch(const char* str, const RE& re); + + private: + void Init(const char* regex); + + // We use a const char* instead of a string, as Google Test may be used + // where string is not available. We also do not use Google Test's own + // String type here, in order to simplify dependencies between the + // files. + const char* pattern_; + bool is_valid_; + +#if GTEST_USES_POSIX_RE + + regex_t full_regex_; // For FullMatch(). + regex_t partial_regex_; // For PartialMatch(). + +#else // GTEST_USES_SIMPLE_RE + + const char* full_pattern_; // For FullMatch(); + +#endif + + GTEST_DISALLOW_ASSIGN_(RE); +}; + +// Formats a source file path and a line number as they would appear +// in an error message from the compiler used to compile this code. +GTEST_API_ ::std::string FormatFileLocation(const char* file, int line); + +// Formats a file location for compiler-independent XML output. +// Although this function is not platform dependent, we put it next to +// FormatFileLocation in order to contrast the two functions. +GTEST_API_ ::std::string FormatCompilerIndependentFileLocation(const char* file, + int line); + +// Defines logging utilities: +// GTEST_LOG_(severity) - logs messages at the specified severity level. The +// message itself is streamed into the macro. +// LogToStderr() - directs all log messages to stderr. +// FlushInfoLog() - flushes informational log messages. + +enum GTestLogSeverity { + GTEST_INFO, + GTEST_WARNING, + GTEST_ERROR, + GTEST_FATAL +}; + +// Formats log entry severity, provides a stream object for streaming the +// log message, and terminates the message with a newline when going out of +// scope. +class GTEST_API_ GTestLog { + public: + GTestLog(GTestLogSeverity severity, const char* file, int line); + + // Flushes the buffers and, if severity is GTEST_FATAL, aborts the program. + ~GTestLog(); + + ::std::ostream& GetStream() { return ::std::cerr; } + + private: + const GTestLogSeverity severity_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestLog); +}; + +#define GTEST_LOG_(severity) \ + ::testing::internal::GTestLog(::testing::internal::GTEST_##severity, \ + __FILE__, __LINE__).GetStream() + +inline void LogToStderr() {} +inline void FlushInfoLog() { fflush(NULL); } + +// INTERNAL IMPLEMENTATION - DO NOT USE. +// +// GTEST_CHECK_ is an all-mode assert. It aborts the program if the condition +// is not satisfied. +// Synopsys: +// GTEST_CHECK_(boolean_condition); +// or +// GTEST_CHECK_(boolean_condition) << "Additional message"; +// +// This checks the condition and if the condition is not satisfied +// it prints message about the condition violation, including the +// condition itself, plus additional message streamed into it, if any, +// and then it aborts the program. It aborts the program irrespective of +// whether it is built in the debug mode or not. +#define GTEST_CHECK_(condition) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::IsTrue(condition)) \ + ; \ + else \ + GTEST_LOG_(FATAL) << "Condition " #condition " failed. " + +// An all-mode assert to verify that the given POSIX-style function +// call returns 0 (indicating success). Known limitation: this +// doesn't expand to a balanced 'if' statement, so enclose the macro +// in {} if you need to use it as the only statement in an 'if' +// branch. +#define GTEST_CHECK_POSIX_SUCCESS_(posix_call) \ + if (const int gtest_error = (posix_call)) \ + GTEST_LOG_(FATAL) << #posix_call << "failed with error " \ + << gtest_error + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Use ImplicitCast_ as a safe version of static_cast for upcasting in +// the type hierarchy (e.g. casting a Foo* to a SuperclassOfFoo* or a +// const Foo*). When you use ImplicitCast_, the compiler checks that +// the cast is safe. Such explicit ImplicitCast_s are necessary in +// surprisingly many situations where C++ demands an exact type match +// instead of an argument type convertable to a target type. +// +// The syntax for using ImplicitCast_ is the same as for static_cast: +// +// ImplicitCast_(expr) +// +// ImplicitCast_ would have been part of the C++ standard library, +// but the proposal was submitted too late. It will probably make +// its way into the language in the future. +// +// This relatively ugly name is intentional. It prevents clashes with +// similar functions users may have (e.g., implicit_cast). The internal +// namespace alone is not enough because the function can be found by ADL. +template +inline To ImplicitCast_(To x) { return x; } + +// When you upcast (that is, cast a pointer from type Foo to type +// SuperclassOfFoo), it's fine to use ImplicitCast_<>, since upcasts +// always succeed. When you downcast (that is, cast a pointer from +// type Foo to type SubclassOfFoo), static_cast<> isn't safe, because +// how do you know the pointer is really of type SubclassOfFoo? It +// could be a bare Foo, or of type DifferentSubclassOfFoo. Thus, +// when you downcast, you should use this macro. In debug mode, we +// use dynamic_cast<> to double-check the downcast is legal (we die +// if it's not). In normal mode, we do the efficient static_cast<> +// instead. Thus, it's important to test in debug mode to make sure +// the cast is legal! +// This is the only place in the code we should use dynamic_cast<>. +// In particular, you SHOULDN'T be using dynamic_cast<> in order to +// do RTTI (eg code like this: +// if (dynamic_cast(foo)) HandleASubclass1Object(foo); +// if (dynamic_cast(foo)) HandleASubclass2Object(foo); +// You should design the code some other way not to need this. +// +// This relatively ugly name is intentional. It prevents clashes with +// similar functions users may have (e.g., down_cast). The internal +// namespace alone is not enough because the function can be found by ADL. +template // use like this: DownCast_(foo); +inline To DownCast_(From* f) { // so we only accept pointers + // Ensures that To is a sub-type of From *. This test is here only + // for compile-time type checking, and has no overhead in an + // optimized build at run-time, as it will be optimized away + // completely. + if (false) { + const To to = NULL; + ::testing::internal::ImplicitCast_(to); + } + +#if GTEST_HAS_RTTI + // RTTI: debug mode only! + GTEST_CHECK_(f == NULL || dynamic_cast(f) != NULL); +#endif + return static_cast(f); +} + +// Downcasts the pointer of type Base to Derived. +// Derived must be a subclass of Base. The parameter MUST +// point to a class of type Derived, not any subclass of it. +// When RTTI is available, the function performs a runtime +// check to enforce this. +template +Derived* CheckedDowncastToActualType(Base* base) { +#if GTEST_HAS_RTTI + GTEST_CHECK_(typeid(*base) == typeid(Derived)); + return dynamic_cast(base); // NOLINT +#else + return static_cast(base); // Poor man's downcast. +#endif +} + +#if GTEST_HAS_STREAM_REDIRECTION + +// Defines the stderr capturer: +// CaptureStdout - starts capturing stdout. +// GetCapturedStdout - stops capturing stdout and returns the captured string. +// CaptureStderr - starts capturing stderr. +// GetCapturedStderr - stops capturing stderr and returns the captured string. +// +GTEST_API_ void CaptureStdout(); +GTEST_API_ String GetCapturedStdout(); +GTEST_API_ void CaptureStderr(); +GTEST_API_ String GetCapturedStderr(); + +#endif // GTEST_HAS_STREAM_REDIRECTION + + +#if GTEST_HAS_DEATH_TEST + +// A copy of all command line arguments. Set by InitGoogleTest(). +extern ::std::vector g_argvs; + +// GTEST_HAS_DEATH_TEST implies we have ::std::string. +const ::std::vector& GetArgvs(); + +#endif // GTEST_HAS_DEATH_TEST + +// Defines synchronization primitives. + +#if GTEST_HAS_PTHREAD + +// Sleeps for (roughly) n milli-seconds. This function is only for +// testing Google Test's own constructs. Don't use it in user tests, +// either directly or indirectly. +inline void SleepMilliseconds(int n) { + const timespec time = { + 0, // 0 seconds. + n * 1000L * 1000L, // And n ms. + }; + nanosleep(&time, NULL); +} + +// Allows a controller thread to pause execution of newly created +// threads until notified. Instances of this class must be created +// and destroyed in the controller thread. +// +// This class is only for testing Google Test's own constructs. Do not +// use it in user tests, either directly or indirectly. +class Notification { + public: + Notification() : notified_(false) {} + + // Notifies all threads created with this notification to start. Must + // be called from the controller thread. + void Notify() { notified_ = true; } + + // Blocks until the controller thread notifies. Must be called from a test + // thread. + void WaitForNotification() { + while(!notified_) { + SleepMilliseconds(10); + } + } + + private: + volatile bool notified_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(Notification); +}; + +// As a C-function, ThreadFuncWithCLinkage cannot be templated itself. +// Consequently, it cannot select a correct instantiation of ThreadWithParam +// in order to call its Run(). Introducing ThreadWithParamBase as a +// non-templated base class for ThreadWithParam allows us to bypass this +// problem. +class ThreadWithParamBase { + public: + virtual ~ThreadWithParamBase() {} + virtual void Run() = 0; +}; + +// pthread_create() accepts a pointer to a function type with the C linkage. +// According to the Standard (7.5/1), function types with different linkages +// are different even if they are otherwise identical. Some compilers (for +// example, SunStudio) treat them as different types. Since class methods +// cannot be defined with C-linkage we need to define a free C-function to +// pass into pthread_create(). +extern "C" inline void* ThreadFuncWithCLinkage(void* thread) { + static_cast(thread)->Run(); + return NULL; +} + +// Helper class for testing Google Test's multi-threading constructs. +// To use it, write: +// +// void ThreadFunc(int param) { /* Do things with param */ } +// Notification thread_can_start; +// ... +// // The thread_can_start parameter is optional; you can supply NULL. +// ThreadWithParam thread(&ThreadFunc, 5, &thread_can_start); +// thread_can_start.Notify(); +// +// These classes are only for testing Google Test's own constructs. Do +// not use them in user tests, either directly or indirectly. +template +class ThreadWithParam : public ThreadWithParamBase { + public: + typedef void (*UserThreadFunc)(T); + + ThreadWithParam( + UserThreadFunc func, T param, Notification* thread_can_start) + : func_(func), + param_(param), + thread_can_start_(thread_can_start), + finished_(false) { + ThreadWithParamBase* const base = this; + // The thread can be created only after all fields except thread_ + // have been initialized. + GTEST_CHECK_POSIX_SUCCESS_( + pthread_create(&thread_, 0, &ThreadFuncWithCLinkage, base)); + } + ~ThreadWithParam() { Join(); } + + void Join() { + if (!finished_) { + GTEST_CHECK_POSIX_SUCCESS_(pthread_join(thread_, 0)); + finished_ = true; + } + } + + virtual void Run() { + if (thread_can_start_ != NULL) + thread_can_start_->WaitForNotification(); + func_(param_); + } + + private: + const UserThreadFunc func_; // User-supplied thread function. + const T param_; // User-supplied parameter to the thread function. + // When non-NULL, used to block execution until the controller thread + // notifies. + Notification* const thread_can_start_; + bool finished_; // true iff we know that the thread function has finished. + pthread_t thread_; // The native thread object. + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadWithParam); +}; + +// MutexBase and Mutex implement mutex on pthreads-based platforms. They +// are used in conjunction with class MutexLock: +// +// Mutex mutex; +// ... +// MutexLock lock(&mutex); // Acquires the mutex and releases it at the end +// // of the current scope. +// +// MutexBase implements behavior for both statically and dynamically +// allocated mutexes. Do not use MutexBase directly. Instead, write +// the following to define a static mutex: +// +// GTEST_DEFINE_STATIC_MUTEX_(g_some_mutex); +// +// You can forward declare a static mutex like this: +// +// GTEST_DECLARE_STATIC_MUTEX_(g_some_mutex); +// +// To create a dynamic mutex, just define an object of type Mutex. +class MutexBase { + public: + // Acquires this mutex. + void Lock() { + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_lock(&mutex_)); + owner_ = pthread_self(); + } + + // Releases this mutex. + void Unlock() { + // We don't protect writing to owner_ here, as it's the caller's + // responsibility to ensure that the current thread holds the + // mutex when this is called. + owner_ = 0; + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_unlock(&mutex_)); + } + + // Does nothing if the current thread holds the mutex. Otherwise, crashes + // with high probability. + void AssertHeld() const { + GTEST_CHECK_(owner_ == pthread_self()) + << "The current thread is not holding the mutex @" << this; + } + + // A static mutex may be used before main() is entered. It may even + // be used before the dynamic initialization stage. Therefore we + // must be able to initialize a static mutex object at link time. + // This means MutexBase has to be a POD and its member variables + // have to be public. + public: + pthread_mutex_t mutex_; // The underlying pthread mutex. + pthread_t owner_; // The thread holding the mutex; 0 means no one holds it. +}; + +// Forward-declares a static mutex. +# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ + extern ::testing::internal::MutexBase mutex + +// Defines and statically (i.e. at link time) initializes a static mutex. +# define GTEST_DEFINE_STATIC_MUTEX_(mutex) \ + ::testing::internal::MutexBase mutex = { PTHREAD_MUTEX_INITIALIZER, 0 } + +// The Mutex class can only be used for mutexes created at runtime. It +// shares its API with MutexBase otherwise. +class Mutex : public MutexBase { + public: + Mutex() { + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_init(&mutex_, NULL)); + owner_ = 0; + } + ~Mutex() { + GTEST_CHECK_POSIX_SUCCESS_(pthread_mutex_destroy(&mutex_)); + } + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(Mutex); +}; + +// We cannot name this class MutexLock as the ctor declaration would +// conflict with a macro named MutexLock, which is defined on some +// platforms. Hence the typedef trick below. +class GTestMutexLock { + public: + explicit GTestMutexLock(MutexBase* mutex) + : mutex_(mutex) { mutex_->Lock(); } + + ~GTestMutexLock() { mutex_->Unlock(); } + + private: + MutexBase* const mutex_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(GTestMutexLock); +}; + +typedef GTestMutexLock MutexLock; + +// Helpers for ThreadLocal. + +// pthread_key_create() requires DeleteThreadLocalValue() to have +// C-linkage. Therefore it cannot be templatized to access +// ThreadLocal. Hence the need for class +// ThreadLocalValueHolderBase. +class ThreadLocalValueHolderBase { + public: + virtual ~ThreadLocalValueHolderBase() {} +}; + +// Called by pthread to delete thread-local data stored by +// pthread_setspecific(). +extern "C" inline void DeleteThreadLocalValue(void* value_holder) { + delete static_cast(value_holder); +} + +// Implements thread-local storage on pthreads-based systems. +// +// // Thread 1 +// ThreadLocal tl(100); // 100 is the default value for each thread. +// +// // Thread 2 +// tl.set(150); // Changes the value for thread 2 only. +// EXPECT_EQ(150, tl.get()); +// +// // Thread 1 +// EXPECT_EQ(100, tl.get()); // In thread 1, tl has the original value. +// tl.set(200); +// EXPECT_EQ(200, tl.get()); +// +// The template type argument T must have a public copy constructor. +// In addition, the default ThreadLocal constructor requires T to have +// a public default constructor. +// +// An object managed for a thread by a ThreadLocal instance is deleted +// when the thread exits. Or, if the ThreadLocal instance dies in +// that thread, when the ThreadLocal dies. It's the user's +// responsibility to ensure that all other threads using a ThreadLocal +// have exited when it dies, or the per-thread objects for those +// threads will not be deleted. +// +// Google Test only uses global ThreadLocal objects. That means they +// will die after main() has returned. Therefore, no per-thread +// object managed by Google Test will be leaked as long as all threads +// using Google Test have exited when main() returns. +template +class ThreadLocal { + public: + ThreadLocal() : key_(CreateKey()), + default_() {} + explicit ThreadLocal(const T& value) : key_(CreateKey()), + default_(value) {} + + ~ThreadLocal() { + // Destroys the managed object for the current thread, if any. + DeleteThreadLocalValue(pthread_getspecific(key_)); + + // Releases resources associated with the key. This will *not* + // delete managed objects for other threads. + GTEST_CHECK_POSIX_SUCCESS_(pthread_key_delete(key_)); + } + + T* pointer() { return GetOrCreateValue(); } + const T* pointer() const { return GetOrCreateValue(); } + const T& get() const { return *pointer(); } + void set(const T& value) { *pointer() = value; } + + private: + // Holds a value of type T. + class ValueHolder : public ThreadLocalValueHolderBase { + public: + explicit ValueHolder(const T& value) : value_(value) {} + + T* pointer() { return &value_; } + + private: + T value_; + GTEST_DISALLOW_COPY_AND_ASSIGN_(ValueHolder); + }; + + static pthread_key_t CreateKey() { + pthread_key_t key; + // When a thread exits, DeleteThreadLocalValue() will be called on + // the object managed for that thread. + GTEST_CHECK_POSIX_SUCCESS_( + pthread_key_create(&key, &DeleteThreadLocalValue)); + return key; + } + + T* GetOrCreateValue() const { + ThreadLocalValueHolderBase* const holder = + static_cast(pthread_getspecific(key_)); + if (holder != NULL) { + return CheckedDowncastToActualType(holder)->pointer(); + } + + ValueHolder* const new_holder = new ValueHolder(default_); + ThreadLocalValueHolderBase* const holder_base = new_holder; + GTEST_CHECK_POSIX_SUCCESS_(pthread_setspecific(key_, holder_base)); + return new_holder->pointer(); + } + + // A key pthreads uses for looking up per-thread values. + const pthread_key_t key_; + const T default_; // The default value for each thread. + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ThreadLocal); +}; + +# define GTEST_IS_THREADSAFE 1 + +#else // GTEST_HAS_PTHREAD + +// A dummy implementation of synchronization primitives (mutex, lock, +// and thread-local variable). Necessary for compiling Google Test where +// mutex is not supported - using Google Test in multiple threads is not +// supported on such platforms. + +class Mutex { + public: + Mutex() {} + void AssertHeld() const {} +}; + +# define GTEST_DECLARE_STATIC_MUTEX_(mutex) \ + extern ::testing::internal::Mutex mutex + +# define GTEST_DEFINE_STATIC_MUTEX_(mutex) ::testing::internal::Mutex mutex + +class GTestMutexLock { + public: + explicit GTestMutexLock(Mutex*) {} // NOLINT +}; + +typedef GTestMutexLock MutexLock; + +template +class ThreadLocal { + public: + ThreadLocal() : value_() {} + explicit ThreadLocal(const T& value) : value_(value) {} + T* pointer() { return &value_; } + const T* pointer() const { return &value_; } + const T& get() const { return value_; } + void set(const T& value) { value_ = value; } + private: + T value_; +}; + +// The above synchronization primitives have dummy implementations. +// Therefore Google Test is not thread-safe. +# define GTEST_IS_THREADSAFE 0 + +#endif // GTEST_HAS_PTHREAD + +// Returns the number of threads running in the process, or 0 to indicate that +// we cannot detect it. +GTEST_API_ size_t GetThreadCount(); + +// Passing non-POD classes through ellipsis (...) crashes the ARM +// compiler and generates a warning in Sun Studio. The Nokia Symbian +// and the IBM XL C/C++ compiler try to instantiate a copy constructor +// for objects passed through ellipsis (...), failing for uncopyable +// objects. We define this to ensure that only POD is passed through +// ellipsis on these systems. +#if defined(__SYMBIAN32__) || defined(__IBMCPP__) || defined(__SUNPRO_CC) +// We lose support for NULL detection where the compiler doesn't like +// passing non-POD classes through ellipsis (...). +# define GTEST_ELLIPSIS_NEEDS_POD_ 1 +#else +# define GTEST_CAN_COMPARE_NULL 1 +#endif + +// The Nokia Symbian and IBM XL C/C++ compilers cannot decide between +// const T& and const T* in a function template. These compilers +// _can_ decide between class template specializations for T and T*, +// so a tr1::type_traits-like is_pointer works. +#if defined(__SYMBIAN32__) || defined(__IBMCPP__) +# define GTEST_NEEDS_IS_POINTER_ 1 +#endif + +template +struct bool_constant { + typedef bool_constant type; + static const bool value = bool_value; +}; +template const bool bool_constant::value; + +typedef bool_constant false_type; +typedef bool_constant true_type; + +template +struct is_pointer : public false_type {}; + +template +struct is_pointer : public true_type {}; + +template +struct IteratorTraits { + typedef typename Iterator::value_type value_type; +}; + +template +struct IteratorTraits { + typedef T value_type; +}; + +template +struct IteratorTraits { + typedef T value_type; +}; + +#if GTEST_OS_WINDOWS +# define GTEST_PATH_SEP_ "\\" +# define GTEST_HAS_ALT_PATH_SEP_ 1 +// The biggest signed integer type the compiler supports. +typedef __int64 BiggestInt; +#else +# define GTEST_PATH_SEP_ "/" +# define GTEST_HAS_ALT_PATH_SEP_ 0 +typedef long long BiggestInt; // NOLINT +#endif // GTEST_OS_WINDOWS + +// Utilities for char. + +// isspace(int ch) and friends accept an unsigned char or EOF. char +// may be signed, depending on the compiler (or compiler flags). +// Therefore we need to cast a char to unsigned char before calling +// isspace(), etc. + +inline bool IsAlpha(char ch) { + return isalpha(static_cast(ch)) != 0; +} +inline bool IsAlNum(char ch) { + return isalnum(static_cast(ch)) != 0; +} +inline bool IsDigit(char ch) { + return isdigit(static_cast(ch)) != 0; +} +inline bool IsLower(char ch) { + return islower(static_cast(ch)) != 0; +} +inline bool IsSpace(char ch) { + return isspace(static_cast(ch)) != 0; +} +inline bool IsUpper(char ch) { + return isupper(static_cast(ch)) != 0; +} +inline bool IsXDigit(char ch) { + return isxdigit(static_cast(ch)) != 0; +} + +inline char ToLower(char ch) { + return static_cast(tolower(static_cast(ch))); +} +inline char ToUpper(char ch) { + return static_cast(toupper(static_cast(ch))); +} + +// The testing::internal::posix namespace holds wrappers for common +// POSIX functions. These wrappers hide the differences between +// Windows/MSVC and POSIX systems. Since some compilers define these +// standard functions as macros, the wrapper cannot have the same name +// as the wrapped function. + +namespace posix { + +// Functions with a different name on Windows. + +#if GTEST_OS_WINDOWS + +typedef struct _stat StatStruct; + +# ifdef __BORLANDC__ +inline int IsATTY(int fd) { return isatty(fd); } +inline int StrCaseCmp(const char* s1, const char* s2) { + return stricmp(s1, s2); +} +inline char* StrDup(const char* src) { return strdup(src); } +# else // !__BORLANDC__ +# if GTEST_OS_WINDOWS_MOBILE +inline int IsATTY(int /* fd */) { return 0; } +# else +inline int IsATTY(int fd) { return _isatty(fd); } +# endif // GTEST_OS_WINDOWS_MOBILE +inline int StrCaseCmp(const char* s1, const char* s2) { + return _stricmp(s1, s2); +} +inline char* StrDup(const char* src) { return _strdup(src); } +# endif // __BORLANDC__ + +# if GTEST_OS_WINDOWS_MOBILE +inline int FileNo(FILE* file) { return reinterpret_cast(_fileno(file)); } +// Stat(), RmDir(), and IsDir() are not needed on Windows CE at this +// time and thus not defined there. +# else +inline int FileNo(FILE* file) { return _fileno(file); } +inline int Stat(const char* path, StatStruct* buf) { return _stat(path, buf); } +inline int RmDir(const char* dir) { return _rmdir(dir); } +inline bool IsDir(const StatStruct& st) { + return (_S_IFDIR & st.st_mode) != 0; +} +# endif // GTEST_OS_WINDOWS_MOBILE + +#else + +typedef struct stat StatStruct; + +inline int FileNo(FILE* file) { return fileno(file); } +inline int IsATTY(int fd) { return isatty(fd); } +inline int Stat(const char* path, StatStruct* buf) { return stat(path, buf); } +inline int StrCaseCmp(const char* s1, const char* s2) { + return strcasecmp(s1, s2); +} +inline char* StrDup(const char* src) { return strdup(src); } +inline int RmDir(const char* dir) { return rmdir(dir); } +inline bool IsDir(const StatStruct& st) { return S_ISDIR(st.st_mode); } + +#endif // GTEST_OS_WINDOWS + +// Functions deprecated by MSVC 8.0. + +#ifdef _MSC_VER +// Temporarily disable warning 4996 (deprecated function). +# pragma warning(push) +# pragma warning(disable:4996) +#endif + +inline const char* StrNCpy(char* dest, const char* src, size_t n) { + return strncpy(dest, src, n); +} + +// ChDir(), FReopen(), FDOpen(), Read(), Write(), Close(), and +// StrError() aren't needed on Windows CE at this time and thus not +// defined there. + +#if !GTEST_OS_WINDOWS_MOBILE +inline int ChDir(const char* dir) { return chdir(dir); } +#endif +inline FILE* FOpen(const char* path, const char* mode) { + return fopen(path, mode); +} +#if !GTEST_OS_WINDOWS_MOBILE +inline FILE *FReopen(const char* path, const char* mode, FILE* stream) { + return freopen(path, mode, stream); +} +inline FILE* FDOpen(int fd, const char* mode) { return fdopen(fd, mode); } +#endif +inline int FClose(FILE* fp) { return fclose(fp); } +#if !GTEST_OS_WINDOWS_MOBILE +inline int Read(int fd, void* buf, unsigned int count) { + return static_cast(read(fd, buf, count)); +} +inline int Write(int fd, const void* buf, unsigned int count) { + return static_cast(write(fd, buf, count)); +} +inline int Close(int fd) { return close(fd); } +inline const char* StrError(int errnum) { return strerror(errnum); } +#endif +inline const char* GetEnv(const char* name) { +#if GTEST_OS_WINDOWS_MOBILE + // We are on Windows CE, which has no environment variables. + return NULL; +#elif defined(__BORLANDC__) || defined(__SunOS_5_8) || defined(__SunOS_5_9) + // Environment variables which we programmatically clear will be set to the + // empty string rather than unset (NULL). Handle that case. + const char* const env = getenv(name); + return (env != NULL && env[0] != '\0') ? env : NULL; +#else + return getenv(name); +#endif +} + +#ifdef _MSC_VER +# pragma warning(pop) // Restores the warning state. +#endif + +#if GTEST_OS_WINDOWS_MOBILE +// Windows CE has no C library. The abort() function is used in +// several places in Google Test. This implementation provides a reasonable +// imitation of standard behaviour. +void Abort(); +#else +inline void Abort() { abort(); } +#endif // GTEST_OS_WINDOWS_MOBILE + +} // namespace posix + +// The maximum number a BiggestInt can represent. This definition +// works no matter BiggestInt is represented in one's complement or +// two's complement. +// +// We cannot rely on numeric_limits in STL, as __int64 and long long +// are not part of standard C++ and numeric_limits doesn't need to be +// defined for them. +const BiggestInt kMaxBiggestInt = + ~(static_cast(1) << (8*sizeof(BiggestInt) - 1)); + +// This template class serves as a compile-time function from size to +// type. It maps a size in bytes to a primitive type with that +// size. e.g. +// +// TypeWithSize<4>::UInt +// +// is typedef-ed to be unsigned int (unsigned integer made up of 4 +// bytes). +// +// Such functionality should belong to STL, but I cannot find it +// there. +// +// Google Test uses this class in the implementation of floating-point +// comparison. +// +// For now it only handles UInt (unsigned int) as that's all Google Test +// needs. Other types can be easily added in the future if need +// arises. +template +class TypeWithSize { + public: + // This prevents the user from using TypeWithSize with incorrect + // values of N. + typedef void UInt; +}; + +// The specialization for size 4. +template <> +class TypeWithSize<4> { + public: + // unsigned int has size 4 in both gcc and MSVC. + // + // As base/basictypes.h doesn't compile on Windows, we cannot use + // uint32, uint64, and etc here. + typedef int Int; + typedef unsigned int UInt; +}; + +// The specialization for size 8. +template <> +class TypeWithSize<8> { + public: + +#if GTEST_OS_WINDOWS + typedef __int64 Int; + typedef unsigned __int64 UInt; +#else + typedef long long Int; // NOLINT + typedef unsigned long long UInt; // NOLINT +#endif // GTEST_OS_WINDOWS +}; + +// Integer types of known sizes. +typedef TypeWithSize<4>::Int Int32; +typedef TypeWithSize<4>::UInt UInt32; +typedef TypeWithSize<8>::Int Int64; +typedef TypeWithSize<8>::UInt UInt64; +typedef TypeWithSize<8>::Int TimeInMillis; // Represents time in milliseconds. + +// Utilities for command line flags and environment variables. + +// Macro for referencing flags. +#define GTEST_FLAG(name) FLAGS_gtest_##name + +// Macros for declaring flags. +#define GTEST_DECLARE_bool_(name) GTEST_API_ extern bool GTEST_FLAG(name) +#define GTEST_DECLARE_int32_(name) \ + GTEST_API_ extern ::testing::internal::Int32 GTEST_FLAG(name) +#define GTEST_DECLARE_string_(name) \ + GTEST_API_ extern ::testing::internal::String GTEST_FLAG(name) + +// Macros for defining flags. +#define GTEST_DEFINE_bool_(name, default_val, doc) \ + GTEST_API_ bool GTEST_FLAG(name) = (default_val) +#define GTEST_DEFINE_int32_(name, default_val, doc) \ + GTEST_API_ ::testing::internal::Int32 GTEST_FLAG(name) = (default_val) +#define GTEST_DEFINE_string_(name, default_val, doc) \ + GTEST_API_ ::testing::internal::String GTEST_FLAG(name) = (default_val) + +// Parses 'str' for a 32-bit signed integer. If successful, writes the result +// to *value and returns true; otherwise leaves *value unchanged and returns +// false. +// TODO(chandlerc): Find a better way to refactor flag and environment parsing +// out of both gtest-port.cc and gtest.cc to avoid exporting this utility +// function. +bool ParseInt32(const Message& src_text, const char* str, Int32* value); + +// Parses a bool/Int32/string from the environment variable +// corresponding to the given Google Test flag. +bool BoolFromGTestEnv(const char* flag, bool default_val); +GTEST_API_ Int32 Int32FromGTestEnv(const char* flag, Int32 default_val); +const char* StringFromGTestEnv(const char* flag, const char* default_val); + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PORT_H_ + +#if GTEST_OS_LINUX +# include +# include +# include +# include +#endif // GTEST_OS_LINUX + +#include +#include +#include +#include +#include + +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee) +// +// The Google C++ Testing Framework (Google Test) +// +// This header file declares the String class and functions used internally by +// Google Test. They are subject to change without notice. They should not used +// by code external to Google Test. +// +// This header file is #included by . +// It should not be #included by other files. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ + +#ifdef __BORLANDC__ +// string.h is not guaranteed to provide strcpy on C++ Builder. +# include +#endif + +#include + +#include + +namespace testing { +namespace internal { + +// String - a UTF-8 string class. +// +// For historic reasons, we don't use std::string. +// +// TODO(wan@google.com): replace this class with std::string or +// implement it in terms of the latter. +// +// Note that String can represent both NULL and the empty string, +// while std::string cannot represent NULL. +// +// NULL and the empty string are considered different. NULL is less +// than anything (including the empty string) except itself. +// +// This class only provides minimum functionality necessary for +// implementing Google Test. We do not intend to implement a full-fledged +// string class here. +// +// Since the purpose of this class is to provide a substitute for +// std::string on platforms where it cannot be used, we define a copy +// constructor and assignment operators such that we don't need +// conditional compilation in a lot of places. +// +// In order to make the representation efficient, the d'tor of String +// is not virtual. Therefore DO NOT INHERIT FROM String. +class GTEST_API_ String { + public: + // Static utility methods + + // Returns the input enclosed in double quotes if it's not NULL; + // otherwise returns "(null)". For example, "\"Hello\"" is returned + // for input "Hello". + // + // This is useful for printing a C string in the syntax of a literal. + // + // Known issue: escape sequences are not handled yet. + static String ShowCStringQuoted(const char* c_str); + + // Clones a 0-terminated C string, allocating memory using new. The + // caller is responsible for deleting the return value using + // delete[]. Returns the cloned string, or NULL if the input is + // NULL. + // + // This is different from strdup() in string.h, which allocates + // memory using malloc(). + static const char* CloneCString(const char* c_str); + +#if GTEST_OS_WINDOWS_MOBILE + // Windows CE does not have the 'ANSI' versions of Win32 APIs. To be + // able to pass strings to Win32 APIs on CE we need to convert them + // to 'Unicode', UTF-16. + + // Creates a UTF-16 wide string from the given ANSI string, allocating + // memory using new. The caller is responsible for deleting the return + // value using delete[]. Returns the wide string, or NULL if the + // input is NULL. + // + // The wide string is created using the ANSI codepage (CP_ACP) to + // match the behaviour of the ANSI versions of Win32 calls and the + // C runtime. + static LPCWSTR AnsiToUtf16(const char* c_str); + + // Creates an ANSI string from the given wide string, allocating + // memory using new. The caller is responsible for deleting the return + // value using delete[]. Returns the ANSI string, or NULL if the + // input is NULL. + // + // The returned string is created using the ANSI codepage (CP_ACP) to + // match the behaviour of the ANSI versions of Win32 calls and the + // C runtime. + static const char* Utf16ToAnsi(LPCWSTR utf16_str); +#endif + + // Compares two C strings. Returns true iff they have the same content. + // + // Unlike strcmp(), this function can handle NULL argument(s). A + // NULL C string is considered different to any non-NULL C string, + // including the empty string. + static bool CStringEquals(const char* lhs, const char* rhs); + + // Converts a wide C string to a String using the UTF-8 encoding. + // NULL will be converted to "(null)". If an error occurred during + // the conversion, "(failed to convert from wide string)" is + // returned. + static String ShowWideCString(const wchar_t* wide_c_str); + + // Similar to ShowWideCString(), except that this function encloses + // the converted string in double quotes. + static String ShowWideCStringQuoted(const wchar_t* wide_c_str); + + // Compares two wide C strings. Returns true iff they have the same + // content. + // + // Unlike wcscmp(), this function can handle NULL argument(s). A + // NULL C string is considered different to any non-NULL C string, + // including the empty string. + static bool WideCStringEquals(const wchar_t* lhs, const wchar_t* rhs); + + // Compares two C strings, ignoring case. Returns true iff they + // have the same content. + // + // Unlike strcasecmp(), this function can handle NULL argument(s). + // A NULL C string is considered different to any non-NULL C string, + // including the empty string. + static bool CaseInsensitiveCStringEquals(const char* lhs, + const char* rhs); + + // Compares two wide C strings, ignoring case. Returns true iff they + // have the same content. + // + // Unlike wcscasecmp(), this function can handle NULL argument(s). + // A NULL C string is considered different to any non-NULL wide C string, + // including the empty string. + // NB: The implementations on different platforms slightly differ. + // On windows, this method uses _wcsicmp which compares according to LC_CTYPE + // environment variable. On GNU platform this method uses wcscasecmp + // which compares according to LC_CTYPE category of the current locale. + // On MacOS X, it uses towlower, which also uses LC_CTYPE category of the + // current locale. + static bool CaseInsensitiveWideCStringEquals(const wchar_t* lhs, + const wchar_t* rhs); + + // Formats a list of arguments to a String, using the same format + // spec string as for printf. + // + // We do not use the StringPrintf class as it is not universally + // available. + // + // The result is limited to 4096 characters (including the tailing + // 0). If 4096 characters are not enough to format the input, + // "" is returned. + static String Format(const char* format, ...); + + // C'tors + + // The default c'tor constructs a NULL string. + String() : c_str_(NULL), length_(0) {} + + // Constructs a String by cloning a 0-terminated C string. + String(const char* a_c_str) { // NOLINT + if (a_c_str == NULL) { + c_str_ = NULL; + length_ = 0; + } else { + ConstructNonNull(a_c_str, strlen(a_c_str)); + } + } + + // Constructs a String by copying a given number of chars from a + // buffer. E.g. String("hello", 3) creates the string "hel", + // String("a\0bcd", 4) creates "a\0bc", String(NULL, 0) creates "", + // and String(NULL, 1) results in access violation. + String(const char* buffer, size_t a_length) { + ConstructNonNull(buffer, a_length); + } + + // The copy c'tor creates a new copy of the string. The two + // String objects do not share content. + String(const String& str) : c_str_(NULL), length_(0) { *this = str; } + + // D'tor. String is intended to be a final class, so the d'tor + // doesn't need to be virtual. + ~String() { delete[] c_str_; } + + // Allows a String to be implicitly converted to an ::std::string or + // ::string, and vice versa. Converting a String containing a NULL + // pointer to ::std::string or ::string is undefined behavior. + // Converting a ::std::string or ::string containing an embedded NUL + // character to a String will result in the prefix up to the first + // NUL character. + String(const ::std::string& str) { + ConstructNonNull(str.c_str(), str.length()); + } + + operator ::std::string() const { return ::std::string(c_str(), length()); } + +#if GTEST_HAS_GLOBAL_STRING + String(const ::string& str) { + ConstructNonNull(str.c_str(), str.length()); + } + + operator ::string() const { return ::string(c_str(), length()); } +#endif // GTEST_HAS_GLOBAL_STRING + + // Returns true iff this is an empty string (i.e. ""). + bool empty() const { return (c_str() != NULL) && (length() == 0); } + + // Compares this with another String. + // Returns < 0 if this is less than rhs, 0 if this is equal to rhs, or > 0 + // if this is greater than rhs. + int Compare(const String& rhs) const; + + // Returns true iff this String equals the given C string. A NULL + // string and a non-NULL string are considered not equal. + bool operator==(const char* a_c_str) const { return Compare(a_c_str) == 0; } + + // Returns true iff this String is less than the given String. A + // NULL string is considered less than "". + bool operator<(const String& rhs) const { return Compare(rhs) < 0; } + + // Returns true iff this String doesn't equal the given C string. A NULL + // string and a non-NULL string are considered not equal. + bool operator!=(const char* a_c_str) const { return !(*this == a_c_str); } + + // Returns true iff this String ends with the given suffix. *Any* + // String is considered to end with a NULL or empty suffix. + bool EndsWith(const char* suffix) const; + + // Returns true iff this String ends with the given suffix, not considering + // case. Any String is considered to end with a NULL or empty suffix. + bool EndsWithCaseInsensitive(const char* suffix) const; + + // Returns the length of the encapsulated string, or 0 if the + // string is NULL. + size_t length() const { return length_; } + + // Gets the 0-terminated C string this String object represents. + // The String object still owns the string. Therefore the caller + // should NOT delete the return value. + const char* c_str() const { return c_str_; } + + // Assigns a C string to this object. Self-assignment works. + const String& operator=(const char* a_c_str) { + return *this = String(a_c_str); + } + + // Assigns a String object to this object. Self-assignment works. + const String& operator=(const String& rhs) { + if (this != &rhs) { + delete[] c_str_; + if (rhs.c_str() == NULL) { + c_str_ = NULL; + length_ = 0; + } else { + ConstructNonNull(rhs.c_str(), rhs.length()); + } + } + + return *this; + } + + private: + // Constructs a non-NULL String from the given content. This + // function can only be called when c_str_ has not been allocated. + // ConstructNonNull(NULL, 0) results in an empty string (""). + // ConstructNonNull(NULL, non_zero) is undefined behavior. + void ConstructNonNull(const char* buffer, size_t a_length) { + char* const str = new char[a_length + 1]; + memcpy(str, buffer, a_length); + str[a_length] = '\0'; + c_str_ = str; + length_ = a_length; + } + + const char* c_str_; + size_t length_; +}; // class String + +// Streams a String to an ostream. Each '\0' character in the String +// is replaced with "\\0". +inline ::std::ostream& operator<<(::std::ostream& os, const String& str) { + if (str.c_str() == NULL) { + os << "(null)"; + } else { + const char* const c_str = str.c_str(); + for (size_t i = 0; i != str.length(); i++) { + if (c_str[i] == '\0') { + os << "\\0"; + } else { + os << c_str[i]; + } + } + } + return os; +} + +// Gets the content of the stringstream's buffer as a String. Each '\0' +// character in the buffer is replaced with "\\0". +GTEST_API_ String StringStreamToString(::std::stringstream* stream); + +// Converts a streamable value to a String. A NULL pointer is +// converted to "(null)". When the input value is a ::string, +// ::std::string, ::wstring, or ::std::wstring object, each NUL +// character in it is replaced with "\\0". + +// Declared here but defined in gtest.h, so that it has access +// to the definition of the Message class, required by the ARM +// compiler. +template +String StreamableToString(const T& streamable); + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_STRING_H_ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: keith.ray@gmail.com (Keith Ray) +// +// Google Test filepath utilities +// +// This header file declares classes and functions used internally by +// Google Test. They are subject to change without notice. +// +// This file is #included in . +// Do not include this header file separately! + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ + + +namespace testing { +namespace internal { + +// FilePath - a class for file and directory pathname manipulation which +// handles platform-specific conventions (like the pathname separator). +// Used for helper functions for naming files in a directory for xml output. +// Except for Set methods, all methods are const or static, which provides an +// "immutable value object" -- useful for peace of mind. +// A FilePath with a value ending in a path separator ("like/this/") represents +// a directory, otherwise it is assumed to represent a file. In either case, +// it may or may not represent an actual file or directory in the file system. +// Names are NOT checked for syntax correctness -- no checking for illegal +// characters, malformed paths, etc. + +class GTEST_API_ FilePath { + public: + FilePath() : pathname_("") { } + FilePath(const FilePath& rhs) : pathname_(rhs.pathname_) { } + + explicit FilePath(const char* pathname) : pathname_(pathname) { + Normalize(); + } + + explicit FilePath(const String& pathname) : pathname_(pathname) { + Normalize(); + } + + FilePath& operator=(const FilePath& rhs) { + Set(rhs); + return *this; + } + + void Set(const FilePath& rhs) { + pathname_ = rhs.pathname_; + } + + String ToString() const { return pathname_; } + const char* c_str() const { return pathname_.c_str(); } + + // Returns the current working directory, or "" if unsuccessful. + static FilePath GetCurrentDir(); + + // Given directory = "dir", base_name = "test", number = 0, + // extension = "xml", returns "dir/test.xml". If number is greater + // than zero (e.g., 12), returns "dir/test_12.xml". + // On Windows platform, uses \ as the separator rather than /. + static FilePath MakeFileName(const FilePath& directory, + const FilePath& base_name, + int number, + const char* extension); + + // Given directory = "dir", relative_path = "test.xml", + // returns "dir/test.xml". + // On Windows, uses \ as the separator rather than /. + static FilePath ConcatPaths(const FilePath& directory, + const FilePath& relative_path); + + // Returns a pathname for a file that does not currently exist. The pathname + // will be directory/base_name.extension or + // directory/base_name_.extension if directory/base_name.extension + // already exists. The number will be incremented until a pathname is found + // that does not already exist. + // Examples: 'dir/foo_test.xml' or 'dir/foo_test_1.xml'. + // There could be a race condition if two or more processes are calling this + // function at the same time -- they could both pick the same filename. + static FilePath GenerateUniqueFileName(const FilePath& directory, + const FilePath& base_name, + const char* extension); + + // Returns true iff the path is NULL or "". + bool IsEmpty() const { return c_str() == NULL || *c_str() == '\0'; } + + // If input name has a trailing separator character, removes it and returns + // the name, otherwise return the name string unmodified. + // On Windows platform, uses \ as the separator, other platforms use /. + FilePath RemoveTrailingPathSeparator() const; + + // Returns a copy of the FilePath with the directory part removed. + // Example: FilePath("path/to/file").RemoveDirectoryName() returns + // FilePath("file"). If there is no directory part ("just_a_file"), it returns + // the FilePath unmodified. If there is no file part ("just_a_dir/") it + // returns an empty FilePath (""). + // On Windows platform, '\' is the path separator, otherwise it is '/'. + FilePath RemoveDirectoryName() const; + + // RemoveFileName returns the directory path with the filename removed. + // Example: FilePath("path/to/file").RemoveFileName() returns "path/to/". + // If the FilePath is "a_file" or "/a_file", RemoveFileName returns + // FilePath("./") or, on Windows, FilePath(".\\"). If the filepath does + // not have a file, like "just/a/dir/", it returns the FilePath unmodified. + // On Windows platform, '\' is the path separator, otherwise it is '/'. + FilePath RemoveFileName() const; + + // Returns a copy of the FilePath with the case-insensitive extension removed. + // Example: FilePath("dir/file.exe").RemoveExtension("EXE") returns + // FilePath("dir/file"). If a case-insensitive extension is not + // found, returns a copy of the original FilePath. + FilePath RemoveExtension(const char* extension) const; + + // Creates directories so that path exists. Returns true if successful or if + // the directories already exist; returns false if unable to create + // directories for any reason. Will also return false if the FilePath does + // not represent a directory (that is, it doesn't end with a path separator). + bool CreateDirectoriesRecursively() const; + + // Create the directory so that path exists. Returns true if successful or + // if the directory already exists; returns false if unable to create the + // directory for any reason, including if the parent directory does not + // exist. Not named "CreateDirectory" because that's a macro on Windows. + bool CreateFolder() const; + + // Returns true if FilePath describes something in the file-system, + // either a file, directory, or whatever, and that something exists. + bool FileOrDirectoryExists() const; + + // Returns true if pathname describes a directory in the file-system + // that exists. + bool DirectoryExists() const; + + // Returns true if FilePath ends with a path separator, which indicates that + // it is intended to represent a directory. Returns false otherwise. + // This does NOT check that a directory (or file) actually exists. + bool IsDirectory() const; + + // Returns true if pathname describes a root directory. (Windows has one + // root directory per disk drive.) + bool IsRootDirectory() const; + + // Returns true if pathname describes an absolute path. + bool IsAbsolutePath() const; + + private: + // Replaces multiple consecutive separators with a single separator. + // For example, "bar///foo" becomes "bar/foo". Does not eliminate other + // redundancies that might be in a pathname involving "." or "..". + // + // A pathname with multiple consecutive separators may occur either through + // user error or as a result of some scripts or APIs that generate a pathname + // with a trailing separator. On other platforms the same API or script + // may NOT generate a pathname with a trailing "/". Then elsewhere that + // pathname may have another "/" and pathname components added to it, + // without checking for the separator already being there. + // The script language and operating system may allow paths like "foo//bar" + // but some of the functions in FilePath will not handle that correctly. In + // particular, RemoveTrailingPathSeparator() only removes one separator, and + // it is called in CreateDirectoriesRecursively() assuming that it will change + // a pathname from directory syntax (trailing separator) to filename syntax. + // + // On Windows this method also replaces the alternate path separator '/' with + // the primary path separator '\\', so that for example "bar\\/\\foo" becomes + // "bar\\foo". + + void Normalize(); + + // Returns a pointer to the last occurence of a valid path separator in + // the FilePath. On Windows, for example, both '/' and '\' are valid path + // separators. Returns NULL if no path separator was found. + const char* FindLastPathSeparator() const; + + String pathname_; +}; // class FilePath + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_FILEPATH_H_ +// This file was GENERATED by command: +// pump.py gtest-type-util.h.pump +// DO NOT EDIT BY HAND!!! + +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) + +// Type utilities needed for implementing typed and type-parameterized +// tests. This file is generated by a SCRIPT. DO NOT EDIT BY HAND! +// +// Currently we support at most 50 types in a list, and at most 50 +// type-parameterized tests in one type-parameterized test case. +// Please contact googletestframework@googlegroups.com if you need +// more. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ + + +// #ifdef __GNUC__ is too general here. It is possible to use gcc without using +// libstdc++ (which is where cxxabi.h comes from). +# ifdef __GLIBCXX__ +# include +# elif defined(__HP_aCC) +# include +# endif // __GLIBCXX__ + +namespace testing { +namespace internal { + +// GetTypeName() returns a human-readable name of type T. +// NB: This function is also used in Google Mock, so don't move it inside of +// the typed-test-only section below. +template +String GetTypeName() { +# if GTEST_HAS_RTTI + + const char* const name = typeid(T).name(); +# if defined(__GLIBCXX__) || defined(__HP_aCC) + int status = 0; + // gcc's implementation of typeid(T).name() mangles the type name, + // so we have to demangle it. +# ifdef __GLIBCXX__ + using abi::__cxa_demangle; +# endif // __GLIBCXX__ + char* const readable_name = __cxa_demangle(name, 0, 0, &status); + const String name_str(status == 0 ? readable_name : name); + free(readable_name); + return name_str; +# else + return name; +# endif // __GLIBCXX__ || __HP_aCC + +# else + + return ""; + +# endif // GTEST_HAS_RTTI +} + +#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P + +// AssertyTypeEq::type is defined iff T1 and T2 are the same +// type. This can be used as a compile-time assertion to ensure that +// two types are equal. + +template +struct AssertTypeEq; + +template +struct AssertTypeEq { + typedef bool type; +}; + +// A unique type used as the default value for the arguments of class +// template Types. This allows us to simulate variadic templates +// (e.g. Types, Type, and etc), which C++ doesn't +// support directly. +struct None {}; + +// The following family of struct and struct templates are used to +// represent type lists. In particular, TypesN +// represents a type list with N types (T1, T2, ..., and TN) in it. +// Except for Types0, every struct in the family has two member types: +// Head for the first type in the list, and Tail for the rest of the +// list. + +// The empty type list. +struct Types0 {}; + +// Type lists of length 1, 2, 3, and so on. + +template +struct Types1 { + typedef T1 Head; + typedef Types0 Tail; +}; +template +struct Types2 { + typedef T1 Head; + typedef Types1 Tail; +}; + +template +struct Types3 { + typedef T1 Head; + typedef Types2 Tail; +}; + +template +struct Types4 { + typedef T1 Head; + typedef Types3 Tail; +}; + +template +struct Types5 { + typedef T1 Head; + typedef Types4 Tail; +}; + +template +struct Types6 { + typedef T1 Head; + typedef Types5 Tail; +}; + +template +struct Types7 { + typedef T1 Head; + typedef Types6 Tail; +}; + +template +struct Types8 { + typedef T1 Head; + typedef Types7 Tail; +}; + +template +struct Types9 { + typedef T1 Head; + typedef Types8 Tail; +}; + +template +struct Types10 { + typedef T1 Head; + typedef Types9 Tail; +}; + +template +struct Types11 { + typedef T1 Head; + typedef Types10 Tail; +}; + +template +struct Types12 { + typedef T1 Head; + typedef Types11 Tail; +}; + +template +struct Types13 { + typedef T1 Head; + typedef Types12 Tail; +}; + +template +struct Types14 { + typedef T1 Head; + typedef Types13 Tail; +}; + +template +struct Types15 { + typedef T1 Head; + typedef Types14 Tail; +}; + +template +struct Types16 { + typedef T1 Head; + typedef Types15 Tail; +}; + +template +struct Types17 { + typedef T1 Head; + typedef Types16 Tail; +}; + +template +struct Types18 { + typedef T1 Head; + typedef Types17 Tail; +}; + +template +struct Types19 { + typedef T1 Head; + typedef Types18 Tail; +}; + +template +struct Types20 { + typedef T1 Head; + typedef Types19 Tail; +}; + +template +struct Types21 { + typedef T1 Head; + typedef Types20 Tail; +}; + +template +struct Types22 { + typedef T1 Head; + typedef Types21 Tail; +}; + +template +struct Types23 { + typedef T1 Head; + typedef Types22 Tail; +}; + +template +struct Types24 { + typedef T1 Head; + typedef Types23 Tail; +}; + +template +struct Types25 { + typedef T1 Head; + typedef Types24 Tail; +}; + +template +struct Types26 { + typedef T1 Head; + typedef Types25 Tail; +}; + +template +struct Types27 { + typedef T1 Head; + typedef Types26 Tail; +}; + +template +struct Types28 { + typedef T1 Head; + typedef Types27 Tail; +}; + +template +struct Types29 { + typedef T1 Head; + typedef Types28 Tail; +}; + +template +struct Types30 { + typedef T1 Head; + typedef Types29 Tail; +}; + +template +struct Types31 { + typedef T1 Head; + typedef Types30 Tail; +}; + +template +struct Types32 { + typedef T1 Head; + typedef Types31 Tail; +}; + +template +struct Types33 { + typedef T1 Head; + typedef Types32 Tail; +}; + +template +struct Types34 { + typedef T1 Head; + typedef Types33 Tail; +}; + +template +struct Types35 { + typedef T1 Head; + typedef Types34 Tail; +}; + +template +struct Types36 { + typedef T1 Head; + typedef Types35 Tail; +}; + +template +struct Types37 { + typedef T1 Head; + typedef Types36 Tail; +}; + +template +struct Types38 { + typedef T1 Head; + typedef Types37 Tail; +}; + +template +struct Types39 { + typedef T1 Head; + typedef Types38 Tail; +}; + +template +struct Types40 { + typedef T1 Head; + typedef Types39 Tail; +}; + +template +struct Types41 { + typedef T1 Head; + typedef Types40 Tail; +}; + +template +struct Types42 { + typedef T1 Head; + typedef Types41 Tail; +}; + +template +struct Types43 { + typedef T1 Head; + typedef Types42 Tail; +}; + +template +struct Types44 { + typedef T1 Head; + typedef Types43 Tail; +}; + +template +struct Types45 { + typedef T1 Head; + typedef Types44 Tail; +}; + +template +struct Types46 { + typedef T1 Head; + typedef Types45 Tail; +}; + +template +struct Types47 { + typedef T1 Head; + typedef Types46 Tail; +}; + +template +struct Types48 { + typedef T1 Head; + typedef Types47 Tail; +}; + +template +struct Types49 { + typedef T1 Head; + typedef Types48 Tail; +}; + +template +struct Types50 { + typedef T1 Head; + typedef Types49 Tail; +}; + + +} // namespace internal + +// We don't want to require the users to write TypesN<...> directly, +// as that would require them to count the length. Types<...> is much +// easier to write, but generates horrible messages when there is a +// compiler error, as gcc insists on printing out each template +// argument, even if it has the default value (this means Types +// will appear as Types in the compiler +// errors). +// +// Our solution is to combine the best part of the two approaches: a +// user would write Types, and Google Test will translate +// that to TypesN internally to make error messages +// readable. The translation is done by the 'type' member of the +// Types template. +template +struct Types { + typedef internal::Types50 type; +}; + +template <> +struct Types { + typedef internal::Types0 type; +}; +template +struct Types { + typedef internal::Types1 type; +}; +template +struct Types { + typedef internal::Types2 type; +}; +template +struct Types { + typedef internal::Types3 type; +}; +template +struct Types { + typedef internal::Types4 type; +}; +template +struct Types { + typedef internal::Types5 type; +}; +template +struct Types { + typedef internal::Types6 type; +}; +template +struct Types { + typedef internal::Types7 type; +}; +template +struct Types { + typedef internal::Types8 type; +}; +template +struct Types { + typedef internal::Types9 type; +}; +template +struct Types { + typedef internal::Types10 type; +}; +template +struct Types { + typedef internal::Types11 type; +}; +template +struct Types { + typedef internal::Types12 type; +}; +template +struct Types { + typedef internal::Types13 type; +}; +template +struct Types { + typedef internal::Types14 type; +}; +template +struct Types { + typedef internal::Types15 type; +}; +template +struct Types { + typedef internal::Types16 type; +}; +template +struct Types { + typedef internal::Types17 type; +}; +template +struct Types { + typedef internal::Types18 type; +}; +template +struct Types { + typedef internal::Types19 type; +}; +template +struct Types { + typedef internal::Types20 type; +}; +template +struct Types { + typedef internal::Types21 type; +}; +template +struct Types { + typedef internal::Types22 type; +}; +template +struct Types { + typedef internal::Types23 type; +}; +template +struct Types { + typedef internal::Types24 type; +}; +template +struct Types { + typedef internal::Types25 type; +}; +template +struct Types { + typedef internal::Types26 type; +}; +template +struct Types { + typedef internal::Types27 type; +}; +template +struct Types { + typedef internal::Types28 type; +}; +template +struct Types { + typedef internal::Types29 type; +}; +template +struct Types { + typedef internal::Types30 type; +}; +template +struct Types { + typedef internal::Types31 type; +}; +template +struct Types { + typedef internal::Types32 type; +}; +template +struct Types { + typedef internal::Types33 type; +}; +template +struct Types { + typedef internal::Types34 type; +}; +template +struct Types { + typedef internal::Types35 type; +}; +template +struct Types { + typedef internal::Types36 type; +}; +template +struct Types { + typedef internal::Types37 type; +}; +template +struct Types { + typedef internal::Types38 type; +}; +template +struct Types { + typedef internal::Types39 type; +}; +template +struct Types { + typedef internal::Types40 type; +}; +template +struct Types { + typedef internal::Types41 type; +}; +template +struct Types { + typedef internal::Types42 type; +}; +template +struct Types { + typedef internal::Types43 type; +}; +template +struct Types { + typedef internal::Types44 type; +}; +template +struct Types { + typedef internal::Types45 type; +}; +template +struct Types { + typedef internal::Types46 type; +}; +template +struct Types { + typedef internal::Types47 type; +}; +template +struct Types { + typedef internal::Types48 type; +}; +template +struct Types { + typedef internal::Types49 type; +}; + +namespace internal { + +# define GTEST_TEMPLATE_ template class + +// The template "selector" struct TemplateSel is used to +// represent Tmpl, which must be a class template with one type +// parameter, as a type. TemplateSel::Bind::type is defined +// as the type Tmpl. This allows us to actually instantiate the +// template "selected" by TemplateSel. +// +// This trick is necessary for simulating typedef for class templates, +// which C++ doesn't support directly. +template +struct TemplateSel { + template + struct Bind { + typedef Tmpl type; + }; +}; + +# define GTEST_BIND_(TmplSel, T) \ + TmplSel::template Bind::type + +// A unique struct template used as the default value for the +// arguments of class template Templates. This allows us to simulate +// variadic templates (e.g. Templates, Templates, +// and etc), which C++ doesn't support directly. +template +struct NoneT {}; + +// The following family of struct and struct templates are used to +// represent template lists. In particular, TemplatesN represents a list of N templates (T1, T2, ..., and TN). Except +// for Templates0, every struct in the family has two member types: +// Head for the selector of the first template in the list, and Tail +// for the rest of the list. + +// The empty template list. +struct Templates0 {}; + +// Template lists of length 1, 2, 3, and so on. + +template +struct Templates1 { + typedef TemplateSel Head; + typedef Templates0 Tail; +}; +template +struct Templates2 { + typedef TemplateSel Head; + typedef Templates1 Tail; +}; + +template +struct Templates3 { + typedef TemplateSel Head; + typedef Templates2 Tail; +}; + +template +struct Templates4 { + typedef TemplateSel Head; + typedef Templates3 Tail; +}; + +template +struct Templates5 { + typedef TemplateSel Head; + typedef Templates4 Tail; +}; + +template +struct Templates6 { + typedef TemplateSel Head; + typedef Templates5 Tail; +}; + +template +struct Templates7 { + typedef TemplateSel Head; + typedef Templates6 Tail; +}; + +template +struct Templates8 { + typedef TemplateSel Head; + typedef Templates7 Tail; +}; + +template +struct Templates9 { + typedef TemplateSel Head; + typedef Templates8 Tail; +}; + +template +struct Templates10 { + typedef TemplateSel Head; + typedef Templates9 Tail; +}; + +template +struct Templates11 { + typedef TemplateSel Head; + typedef Templates10 Tail; +}; + +template +struct Templates12 { + typedef TemplateSel Head; + typedef Templates11 Tail; +}; + +template +struct Templates13 { + typedef TemplateSel Head; + typedef Templates12 Tail; +}; + +template +struct Templates14 { + typedef TemplateSel Head; + typedef Templates13 Tail; +}; + +template +struct Templates15 { + typedef TemplateSel Head; + typedef Templates14 Tail; +}; + +template +struct Templates16 { + typedef TemplateSel Head; + typedef Templates15 Tail; +}; + +template +struct Templates17 { + typedef TemplateSel Head; + typedef Templates16 Tail; +}; + +template +struct Templates18 { + typedef TemplateSel Head; + typedef Templates17 Tail; +}; + +template +struct Templates19 { + typedef TemplateSel Head; + typedef Templates18 Tail; +}; + +template +struct Templates20 { + typedef TemplateSel Head; + typedef Templates19 Tail; +}; + +template +struct Templates21 { + typedef TemplateSel Head; + typedef Templates20 Tail; +}; + +template +struct Templates22 { + typedef TemplateSel Head; + typedef Templates21 Tail; +}; + +template +struct Templates23 { + typedef TemplateSel Head; + typedef Templates22 Tail; +}; + +template +struct Templates24 { + typedef TemplateSel Head; + typedef Templates23 Tail; +}; + +template +struct Templates25 { + typedef TemplateSel Head; + typedef Templates24 Tail; +}; + +template +struct Templates26 { + typedef TemplateSel Head; + typedef Templates25 Tail; +}; + +template +struct Templates27 { + typedef TemplateSel Head; + typedef Templates26 Tail; +}; + +template +struct Templates28 { + typedef TemplateSel Head; + typedef Templates27 Tail; +}; + +template +struct Templates29 { + typedef TemplateSel Head; + typedef Templates28 Tail; +}; + +template +struct Templates30 { + typedef TemplateSel Head; + typedef Templates29 Tail; +}; + +template +struct Templates31 { + typedef TemplateSel Head; + typedef Templates30 Tail; +}; + +template +struct Templates32 { + typedef TemplateSel Head; + typedef Templates31 Tail; +}; + +template +struct Templates33 { + typedef TemplateSel Head; + typedef Templates32 Tail; +}; + +template +struct Templates34 { + typedef TemplateSel Head; + typedef Templates33 Tail; +}; + +template +struct Templates35 { + typedef TemplateSel Head; + typedef Templates34 Tail; +}; + +template +struct Templates36 { + typedef TemplateSel Head; + typedef Templates35 Tail; +}; + +template +struct Templates37 { + typedef TemplateSel Head; + typedef Templates36 Tail; +}; + +template +struct Templates38 { + typedef TemplateSel Head; + typedef Templates37 Tail; +}; + +template +struct Templates39 { + typedef TemplateSel Head; + typedef Templates38 Tail; +}; + +template +struct Templates40 { + typedef TemplateSel Head; + typedef Templates39 Tail; +}; + +template +struct Templates41 { + typedef TemplateSel Head; + typedef Templates40 Tail; +}; + +template +struct Templates42 { + typedef TemplateSel Head; + typedef Templates41 Tail; +}; + +template +struct Templates43 { + typedef TemplateSel Head; + typedef Templates42 Tail; +}; + +template +struct Templates44 { + typedef TemplateSel Head; + typedef Templates43 Tail; +}; + +template +struct Templates45 { + typedef TemplateSel Head; + typedef Templates44 Tail; +}; + +template +struct Templates46 { + typedef TemplateSel Head; + typedef Templates45 Tail; +}; + +template +struct Templates47 { + typedef TemplateSel Head; + typedef Templates46 Tail; +}; + +template +struct Templates48 { + typedef TemplateSel Head; + typedef Templates47 Tail; +}; + +template +struct Templates49 { + typedef TemplateSel Head; + typedef Templates48 Tail; +}; + +template +struct Templates50 { + typedef TemplateSel Head; + typedef Templates49 Tail; +}; + + +// We don't want to require the users to write TemplatesN<...> directly, +// as that would require them to count the length. Templates<...> is much +// easier to write, but generates horrible messages when there is a +// compiler error, as gcc insists on printing out each template +// argument, even if it has the default value (this means Templates +// will appear as Templates in the compiler +// errors). +// +// Our solution is to combine the best part of the two approaches: a +// user would write Templates, and Google Test will translate +// that to TemplatesN internally to make error messages +// readable. The translation is done by the 'type' member of the +// Templates template. +template +struct Templates { + typedef Templates50 type; +}; + +template <> +struct Templates { + typedef Templates0 type; +}; +template +struct Templates { + typedef Templates1 type; +}; +template +struct Templates { + typedef Templates2 type; +}; +template +struct Templates { + typedef Templates3 type; +}; +template +struct Templates { + typedef Templates4 type; +}; +template +struct Templates { + typedef Templates5 type; +}; +template +struct Templates { + typedef Templates6 type; +}; +template +struct Templates { + typedef Templates7 type; +}; +template +struct Templates { + typedef Templates8 type; +}; +template +struct Templates { + typedef Templates9 type; +}; +template +struct Templates { + typedef Templates10 type; +}; +template +struct Templates { + typedef Templates11 type; +}; +template +struct Templates { + typedef Templates12 type; +}; +template +struct Templates { + typedef Templates13 type; +}; +template +struct Templates { + typedef Templates14 type; +}; +template +struct Templates { + typedef Templates15 type; +}; +template +struct Templates { + typedef Templates16 type; +}; +template +struct Templates { + typedef Templates17 type; +}; +template +struct Templates { + typedef Templates18 type; +}; +template +struct Templates { + typedef Templates19 type; +}; +template +struct Templates { + typedef Templates20 type; +}; +template +struct Templates { + typedef Templates21 type; +}; +template +struct Templates { + typedef Templates22 type; +}; +template +struct Templates { + typedef Templates23 type; +}; +template +struct Templates { + typedef Templates24 type; +}; +template +struct Templates { + typedef Templates25 type; +}; +template +struct Templates { + typedef Templates26 type; +}; +template +struct Templates { + typedef Templates27 type; +}; +template +struct Templates { + typedef Templates28 type; +}; +template +struct Templates { + typedef Templates29 type; +}; +template +struct Templates { + typedef Templates30 type; +}; +template +struct Templates { + typedef Templates31 type; +}; +template +struct Templates { + typedef Templates32 type; +}; +template +struct Templates { + typedef Templates33 type; +}; +template +struct Templates { + typedef Templates34 type; +}; +template +struct Templates { + typedef Templates35 type; +}; +template +struct Templates { + typedef Templates36 type; +}; +template +struct Templates { + typedef Templates37 type; +}; +template +struct Templates { + typedef Templates38 type; +}; +template +struct Templates { + typedef Templates39 type; +}; +template +struct Templates { + typedef Templates40 type; +}; +template +struct Templates { + typedef Templates41 type; +}; +template +struct Templates { + typedef Templates42 type; +}; +template +struct Templates { + typedef Templates43 type; +}; +template +struct Templates { + typedef Templates44 type; +}; +template +struct Templates { + typedef Templates45 type; +}; +template +struct Templates { + typedef Templates46 type; +}; +template +struct Templates { + typedef Templates47 type; +}; +template +struct Templates { + typedef Templates48 type; +}; +template +struct Templates { + typedef Templates49 type; +}; + +// The TypeList template makes it possible to use either a single type +// or a Types<...> list in TYPED_TEST_CASE() and +// INSTANTIATE_TYPED_TEST_CASE_P(). + +template +struct TypeList { typedef Types1 type; }; + +template +struct TypeList > { + typedef typename Types::type type; +}; + +#endif // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_TYPE_UTIL_H_ + +// Due to C++ preprocessor weirdness, we need double indirection to +// concatenate two tokens when one of them is __LINE__. Writing +// +// foo ## __LINE__ +// +// will result in the token foo__LINE__, instead of foo followed by +// the current line number. For more details, see +// http://www.parashift.com/c++-faq-lite/misc-technical-issues.html#faq-39.6 +#define GTEST_CONCAT_TOKEN_(foo, bar) GTEST_CONCAT_TOKEN_IMPL_(foo, bar) +#define GTEST_CONCAT_TOKEN_IMPL_(foo, bar) foo ## bar + +// Google Test defines the testing::Message class to allow construction of +// test messages via the << operator. The idea is that anything +// streamable to std::ostream can be streamed to a testing::Message. +// This allows a user to use his own types in Google Test assertions by +// overloading the << operator. +// +// util/gtl/stl_logging-inl.h overloads << for STL containers. These +// overloads cannot be defined in the std namespace, as that will be +// undefined behavior. Therefore, they are defined in the global +// namespace instead. +// +// C++'s symbol lookup rule (i.e. Koenig lookup) says that these +// overloads are visible in either the std namespace or the global +// namespace, but not other namespaces, including the testing +// namespace which Google Test's Message class is in. +// +// To allow STL containers (and other types that has a << operator +// defined in the global namespace) to be used in Google Test assertions, +// testing::Message must access the custom << operator from the global +// namespace. Hence this helper function. +// +// Note: Jeffrey Yasskin suggested an alternative fix by "using +// ::operator<<;" in the definition of Message's operator<<. That fix +// doesn't require a helper function, but unfortunately doesn't +// compile with MSVC. +template +inline void GTestStreamToHelper(std::ostream* os, const T& val) { + *os << val; +} + +class ProtocolMessage; +namespace proto2 { class Message; } + +namespace testing { + +// Forward declarations. + +class AssertionResult; // Result of an assertion. +class Message; // Represents a failure message. +class Test; // Represents a test. +class TestInfo; // Information about a test. +class TestPartResult; // Result of a test part. +class UnitTest; // A collection of test cases. + +template +::std::string PrintToString(const T& value); + +namespace internal { + +struct TraceInfo; // Information about a trace point. +class ScopedTrace; // Implements scoped trace. +class TestInfoImpl; // Opaque implementation of TestInfo +class UnitTestImpl; // Opaque implementation of UnitTest + +// How many times InitGoogleTest() has been called. +extern int g_init_gtest_count; + +// The text used in failure messages to indicate the start of the +// stack trace. +GTEST_API_ extern const char kStackTraceMarker[]; + +// A secret type that Google Test users don't know about. It has no +// definition on purpose. Therefore it's impossible to create a +// Secret object, which is what we want. +class Secret; + +// Two overloaded helpers for checking at compile time whether an +// expression is a null pointer literal (i.e. NULL or any 0-valued +// compile-time integral constant). Their return values have +// different sizes, so we can use sizeof() to test which version is +// picked by the compiler. These helpers have no implementations, as +// we only need their signatures. +// +// Given IsNullLiteralHelper(x), the compiler will pick the first +// version if x can be implicitly converted to Secret*, and pick the +// second version otherwise. Since Secret is a secret and incomplete +// type, the only expression a user can write that has type Secret* is +// a null pointer literal. Therefore, we know that x is a null +// pointer literal if and only if the first version is picked by the +// compiler. +char IsNullLiteralHelper(Secret* p); +char (&IsNullLiteralHelper(...))[2]; // NOLINT + +// A compile-time bool constant that is true if and only if x is a +// null pointer literal (i.e. NULL or any 0-valued compile-time +// integral constant). +#ifdef GTEST_ELLIPSIS_NEEDS_POD_ +// We lose support for NULL detection where the compiler doesn't like +// passing non-POD classes through ellipsis (...). +# define GTEST_IS_NULL_LITERAL_(x) false +#else +# define GTEST_IS_NULL_LITERAL_(x) \ + (sizeof(::testing::internal::IsNullLiteralHelper(x)) == 1) +#endif // GTEST_ELLIPSIS_NEEDS_POD_ + +// Appends the user-supplied message to the Google-Test-generated message. +GTEST_API_ String AppendUserMessage(const String& gtest_msg, + const Message& user_msg); + +// A helper class for creating scoped traces in user programs. +class GTEST_API_ ScopedTrace { + public: + // The c'tor pushes the given source file location and message onto + // a trace stack maintained by Google Test. + ScopedTrace(const char* file, int line, const Message& message); + + // The d'tor pops the info pushed by the c'tor. + // + // Note that the d'tor is not virtual in order to be efficient. + // Don't inherit from ScopedTrace! + ~ScopedTrace(); + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(ScopedTrace); +} GTEST_ATTRIBUTE_UNUSED_; // A ScopedTrace object does its job in its + // c'tor and d'tor. Therefore it doesn't + // need to be used otherwise. + +// Converts a streamable value to a String. A NULL pointer is +// converted to "(null)". When the input value is a ::string, +// ::std::string, ::wstring, or ::std::wstring object, each NUL +// character in it is replaced with "\\0". +// Declared here but defined in gtest.h, so that it has access +// to the definition of the Message class, required by the ARM +// compiler. +template +String StreamableToString(const T& streamable); + +// The Symbian compiler has a bug that prevents it from selecting the +// correct overload of FormatForComparisonFailureMessage (see below) +// unless we pass the first argument by reference. If we do that, +// however, Visual Age C++ 10.1 generates a compiler error. Therefore +// we only apply the work-around for Symbian. +#if defined(__SYMBIAN32__) +# define GTEST_CREF_WORKAROUND_ const& +#else +# define GTEST_CREF_WORKAROUND_ +#endif + +// When this operand is a const char* or char*, if the other operand +// is a ::std::string or ::string, we print this operand as a C string +// rather than a pointer (we do the same for wide strings); otherwise +// we print it as a pointer to be safe. + +// This internal macro is used to avoid duplicated code. +#define GTEST_FORMAT_IMPL_(operand2_type, operand1_printer)\ +inline String FormatForComparisonFailureMessage(\ + operand2_type::value_type* GTEST_CREF_WORKAROUND_ str, \ + const operand2_type& /*operand2*/) {\ + return operand1_printer(str);\ +}\ +inline String FormatForComparisonFailureMessage(\ + const operand2_type::value_type* GTEST_CREF_WORKAROUND_ str, \ + const operand2_type& /*operand2*/) {\ + return operand1_printer(str);\ +} + +GTEST_FORMAT_IMPL_(::std::string, String::ShowCStringQuoted) +#if GTEST_HAS_STD_WSTRING +GTEST_FORMAT_IMPL_(::std::wstring, String::ShowWideCStringQuoted) +#endif // GTEST_HAS_STD_WSTRING + +#if GTEST_HAS_GLOBAL_STRING +GTEST_FORMAT_IMPL_(::string, String::ShowCStringQuoted) +#endif // GTEST_HAS_GLOBAL_STRING +#if GTEST_HAS_GLOBAL_WSTRING +GTEST_FORMAT_IMPL_(::wstring, String::ShowWideCStringQuoted) +#endif // GTEST_HAS_GLOBAL_WSTRING + +#undef GTEST_FORMAT_IMPL_ + +// The next four overloads handle the case where the operand being +// printed is a char/wchar_t pointer and the other operand is not a +// string/wstring object. In such cases, we just print the operand as +// a pointer to be safe. +#define GTEST_FORMAT_CHAR_PTR_IMPL_(CharType) \ + template \ + String FormatForComparisonFailureMessage(CharType* GTEST_CREF_WORKAROUND_ p, \ + const T&) { \ + return PrintToString(static_cast(p)); \ + } + +GTEST_FORMAT_CHAR_PTR_IMPL_(char) +GTEST_FORMAT_CHAR_PTR_IMPL_(const char) +GTEST_FORMAT_CHAR_PTR_IMPL_(wchar_t) +GTEST_FORMAT_CHAR_PTR_IMPL_(const wchar_t) + +#undef GTEST_FORMAT_CHAR_PTR_IMPL_ + +// Constructs and returns the message for an equality assertion +// (e.g. ASSERT_EQ, EXPECT_STREQ, etc) failure. +// +// The first four parameters are the expressions used in the assertion +// and their values, as strings. For example, for ASSERT_EQ(foo, bar) +// where foo is 5 and bar is 6, we have: +// +// expected_expression: "foo" +// actual_expression: "bar" +// expected_value: "5" +// actual_value: "6" +// +// The ignoring_case parameter is true iff the assertion is a +// *_STRCASEEQ*. When it's true, the string " (ignoring case)" will +// be inserted into the message. +GTEST_API_ AssertionResult EqFailure(const char* expected_expression, + const char* actual_expression, + const String& expected_value, + const String& actual_value, + bool ignoring_case); + +// Constructs a failure message for Boolean assertions such as EXPECT_TRUE. +GTEST_API_ String GetBoolAssertionFailureMessage( + const AssertionResult& assertion_result, + const char* expression_text, + const char* actual_predicate_value, + const char* expected_predicate_value); + +// This template class represents an IEEE floating-point number +// (either single-precision or double-precision, depending on the +// template parameters). +// +// The purpose of this class is to do more sophisticated number +// comparison. (Due to round-off error, etc, it's very unlikely that +// two floating-points will be equal exactly. Hence a naive +// comparison by the == operation often doesn't work.) +// +// Format of IEEE floating-point: +// +// The most-significant bit being the leftmost, an IEEE +// floating-point looks like +// +// sign_bit exponent_bits fraction_bits +// +// Here, sign_bit is a single bit that designates the sign of the +// number. +// +// For float, there are 8 exponent bits and 23 fraction bits. +// +// For double, there are 11 exponent bits and 52 fraction bits. +// +// More details can be found at +// http://en.wikipedia.org/wiki/IEEE_floating-point_standard. +// +// Template parameter: +// +// RawType: the raw floating-point type (either float or double) +template +class FloatingPoint { + public: + // Defines the unsigned integer type that has the same size as the + // floating point number. + typedef typename TypeWithSize::UInt Bits; + + // Constants. + + // # of bits in a number. + static const size_t kBitCount = 8*sizeof(RawType); + + // # of fraction bits in a number. + static const size_t kFractionBitCount = + std::numeric_limits::digits - 1; + + // # of exponent bits in a number. + static const size_t kExponentBitCount = kBitCount - 1 - kFractionBitCount; + + // The mask for the sign bit. + static const Bits kSignBitMask = static_cast(1) << (kBitCount - 1); + + // The mask for the fraction bits. + static const Bits kFractionBitMask = + ~static_cast(0) >> (kExponentBitCount + 1); + + // The mask for the exponent bits. + static const Bits kExponentBitMask = ~(kSignBitMask | kFractionBitMask); + + // How many ULP's (Units in the Last Place) we want to tolerate when + // comparing two numbers. The larger the value, the more error we + // allow. A 0 value means that two numbers must be exactly the same + // to be considered equal. + // + // The maximum error of a single floating-point operation is 0.5 + // units in the last place. On Intel CPU's, all floating-point + // calculations are done with 80-bit precision, while double has 64 + // bits. Therefore, 4 should be enough for ordinary use. + // + // See the following article for more details on ULP: + // http://www.cygnus-software.com/papers/comparingfloats/comparingfloats.htm. + static const size_t kMaxUlps = 4; + + // Constructs a FloatingPoint from a raw floating-point number. + // + // On an Intel CPU, passing a non-normalized NAN (Not a Number) + // around may change its bits, although the new value is guaranteed + // to be also a NAN. Therefore, don't expect this constructor to + // preserve the bits in x when x is a NAN. + explicit FloatingPoint(const RawType& x) { u_.value_ = x; } + + // Static methods + + // Reinterprets a bit pattern as a floating-point number. + // + // This function is needed to test the AlmostEquals() method. + static RawType ReinterpretBits(const Bits bits) { + FloatingPoint fp(0); + fp.u_.bits_ = bits; + return fp.u_.value_; + } + + // Returns the floating-point number that represent positive infinity. + static RawType Infinity() { + return ReinterpretBits(kExponentBitMask); + } + + // Non-static methods + + // Returns the bits that represents this number. + const Bits &bits() const { return u_.bits_; } + + // Returns the exponent bits of this number. + Bits exponent_bits() const { return kExponentBitMask & u_.bits_; } + + // Returns the fraction bits of this number. + Bits fraction_bits() const { return kFractionBitMask & u_.bits_; } + + // Returns the sign bit of this number. + Bits sign_bit() const { return kSignBitMask & u_.bits_; } + + // Returns true iff this is NAN (not a number). + bool is_nan() const { + // It's a NAN if the exponent bits are all ones and the fraction + // bits are not entirely zeros. + return (exponent_bits() == kExponentBitMask) && (fraction_bits() != 0); + } + + // Returns true iff this number is at most kMaxUlps ULP's away from + // rhs. In particular, this function: + // + // - returns false if either number is (or both are) NAN. + // - treats really large numbers as almost equal to infinity. + // - thinks +0.0 and -0.0 are 0 DLP's apart. + bool AlmostEquals(const FloatingPoint& rhs) const { + // The IEEE standard says that any comparison operation involving + // a NAN must return false. + if (is_nan() || rhs.is_nan()) return false; + + return DistanceBetweenSignAndMagnitudeNumbers(u_.bits_, rhs.u_.bits_) + <= kMaxUlps; + } + + private: + // The data type used to store the actual floating-point number. + union FloatingPointUnion { + RawType value_; // The raw floating-point number. + Bits bits_; // The bits that represent the number. + }; + + // Converts an integer from the sign-and-magnitude representation to + // the biased representation. More precisely, let N be 2 to the + // power of (kBitCount - 1), an integer x is represented by the + // unsigned number x + N. + // + // For instance, + // + // -N + 1 (the most negative number representable using + // sign-and-magnitude) is represented by 1; + // 0 is represented by N; and + // N - 1 (the biggest number representable using + // sign-and-magnitude) is represented by 2N - 1. + // + // Read http://en.wikipedia.org/wiki/Signed_number_representations + // for more details on signed number representations. + static Bits SignAndMagnitudeToBiased(const Bits &sam) { + if (kSignBitMask & sam) { + // sam represents a negative number. + return ~sam + 1; + } else { + // sam represents a positive number. + return kSignBitMask | sam; + } + } + + // Given two numbers in the sign-and-magnitude representation, + // returns the distance between them as an unsigned number. + static Bits DistanceBetweenSignAndMagnitudeNumbers(const Bits &sam1, + const Bits &sam2) { + const Bits biased1 = SignAndMagnitudeToBiased(sam1); + const Bits biased2 = SignAndMagnitudeToBiased(sam2); + return (biased1 >= biased2) ? (biased1 - biased2) : (biased2 - biased1); + } + + FloatingPointUnion u_; +}; + +// Typedefs the instances of the FloatingPoint template class that we +// care to use. +typedef FloatingPoint Float; +typedef FloatingPoint Double; + +// In order to catch the mistake of putting tests that use different +// test fixture classes in the same test case, we need to assign +// unique IDs to fixture classes and compare them. The TypeId type is +// used to hold such IDs. The user should treat TypeId as an opaque +// type: the only operation allowed on TypeId values is to compare +// them for equality using the == operator. +typedef const void* TypeId; + +template +class TypeIdHelper { + public: + // dummy_ must not have a const type. Otherwise an overly eager + // compiler (e.g. MSVC 7.1 & 8.0) may try to merge + // TypeIdHelper::dummy_ for different Ts as an "optimization". + static bool dummy_; +}; + +template +bool TypeIdHelper::dummy_ = false; + +// GetTypeId() returns the ID of type T. Different values will be +// returned for different types. Calling the function twice with the +// same type argument is guaranteed to return the same ID. +template +TypeId GetTypeId() { + // The compiler is required to allocate a different + // TypeIdHelper::dummy_ variable for each T used to instantiate + // the template. Therefore, the address of dummy_ is guaranteed to + // be unique. + return &(TypeIdHelper::dummy_); +} + +// Returns the type ID of ::testing::Test. Always call this instead +// of GetTypeId< ::testing::Test>() to get the type ID of +// ::testing::Test, as the latter may give the wrong result due to a +// suspected linker bug when compiling Google Test as a Mac OS X +// framework. +GTEST_API_ TypeId GetTestTypeId(); + +// Defines the abstract factory interface that creates instances +// of a Test object. +class TestFactoryBase { + public: + virtual ~TestFactoryBase() {} + + // Creates a test instance to run. The instance is both created and destroyed + // within TestInfoImpl::Run() + virtual Test* CreateTest() = 0; + + protected: + TestFactoryBase() {} + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestFactoryBase); +}; + +// This class provides implementation of TeastFactoryBase interface. +// It is used in TEST and TEST_F macros. +template +class TestFactoryImpl : public TestFactoryBase { + public: + virtual Test* CreateTest() { return new TestClass; } +}; + +#if GTEST_OS_WINDOWS + +// Predicate-formatters for implementing the HRESULT checking macros +// {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED} +// We pass a long instead of HRESULT to avoid causing an +// include dependency for the HRESULT type. +GTEST_API_ AssertionResult IsHRESULTSuccess(const char* expr, + long hr); // NOLINT +GTEST_API_ AssertionResult IsHRESULTFailure(const char* expr, + long hr); // NOLINT + +#endif // GTEST_OS_WINDOWS + +// Types of SetUpTestCase() and TearDownTestCase() functions. +typedef void (*SetUpTestCaseFunc)(); +typedef void (*TearDownTestCaseFunc)(); + +// Creates a new TestInfo object and registers it with Google Test; +// returns the created object. +// +// Arguments: +// +// test_case_name: name of the test case +// name: name of the test +// type_param the name of the test's type parameter, or NULL if +// this is not a typed or a type-parameterized test. +// value_param text representation of the test's value parameter, +// or NULL if this is not a type-parameterized test. +// fixture_class_id: ID of the test fixture class +// set_up_tc: pointer to the function that sets up the test case +// tear_down_tc: pointer to the function that tears down the test case +// factory: pointer to the factory that creates a test object. +// The newly created TestInfo instance will assume +// ownership of the factory object. +GTEST_API_ TestInfo* MakeAndRegisterTestInfo( + const char* test_case_name, const char* name, + const char* type_param, + const char* value_param, + TypeId fixture_class_id, + SetUpTestCaseFunc set_up_tc, + TearDownTestCaseFunc tear_down_tc, + TestFactoryBase* factory); + +// If *pstr starts with the given prefix, modifies *pstr to be right +// past the prefix and returns true; otherwise leaves *pstr unchanged +// and returns false. None of pstr, *pstr, and prefix can be NULL. +GTEST_API_ bool SkipPrefix(const char* prefix, const char** pstr); + +#if GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P + +// State of the definition of a type-parameterized test case. +class GTEST_API_ TypedTestCasePState { + public: + TypedTestCasePState() : registered_(false) {} + + // Adds the given test name to defined_test_names_ and return true + // if the test case hasn't been registered; otherwise aborts the + // program. + bool AddTestName(const char* file, int line, const char* case_name, + const char* test_name) { + if (registered_) { + fprintf(stderr, "%s Test %s must be defined before " + "REGISTER_TYPED_TEST_CASE_P(%s, ...).\n", + FormatFileLocation(file, line).c_str(), test_name, case_name); + fflush(stderr); + posix::Abort(); + } + defined_test_names_.insert(test_name); + return true; + } + + // Verifies that registered_tests match the test names in + // defined_test_names_; returns registered_tests if successful, or + // aborts the program otherwise. + const char* VerifyRegisteredTestNames( + const char* file, int line, const char* registered_tests); + + private: + bool registered_; + ::std::set defined_test_names_; +}; + +// Skips to the first non-space char after the first comma in 'str'; +// returns NULL if no comma is found in 'str'. +inline const char* SkipComma(const char* str) { + const char* comma = strchr(str, ','); + if (comma == NULL) { + return NULL; + } + while (IsSpace(*(++comma))) {} + return comma; +} + +// Returns the prefix of 'str' before the first comma in it; returns +// the entire string if it contains no comma. +inline String GetPrefixUntilComma(const char* str) { + const char* comma = strchr(str, ','); + return comma == NULL ? String(str) : String(str, comma - str); +} + +// TypeParameterizedTest::Register() +// registers a list of type-parameterized tests with Google Test. The +// return value is insignificant - we just need to return something +// such that we can call this function in a namespace scope. +// +// Implementation note: The GTEST_TEMPLATE_ macro declares a template +// template parameter. It's defined in gtest-type-util.h. +template +class TypeParameterizedTest { + public: + // 'index' is the index of the test in the type list 'Types' + // specified in INSTANTIATE_TYPED_TEST_CASE_P(Prefix, TestCase, + // Types). Valid values for 'index' are [0, N - 1] where N is the + // length of Types. + static bool Register(const char* prefix, const char* case_name, + const char* test_names, int index) { + typedef typename Types::Head Type; + typedef Fixture FixtureClass; + typedef typename GTEST_BIND_(TestSel, Type) TestClass; + + // First, registers the first type-parameterized test in the type + // list. + MakeAndRegisterTestInfo( + String::Format("%s%s%s/%d", prefix, prefix[0] == '\0' ? "" : "/", + case_name, index).c_str(), + GetPrefixUntilComma(test_names).c_str(), + GetTypeName().c_str(), + NULL, // No value parameter. + GetTypeId(), + TestClass::SetUpTestCase, + TestClass::TearDownTestCase, + new TestFactoryImpl); + + // Next, recurses (at compile time) with the tail of the type list. + return TypeParameterizedTest + ::Register(prefix, case_name, test_names, index + 1); + } +}; + +// The base case for the compile time recursion. +template +class TypeParameterizedTest { + public: + static bool Register(const char* /*prefix*/, const char* /*case_name*/, + const char* /*test_names*/, int /*index*/) { + return true; + } +}; + +// TypeParameterizedTestCase::Register() +// registers *all combinations* of 'Tests' and 'Types' with Google +// Test. The return value is insignificant - we just need to return +// something such that we can call this function in a namespace scope. +template +class TypeParameterizedTestCase { + public: + static bool Register(const char* prefix, const char* case_name, + const char* test_names) { + typedef typename Tests::Head Head; + + // First, register the first test in 'Test' for each type in 'Types'. + TypeParameterizedTest::Register( + prefix, case_name, test_names, 0); + + // Next, recurses (at compile time) with the tail of the test list. + return TypeParameterizedTestCase + ::Register(prefix, case_name, SkipComma(test_names)); + } +}; + +// The base case for the compile time recursion. +template +class TypeParameterizedTestCase { + public: + static bool Register(const char* /*prefix*/, const char* /*case_name*/, + const char* /*test_names*/) { + return true; + } +}; + +#endif // GTEST_HAS_TYPED_TEST || GTEST_HAS_TYPED_TEST_P + +// Returns the current OS stack trace as a String. +// +// The maximum number of stack frames to be included is specified by +// the gtest_stack_trace_depth flag. The skip_count parameter +// specifies the number of top frames to be skipped, which doesn't +// count against the number of frames to be included. +// +// For example, if Foo() calls Bar(), which in turn calls +// GetCurrentOsStackTraceExceptTop(..., 1), Foo() will be included in +// the trace but Bar() and GetCurrentOsStackTraceExceptTop() won't. +GTEST_API_ String GetCurrentOsStackTraceExceptTop(UnitTest* unit_test, + int skip_count); + +// Helpers for suppressing warnings on unreachable code or constant +// condition. + +// Always returns true. +GTEST_API_ bool AlwaysTrue(); + +// Always returns false. +inline bool AlwaysFalse() { return !AlwaysTrue(); } + +// Helper for suppressing false warning from Clang on a const char* +// variable declared in a conditional expression always being NULL in +// the else branch. +struct GTEST_API_ ConstCharPtr { + ConstCharPtr(const char* str) : value(str) {} + operator bool() const { return true; } + const char* value; +}; + +// A simple Linear Congruential Generator for generating random +// numbers with a uniform distribution. Unlike rand() and srand(), it +// doesn't use global state (and therefore can't interfere with user +// code). Unlike rand_r(), it's portable. An LCG isn't very random, +// but it's good enough for our purposes. +class GTEST_API_ Random { + public: + static const UInt32 kMaxRange = 1u << 31; + + explicit Random(UInt32 seed) : state_(seed) {} + + void Reseed(UInt32 seed) { state_ = seed; } + + // Generates a random number from [0, range). Crashes if 'range' is + // 0 or greater than kMaxRange. + UInt32 Generate(UInt32 range); + + private: + UInt32 state_; + GTEST_DISALLOW_COPY_AND_ASSIGN_(Random); +}; + +// Defining a variable of type CompileAssertTypesEqual will cause a +// compiler error iff T1 and T2 are different types. +template +struct CompileAssertTypesEqual; + +template +struct CompileAssertTypesEqual { +}; + +// Removes the reference from a type if it is a reference type, +// otherwise leaves it unchanged. This is the same as +// tr1::remove_reference, which is not widely available yet. +template +struct RemoveReference { typedef T type; }; // NOLINT +template +struct RemoveReference { typedef T type; }; // NOLINT + +// A handy wrapper around RemoveReference that works when the argument +// T depends on template parameters. +#define GTEST_REMOVE_REFERENCE_(T) \ + typename ::testing::internal::RemoveReference::type + +// Removes const from a type if it is a const type, otherwise leaves +// it unchanged. This is the same as tr1::remove_const, which is not +// widely available yet. +template +struct RemoveConst { typedef T type; }; // NOLINT +template +struct RemoveConst { typedef T type; }; // NOLINT + +// MSVC 8.0, Sun C++, and IBM XL C++ have a bug which causes the above +// definition to fail to remove the const in 'const int[3]' and 'const +// char[3][4]'. The following specialization works around the bug. +// However, it causes trouble with GCC and thus needs to be +// conditionally compiled. +#if defined(_MSC_VER) || defined(__SUNPRO_CC) || defined(__IBMCPP__) +template +struct RemoveConst { + typedef typename RemoveConst::type type[N]; +}; +#endif + +// A handy wrapper around RemoveConst that works when the argument +// T depends on template parameters. +#define GTEST_REMOVE_CONST_(T) \ + typename ::testing::internal::RemoveConst::type + +// Turns const U&, U&, const U, and U all into U. +#define GTEST_REMOVE_REFERENCE_AND_CONST_(T) \ + GTEST_REMOVE_CONST_(GTEST_REMOVE_REFERENCE_(T)) + +// Adds reference to a type if it is not a reference type, +// otherwise leaves it unchanged. This is the same as +// tr1::add_reference, which is not widely available yet. +template +struct AddReference { typedef T& type; }; // NOLINT +template +struct AddReference { typedef T& type; }; // NOLINT + +// A handy wrapper around AddReference that works when the argument T +// depends on template parameters. +#define GTEST_ADD_REFERENCE_(T) \ + typename ::testing::internal::AddReference::type + +// Adds a reference to const on top of T as necessary. For example, +// it transforms +// +// char ==> const char& +// const char ==> const char& +// char& ==> const char& +// const char& ==> const char& +// +// The argument T must depend on some template parameters. +#define GTEST_REFERENCE_TO_CONST_(T) \ + GTEST_ADD_REFERENCE_(const GTEST_REMOVE_REFERENCE_(T)) + +// ImplicitlyConvertible::value is a compile-time bool +// constant that's true iff type From can be implicitly converted to +// type To. +template +class ImplicitlyConvertible { + private: + // We need the following helper functions only for their types. + // They have no implementations. + + // MakeFrom() is an expression whose type is From. We cannot simply + // use From(), as the type From may not have a public default + // constructor. + static From MakeFrom(); + + // These two functions are overloaded. Given an expression + // Helper(x), the compiler will pick the first version if x can be + // implicitly converted to type To; otherwise it will pick the + // second version. + // + // The first version returns a value of size 1, and the second + // version returns a value of size 2. Therefore, by checking the + // size of Helper(x), which can be done at compile time, we can tell + // which version of Helper() is used, and hence whether x can be + // implicitly converted to type To. + static char Helper(To); + static char (&Helper(...))[2]; // NOLINT + + // We have to put the 'public' section after the 'private' section, + // or MSVC refuses to compile the code. + public: + // MSVC warns about implicitly converting from double to int for + // possible loss of data, so we need to temporarily disable the + // warning. +#ifdef _MSC_VER +# pragma warning(push) // Saves the current warning state. +# pragma warning(disable:4244) // Temporarily disables warning 4244. + + static const bool value = + sizeof(Helper(ImplicitlyConvertible::MakeFrom())) == 1; +# pragma warning(pop) // Restores the warning state. +#elif defined(__BORLANDC__) + // C++Builder cannot use member overload resolution during template + // instantiation. The simplest workaround is to use its C++0x type traits + // functions (C++Builder 2009 and above only). + static const bool value = __is_convertible(From, To); +#else + static const bool value = + sizeof(Helper(ImplicitlyConvertible::MakeFrom())) == 1; +#endif // _MSV_VER +}; +template +const bool ImplicitlyConvertible::value; + +// IsAProtocolMessage::value is a compile-time bool constant that's +// true iff T is type ProtocolMessage, proto2::Message, or a subclass +// of those. +template +struct IsAProtocolMessage + : public bool_constant< + ImplicitlyConvertible::value || + ImplicitlyConvertible::value> { +}; + +// When the compiler sees expression IsContainerTest(0), if C is an +// STL-style container class, the first overload of IsContainerTest +// will be viable (since both C::iterator* and C::const_iterator* are +// valid types and NULL can be implicitly converted to them). It will +// be picked over the second overload as 'int' is a perfect match for +// the type of argument 0. If C::iterator or C::const_iterator is not +// a valid type, the first overload is not viable, and the second +// overload will be picked. Therefore, we can determine whether C is +// a container class by checking the type of IsContainerTest(0). +// The value of the expression is insignificant. +// +// Note that we look for both C::iterator and C::const_iterator. The +// reason is that C++ injects the name of a class as a member of the +// class itself (e.g. you can refer to class iterator as either +// 'iterator' or 'iterator::iterator'). If we look for C::iterator +// only, for example, we would mistakenly think that a class named +// iterator is an STL container. +// +// Also note that the simpler approach of overloading +// IsContainerTest(typename C::const_iterator*) and +// IsContainerTest(...) doesn't work with Visual Age C++ and Sun C++. +typedef int IsContainer; +template +IsContainer IsContainerTest(int /* dummy */, + typename C::iterator* /* it */ = NULL, + typename C::const_iterator* /* const_it */ = NULL) { + return 0; +} + +typedef char IsNotContainer; +template +IsNotContainer IsContainerTest(long /* dummy */) { return '\0'; } + +// EnableIf::type is void when 'Cond' is true, and +// undefined when 'Cond' is false. To use SFINAE to make a function +// overload only apply when a particular expression is true, add +// "typename EnableIf::type* = 0" as the last parameter. +template struct EnableIf; +template<> struct EnableIf { typedef void type; }; // NOLINT + +// Utilities for native arrays. + +// ArrayEq() compares two k-dimensional native arrays using the +// elements' operator==, where k can be any integer >= 0. When k is +// 0, ArrayEq() degenerates into comparing a single pair of values. + +template +bool ArrayEq(const T* lhs, size_t size, const U* rhs); + +// This generic version is used when k is 0. +template +inline bool ArrayEq(const T& lhs, const U& rhs) { return lhs == rhs; } + +// This overload is used when k >= 1. +template +inline bool ArrayEq(const T(&lhs)[N], const U(&rhs)[N]) { + return internal::ArrayEq(lhs, N, rhs); +} + +// This helper reduces code bloat. If we instead put its logic inside +// the previous ArrayEq() function, arrays with different sizes would +// lead to different copies of the template code. +template +bool ArrayEq(const T* lhs, size_t size, const U* rhs) { + for (size_t i = 0; i != size; i++) { + if (!internal::ArrayEq(lhs[i], rhs[i])) + return false; + } + return true; +} + +// Finds the first element in the iterator range [begin, end) that +// equals elem. Element may be a native array type itself. +template +Iter ArrayAwareFind(Iter begin, Iter end, const Element& elem) { + for (Iter it = begin; it != end; ++it) { + if (internal::ArrayEq(*it, elem)) + return it; + } + return end; +} + +// CopyArray() copies a k-dimensional native array using the elements' +// operator=, where k can be any integer >= 0. When k is 0, +// CopyArray() degenerates into copying a single value. + +template +void CopyArray(const T* from, size_t size, U* to); + +// This generic version is used when k is 0. +template +inline void CopyArray(const T& from, U* to) { *to = from; } + +// This overload is used when k >= 1. +template +inline void CopyArray(const T(&from)[N], U(*to)[N]) { + internal::CopyArray(from, N, *to); +} + +// This helper reduces code bloat. If we instead put its logic inside +// the previous CopyArray() function, arrays with different sizes +// would lead to different copies of the template code. +template +void CopyArray(const T* from, size_t size, U* to) { + for (size_t i = 0; i != size; i++) { + internal::CopyArray(from[i], to + i); + } +} + +// The relation between an NativeArray object (see below) and the +// native array it represents. +enum RelationToSource { + kReference, // The NativeArray references the native array. + kCopy // The NativeArray makes a copy of the native array and + // owns the copy. +}; + +// Adapts a native array to a read-only STL-style container. Instead +// of the complete STL container concept, this adaptor only implements +// members useful for Google Mock's container matchers. New members +// should be added as needed. To simplify the implementation, we only +// support Element being a raw type (i.e. having no top-level const or +// reference modifier). It's the client's responsibility to satisfy +// this requirement. Element can be an array type itself (hence +// multi-dimensional arrays are supported). +template +class NativeArray { + public: + // STL-style container typedefs. + typedef Element value_type; + typedef Element* iterator; + typedef const Element* const_iterator; + + // Constructs from a native array. + NativeArray(const Element* array, size_t count, RelationToSource relation) { + Init(array, count, relation); + } + + // Copy constructor. + NativeArray(const NativeArray& rhs) { + Init(rhs.array_, rhs.size_, rhs.relation_to_source_); + } + + ~NativeArray() { + // Ensures that the user doesn't instantiate NativeArray with a + // const or reference type. + static_cast(StaticAssertTypeEqHelper()); + if (relation_to_source_ == kCopy) + delete[] array_; + } + + // STL-style container methods. + size_t size() const { return size_; } + const_iterator begin() const { return array_; } + const_iterator end() const { return array_ + size_; } + bool operator==(const NativeArray& rhs) const { + return size() == rhs.size() && + ArrayEq(begin(), size(), rhs.begin()); + } + + private: + // Initializes this object; makes a copy of the input array if + // 'relation' is kCopy. + void Init(const Element* array, size_t a_size, RelationToSource relation) { + if (relation == kReference) { + array_ = array; + } else { + Element* const copy = new Element[a_size]; + CopyArray(array, a_size, copy); + array_ = copy; + } + size_ = a_size; + relation_to_source_ = relation; + } + + const Element* array_; + size_t size_; + RelationToSource relation_to_source_; + + GTEST_DISALLOW_ASSIGN_(NativeArray); +}; + +} // namespace internal +} // namespace testing + +#define GTEST_MESSAGE_AT_(file, line, message, result_type) \ + ::testing::internal::AssertHelper(result_type, file, line, message) \ + = ::testing::Message() + +#define GTEST_MESSAGE_(message, result_type) \ + GTEST_MESSAGE_AT_(__FILE__, __LINE__, message, result_type) + +#define GTEST_FATAL_FAILURE_(message) \ + return GTEST_MESSAGE_(message, ::testing::TestPartResult::kFatalFailure) + +#define GTEST_NONFATAL_FAILURE_(message) \ + GTEST_MESSAGE_(message, ::testing::TestPartResult::kNonFatalFailure) + +#define GTEST_SUCCESS_(message) \ + GTEST_MESSAGE_(message, ::testing::TestPartResult::kSuccess) + +// Suppresses MSVC warnings 4072 (unreachable code) for the code following +// statement if it returns or throws (or doesn't return or throw in some +// situations). +#define GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) \ + if (::testing::internal::AlwaysTrue()) { statement; } + +#define GTEST_TEST_THROW_(statement, expected_exception, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::ConstCharPtr gtest_msg = "") { \ + bool gtest_caught_expected = false; \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } \ + catch (expected_exception const&) { \ + gtest_caught_expected = true; \ + } \ + catch (...) { \ + gtest_msg.value = \ + "Expected: " #statement " throws an exception of type " \ + #expected_exception ".\n Actual: it throws a different type."; \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \ + } \ + if (!gtest_caught_expected) { \ + gtest_msg.value = \ + "Expected: " #statement " throws an exception of type " \ + #expected_exception ".\n Actual: it throws nothing."; \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__); \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testthrow_, __LINE__): \ + fail(gtest_msg.value) + +#define GTEST_TEST_NO_THROW_(statement, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } \ + catch (...) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__); \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testnothrow_, __LINE__): \ + fail("Expected: " #statement " doesn't throw an exception.\n" \ + " Actual: it throws.") + +#define GTEST_TEST_ANY_THROW_(statement, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + bool gtest_caught_any = false; \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } \ + catch (...) { \ + gtest_caught_any = true; \ + } \ + if (!gtest_caught_any) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__); \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testanythrow_, __LINE__): \ + fail("Expected: " #statement " throws an exception.\n" \ + " Actual: it doesn't.") + + +// Implements Boolean test assertions such as EXPECT_TRUE. expression can be +// either a boolean expression or an AssertionResult. text is a textual +// represenation of expression as it was passed into the EXPECT_TRUE. +#define GTEST_TEST_BOOLEAN_(expression, text, actual, expected, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (const ::testing::AssertionResult gtest_ar_ = \ + ::testing::AssertionResult(expression)) \ + ; \ + else \ + fail(::testing::internal::GetBoolAssertionFailureMessage(\ + gtest_ar_, text, #actual, #expected).c_str()) + +#define GTEST_TEST_NO_FATAL_FAILURE_(statement, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + ::testing::internal::HasNewFatalFailureHelper gtest_fatal_failure_checker; \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + if (gtest_fatal_failure_checker.has_new_fatal_failure()) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__); \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_testnofatal_, __LINE__): \ + fail("Expected: " #statement " doesn't generate new fatal " \ + "failures in the current thread.\n" \ + " Actual: it does.") + +// Expands to the name of the class that implements the given test. +#define GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \ + test_case_name##_##test_name##_Test + +// Helper macro for defining tests. +#define GTEST_TEST_(test_case_name, test_name, parent_class, parent_id)\ +class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) : public parent_class {\ + public:\ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {}\ + private:\ + virtual void TestBody();\ + static ::testing::TestInfo* const test_info_ GTEST_ATTRIBUTE_UNUSED_;\ + GTEST_DISALLOW_COPY_AND_ASSIGN_(\ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name));\ +};\ +\ +::testing::TestInfo* const GTEST_TEST_CLASS_NAME_(test_case_name, test_name)\ + ::test_info_ =\ + ::testing::internal::MakeAndRegisterTestInfo(\ + #test_case_name, #test_name, NULL, NULL, \ + (parent_id), \ + parent_class::SetUpTestCase, \ + parent_class::TearDownTestCase, \ + new ::testing::internal::TestFactoryImpl<\ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>);\ +void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_INTERNAL_H_ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) +// +// The Google C++ Testing Framework (Google Test) +// +// This header file defines the public API for death tests. It is +// #included by gtest.h so a user doesn't need to include this +// directly. + +#ifndef GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ +#define GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ + +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Authors: wan@google.com (Zhanyong Wan), eefacm@gmail.com (Sean Mcafee) +// +// The Google C++ Testing Framework (Google Test) +// +// This header file defines internal utilities needed for implementing +// death tests. They are subject to change without notice. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ + + +#include + +namespace testing { +namespace internal { + +GTEST_DECLARE_string_(internal_run_death_test); + +// Names of the flags (needed for parsing Google Test flags). +const char kDeathTestStyleFlag[] = "death_test_style"; +const char kDeathTestUseFork[] = "death_test_use_fork"; +const char kInternalRunDeathTestFlag[] = "internal_run_death_test"; + +#if GTEST_HAS_DEATH_TEST + +// DeathTest is a class that hides much of the complexity of the +// GTEST_DEATH_TEST_ macro. It is abstract; its static Create method +// returns a concrete class that depends on the prevailing death test +// style, as defined by the --gtest_death_test_style and/or +// --gtest_internal_run_death_test flags. + +// In describing the results of death tests, these terms are used with +// the corresponding definitions: +// +// exit status: The integer exit information in the format specified +// by wait(2) +// exit code: The integer code passed to exit(3), _exit(2), or +// returned from main() +class GTEST_API_ DeathTest { + public: + // Create returns false if there was an error determining the + // appropriate action to take for the current death test; for example, + // if the gtest_death_test_style flag is set to an invalid value. + // The LastMessage method will return a more detailed message in that + // case. Otherwise, the DeathTest pointer pointed to by the "test" + // argument is set. If the death test should be skipped, the pointer + // is set to NULL; otherwise, it is set to the address of a new concrete + // DeathTest object that controls the execution of the current test. + static bool Create(const char* statement, const RE* regex, + const char* file, int line, DeathTest** test); + DeathTest(); + virtual ~DeathTest() { } + + // A helper class that aborts a death test when it's deleted. + class ReturnSentinel { + public: + explicit ReturnSentinel(DeathTest* test) : test_(test) { } + ~ReturnSentinel() { test_->Abort(TEST_ENCOUNTERED_RETURN_STATEMENT); } + private: + DeathTest* const test_; + GTEST_DISALLOW_COPY_AND_ASSIGN_(ReturnSentinel); + } GTEST_ATTRIBUTE_UNUSED_; + + // An enumeration of possible roles that may be taken when a death + // test is encountered. EXECUTE means that the death test logic should + // be executed immediately. OVERSEE means that the program should prepare + // the appropriate environment for a child process to execute the death + // test, then wait for it to complete. + enum TestRole { OVERSEE_TEST, EXECUTE_TEST }; + + // An enumeration of the three reasons that a test might be aborted. + enum AbortReason { + TEST_ENCOUNTERED_RETURN_STATEMENT, + TEST_THREW_EXCEPTION, + TEST_DID_NOT_DIE + }; + + // Assumes one of the above roles. + virtual TestRole AssumeRole() = 0; + + // Waits for the death test to finish and returns its status. + virtual int Wait() = 0; + + // Returns true if the death test passed; that is, the test process + // exited during the test, its exit status matches a user-supplied + // predicate, and its stderr output matches a user-supplied regular + // expression. + // The user-supplied predicate may be a macro expression rather + // than a function pointer or functor, or else Wait and Passed could + // be combined. + virtual bool Passed(bool exit_status_ok) = 0; + + // Signals that the death test did not die as expected. + virtual void Abort(AbortReason reason) = 0; + + // Returns a human-readable outcome message regarding the outcome of + // the last death test. + static const char* LastMessage(); + + static void set_last_death_test_message(const String& message); + + private: + // A string containing a description of the outcome of the last death test. + static String last_death_test_message_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(DeathTest); +}; + +// Factory interface for death tests. May be mocked out for testing. +class DeathTestFactory { + public: + virtual ~DeathTestFactory() { } + virtual bool Create(const char* statement, const RE* regex, + const char* file, int line, DeathTest** test) = 0; +}; + +// A concrete DeathTestFactory implementation for normal use. +class DefaultDeathTestFactory : public DeathTestFactory { + public: + virtual bool Create(const char* statement, const RE* regex, + const char* file, int line, DeathTest** test); +}; + +// Returns true if exit_status describes a process that was terminated +// by a signal, or exited normally with a nonzero exit code. +GTEST_API_ bool ExitedUnsuccessfully(int exit_status); + +// Traps C++ exceptions escaping statement and reports them as test +// failures. Note that trapping SEH exceptions is not implemented here. +# if GTEST_HAS_EXCEPTIONS +# define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \ + try { \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + } catch (const ::std::exception& gtest_exception) { \ + fprintf(\ + stderr, \ + "\n%s: Caught std::exception-derived exception escaping the " \ + "death test statement. Exception message: %s\n", \ + ::testing::internal::FormatFileLocation(__FILE__, __LINE__).c_str(), \ + gtest_exception.what()); \ + fflush(stderr); \ + death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \ + } catch (...) { \ + death_test->Abort(::testing::internal::DeathTest::TEST_THREW_EXCEPTION); \ + } + +# else +# define GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, death_test) \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement) + +# endif + +// This macro is for implementing ASSERT_DEATH*, EXPECT_DEATH*, +// ASSERT_EXIT*, and EXPECT_EXIT*. +# define GTEST_DEATH_TEST_(statement, predicate, regex, fail) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + const ::testing::internal::RE& gtest_regex = (regex); \ + ::testing::internal::DeathTest* gtest_dt; \ + if (!::testing::internal::DeathTest::Create(#statement, >est_regex, \ + __FILE__, __LINE__, >est_dt)) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \ + } \ + if (gtest_dt != NULL) { \ + ::testing::internal::scoped_ptr< ::testing::internal::DeathTest> \ + gtest_dt_ptr(gtest_dt); \ + switch (gtest_dt->AssumeRole()) { \ + case ::testing::internal::DeathTest::OVERSEE_TEST: \ + if (!gtest_dt->Passed(predicate(gtest_dt->Wait()))) { \ + goto GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__); \ + } \ + break; \ + case ::testing::internal::DeathTest::EXECUTE_TEST: { \ + ::testing::internal::DeathTest::ReturnSentinel \ + gtest_sentinel(gtest_dt); \ + GTEST_EXECUTE_DEATH_TEST_STATEMENT_(statement, gtest_dt); \ + gtest_dt->Abort(::testing::internal::DeathTest::TEST_DID_NOT_DIE); \ + break; \ + } \ + default: \ + break; \ + } \ + } \ + } else \ + GTEST_CONCAT_TOKEN_(gtest_label_, __LINE__): \ + fail(::testing::internal::DeathTest::LastMessage()) +// The symbol "fail" here expands to something into which a message +// can be streamed. + +// A class representing the parsed contents of the +// --gtest_internal_run_death_test flag, as it existed when +// RUN_ALL_TESTS was called. +class InternalRunDeathTestFlag { + public: + InternalRunDeathTestFlag(const String& a_file, + int a_line, + int an_index, + int a_write_fd) + : file_(a_file), line_(a_line), index_(an_index), + write_fd_(a_write_fd) {} + + ~InternalRunDeathTestFlag() { + if (write_fd_ >= 0) + posix::Close(write_fd_); + } + + String file() const { return file_; } + int line() const { return line_; } + int index() const { return index_; } + int write_fd() const { return write_fd_; } + + private: + String file_; + int line_; + int index_; + int write_fd_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(InternalRunDeathTestFlag); +}; + +// Returns a newly created InternalRunDeathTestFlag object with fields +// initialized from the GTEST_FLAG(internal_run_death_test) flag if +// the flag is specified; otherwise returns NULL. +InternalRunDeathTestFlag* ParseInternalRunDeathTestFlag(); + +#else // GTEST_HAS_DEATH_TEST + +// This macro is used for implementing macros such as +// EXPECT_DEATH_IF_SUPPORTED and ASSERT_DEATH_IF_SUPPORTED on systems where +// death tests are not supported. Those macros must compile on such systems +// iff EXPECT_DEATH and ASSERT_DEATH compile with the same parameters on +// systems that support death tests. This allows one to write such a macro +// on a system that does not support death tests and be sure that it will +// compile on a death-test supporting system. +// +// Parameters: +// statement - A statement that a macro such as EXPECT_DEATH would test +// for program termination. This macro has to make sure this +// statement is compiled but not executed, to ensure that +// EXPECT_DEATH_IF_SUPPORTED compiles with a certain +// parameter iff EXPECT_DEATH compiles with it. +// regex - A regex that a macro such as EXPECT_DEATH would use to test +// the output of statement. This parameter has to be +// compiled but not evaluated by this macro, to ensure that +// this macro only accepts expressions that a macro such as +// EXPECT_DEATH would accept. +// terminator - Must be an empty statement for EXPECT_DEATH_IF_SUPPORTED +// and a return statement for ASSERT_DEATH_IF_SUPPORTED. +// This ensures that ASSERT_DEATH_IF_SUPPORTED will not +// compile inside functions where ASSERT_DEATH doesn't +// compile. +// +// The branch that has an always false condition is used to ensure that +// statement and regex are compiled (and thus syntactically correct) but +// never executed. The unreachable code macro protects the terminator +// statement from generating an 'unreachable code' warning in case +// statement unconditionally returns or throws. The Message constructor at +// the end allows the syntax of streaming additional messages into the +// macro, for compilational compatibility with EXPECT_DEATH/ASSERT_DEATH. +# define GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, terminator) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (::testing::internal::AlwaysTrue()) { \ + GTEST_LOG_(WARNING) \ + << "Death tests are not supported on this platform.\n" \ + << "Statement '" #statement "' cannot be verified."; \ + } else if (::testing::internal::AlwaysFalse()) { \ + ::testing::internal::RE::PartialMatch(".*", (regex)); \ + GTEST_SUPPRESS_UNREACHABLE_CODE_WARNING_BELOW_(statement); \ + terminator; \ + } else \ + ::testing::Message() + +#endif // GTEST_HAS_DEATH_TEST + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_DEATH_TEST_INTERNAL_H_ + +namespace testing { + +// This flag controls the style of death tests. Valid values are "threadsafe", +// meaning that the death test child process will re-execute the test binary +// from the start, running only a single death test, or "fast", +// meaning that the child process will execute the test logic immediately +// after forking. +GTEST_DECLARE_string_(death_test_style); + +#if GTEST_HAS_DEATH_TEST + +// The following macros are useful for writing death tests. + +// Here's what happens when an ASSERT_DEATH* or EXPECT_DEATH* is +// executed: +// +// 1. It generates a warning if there is more than one active +// thread. This is because it's safe to fork() or clone() only +// when there is a single thread. +// +// 2. The parent process clone()s a sub-process and runs the death +// test in it; the sub-process exits with code 0 at the end of the +// death test, if it hasn't exited already. +// +// 3. The parent process waits for the sub-process to terminate. +// +// 4. The parent process checks the exit code and error message of +// the sub-process. +// +// Examples: +// +// ASSERT_DEATH(server.SendMessage(56, "Hello"), "Invalid port number"); +// for (int i = 0; i < 5; i++) { +// EXPECT_DEATH(server.ProcessRequest(i), +// "Invalid request .* in ProcessRequest()") +// << "Failed to die on request " << i); +// } +// +// ASSERT_EXIT(server.ExitNow(), ::testing::ExitedWithCode(0), "Exiting"); +// +// bool KilledBySIGHUP(int exit_code) { +// return WIFSIGNALED(exit_code) && WTERMSIG(exit_code) == SIGHUP; +// } +// +// ASSERT_EXIT(client.HangUpServer(), KilledBySIGHUP, "Hanging up!"); +// +// On the regular expressions used in death tests: +// +// On POSIX-compliant systems (*nix), we use the library, +// which uses the POSIX extended regex syntax. +// +// On other platforms (e.g. Windows), we only support a simple regex +// syntax implemented as part of Google Test. This limited +// implementation should be enough most of the time when writing +// death tests; though it lacks many features you can find in PCRE +// or POSIX extended regex syntax. For example, we don't support +// union ("x|y"), grouping ("(xy)"), brackets ("[xy]"), and +// repetition count ("x{5,7}"), among others. +// +// Below is the syntax that we do support. We chose it to be a +// subset of both PCRE and POSIX extended regex, so it's easy to +// learn wherever you come from. In the following: 'A' denotes a +// literal character, period (.), or a single \\ escape sequence; +// 'x' and 'y' denote regular expressions; 'm' and 'n' are for +// natural numbers. +// +// c matches any literal character c +// \\d matches any decimal digit +// \\D matches any character that's not a decimal digit +// \\f matches \f +// \\n matches \n +// \\r matches \r +// \\s matches any ASCII whitespace, including \n +// \\S matches any character that's not a whitespace +// \\t matches \t +// \\v matches \v +// \\w matches any letter, _, or decimal digit +// \\W matches any character that \\w doesn't match +// \\c matches any literal character c, which must be a punctuation +// . matches any single character except \n +// A? matches 0 or 1 occurrences of A +// A* matches 0 or many occurrences of A +// A+ matches 1 or many occurrences of A +// ^ matches the beginning of a string (not that of each line) +// $ matches the end of a string (not that of each line) +// xy matches x followed by y +// +// If you accidentally use PCRE or POSIX extended regex features +// not implemented by us, you will get a run-time failure. In that +// case, please try to rewrite your regular expression within the +// above syntax. +// +// This implementation is *not* meant to be as highly tuned or robust +// as a compiled regex library, but should perform well enough for a +// death test, which already incurs significant overhead by launching +// a child process. +// +// Known caveats: +// +// A "threadsafe" style death test obtains the path to the test +// program from argv[0] and re-executes it in the sub-process. For +// simplicity, the current implementation doesn't search the PATH +// when launching the sub-process. This means that the user must +// invoke the test program via a path that contains at least one +// path separator (e.g. path/to/foo_test and +// /absolute/path/to/bar_test are fine, but foo_test is not). This +// is rarely a problem as people usually don't put the test binary +// directory in PATH. +// +// TODO(wan@google.com): make thread-safe death tests search the PATH. + +// Asserts that a given statement causes the program to exit, with an +// integer exit status that satisfies predicate, and emitting error output +// that matches regex. +# define ASSERT_EXIT(statement, predicate, regex) \ + GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_FATAL_FAILURE_) + +// Like ASSERT_EXIT, but continues on to successive tests in the +// test case, if any: +# define EXPECT_EXIT(statement, predicate, regex) \ + GTEST_DEATH_TEST_(statement, predicate, regex, GTEST_NONFATAL_FAILURE_) + +// Asserts that a given statement causes the program to exit, either by +// explicitly exiting with a nonzero exit code or being killed by a +// signal, and emitting error output that matches regex. +# define ASSERT_DEATH(statement, regex) \ + ASSERT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex) + +// Like ASSERT_DEATH, but continues on to successive tests in the +// test case, if any: +# define EXPECT_DEATH(statement, regex) \ + EXPECT_EXIT(statement, ::testing::internal::ExitedUnsuccessfully, regex) + +// Two predicate classes that can be used in {ASSERT,EXPECT}_EXIT*: + +// Tests that an exit code describes a normal exit with a given exit code. +class GTEST_API_ ExitedWithCode { + public: + explicit ExitedWithCode(int exit_code); + bool operator()(int exit_status) const; + private: + // No implementation - assignment is unsupported. + void operator=(const ExitedWithCode& other); + + const int exit_code_; +}; + +# if !GTEST_OS_WINDOWS +// Tests that an exit code describes an exit due to termination by a +// given signal. +class GTEST_API_ KilledBySignal { + public: + explicit KilledBySignal(int signum); + bool operator()(int exit_status) const; + private: + const int signum_; +}; +# endif // !GTEST_OS_WINDOWS + +// EXPECT_DEBUG_DEATH asserts that the given statements die in debug mode. +// The death testing framework causes this to have interesting semantics, +// since the sideeffects of the call are only visible in opt mode, and not +// in debug mode. +// +// In practice, this can be used to test functions that utilize the +// LOG(DFATAL) macro using the following style: +// +// int DieInDebugOr12(int* sideeffect) { +// if (sideeffect) { +// *sideeffect = 12; +// } +// LOG(DFATAL) << "death"; +// return 12; +// } +// +// TEST(TestCase, TestDieOr12WorksInDgbAndOpt) { +// int sideeffect = 0; +// // Only asserts in dbg. +// EXPECT_DEBUG_DEATH(DieInDebugOr12(&sideeffect), "death"); +// +// #ifdef NDEBUG +// // opt-mode has sideeffect visible. +// EXPECT_EQ(12, sideeffect); +// #else +// // dbg-mode no visible sideeffect. +// EXPECT_EQ(0, sideeffect); +// #endif +// } +// +// This will assert that DieInDebugReturn12InOpt() crashes in debug +// mode, usually due to a DCHECK or LOG(DFATAL), but returns the +// appropriate fallback value (12 in this case) in opt mode. If you +// need to test that a function has appropriate side-effects in opt +// mode, include assertions against the side-effects. A general +// pattern for this is: +// +// EXPECT_DEBUG_DEATH({ +// // Side-effects here will have an effect after this statement in +// // opt mode, but none in debug mode. +// EXPECT_EQ(12, DieInDebugOr12(&sideeffect)); +// }, "death"); +// +# ifdef NDEBUG + +# define EXPECT_DEBUG_DEATH(statement, regex) \ + do { statement; } while (::testing::internal::AlwaysFalse()) + +# define ASSERT_DEBUG_DEATH(statement, regex) \ + do { statement; } while (::testing::internal::AlwaysFalse()) + +# else + +# define EXPECT_DEBUG_DEATH(statement, regex) \ + EXPECT_DEATH(statement, regex) + +# define ASSERT_DEBUG_DEATH(statement, regex) \ + ASSERT_DEATH(statement, regex) + +# endif // NDEBUG for EXPECT_DEBUG_DEATH +#endif // GTEST_HAS_DEATH_TEST + +// EXPECT_DEATH_IF_SUPPORTED(statement, regex) and +// ASSERT_DEATH_IF_SUPPORTED(statement, regex) expand to real death tests if +// death tests are supported; otherwise they just issue a warning. This is +// useful when you are combining death test assertions with normal test +// assertions in one test. +#if GTEST_HAS_DEATH_TEST +# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \ + EXPECT_DEATH(statement, regex) +# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \ + ASSERT_DEATH(statement, regex) +#else +# define EXPECT_DEATH_IF_SUPPORTED(statement, regex) \ + GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, ) +# define ASSERT_DEATH_IF_SUPPORTED(statement, regex) \ + GTEST_UNSUPPORTED_DEATH_TEST_(statement, regex, return) +#endif + +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_GTEST_DEATH_TEST_H_ +// Copyright 2005, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) +// +// The Google C++ Testing Framework (Google Test) +// +// This header file defines the Message class. +// +// IMPORTANT NOTE: Due to limitation of the C++ language, we have to +// leave some internal implementation details in this header file. +// They are clearly marked by comments like this: +// +// // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +// +// Such code is NOT meant to be used by a user directly, and is subject +// to CHANGE WITHOUT NOTICE. Therefore DO NOT DEPEND ON IT in a user +// program! + +#ifndef GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ +#define GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ + +#include + + +namespace testing { + +// The Message class works like an ostream repeater. +// +// Typical usage: +// +// 1. You stream a bunch of values to a Message object. +// It will remember the text in a stringstream. +// 2. Then you stream the Message object to an ostream. +// This causes the text in the Message to be streamed +// to the ostream. +// +// For example; +// +// testing::Message foo; +// foo << 1 << " != " << 2; +// std::cout << foo; +// +// will print "1 != 2". +// +// Message is not intended to be inherited from. In particular, its +// destructor is not virtual. +// +// Note that stringstream behaves differently in gcc and in MSVC. You +// can stream a NULL char pointer to it in the former, but not in the +// latter (it causes an access violation if you do). The Message +// class hides this difference by treating a NULL char pointer as +// "(null)". +class GTEST_API_ Message { + private: + // The type of basic IO manipulators (endl, ends, and flush) for + // narrow streams. + typedef std::ostream& (*BasicNarrowIoManip)(std::ostream&); + + public: + // Constructs an empty Message. + // We allocate the stringstream separately because otherwise each use of + // ASSERT/EXPECT in a procedure adds over 200 bytes to the procedure's + // stack frame leading to huge stack frames in some cases; gcc does not reuse + // the stack space. + Message() : ss_(new ::std::stringstream) { + // By default, we want there to be enough precision when printing + // a double to a Message. + *ss_ << std::setprecision(std::numeric_limits::digits10 + 2); + } + + // Copy constructor. + Message(const Message& msg) : ss_(new ::std::stringstream) { // NOLINT + *ss_ << msg.GetString(); + } + + // Constructs a Message from a C-string. + explicit Message(const char* str) : ss_(new ::std::stringstream) { + *ss_ << str; + } + +#if GTEST_OS_SYMBIAN + // Streams a value (either a pointer or not) to this object. + template + inline Message& operator <<(const T& value) { + StreamHelper(typename internal::is_pointer::type(), value); + return *this; + } +#else + // Streams a non-pointer value to this object. + template + inline Message& operator <<(const T& val) { + ::GTestStreamToHelper(ss_.get(), val); + return *this; + } + + // Streams a pointer value to this object. + // + // This function is an overload of the previous one. When you + // stream a pointer to a Message, this definition will be used as it + // is more specialized. (The C++ Standard, section + // [temp.func.order].) If you stream a non-pointer, then the + // previous definition will be used. + // + // The reason for this overload is that streaming a NULL pointer to + // ostream is undefined behavior. Depending on the compiler, you + // may get "0", "(nil)", "(null)", or an access violation. To + // ensure consistent result across compilers, we always treat NULL + // as "(null)". + template + inline Message& operator <<(T* const& pointer) { // NOLINT + if (pointer == NULL) { + *ss_ << "(null)"; + } else { + ::GTestStreamToHelper(ss_.get(), pointer); + } + return *this; + } +#endif // GTEST_OS_SYMBIAN + + // Since the basic IO manipulators are overloaded for both narrow + // and wide streams, we have to provide this specialized definition + // of operator <<, even though its body is the same as the + // templatized version above. Without this definition, streaming + // endl or other basic IO manipulators to Message will confuse the + // compiler. + Message& operator <<(BasicNarrowIoManip val) { + *ss_ << val; + return *this; + } + + // Instead of 1/0, we want to see true/false for bool values. + Message& operator <<(bool b) { + return *this << (b ? "true" : "false"); + } + + // These two overloads allow streaming a wide C string to a Message + // using the UTF-8 encoding. + Message& operator <<(const wchar_t* wide_c_str) { + return *this << internal::String::ShowWideCString(wide_c_str); + } + Message& operator <<(wchar_t* wide_c_str) { + return *this << internal::String::ShowWideCString(wide_c_str); + } + +#if GTEST_HAS_STD_WSTRING + // Converts the given wide string to a narrow string using the UTF-8 + // encoding, and streams the result to this Message object. + Message& operator <<(const ::std::wstring& wstr); +#endif // GTEST_HAS_STD_WSTRING + +#if GTEST_HAS_GLOBAL_WSTRING + // Converts the given wide string to a narrow string using the UTF-8 + // encoding, and streams the result to this Message object. + Message& operator <<(const ::wstring& wstr); +#endif // GTEST_HAS_GLOBAL_WSTRING + + // Gets the text streamed to this object so far as a String. + // Each '\0' character in the buffer is replaced with "\\0". + // + // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + internal::String GetString() const { + return internal::StringStreamToString(ss_.get()); + } + + private: + +#if GTEST_OS_SYMBIAN + // These are needed as the Nokia Symbian Compiler cannot decide between + // const T& and const T* in a function template. The Nokia compiler _can_ + // decide between class template specializations for T and T*, so a + // tr1::type_traits-like is_pointer works, and we can overload on that. + template + inline void StreamHelper(internal::true_type /*dummy*/, T* pointer) { + if (pointer == NULL) { + *ss_ << "(null)"; + } else { + ::GTestStreamToHelper(ss_.get(), pointer); + } + } + template + inline void StreamHelper(internal::false_type /*dummy*/, const T& value) { + ::GTestStreamToHelper(ss_.get(), value); + } +#endif // GTEST_OS_SYMBIAN + + // We'll hold the text streamed to this object here. + const internal::scoped_ptr< ::std::stringstream> ss_; + + // We declare (but don't implement) this to prevent the compiler + // from implementing the assignment operator. + void operator=(const Message&); +}; + +// Streams a Message to an ostream. +inline std::ostream& operator <<(std::ostream& os, const Message& sb) { + return os << sb.GetString(); +} + +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_GTEST_MESSAGE_H_ +// This file was GENERATED by command: +// pump.py gtest-param-test.h.pump +// DO NOT EDIT BY HAND!!! + +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Authors: vladl@google.com (Vlad Losev) +// +// Macros and functions for implementing parameterized tests +// in Google C++ Testing Framework (Google Test) +// +// This file is generated by a SCRIPT. DO NOT EDIT BY HAND! +// +#ifndef GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ +#define GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ + + +// Value-parameterized tests allow you to test your code with different +// parameters without writing multiple copies of the same test. +// +// Here is how you use value-parameterized tests: + +#if 0 + +// To write value-parameterized tests, first you should define a fixture +// class. It is usually derived from testing::TestWithParam (see below for +// another inheritance scheme that's sometimes useful in more complicated +// class hierarchies), where the type of your parameter values. +// TestWithParam is itself derived from testing::Test. T can be any +// copyable type. If it's a raw pointer, you are responsible for managing the +// lifespan of the pointed values. + +class FooTest : public ::testing::TestWithParam { + // You can implement all the usual class fixture members here. +}; + +// Then, use the TEST_P macro to define as many parameterized tests +// for this fixture as you want. The _P suffix is for "parameterized" +// or "pattern", whichever you prefer to think. + +TEST_P(FooTest, DoesBlah) { + // Inside a test, access the test parameter with the GetParam() method + // of the TestWithParam class: + EXPECT_TRUE(foo.Blah(GetParam())); + ... +} + +TEST_P(FooTest, HasBlahBlah) { + ... +} + +// Finally, you can use INSTANTIATE_TEST_CASE_P to instantiate the test +// case with any set of parameters you want. Google Test defines a number +// of functions for generating test parameters. They return what we call +// (surprise!) parameter generators. Here is a summary of them, which +// are all in the testing namespace: +// +// +// Range(begin, end [, step]) - Yields values {begin, begin+step, +// begin+step+step, ...}. The values do not +// include end. step defaults to 1. +// Values(v1, v2, ..., vN) - Yields values {v1, v2, ..., vN}. +// ValuesIn(container) - Yields values from a C-style array, an STL +// ValuesIn(begin,end) container, or an iterator range [begin, end). +// Bool() - Yields sequence {false, true}. +// Combine(g1, g2, ..., gN) - Yields all combinations (the Cartesian product +// for the math savvy) of the values generated +// by the N generators. +// +// For more details, see comments at the definitions of these functions below +// in this file. +// +// The following statement will instantiate tests from the FooTest test case +// each with parameter values "meeny", "miny", and "moe". + +INSTANTIATE_TEST_CASE_P(InstantiationName, + FooTest, + Values("meeny", "miny", "moe")); + +// To distinguish different instances of the pattern, (yes, you +// can instantiate it more then once) the first argument to the +// INSTANTIATE_TEST_CASE_P macro is a prefix that will be added to the +// actual test case name. Remember to pick unique prefixes for different +// instantiations. The tests from the instantiation above will have +// these names: +// +// * InstantiationName/FooTest.DoesBlah/0 for "meeny" +// * InstantiationName/FooTest.DoesBlah/1 for "miny" +// * InstantiationName/FooTest.DoesBlah/2 for "moe" +// * InstantiationName/FooTest.HasBlahBlah/0 for "meeny" +// * InstantiationName/FooTest.HasBlahBlah/1 for "miny" +// * InstantiationName/FooTest.HasBlahBlah/2 for "moe" +// +// You can use these names in --gtest_filter. +// +// This statement will instantiate all tests from FooTest again, each +// with parameter values "cat" and "dog": + +const char* pets[] = {"cat", "dog"}; +INSTANTIATE_TEST_CASE_P(AnotherInstantiationName, FooTest, ValuesIn(pets)); + +// The tests from the instantiation above will have these names: +// +// * AnotherInstantiationName/FooTest.DoesBlah/0 for "cat" +// * AnotherInstantiationName/FooTest.DoesBlah/1 for "dog" +// * AnotherInstantiationName/FooTest.HasBlahBlah/0 for "cat" +// * AnotherInstantiationName/FooTest.HasBlahBlah/1 for "dog" +// +// Please note that INSTANTIATE_TEST_CASE_P will instantiate all tests +// in the given test case, whether their definitions come before or +// AFTER the INSTANTIATE_TEST_CASE_P statement. +// +// Please also note that generator expressions (including parameters to the +// generators) are evaluated in InitGoogleTest(), after main() has started. +// This allows the user on one hand, to adjust generator parameters in order +// to dynamically determine a set of tests to run and on the other hand, +// give the user a chance to inspect the generated tests with Google Test +// reflection API before RUN_ALL_TESTS() is executed. +// +// You can see samples/sample7_unittest.cc and samples/sample8_unittest.cc +// for more examples. +// +// In the future, we plan to publish the API for defining new parameter +// generators. But for now this interface remains part of the internal +// implementation and is subject to change. +// +// +// A parameterized test fixture must be derived from testing::Test and from +// testing::WithParamInterface, where T is the type of the parameter +// values. Inheriting from TestWithParam satisfies that requirement because +// TestWithParam inherits from both Test and WithParamInterface. In more +// complicated hierarchies, however, it is occasionally useful to inherit +// separately from Test and WithParamInterface. For example: + +class BaseTest : public ::testing::Test { + // You can inherit all the usual members for a non-parameterized test + // fixture here. +}; + +class DerivedTest : public BaseTest, public ::testing::WithParamInterface { + // The usual test fixture members go here too. +}; + +TEST_F(BaseTest, HasFoo) { + // This is an ordinary non-parameterized test. +} + +TEST_P(DerivedTest, DoesBlah) { + // GetParam works just the same here as if you inherit from TestWithParam. + EXPECT_TRUE(foo.Blah(GetParam())); +} + +#endif // 0 + + +#if !GTEST_OS_SYMBIAN +# include +#endif + +// scripts/fuse_gtest.py depends on gtest's own header being #included +// *unconditionally*. Therefore these #includes cannot be moved +// inside #if GTEST_HAS_PARAM_TEST. +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: vladl@google.com (Vlad Losev) + +// Type and function utilities for implementing parameterized tests. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ + +#include +#include +#include + +// scripts/fuse_gtest.py depends on gtest's own header being #included +// *unconditionally*. Therefore these #includes cannot be moved +// inside #if GTEST_HAS_PARAM_TEST. +// Copyright 2003 Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Authors: Dan Egnor (egnor@google.com) +// +// A "smart" pointer type with reference tracking. Every pointer to a +// particular object is kept on a circular linked list. When the last pointer +// to an object is destroyed or reassigned, the object is deleted. +// +// Used properly, this deletes the object when the last reference goes away. +// There are several caveats: +// - Like all reference counting schemes, cycles lead to leaks. +// - Each smart pointer is actually two pointers (8 bytes instead of 4). +// - Every time a pointer is assigned, the entire list of pointers to that +// object is traversed. This class is therefore NOT SUITABLE when there +// will often be more than two or three pointers to a particular object. +// - References are only tracked as long as linked_ptr<> objects are copied. +// If a linked_ptr<> is converted to a raw pointer and back, BAD THINGS +// will happen (double deletion). +// +// A good use of this class is storing object references in STL containers. +// You can safely put linked_ptr<> in a vector<>. +// Other uses may not be as good. +// +// Note: If you use an incomplete type with linked_ptr<>, the class +// *containing* linked_ptr<> must have a constructor and destructor (even +// if they do nothing!). +// +// Bill Gibbons suggested we use something like this. +// +// Thread Safety: +// Unlike other linked_ptr implementations, in this implementation +// a linked_ptr object is thread-safe in the sense that: +// - it's safe to copy linked_ptr objects concurrently, +// - it's safe to copy *from* a linked_ptr and read its underlying +// raw pointer (e.g. via get()) concurrently, and +// - it's safe to write to two linked_ptrs that point to the same +// shared object concurrently. +// TODO(wan@google.com): rename this to safe_linked_ptr to avoid +// confusion with normal linked_ptr. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_ + +#include +#include + + +namespace testing { +namespace internal { + +// Protects copying of all linked_ptr objects. +GTEST_API_ GTEST_DECLARE_STATIC_MUTEX_(g_linked_ptr_mutex); + +// This is used internally by all instances of linked_ptr<>. It needs to be +// a non-template class because different types of linked_ptr<> can refer to +// the same object (linked_ptr(obj) vs linked_ptr(obj)). +// So, it needs to be possible for different types of linked_ptr to participate +// in the same circular linked list, so we need a single class type here. +// +// DO NOT USE THIS CLASS DIRECTLY YOURSELF. Use linked_ptr. +class linked_ptr_internal { + public: + // Create a new circle that includes only this instance. + void join_new() { + next_ = this; + } + + // Many linked_ptr operations may change p.link_ for some linked_ptr + // variable p in the same circle as this object. Therefore we need + // to prevent two such operations from occurring concurrently. + // + // Note that different types of linked_ptr objects can coexist in a + // circle (e.g. linked_ptr, linked_ptr, and + // linked_ptr). Therefore we must use a single mutex to + // protect all linked_ptr objects. This can create serious + // contention in production code, but is acceptable in a testing + // framework. + + // Join an existing circle. + // L < g_linked_ptr_mutex + void join(linked_ptr_internal const* ptr) { + MutexLock lock(&g_linked_ptr_mutex); + + linked_ptr_internal const* p = ptr; + while (p->next_ != ptr) p = p->next_; + p->next_ = this; + next_ = ptr; + } + + // Leave whatever circle we're part of. Returns true if we were the + // last member of the circle. Once this is done, you can join() another. + // L < g_linked_ptr_mutex + bool depart() { + MutexLock lock(&g_linked_ptr_mutex); + + if (next_ == this) return true; + linked_ptr_internal const* p = next_; + while (p->next_ != this) p = p->next_; + p->next_ = next_; + return false; + } + + private: + mutable linked_ptr_internal const* next_; +}; + +template +class linked_ptr { + public: + typedef T element_type; + + // Take over ownership of a raw pointer. This should happen as soon as + // possible after the object is created. + explicit linked_ptr(T* ptr = NULL) { capture(ptr); } + ~linked_ptr() { depart(); } + + // Copy an existing linked_ptr<>, adding ourselves to the list of references. + template linked_ptr(linked_ptr const& ptr) { copy(&ptr); } + linked_ptr(linked_ptr const& ptr) { // NOLINT + assert(&ptr != this); + copy(&ptr); + } + + // Assignment releases the old value and acquires the new. + template linked_ptr& operator=(linked_ptr const& ptr) { + depart(); + copy(&ptr); + return *this; + } + + linked_ptr& operator=(linked_ptr const& ptr) { + if (&ptr != this) { + depart(); + copy(&ptr); + } + return *this; + } + + // Smart pointer members. + void reset(T* ptr = NULL) { + depart(); + capture(ptr); + } + T* get() const { return value_; } + T* operator->() const { return value_; } + T& operator*() const { return *value_; } + + bool operator==(T* p) const { return value_ == p; } + bool operator!=(T* p) const { return value_ != p; } + template + bool operator==(linked_ptr const& ptr) const { + return value_ == ptr.get(); + } + template + bool operator!=(linked_ptr const& ptr) const { + return value_ != ptr.get(); + } + + private: + template + friend class linked_ptr; + + T* value_; + linked_ptr_internal link_; + + void depart() { + if (link_.depart()) delete value_; + } + + void capture(T* ptr) { + value_ = ptr; + link_.join_new(); + } + + template void copy(linked_ptr const* ptr) { + value_ = ptr->get(); + if (value_) + link_.join(&ptr->link_); + else + link_.join_new(); + } +}; + +template inline +bool operator==(T* ptr, const linked_ptr& x) { + return ptr == x.get(); +} + +template inline +bool operator!=(T* ptr, const linked_ptr& x) { + return ptr != x.get(); +} + +// A function to convert T* into linked_ptr +// Doing e.g. make_linked_ptr(new FooBarBaz(arg)) is a shorter notation +// for linked_ptr >(new FooBarBaz(arg)) +template +linked_ptr make_linked_ptr(T* ptr) { + return linked_ptr(ptr); +} + +} // namespace internal +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_LINKED_PTR_H_ +// Copyright 2007, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) + +// Google Test - The Google C++ Testing Framework +// +// This file implements a universal value printer that can print a +// value of any type T: +// +// void ::testing::internal::UniversalPrinter::Print(value, ostream_ptr); +// +// A user can teach this function how to print a class type T by +// defining either operator<<() or PrintTo() in the namespace that +// defines T. More specifically, the FIRST defined function in the +// following list will be used (assuming T is defined in namespace +// foo): +// +// 1. foo::PrintTo(const T&, ostream*) +// 2. operator<<(ostream&, const T&) defined in either foo or the +// global namespace. +// +// If none of the above is defined, it will print the debug string of +// the value if it is a protocol buffer, or print the raw bytes in the +// value otherwise. +// +// To aid debugging: when T is a reference type, the address of the +// value is also printed; when T is a (const) char pointer, both the +// pointer value and the NUL-terminated string it points to are +// printed. +// +// We also provide some convenient wrappers: +// +// // Prints a value to a string. For a (const or not) char +// // pointer, the NUL-terminated string (but not the pointer) is +// // printed. +// std::string ::testing::PrintToString(const T& value); +// +// // Prints a value tersely: for a reference type, the referenced +// // value (but not the address) is printed; for a (const or not) char +// // pointer, the NUL-terminated string (but not the pointer) is +// // printed. +// void ::testing::internal::UniversalTersePrint(const T& value, ostream*); +// +// // Prints value using the type inferred by the compiler. The difference +// // from UniversalTersePrint() is that this function prints both the +// // pointer and the NUL-terminated string for a (const or not) char pointer. +// void ::testing::internal::UniversalPrint(const T& value, ostream*); +// +// // Prints the fields of a tuple tersely to a string vector, one +// // element for each field. Tuple support must be enabled in +// // gtest-port.h. +// std::vector UniversalTersePrintTupleFieldsToStrings( +// const Tuple& value); +// +// Known limitation: +// +// The print primitives print the elements of an STL-style container +// using the compiler-inferred type of *iter where iter is a +// const_iterator of the container. When const_iterator is an input +// iterator but not a forward iterator, this inferred type may not +// match value_type, and the print output may be incorrect. In +// practice, this is rarely a problem as for most containers +// const_iterator is a forward iterator. We'll fix this if there's an +// actual need for it. Note that this fix cannot rely on value_type +// being defined as many user-defined container types don't have +// value_type. + +#ifndef GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ +#define GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ + +#include // NOLINT +#include +#include +#include +#include + +namespace testing { + +// Definitions in the 'internal' and 'internal2' name spaces are +// subject to change without notice. DO NOT USE THEM IN USER CODE! +namespace internal2 { + +// Prints the given number of bytes in the given object to the given +// ostream. +GTEST_API_ void PrintBytesInObjectTo(const unsigned char* obj_bytes, + size_t count, + ::std::ostream* os); + +// For selecting which printer to use when a given type has neither << +// nor PrintTo(). +enum TypeKind { + kProtobuf, // a protobuf type + kConvertibleToInteger, // a type implicitly convertible to BiggestInt + // (e.g. a named or unnamed enum type) + kOtherType // anything else +}; + +// TypeWithoutFormatter::PrintValue(value, os) is called +// by the universal printer to print a value of type T when neither +// operator<< nor PrintTo() is defined for T, where kTypeKind is the +// "kind" of T as defined by enum TypeKind. +template +class TypeWithoutFormatter { + public: + // This default version is called when kTypeKind is kOtherType. + static void PrintValue(const T& value, ::std::ostream* os) { + PrintBytesInObjectTo(reinterpret_cast(&value), + sizeof(value), os); + } +}; + +// We print a protobuf using its ShortDebugString() when the string +// doesn't exceed this many characters; otherwise we print it using +// DebugString() for better readability. +const size_t kProtobufOneLinerMaxLength = 50; + +template +class TypeWithoutFormatter { + public: + static void PrintValue(const T& value, ::std::ostream* os) { + const ::testing::internal::string short_str = value.ShortDebugString(); + const ::testing::internal::string pretty_str = + short_str.length() <= kProtobufOneLinerMaxLength ? + short_str : ("\n" + value.DebugString()); + *os << ("<" + pretty_str + ">"); + } +}; + +template +class TypeWithoutFormatter { + public: + // Since T has no << operator or PrintTo() but can be implicitly + // converted to BiggestInt, we print it as a BiggestInt. + // + // Most likely T is an enum type (either named or unnamed), in which + // case printing it as an integer is the desired behavior. In case + // T is not an enum, printing it as an integer is the best we can do + // given that it has no user-defined printer. + static void PrintValue(const T& value, ::std::ostream* os) { + const internal::BiggestInt kBigInt = value; + *os << kBigInt; + } +}; + +// Prints the given value to the given ostream. If the value is a +// protocol message, its debug string is printed; if it's an enum or +// of a type implicitly convertible to BiggestInt, it's printed as an +// integer; otherwise the bytes in the value are printed. This is +// what UniversalPrinter::Print() does when it knows nothing about +// type T and T has neither << operator nor PrintTo(). +// +// A user can override this behavior for a class type Foo by defining +// a << operator in the namespace where Foo is defined. +// +// We put this operator in namespace 'internal2' instead of 'internal' +// to simplify the implementation, as much code in 'internal' needs to +// use << in STL, which would conflict with our own << were it defined +// in 'internal'. +// +// Note that this operator<< takes a generic std::basic_ostream type instead of the more restricted std::ostream. If +// we define it to take an std::ostream instead, we'll get an +// "ambiguous overloads" compiler error when trying to print a type +// Foo that supports streaming to std::basic_ostream, as the compiler cannot tell whether +// operator<<(std::ostream&, const T&) or +// operator<<(std::basic_stream, const Foo&) is more +// specific. +template +::std::basic_ostream& operator<<( + ::std::basic_ostream& os, const T& x) { + TypeWithoutFormatter::value ? kProtobuf : + internal::ImplicitlyConvertible::value ? + kConvertibleToInteger : kOtherType)>::PrintValue(x, &os); + return os; +} + +} // namespace internal2 +} // namespace testing + +// This namespace MUST NOT BE NESTED IN ::testing, or the name look-up +// magic needed for implementing UniversalPrinter won't work. +namespace testing_internal { + +// Used to print a value that is not an STL-style container when the +// user doesn't define PrintTo() for it. +template +void DefaultPrintNonContainerTo(const T& value, ::std::ostream* os) { + // With the following statement, during unqualified name lookup, + // testing::internal2::operator<< appears as if it was declared in + // the nearest enclosing namespace that contains both + // ::testing_internal and ::testing::internal2, i.e. the global + // namespace. For more details, refer to the C++ Standard section + // 7.3.4-1 [namespace.udir]. This allows us to fall back onto + // testing::internal2::operator<< in case T doesn't come with a << + // operator. + // + // We cannot write 'using ::testing::internal2::operator<<;', which + // gcc 3.3 fails to compile due to a compiler bug. + using namespace ::testing::internal2; // NOLINT + + // Assuming T is defined in namespace foo, in the next statement, + // the compiler will consider all of: + // + // 1. foo::operator<< (thanks to Koenig look-up), + // 2. ::operator<< (as the current namespace is enclosed in ::), + // 3. testing::internal2::operator<< (thanks to the using statement above). + // + // The operator<< whose type matches T best will be picked. + // + // We deliberately allow #2 to be a candidate, as sometimes it's + // impossible to define #1 (e.g. when foo is ::std, defining + // anything in it is undefined behavior unless you are a compiler + // vendor.). + *os << value; +} + +} // namespace testing_internal + +namespace testing { +namespace internal { + +// UniversalPrinter::Print(value, ostream_ptr) prints the given +// value to the given ostream. The caller must ensure that +// 'ostream_ptr' is not NULL, or the behavior is undefined. +// +// We define UniversalPrinter as a class template (as opposed to a +// function template), as we need to partially specialize it for +// reference types, which cannot be done with function templates. +template +class UniversalPrinter; + +template +void UniversalPrint(const T& value, ::std::ostream* os); + +// Used to print an STL-style container when the user doesn't define +// a PrintTo() for it. +template +void DefaultPrintTo(IsContainer /* dummy */, + false_type /* is not a pointer */, + const C& container, ::std::ostream* os) { + const size_t kMaxCount = 32; // The maximum number of elements to print. + *os << '{'; + size_t count = 0; + for (typename C::const_iterator it = container.begin(); + it != container.end(); ++it, ++count) { + if (count > 0) { + *os << ','; + if (count == kMaxCount) { // Enough has been printed. + *os << " ..."; + break; + } + } + *os << ' '; + // We cannot call PrintTo(*it, os) here as PrintTo() doesn't + // handle *it being a native array. + internal::UniversalPrint(*it, os); + } + + if (count > 0) { + *os << ' '; + } + *os << '}'; +} + +// Used to print a pointer that is neither a char pointer nor a member +// pointer, when the user doesn't define PrintTo() for it. (A member +// variable pointer or member function pointer doesn't really point to +// a location in the address space. Their representation is +// implementation-defined. Therefore they will be printed as raw +// bytes.) +template +void DefaultPrintTo(IsNotContainer /* dummy */, + true_type /* is a pointer */, + T* p, ::std::ostream* os) { + if (p == NULL) { + *os << "NULL"; + } else { + // C++ doesn't allow casting from a function pointer to any object + // pointer. + // + // IsTrue() silences warnings: "Condition is always true", + // "unreachable code". + if (IsTrue(ImplicitlyConvertible::value)) { + // T is not a function type. We just call << to print p, + // relying on ADL to pick up user-defined << for their pointer + // types, if any. + *os << p; + } else { + // T is a function type, so '*os << p' doesn't do what we want + // (it just prints p as bool). We want to print p as a const + // void*. However, we cannot cast it to const void* directly, + // even using reinterpret_cast, as earlier versions of gcc + // (e.g. 3.4.5) cannot compile the cast when p is a function + // pointer. Casting to UInt64 first solves the problem. + *os << reinterpret_cast( + reinterpret_cast(p)); + } + } +} + +// Used to print a non-container, non-pointer value when the user +// doesn't define PrintTo() for it. +template +void DefaultPrintTo(IsNotContainer /* dummy */, + false_type /* is not a pointer */, + const T& value, ::std::ostream* os) { + ::testing_internal::DefaultPrintNonContainerTo(value, os); +} + +// Prints the given value using the << operator if it has one; +// otherwise prints the bytes in it. This is what +// UniversalPrinter::Print() does when PrintTo() is not specialized +// or overloaded for type T. +// +// A user can override this behavior for a class type Foo by defining +// an overload of PrintTo() in the namespace where Foo is defined. We +// give the user this option as sometimes defining a << operator for +// Foo is not desirable (e.g. the coding style may prevent doing it, +// or there is already a << operator but it doesn't do what the user +// wants). +template +void PrintTo(const T& value, ::std::ostream* os) { + // DefaultPrintTo() is overloaded. The type of its first two + // arguments determine which version will be picked. If T is an + // STL-style container, the version for container will be called; if + // T is a pointer, the pointer version will be called; otherwise the + // generic version will be called. + // + // Note that we check for container types here, prior to we check + // for protocol message types in our operator<<. The rationale is: + // + // For protocol messages, we want to give people a chance to + // override Google Mock's format by defining a PrintTo() or + // operator<<. For STL containers, other formats can be + // incompatible with Google Mock's format for the container + // elements; therefore we check for container types here to ensure + // that our format is used. + // + // The second argument of DefaultPrintTo() is needed to bypass a bug + // in Symbian's C++ compiler that prevents it from picking the right + // overload between: + // + // PrintTo(const T& x, ...); + // PrintTo(T* x, ...); + DefaultPrintTo(IsContainerTest(0), is_pointer(), value, os); +} + +// The following list of PrintTo() overloads tells +// UniversalPrinter::Print() how to print standard types (built-in +// types, strings, plain arrays, and pointers). + +// Overloads for various char types. +GTEST_API_ void PrintTo(unsigned char c, ::std::ostream* os); +GTEST_API_ void PrintTo(signed char c, ::std::ostream* os); +inline void PrintTo(char c, ::std::ostream* os) { + // When printing a plain char, we always treat it as unsigned. This + // way, the output won't be affected by whether the compiler thinks + // char is signed or not. + PrintTo(static_cast(c), os); +} + +// Overloads for other simple built-in types. +inline void PrintTo(bool x, ::std::ostream* os) { + *os << (x ? "true" : "false"); +} + +// Overload for wchar_t type. +// Prints a wchar_t as a symbol if it is printable or as its internal +// code otherwise and also as its decimal code (except for L'\0'). +// The L'\0' char is printed as "L'\\0'". The decimal code is printed +// as signed integer when wchar_t is implemented by the compiler +// as a signed type and is printed as an unsigned integer when wchar_t +// is implemented as an unsigned type. +GTEST_API_ void PrintTo(wchar_t wc, ::std::ostream* os); + +// Overloads for C strings. +GTEST_API_ void PrintTo(const char* s, ::std::ostream* os); +inline void PrintTo(char* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} + +// signed/unsigned char is often used for representing binary data, so +// we print pointers to it as void* to be safe. +inline void PrintTo(const signed char* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} +inline void PrintTo(signed char* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} +inline void PrintTo(const unsigned char* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} +inline void PrintTo(unsigned char* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} + +// MSVC can be configured to define wchar_t as a typedef of unsigned +// short. It defines _NATIVE_WCHAR_T_DEFINED when wchar_t is a native +// type. When wchar_t is a typedef, defining an overload for const +// wchar_t* would cause unsigned short* be printed as a wide string, +// possibly causing invalid memory accesses. +#if !defined(_MSC_VER) || defined(_NATIVE_WCHAR_T_DEFINED) +// Overloads for wide C strings +GTEST_API_ void PrintTo(const wchar_t* s, ::std::ostream* os); +inline void PrintTo(wchar_t* s, ::std::ostream* os) { + PrintTo(ImplicitCast_(s), os); +} +#endif + +// Overload for C arrays. Multi-dimensional arrays are printed +// properly. + +// Prints the given number of elements in an array, without printing +// the curly braces. +template +void PrintRawArrayTo(const T a[], size_t count, ::std::ostream* os) { + UniversalPrint(a[0], os); + for (size_t i = 1; i != count; i++) { + *os << ", "; + UniversalPrint(a[i], os); + } +} + +// Overloads for ::string and ::std::string. +#if GTEST_HAS_GLOBAL_STRING +GTEST_API_ void PrintStringTo(const ::string&s, ::std::ostream* os); +inline void PrintTo(const ::string& s, ::std::ostream* os) { + PrintStringTo(s, os); +} +#endif // GTEST_HAS_GLOBAL_STRING + +GTEST_API_ void PrintStringTo(const ::std::string&s, ::std::ostream* os); +inline void PrintTo(const ::std::string& s, ::std::ostream* os) { + PrintStringTo(s, os); +} + +// Overloads for ::wstring and ::std::wstring. +#if GTEST_HAS_GLOBAL_WSTRING +GTEST_API_ void PrintWideStringTo(const ::wstring&s, ::std::ostream* os); +inline void PrintTo(const ::wstring& s, ::std::ostream* os) { + PrintWideStringTo(s, os); +} +#endif // GTEST_HAS_GLOBAL_WSTRING + +#if GTEST_HAS_STD_WSTRING +GTEST_API_ void PrintWideStringTo(const ::std::wstring&s, ::std::ostream* os); +inline void PrintTo(const ::std::wstring& s, ::std::ostream* os) { + PrintWideStringTo(s, os); +} +#endif // GTEST_HAS_STD_WSTRING + +#if GTEST_HAS_TR1_TUPLE +// Overload for ::std::tr1::tuple. Needed for printing function arguments, +// which are packed as tuples. + +// Helper function for printing a tuple. T must be instantiated with +// a tuple type. +template +void PrintTupleTo(const T& t, ::std::ostream* os); + +// Overloaded PrintTo() for tuples of various arities. We support +// tuples of up-to 10 fields. The following implementation works +// regardless of whether tr1::tuple is implemented using the +// non-standard variadic template feature or not. + +inline void PrintTo(const ::std::tr1::tuple<>& t, ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template +void PrintTo(const ::std::tr1::tuple& t, ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template +void PrintTo(const ::std::tr1::tuple& t, ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template +void PrintTo(const ::std::tr1::tuple& t, ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template +void PrintTo(const ::std::tr1::tuple& t, ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template +void PrintTo(const ::std::tr1::tuple& t, + ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template +void PrintTo(const ::std::tr1::tuple& t, + ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template +void PrintTo(const ::std::tr1::tuple& t, + ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template +void PrintTo(const ::std::tr1::tuple& t, + ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template +void PrintTo(const ::std::tr1::tuple& t, + ::std::ostream* os) { + PrintTupleTo(t, os); +} + +template +void PrintTo( + const ::std::tr1::tuple& t, + ::std::ostream* os) { + PrintTupleTo(t, os); +} +#endif // GTEST_HAS_TR1_TUPLE + +// Overload for std::pair. +template +void PrintTo(const ::std::pair& value, ::std::ostream* os) { + *os << '('; + // We cannot use UniversalPrint(value.first, os) here, as T1 may be + // a reference type. The same for printing value.second. + UniversalPrinter::Print(value.first, os); + *os << ", "; + UniversalPrinter::Print(value.second, os); + *os << ')'; +} + +// Implements printing a non-reference type T by letting the compiler +// pick the right overload of PrintTo() for T. +template +class UniversalPrinter { + public: + // MSVC warns about adding const to a function type, so we want to + // disable the warning. +#ifdef _MSC_VER +# pragma warning(push) // Saves the current warning state. +# pragma warning(disable:4180) // Temporarily disables warning 4180. +#endif // _MSC_VER + + // Note: we deliberately don't call this PrintTo(), as that name + // conflicts with ::testing::internal::PrintTo in the body of the + // function. + static void Print(const T& value, ::std::ostream* os) { + // By default, ::testing::internal::PrintTo() is used for printing + // the value. + // + // Thanks to Koenig look-up, if T is a class and has its own + // PrintTo() function defined in its namespace, that function will + // be visible here. Since it is more specific than the generic ones + // in ::testing::internal, it will be picked by the compiler in the + // following statement - exactly what we want. + PrintTo(value, os); + } + +#ifdef _MSC_VER +# pragma warning(pop) // Restores the warning state. +#endif // _MSC_VER +}; + +// UniversalPrintArray(begin, len, os) prints an array of 'len' +// elements, starting at address 'begin'. +template +void UniversalPrintArray(const T* begin, size_t len, ::std::ostream* os) { + if (len == 0) { + *os << "{}"; + } else { + *os << "{ "; + const size_t kThreshold = 18; + const size_t kChunkSize = 8; + // If the array has more than kThreshold elements, we'll have to + // omit some details by printing only the first and the last + // kChunkSize elements. + // TODO(wan@google.com): let the user control the threshold using a flag. + if (len <= kThreshold) { + PrintRawArrayTo(begin, len, os); + } else { + PrintRawArrayTo(begin, kChunkSize, os); + *os << ", ..., "; + PrintRawArrayTo(begin + len - kChunkSize, kChunkSize, os); + } + *os << " }"; + } +} +// This overload prints a (const) char array compactly. +GTEST_API_ void UniversalPrintArray(const char* begin, + size_t len, + ::std::ostream* os); + +// Implements printing an array type T[N]. +template +class UniversalPrinter { + public: + // Prints the given array, omitting some elements when there are too + // many. + static void Print(const T (&a)[N], ::std::ostream* os) { + UniversalPrintArray(a, N, os); + } +}; + +// Implements printing a reference type T&. +template +class UniversalPrinter { + public: + // MSVC warns about adding const to a function type, so we want to + // disable the warning. +#ifdef _MSC_VER +# pragma warning(push) // Saves the current warning state. +# pragma warning(disable:4180) // Temporarily disables warning 4180. +#endif // _MSC_VER + + static void Print(const T& value, ::std::ostream* os) { + // Prints the address of the value. We use reinterpret_cast here + // as static_cast doesn't compile when T is a function type. + *os << "@" << reinterpret_cast(&value) << " "; + + // Then prints the value itself. + UniversalPrint(value, os); + } + +#ifdef _MSC_VER +# pragma warning(pop) // Restores the warning state. +#endif // _MSC_VER +}; + +// Prints a value tersely: for a reference type, the referenced value +// (but not the address) is printed; for a (const) char pointer, the +// NUL-terminated string (but not the pointer) is printed. +template +void UniversalTersePrint(const T& value, ::std::ostream* os) { + UniversalPrint(value, os); +} +inline void UniversalTersePrint(const char* str, ::std::ostream* os) { + if (str == NULL) { + *os << "NULL"; + } else { + UniversalPrint(string(str), os); + } +} +inline void UniversalTersePrint(char* str, ::std::ostream* os) { + UniversalTersePrint(static_cast(str), os); +} + +// Prints a value using the type inferred by the compiler. The +// difference between this and UniversalTersePrint() is that for a +// (const) char pointer, this prints both the pointer and the +// NUL-terminated string. +template +void UniversalPrint(const T& value, ::std::ostream* os) { + UniversalPrinter::Print(value, os); +} + +#if GTEST_HAS_TR1_TUPLE +typedef ::std::vector Strings; + +// This helper template allows PrintTo() for tuples and +// UniversalTersePrintTupleFieldsToStrings() to be defined by +// induction on the number of tuple fields. The idea is that +// TuplePrefixPrinter::PrintPrefixTo(t, os) prints the first N +// fields in tuple t, and can be defined in terms of +// TuplePrefixPrinter. + +// The inductive case. +template +struct TuplePrefixPrinter { + // Prints the first N fields of a tuple. + template + static void PrintPrefixTo(const Tuple& t, ::std::ostream* os) { + TuplePrefixPrinter::PrintPrefixTo(t, os); + *os << ", "; + UniversalPrinter::type> + ::Print(::std::tr1::get(t), os); + } + + // Tersely prints the first N fields of a tuple to a string vector, + // one element for each field. + template + static void TersePrintPrefixToStrings(const Tuple& t, Strings* strings) { + TuplePrefixPrinter::TersePrintPrefixToStrings(t, strings); + ::std::stringstream ss; + UniversalTersePrint(::std::tr1::get(t), &ss); + strings->push_back(ss.str()); + } +}; + +// Base cases. +template <> +struct TuplePrefixPrinter<0> { + template + static void PrintPrefixTo(const Tuple&, ::std::ostream*) {} + + template + static void TersePrintPrefixToStrings(const Tuple&, Strings*) {} +}; +// We have to specialize the entire TuplePrefixPrinter<> class +// template here, even though the definition of +// TersePrintPrefixToStrings() is the same as the generic version, as +// Embarcadero (formerly CodeGear, formerly Borland) C++ doesn't +// support specializing a method template of a class template. +template <> +struct TuplePrefixPrinter<1> { + template + static void PrintPrefixTo(const Tuple& t, ::std::ostream* os) { + UniversalPrinter::type>:: + Print(::std::tr1::get<0>(t), os); + } + + template + static void TersePrintPrefixToStrings(const Tuple& t, Strings* strings) { + ::std::stringstream ss; + UniversalTersePrint(::std::tr1::get<0>(t), &ss); + strings->push_back(ss.str()); + } +}; + +// Helper function for printing a tuple. T must be instantiated with +// a tuple type. +template +void PrintTupleTo(const T& t, ::std::ostream* os) { + *os << "("; + TuplePrefixPrinter< ::std::tr1::tuple_size::value>:: + PrintPrefixTo(t, os); + *os << ")"; +} + +// Prints the fields of a tuple tersely to a string vector, one +// element for each field. See the comment before +// UniversalTersePrint() for how we define "tersely". +template +Strings UniversalTersePrintTupleFieldsToStrings(const Tuple& value) { + Strings result; + TuplePrefixPrinter< ::std::tr1::tuple_size::value>:: + TersePrintPrefixToStrings(value, &result); + return result; +} +#endif // GTEST_HAS_TR1_TUPLE + +} // namespace internal + +template +::std::string PrintToString(const T& value) { + ::std::stringstream ss; + internal::UniversalTersePrint(value, &ss); + return ss.str(); +} + +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_GTEST_PRINTERS_H_ + +#if GTEST_HAS_PARAM_TEST + +namespace testing { +namespace internal { + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Outputs a message explaining invalid registration of different +// fixture class for the same test case. This may happen when +// TEST_P macro is used to define two tests with the same name +// but in different namespaces. +GTEST_API_ void ReportInvalidTestCaseType(const char* test_case_name, + const char* file, int line); + +template class ParamGeneratorInterface; +template class ParamGenerator; + +// Interface for iterating over elements provided by an implementation +// of ParamGeneratorInterface. +template +class ParamIteratorInterface { + public: + virtual ~ParamIteratorInterface() {} + // A pointer to the base generator instance. + // Used only for the purposes of iterator comparison + // to make sure that two iterators belong to the same generator. + virtual const ParamGeneratorInterface* BaseGenerator() const = 0; + // Advances iterator to point to the next element + // provided by the generator. The caller is responsible + // for not calling Advance() on an iterator equal to + // BaseGenerator()->End(). + virtual void Advance() = 0; + // Clones the iterator object. Used for implementing copy semantics + // of ParamIterator. + virtual ParamIteratorInterface* Clone() const = 0; + // Dereferences the current iterator and provides (read-only) access + // to the pointed value. It is the caller's responsibility not to call + // Current() on an iterator equal to BaseGenerator()->End(). + // Used for implementing ParamGenerator::operator*(). + virtual const T* Current() const = 0; + // Determines whether the given iterator and other point to the same + // element in the sequence generated by the generator. + // Used for implementing ParamGenerator::operator==(). + virtual bool Equals(const ParamIteratorInterface& other) const = 0; +}; + +// Class iterating over elements provided by an implementation of +// ParamGeneratorInterface. It wraps ParamIteratorInterface +// and implements the const forward iterator concept. +template +class ParamIterator { + public: + typedef T value_type; + typedef const T& reference; + typedef ptrdiff_t difference_type; + + // ParamIterator assumes ownership of the impl_ pointer. + ParamIterator(const ParamIterator& other) : impl_(other.impl_->Clone()) {} + ParamIterator& operator=(const ParamIterator& other) { + if (this != &other) + impl_.reset(other.impl_->Clone()); + return *this; + } + + const T& operator*() const { return *impl_->Current(); } + const T* operator->() const { return impl_->Current(); } + // Prefix version of operator++. + ParamIterator& operator++() { + impl_->Advance(); + return *this; + } + // Postfix version of operator++. + ParamIterator operator++(int /*unused*/) { + ParamIteratorInterface* clone = impl_->Clone(); + impl_->Advance(); + return ParamIterator(clone); + } + bool operator==(const ParamIterator& other) const { + return impl_.get() == other.impl_.get() || impl_->Equals(*other.impl_); + } + bool operator!=(const ParamIterator& other) const { + return !(*this == other); + } + + private: + friend class ParamGenerator; + explicit ParamIterator(ParamIteratorInterface* impl) : impl_(impl) {} + scoped_ptr > impl_; +}; + +// ParamGeneratorInterface is the binary interface to access generators +// defined in other translation units. +template +class ParamGeneratorInterface { + public: + typedef T ParamType; + + virtual ~ParamGeneratorInterface() {} + + // Generator interface definition + virtual ParamIteratorInterface* Begin() const = 0; + virtual ParamIteratorInterface* End() const = 0; +}; + +// Wraps ParamGeneratorInterface and provides general generator syntax +// compatible with the STL Container concept. +// This class implements copy initialization semantics and the contained +// ParamGeneratorInterface instance is shared among all copies +// of the original object. This is possible because that instance is immutable. +template +class ParamGenerator { + public: + typedef ParamIterator iterator; + + explicit ParamGenerator(ParamGeneratorInterface* impl) : impl_(impl) {} + ParamGenerator(const ParamGenerator& other) : impl_(other.impl_) {} + + ParamGenerator& operator=(const ParamGenerator& other) { + impl_ = other.impl_; + return *this; + } + + iterator begin() const { return iterator(impl_->Begin()); } + iterator end() const { return iterator(impl_->End()); } + + private: + linked_ptr > impl_; +}; + +// Generates values from a range of two comparable values. Can be used to +// generate sequences of user-defined types that implement operator+() and +// operator<(). +// This class is used in the Range() function. +template +class RangeGenerator : public ParamGeneratorInterface { + public: + RangeGenerator(T begin, T end, IncrementT step) + : begin_(begin), end_(end), + step_(step), end_index_(CalculateEndIndex(begin, end, step)) {} + virtual ~RangeGenerator() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, begin_, 0, step_); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, end_, end_index_, step_); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, T value, int index, + IncrementT step) + : base_(base), value_(value), index_(index), step_(step) {} + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + virtual void Advance() { + value_ = value_ + step_; + index_++; + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const T* Current() const { return &value_; } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const int other_index = + CheckedDowncastToActualType(&other)->index_; + return index_ == other_index; + } + + private: + Iterator(const Iterator& other) + : ParamIteratorInterface(), + base_(other.base_), value_(other.value_), index_(other.index_), + step_(other.step_) {} + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + T value_; + int index_; + const IncrementT step_; + }; // class RangeGenerator::Iterator + + static int CalculateEndIndex(const T& begin, + const T& end, + const IncrementT& step) { + int end_index = 0; + for (T i = begin; i < end; i = i + step) + end_index++; + return end_index; + } + + // No implementation - assignment is unsupported. + void operator=(const RangeGenerator& other); + + const T begin_; + const T end_; + const IncrementT step_; + // The index for the end() iterator. All the elements in the generated + // sequence are indexed (0-based) to aid iterator comparison. + const int end_index_; +}; // class RangeGenerator + + +// Generates values from a pair of STL-style iterators. Used in the +// ValuesIn() function. The elements are copied from the source range +// since the source can be located on the stack, and the generator +// is likely to persist beyond that stack frame. +template +class ValuesInIteratorRangeGenerator : public ParamGeneratorInterface { + public: + template + ValuesInIteratorRangeGenerator(ForwardIterator begin, ForwardIterator end) + : container_(begin, end) {} + virtual ~ValuesInIteratorRangeGenerator() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, container_.begin()); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, container_.end()); + } + + private: + typedef typename ::std::vector ContainerType; + + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + typename ContainerType::const_iterator iterator) + : base_(base), iterator_(iterator) {} + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + virtual void Advance() { + ++iterator_; + value_.reset(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + // We need to use cached value referenced by iterator_ because *iterator_ + // can return a temporary object (and of type other then T), so just + // having "return &*iterator_;" doesn't work. + // value_ is updated here and not in Advance() because Advance() + // can advance iterator_ beyond the end of the range, and we cannot + // detect that fact. The client code, on the other hand, is + // responsible for not calling Current() on an out-of-range iterator. + virtual const T* Current() const { + if (value_.get() == NULL) + value_.reset(new T(*iterator_)); + return value_.get(); + } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + return iterator_ == + CheckedDowncastToActualType(&other)->iterator_; + } + + private: + Iterator(const Iterator& other) + // The explicit constructor call suppresses a false warning + // emitted by gcc when supplied with the -Wextra option. + : ParamIteratorInterface(), + base_(other.base_), + iterator_(other.iterator_) {} + + const ParamGeneratorInterface* const base_; + typename ContainerType::const_iterator iterator_; + // A cached value of *iterator_. We keep it here to allow access by + // pointer in the wrapping iterator's operator->(). + // value_ needs to be mutable to be accessed in Current(). + // Use of scoped_ptr helps manage cached value's lifetime, + // which is bound by the lifespan of the iterator itself. + mutable scoped_ptr value_; + }; // class ValuesInIteratorRangeGenerator::Iterator + + // No implementation - assignment is unsupported. + void operator=(const ValuesInIteratorRangeGenerator& other); + + const ContainerType container_; +}; // class ValuesInIteratorRangeGenerator + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Stores a parameter value and later creates tests parameterized with that +// value. +template +class ParameterizedTestFactory : public TestFactoryBase { + public: + typedef typename TestClass::ParamType ParamType; + explicit ParameterizedTestFactory(ParamType parameter) : + parameter_(parameter) {} + virtual Test* CreateTest() { + TestClass::SetParam(¶meter_); + return new TestClass(); + } + + private: + const ParamType parameter_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestFactory); +}; + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// TestMetaFactoryBase is a base class for meta-factories that create +// test factories for passing into MakeAndRegisterTestInfo function. +template +class TestMetaFactoryBase { + public: + virtual ~TestMetaFactoryBase() {} + + virtual TestFactoryBase* CreateTestFactory(ParamType parameter) = 0; +}; + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// TestMetaFactory creates test factories for passing into +// MakeAndRegisterTestInfo function. Since MakeAndRegisterTestInfo receives +// ownership of test factory pointer, same factory object cannot be passed +// into that method twice. But ParameterizedTestCaseInfo is going to call +// it for each Test/Parameter value combination. Thus it needs meta factory +// creator class. +template +class TestMetaFactory + : public TestMetaFactoryBase { + public: + typedef typename TestCase::ParamType ParamType; + + TestMetaFactory() {} + + virtual TestFactoryBase* CreateTestFactory(ParamType parameter) { + return new ParameterizedTestFactory(parameter); + } + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestMetaFactory); +}; + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// ParameterizedTestCaseInfoBase is a generic interface +// to ParameterizedTestCaseInfo classes. ParameterizedTestCaseInfoBase +// accumulates test information provided by TEST_P macro invocations +// and generators provided by INSTANTIATE_TEST_CASE_P macro invocations +// and uses that information to register all resulting test instances +// in RegisterTests method. The ParameterizeTestCaseRegistry class holds +// a collection of pointers to the ParameterizedTestCaseInfo objects +// and calls RegisterTests() on each of them when asked. +class ParameterizedTestCaseInfoBase { + public: + virtual ~ParameterizedTestCaseInfoBase() {} + + // Base part of test case name for display purposes. + virtual const string& GetTestCaseName() const = 0; + // Test case id to verify identity. + virtual TypeId GetTestCaseTypeId() const = 0; + // UnitTest class invokes this method to register tests in this + // test case right before running them in RUN_ALL_TESTS macro. + // This method should not be called more then once on any single + // instance of a ParameterizedTestCaseInfoBase derived class. + virtual void RegisterTests() = 0; + + protected: + ParameterizedTestCaseInfoBase() {} + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfoBase); +}; + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// ParameterizedTestCaseInfo accumulates tests obtained from TEST_P +// macro invocations for a particular test case and generators +// obtained from INSTANTIATE_TEST_CASE_P macro invocations for that +// test case. It registers tests with all values generated by all +// generators when asked. +template +class ParameterizedTestCaseInfo : public ParameterizedTestCaseInfoBase { + public: + // ParamType and GeneratorCreationFunc are private types but are required + // for declarations of public methods AddTestPattern() and + // AddTestCaseInstantiation(). + typedef typename TestCase::ParamType ParamType; + // A function that returns an instance of appropriate generator type. + typedef ParamGenerator(GeneratorCreationFunc)(); + + explicit ParameterizedTestCaseInfo(const char* name) + : test_case_name_(name) {} + + // Test case base name for display purposes. + virtual const string& GetTestCaseName() const { return test_case_name_; } + // Test case id to verify identity. + virtual TypeId GetTestCaseTypeId() const { return GetTypeId(); } + // TEST_P macro uses AddTestPattern() to record information + // about a single test in a LocalTestInfo structure. + // test_case_name is the base name of the test case (without invocation + // prefix). test_base_name is the name of an individual test without + // parameter index. For the test SequenceA/FooTest.DoBar/1 FooTest is + // test case base name and DoBar is test base name. + void AddTestPattern(const char* test_case_name, + const char* test_base_name, + TestMetaFactoryBase* meta_factory) { + tests_.push_back(linked_ptr(new TestInfo(test_case_name, + test_base_name, + meta_factory))); + } + // INSTANTIATE_TEST_CASE_P macro uses AddGenerator() to record information + // about a generator. + int AddTestCaseInstantiation(const string& instantiation_name, + GeneratorCreationFunc* func, + const char* /* file */, + int /* line */) { + instantiations_.push_back(::std::make_pair(instantiation_name, func)); + return 0; // Return value used only to run this method in namespace scope. + } + // UnitTest class invokes this method to register tests in this test case + // test cases right before running tests in RUN_ALL_TESTS macro. + // This method should not be called more then once on any single + // instance of a ParameterizedTestCaseInfoBase derived class. + // UnitTest has a guard to prevent from calling this method more then once. + virtual void RegisterTests() { + for (typename TestInfoContainer::iterator test_it = tests_.begin(); + test_it != tests_.end(); ++test_it) { + linked_ptr test_info = *test_it; + for (typename InstantiationContainer::iterator gen_it = + instantiations_.begin(); gen_it != instantiations_.end(); + ++gen_it) { + const string& instantiation_name = gen_it->first; + ParamGenerator generator((*gen_it->second)()); + + Message test_case_name_stream; + if ( !instantiation_name.empty() ) + test_case_name_stream << instantiation_name << "/"; + test_case_name_stream << test_info->test_case_base_name; + + int i = 0; + for (typename ParamGenerator::iterator param_it = + generator.begin(); + param_it != generator.end(); ++param_it, ++i) { + Message test_name_stream; + test_name_stream << test_info->test_base_name << "/" << i; + MakeAndRegisterTestInfo( + test_case_name_stream.GetString().c_str(), + test_name_stream.GetString().c_str(), + NULL, // No type parameter. + PrintToString(*param_it).c_str(), + GetTestCaseTypeId(), + TestCase::SetUpTestCase, + TestCase::TearDownTestCase, + test_info->test_meta_factory->CreateTestFactory(*param_it)); + } // for param_it + } // for gen_it + } // for test_it + } // RegisterTests + + private: + // LocalTestInfo structure keeps information about a single test registered + // with TEST_P macro. + struct TestInfo { + TestInfo(const char* a_test_case_base_name, + const char* a_test_base_name, + TestMetaFactoryBase* a_test_meta_factory) : + test_case_base_name(a_test_case_base_name), + test_base_name(a_test_base_name), + test_meta_factory(a_test_meta_factory) {} + + const string test_case_base_name; + const string test_base_name; + const scoped_ptr > test_meta_factory; + }; + typedef ::std::vector > TestInfoContainer; + // Keeps pairs of + // received from INSTANTIATE_TEST_CASE_P macros. + typedef ::std::vector > + InstantiationContainer; + + const string test_case_name_; + TestInfoContainer tests_; + InstantiationContainer instantiations_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseInfo); +}; // class ParameterizedTestCaseInfo + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// ParameterizedTestCaseRegistry contains a map of ParameterizedTestCaseInfoBase +// classes accessed by test case names. TEST_P and INSTANTIATE_TEST_CASE_P +// macros use it to locate their corresponding ParameterizedTestCaseInfo +// descriptors. +class ParameterizedTestCaseRegistry { + public: + ParameterizedTestCaseRegistry() {} + ~ParameterizedTestCaseRegistry() { + for (TestCaseInfoContainer::iterator it = test_case_infos_.begin(); + it != test_case_infos_.end(); ++it) { + delete *it; + } + } + + // Looks up or creates and returns a structure containing information about + // tests and instantiations of a particular test case. + template + ParameterizedTestCaseInfo* GetTestCasePatternHolder( + const char* test_case_name, + const char* file, + int line) { + ParameterizedTestCaseInfo* typed_test_info = NULL; + for (TestCaseInfoContainer::iterator it = test_case_infos_.begin(); + it != test_case_infos_.end(); ++it) { + if ((*it)->GetTestCaseName() == test_case_name) { + if ((*it)->GetTestCaseTypeId() != GetTypeId()) { + // Complain about incorrect usage of Google Test facilities + // and terminate the program since we cannot guaranty correct + // test case setup and tear-down in this case. + ReportInvalidTestCaseType(test_case_name, file, line); + posix::Abort(); + } else { + // At this point we are sure that the object we found is of the same + // type we are looking for, so we downcast it to that type + // without further checks. + typed_test_info = CheckedDowncastToActualType< + ParameterizedTestCaseInfo >(*it); + } + break; + } + } + if (typed_test_info == NULL) { + typed_test_info = new ParameterizedTestCaseInfo(test_case_name); + test_case_infos_.push_back(typed_test_info); + } + return typed_test_info; + } + void RegisterTests() { + for (TestCaseInfoContainer::iterator it = test_case_infos_.begin(); + it != test_case_infos_.end(); ++it) { + (*it)->RegisterTests(); + } + } + + private: + typedef ::std::vector TestCaseInfoContainer; + + TestCaseInfoContainer test_case_infos_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(ParameterizedTestCaseRegistry); +}; + +} // namespace internal +} // namespace testing + +#endif // GTEST_HAS_PARAM_TEST + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_H_ +// This file was GENERATED by command: +// pump.py gtest-param-util-generated.h.pump +// DO NOT EDIT BY HAND!!! + +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: vladl@google.com (Vlad Losev) + +// Type and function utilities for implementing parameterized tests. +// This file is generated by a SCRIPT. DO NOT EDIT BY HAND! +// +// Currently Google Test supports at most 50 arguments in Values, +// and at most 10 arguments in Combine. Please contact +// googletestframework@googlegroups.com if you need more. +// Please note that the number of arguments to Combine is limited +// by the maximum arity of the implementation of tr1::tuple which is +// currently set at 10. + +#ifndef GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ +#define GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ + +// scripts/fuse_gtest.py depends on gtest's own header being #included +// *unconditionally*. Therefore these #includes cannot be moved +// inside #if GTEST_HAS_PARAM_TEST. + +#if GTEST_HAS_PARAM_TEST + +namespace testing { + +// Forward declarations of ValuesIn(), which is implemented in +// include/gtest/gtest-param-test.h. +template +internal::ParamGenerator< + typename ::testing::internal::IteratorTraits::value_type> +ValuesIn(ForwardIterator begin, ForwardIterator end); + +template +internal::ParamGenerator ValuesIn(const T (&array)[N]); + +template +internal::ParamGenerator ValuesIn( + const Container& container); + +namespace internal { + +// Used in the Values() function to provide polymorphic capabilities. +template +class ValueArray1 { + public: + explicit ValueArray1(T1 v1) : v1_(v1) {} + + template + operator ParamGenerator() const { return ValuesIn(&v1_, &v1_ + 1); } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray1& other); + + const T1 v1_; +}; + +template +class ValueArray2 { + public: + ValueArray2(T1 v1, T2 v2) : v1_(v1), v2_(v2) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray2& other); + + const T1 v1_; + const T2 v2_; +}; + +template +class ValueArray3 { + public: + ValueArray3(T1 v1, T2 v2, T3 v3) : v1_(v1), v2_(v2), v3_(v3) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray3& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; +}; + +template +class ValueArray4 { + public: + ValueArray4(T1 v1, T2 v2, T3 v3, T4 v4) : v1_(v1), v2_(v2), v3_(v3), + v4_(v4) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray4& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; +}; + +template +class ValueArray5 { + public: + ValueArray5(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5) : v1_(v1), v2_(v2), v3_(v3), + v4_(v4), v5_(v5) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray5& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; +}; + +template +class ValueArray6 { + public: + ValueArray6(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6) : v1_(v1), v2_(v2), + v3_(v3), v4_(v4), v5_(v5), v6_(v6) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray6& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; +}; + +template +class ValueArray7 { + public: + ValueArray7(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7) : v1_(v1), + v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray7& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; +}; + +template +class ValueArray8 { + public: + ValueArray8(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, + T8 v8) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray8& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; +}; + +template +class ValueArray9 { + public: + ValueArray9(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, + T9 v9) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray9& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; +}; + +template +class ValueArray10 { + public: + ValueArray10(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray10& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; +}; + +template +class ValueArray11 { + public: + ValueArray11(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), + v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray11& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; +}; + +template +class ValueArray12 { + public: + ValueArray12(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), + v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray12& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; +}; + +template +class ValueArray13 { + public: + ValueArray13(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), + v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), + v12_(v12), v13_(v13) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray13& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; +}; + +template +class ValueArray14 { + public: + ValueArray14(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) : v1_(v1), v2_(v2), v3_(v3), + v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray14& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; +}; + +template +class ValueArray15 { + public: + ValueArray15(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) : v1_(v1), v2_(v2), + v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray15& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; +}; + +template +class ValueArray16 { + public: + ValueArray16(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16) : v1_(v1), + v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), + v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), + v16_(v16) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray16& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; +}; + +template +class ValueArray17 { + public: + ValueArray17(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, + T17 v17) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray17& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; +}; + +template +class ValueArray18 { + public: + ValueArray18(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray18& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; +}; + +template +class ValueArray19 { + public: + ValueArray19(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), + v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), + v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray19& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; +}; + +template +class ValueArray20 { + public: + ValueArray20(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), + v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), + v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), + v19_(v19), v20_(v20) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray20& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; +}; + +template +class ValueArray21 { + public: + ValueArray21(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), + v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), + v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), + v18_(v18), v19_(v19), v20_(v20), v21_(v21) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray21& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; +}; + +template +class ValueArray22 { + public: + ValueArray22(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22) : v1_(v1), v2_(v2), v3_(v3), + v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray22& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; +}; + +template +class ValueArray23 { + public: + ValueArray23(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23) : v1_(v1), v2_(v2), + v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), + v23_(v23) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, + v23_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray23& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; +}; + +template +class ValueArray24 { + public: + ValueArray24(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24) : v1_(v1), + v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), + v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), + v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), + v22_(v22), v23_(v23), v24_(v24) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_, + v24_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray24& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; +}; + +template +class ValueArray25 { + public: + ValueArray25(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, + T25 v25) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_, + v24_, v25_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray25& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; +}; + +template +class ValueArray26 { + public: + ValueArray26(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_, + v24_, v25_, v26_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray26& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; +}; + +template +class ValueArray27 { + public: + ValueArray27(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), + v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), + v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), + v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), + v26_(v26), v27_(v27) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_, + v24_, v25_, v26_, v27_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray27& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; +}; + +template +class ValueArray28 { + public: + ValueArray28(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), + v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), + v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), + v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), + v25_(v25), v26_(v26), v27_(v27), v28_(v28) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_, + v24_, v25_, v26_, v27_, v28_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray28& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; +}; + +template +class ValueArray29 { + public: + ValueArray29(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), + v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), + v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), + v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), + v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_, + v24_, v25_, v26_, v27_, v28_, v29_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray29& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; +}; + +template +class ValueArray30 { + public: + ValueArray30(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) : v1_(v1), v2_(v2), v3_(v3), + v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), + v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), + v29_(v29), v30_(v30) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_, + v24_, v25_, v26_, v27_, v28_, v29_, v30_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray30& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; +}; + +template +class ValueArray31 { + public: + ValueArray31(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) : v1_(v1), v2_(v2), + v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), + v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), + v29_(v29), v30_(v30), v31_(v31) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_, + v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray31& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; +}; + +template +class ValueArray32 { + public: + ValueArray32(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32) : v1_(v1), + v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), + v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), + v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), + v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), + v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_, + v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray32& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; +}; + +template +class ValueArray33 { + public: + ValueArray33(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, + T33 v33) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), + v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), + v33_(v33) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_, + v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray33& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; +}; + +template +class ValueArray34 { + public: + ValueArray34(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), + v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), + v33_(v33), v34_(v34) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_, + v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray34& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; +}; + +template +class ValueArray35 { + public: + ValueArray35(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), + v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), + v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), + v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), + v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), + v32_(v32), v33_(v33), v34_(v34), v35_(v35) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_, + v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, + v35_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray35& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; +}; + +template +class ValueArray36 { + public: + ValueArray36(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), + v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), + v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), + v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), + v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), + v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_, + v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_, + v36_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray36& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; +}; + +template +class ValueArray37 { + public: + ValueArray37(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), + v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), + v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), + v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), + v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), + v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), + v36_(v36), v37_(v37) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_, + v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_, + v36_, v37_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray37& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; +}; + +template +class ValueArray38 { + public: + ValueArray38(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38) : v1_(v1), v2_(v2), v3_(v3), + v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), + v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), + v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), + v35_(v35), v36_(v36), v37_(v37), v38_(v38) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_, + v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_, + v36_, v37_, v38_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray38& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; +}; + +template +class ValueArray39 { + public: + ValueArray39(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39) : v1_(v1), v2_(v2), + v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), + v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), + v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), + v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_, + v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_, + v36_, v37_, v38_, v39_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray39& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; +}; + +template +class ValueArray40 { + public: + ValueArray40(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) : v1_(v1), + v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), + v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), + v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), + v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), + v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), + v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), + v40_(v40) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_, + v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_, + v36_, v37_, v38_, v39_, v40_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray40& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; +}; + +template +class ValueArray41 { + public: + ValueArray41(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, + T41 v41) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), + v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), + v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), + v39_(v39), v40_(v40), v41_(v41) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_, + v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_, + v36_, v37_, v38_, v39_, v40_, v41_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray41& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; +}; + +template +class ValueArray42 { + public: + ValueArray42(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), + v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), + v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), + v39_(v39), v40_(v40), v41_(v41), v42_(v42) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_, + v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_, + v36_, v37_, v38_, v39_, v40_, v41_, v42_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray42& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; +}; + +template +class ValueArray43 { + public: + ValueArray43(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), + v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), + v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), + v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), + v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), + v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), + v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_, + v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_, + v36_, v37_, v38_, v39_, v40_, v41_, v42_, v43_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray43& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; +}; + +template +class ValueArray44 { + public: + ValueArray44(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), + v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), + v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), v18_(v18), + v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), v24_(v24), + v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), v30_(v30), + v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), v36_(v36), + v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41), v42_(v42), + v43_(v43), v44_(v44) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_, + v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_, + v36_, v37_, v38_, v39_, v40_, v41_, v42_, v43_, v44_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray44& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; + const T44 v44_; +}; + +template +class ValueArray45 { + public: + ValueArray45(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44, T45 v45) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), + v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), v11_(v11), + v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), v17_(v17), + v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), v23_(v23), + v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), v29_(v29), + v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), v35_(v35), + v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), v41_(v41), + v42_(v42), v43_(v43), v44_(v44), v45_(v45) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_, + v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_, + v36_, v37_, v38_, v39_, v40_, v41_, v42_, v43_, v44_, v45_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray45& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; + const T44 v44_; + const T45 v45_; +}; + +template +class ValueArray46 { + public: + ValueArray46(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) : v1_(v1), v2_(v2), v3_(v3), + v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), + v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), + v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), + v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), + v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_, + v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_, + v36_, v37_, v38_, v39_, v40_, v41_, v42_, v43_, v44_, v45_, v46_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray46& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; + const T44 v44_; + const T45 v45_; + const T46 v46_; +}; + +template +class ValueArray47 { + public: + ValueArray47(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) : v1_(v1), v2_(v2), + v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), v10_(v10), + v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), v16_(v16), + v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), v22_(v22), + v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), v28_(v28), + v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), v34_(v34), + v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), v40_(v40), + v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), v46_(v46), + v47_(v47) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_, + v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_, + v36_, v37_, v38_, v39_, v40_, v41_, v42_, v43_, v44_, v45_, v46_, + v47_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray47& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; + const T44 v44_; + const T45 v45_; + const T46 v46_; + const T47 v47_; +}; + +template +class ValueArray48 { + public: + ValueArray48(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48) : v1_(v1), + v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), v8_(v8), v9_(v9), + v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), v15_(v15), + v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), v21_(v21), + v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), v27_(v27), + v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), v33_(v33), + v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), v39_(v39), + v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), v45_(v45), + v46_(v46), v47_(v47), v48_(v48) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_, + v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_, + v36_, v37_, v38_, v39_, v40_, v41_, v42_, v43_, v44_, v45_, v46_, v47_, + v48_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray48& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; + const T44 v44_; + const T45 v45_; + const T46 v46_; + const T47 v47_; + const T48 v48_; +}; + +template +class ValueArray49 { + public: + ValueArray49(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48, + T49 v49) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), + v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), + v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), + v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), + v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_, + v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_, + v36_, v37_, v38_, v39_, v40_, v41_, v42_, v43_, v44_, v45_, v46_, v47_, + v48_, v49_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray49& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; + const T44 v44_; + const T45 v45_; + const T46 v46_; + const T47 v47_; + const T48 v48_; + const T49 v49_; +}; + +template +class ValueArray50 { + public: + ValueArray50(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, T48 v48, T49 v49, + T50 v50) : v1_(v1), v2_(v2), v3_(v3), v4_(v4), v5_(v5), v6_(v6), v7_(v7), + v8_(v8), v9_(v9), v10_(v10), v11_(v11), v12_(v12), v13_(v13), v14_(v14), + v15_(v15), v16_(v16), v17_(v17), v18_(v18), v19_(v19), v20_(v20), + v21_(v21), v22_(v22), v23_(v23), v24_(v24), v25_(v25), v26_(v26), + v27_(v27), v28_(v28), v29_(v29), v30_(v30), v31_(v31), v32_(v32), + v33_(v33), v34_(v34), v35_(v35), v36_(v36), v37_(v37), v38_(v38), + v39_(v39), v40_(v40), v41_(v41), v42_(v42), v43_(v43), v44_(v44), + v45_(v45), v46_(v46), v47_(v47), v48_(v48), v49_(v49), v50_(v50) {} + + template + operator ParamGenerator() const { + const T array[] = {v1_, v2_, v3_, v4_, v5_, v6_, v7_, v8_, v9_, v10_, v11_, + v12_, v13_, v14_, v15_, v16_, v17_, v18_, v19_, v20_, v21_, v22_, v23_, + v24_, v25_, v26_, v27_, v28_, v29_, v30_, v31_, v32_, v33_, v34_, v35_, + v36_, v37_, v38_, v39_, v40_, v41_, v42_, v43_, v44_, v45_, v46_, v47_, + v48_, v49_, v50_}; + return ValuesIn(array); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const ValueArray50& other); + + const T1 v1_; + const T2 v2_; + const T3 v3_; + const T4 v4_; + const T5 v5_; + const T6 v6_; + const T7 v7_; + const T8 v8_; + const T9 v9_; + const T10 v10_; + const T11 v11_; + const T12 v12_; + const T13 v13_; + const T14 v14_; + const T15 v15_; + const T16 v16_; + const T17 v17_; + const T18 v18_; + const T19 v19_; + const T20 v20_; + const T21 v21_; + const T22 v22_; + const T23 v23_; + const T24 v24_; + const T25 v25_; + const T26 v26_; + const T27 v27_; + const T28 v28_; + const T29 v29_; + const T30 v30_; + const T31 v31_; + const T32 v32_; + const T33 v33_; + const T34 v34_; + const T35 v35_; + const T36 v36_; + const T37 v37_; + const T38 v38_; + const T39 v39_; + const T40 v40_; + const T41 v41_; + const T42 v42_; + const T43 v43_; + const T44 v44_; + const T45 v45_; + const T46 v46_; + const T47 v47_; + const T48 v48_; + const T49 v49_; + const T50 v50_; +}; + +# if GTEST_HAS_COMBINE +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Generates values from the Cartesian product of values produced +// by the argument generators. +// +template +class CartesianProductGenerator2 + : public ParamGeneratorInterface< ::std::tr1::tuple > { + public: + typedef ::std::tr1::tuple ParamType; + + CartesianProductGenerator2(const ParamGenerator& g1, + const ParamGenerator& g2) + : g1_(g1), g2_(g2) {} + virtual ~CartesianProductGenerator2() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin()); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end()); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + const ParamGenerator& g1, + const typename ParamGenerator::iterator& current1, + const ParamGenerator& g2, + const typename ParamGenerator::iterator& current2) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current2_; + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return ¤t_value_; } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = ParamType(*current1_, *current2_); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator::iterator begin1_; + const typename ParamGenerator::iterator end1_; + typename ParamGenerator::iterator current1_; + const typename ParamGenerator::iterator begin2_; + const typename ParamGenerator::iterator end2_; + typename ParamGenerator::iterator current2_; + ParamType current_value_; + }; // class CartesianProductGenerator2::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator2& other); + + const ParamGenerator g1_; + const ParamGenerator g2_; +}; // class CartesianProductGenerator2 + + +template +class CartesianProductGenerator3 + : public ParamGeneratorInterface< ::std::tr1::tuple > { + public: + typedef ::std::tr1::tuple ParamType; + + CartesianProductGenerator3(const ParamGenerator& g1, + const ParamGenerator& g2, const ParamGenerator& g3) + : g1_(g1), g2_(g2), g3_(g3) {} + virtual ~CartesianProductGenerator3() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin()); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end()); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + const ParamGenerator& g1, + const typename ParamGenerator::iterator& current1, + const ParamGenerator& g2, + const typename ParamGenerator::iterator& current2, + const ParamGenerator& g3, + const typename ParamGenerator::iterator& current3) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current3_; + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return ¤t_value_; } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = ParamType(*current1_, *current2_, *current3_); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator::iterator begin1_; + const typename ParamGenerator::iterator end1_; + typename ParamGenerator::iterator current1_; + const typename ParamGenerator::iterator begin2_; + const typename ParamGenerator::iterator end2_; + typename ParamGenerator::iterator current2_; + const typename ParamGenerator::iterator begin3_; + const typename ParamGenerator::iterator end3_; + typename ParamGenerator::iterator current3_; + ParamType current_value_; + }; // class CartesianProductGenerator3::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator3& other); + + const ParamGenerator g1_; + const ParamGenerator g2_; + const ParamGenerator g3_; +}; // class CartesianProductGenerator3 + + +template +class CartesianProductGenerator4 + : public ParamGeneratorInterface< ::std::tr1::tuple > { + public: + typedef ::std::tr1::tuple ParamType; + + CartesianProductGenerator4(const ParamGenerator& g1, + const ParamGenerator& g2, const ParamGenerator& g3, + const ParamGenerator& g4) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {} + virtual ~CartesianProductGenerator4() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin(), g4_, g4_.begin()); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), + g4_, g4_.end()); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + const ParamGenerator& g1, + const typename ParamGenerator::iterator& current1, + const ParamGenerator& g2, + const typename ParamGenerator::iterator& current2, + const ParamGenerator& g3, + const typename ParamGenerator::iterator& current3, + const ParamGenerator& g4, + const typename ParamGenerator::iterator& current4) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3), + begin4_(g4.begin()), end4_(g4.end()), current4_(current4) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current4_; + if (current4_ == end4_) { + current4_ = begin4_; + ++current3_; + } + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return ¤t_value_; } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_ && + current4_ == typed_other->current4_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_), + begin4_(other.begin4_), + end4_(other.end4_), + current4_(other.current4_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = ParamType(*current1_, *current2_, *current3_, + *current4_); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_ || + current4_ == end4_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator::iterator begin1_; + const typename ParamGenerator::iterator end1_; + typename ParamGenerator::iterator current1_; + const typename ParamGenerator::iterator begin2_; + const typename ParamGenerator::iterator end2_; + typename ParamGenerator::iterator current2_; + const typename ParamGenerator::iterator begin3_; + const typename ParamGenerator::iterator end3_; + typename ParamGenerator::iterator current3_; + const typename ParamGenerator::iterator begin4_; + const typename ParamGenerator::iterator end4_; + typename ParamGenerator::iterator current4_; + ParamType current_value_; + }; // class CartesianProductGenerator4::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator4& other); + + const ParamGenerator g1_; + const ParamGenerator g2_; + const ParamGenerator g3_; + const ParamGenerator g4_; +}; // class CartesianProductGenerator4 + + +template +class CartesianProductGenerator5 + : public ParamGeneratorInterface< ::std::tr1::tuple > { + public: + typedef ::std::tr1::tuple ParamType; + + CartesianProductGenerator5(const ParamGenerator& g1, + const ParamGenerator& g2, const ParamGenerator& g3, + const ParamGenerator& g4, const ParamGenerator& g5) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {} + virtual ~CartesianProductGenerator5() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin()); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), + g4_, g4_.end(), g5_, g5_.end()); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + const ParamGenerator& g1, + const typename ParamGenerator::iterator& current1, + const ParamGenerator& g2, + const typename ParamGenerator::iterator& current2, + const ParamGenerator& g3, + const typename ParamGenerator::iterator& current3, + const ParamGenerator& g4, + const typename ParamGenerator::iterator& current4, + const ParamGenerator& g5, + const typename ParamGenerator::iterator& current5) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3), + begin4_(g4.begin()), end4_(g4.end()), current4_(current4), + begin5_(g5.begin()), end5_(g5.end()), current5_(current5) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current5_; + if (current5_ == end5_) { + current5_ = begin5_; + ++current4_; + } + if (current4_ == end4_) { + current4_ = begin4_; + ++current3_; + } + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return ¤t_value_; } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_ && + current4_ == typed_other->current4_ && + current5_ == typed_other->current5_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_), + begin4_(other.begin4_), + end4_(other.end4_), + current4_(other.current4_), + begin5_(other.begin5_), + end5_(other.end5_), + current5_(other.current5_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = ParamType(*current1_, *current2_, *current3_, + *current4_, *current5_); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_ || + current4_ == end4_ || + current5_ == end5_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator::iterator begin1_; + const typename ParamGenerator::iterator end1_; + typename ParamGenerator::iterator current1_; + const typename ParamGenerator::iterator begin2_; + const typename ParamGenerator::iterator end2_; + typename ParamGenerator::iterator current2_; + const typename ParamGenerator::iterator begin3_; + const typename ParamGenerator::iterator end3_; + typename ParamGenerator::iterator current3_; + const typename ParamGenerator::iterator begin4_; + const typename ParamGenerator::iterator end4_; + typename ParamGenerator::iterator current4_; + const typename ParamGenerator::iterator begin5_; + const typename ParamGenerator::iterator end5_; + typename ParamGenerator::iterator current5_; + ParamType current_value_; + }; // class CartesianProductGenerator5::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator5& other); + + const ParamGenerator g1_; + const ParamGenerator g2_; + const ParamGenerator g3_; + const ParamGenerator g4_; + const ParamGenerator g5_; +}; // class CartesianProductGenerator5 + + +template +class CartesianProductGenerator6 + : public ParamGeneratorInterface< ::std::tr1::tuple > { + public: + typedef ::std::tr1::tuple ParamType; + + CartesianProductGenerator6(const ParamGenerator& g1, + const ParamGenerator& g2, const ParamGenerator& g3, + const ParamGenerator& g4, const ParamGenerator& g5, + const ParamGenerator& g6) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {} + virtual ~CartesianProductGenerator6() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin()); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), + g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end()); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + const ParamGenerator& g1, + const typename ParamGenerator::iterator& current1, + const ParamGenerator& g2, + const typename ParamGenerator::iterator& current2, + const ParamGenerator& g3, + const typename ParamGenerator::iterator& current3, + const ParamGenerator& g4, + const typename ParamGenerator::iterator& current4, + const ParamGenerator& g5, + const typename ParamGenerator::iterator& current5, + const ParamGenerator& g6, + const typename ParamGenerator::iterator& current6) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3), + begin4_(g4.begin()), end4_(g4.end()), current4_(current4), + begin5_(g5.begin()), end5_(g5.end()), current5_(current5), + begin6_(g6.begin()), end6_(g6.end()), current6_(current6) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current6_; + if (current6_ == end6_) { + current6_ = begin6_; + ++current5_; + } + if (current5_ == end5_) { + current5_ = begin5_; + ++current4_; + } + if (current4_ == end4_) { + current4_ = begin4_; + ++current3_; + } + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return ¤t_value_; } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_ && + current4_ == typed_other->current4_ && + current5_ == typed_other->current5_ && + current6_ == typed_other->current6_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_), + begin4_(other.begin4_), + end4_(other.end4_), + current4_(other.current4_), + begin5_(other.begin5_), + end5_(other.end5_), + current5_(other.current5_), + begin6_(other.begin6_), + end6_(other.end6_), + current6_(other.current6_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = ParamType(*current1_, *current2_, *current3_, + *current4_, *current5_, *current6_); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_ || + current4_ == end4_ || + current5_ == end5_ || + current6_ == end6_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator::iterator begin1_; + const typename ParamGenerator::iterator end1_; + typename ParamGenerator::iterator current1_; + const typename ParamGenerator::iterator begin2_; + const typename ParamGenerator::iterator end2_; + typename ParamGenerator::iterator current2_; + const typename ParamGenerator::iterator begin3_; + const typename ParamGenerator::iterator end3_; + typename ParamGenerator::iterator current3_; + const typename ParamGenerator::iterator begin4_; + const typename ParamGenerator::iterator end4_; + typename ParamGenerator::iterator current4_; + const typename ParamGenerator::iterator begin5_; + const typename ParamGenerator::iterator end5_; + typename ParamGenerator::iterator current5_; + const typename ParamGenerator::iterator begin6_; + const typename ParamGenerator::iterator end6_; + typename ParamGenerator::iterator current6_; + ParamType current_value_; + }; // class CartesianProductGenerator6::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator6& other); + + const ParamGenerator g1_; + const ParamGenerator g2_; + const ParamGenerator g3_; + const ParamGenerator g4_; + const ParamGenerator g5_; + const ParamGenerator g6_; +}; // class CartesianProductGenerator6 + + +template +class CartesianProductGenerator7 + : public ParamGeneratorInterface< ::std::tr1::tuple > { + public: + typedef ::std::tr1::tuple ParamType; + + CartesianProductGenerator7(const ParamGenerator& g1, + const ParamGenerator& g2, const ParamGenerator& g3, + const ParamGenerator& g4, const ParamGenerator& g5, + const ParamGenerator& g6, const ParamGenerator& g7) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {} + virtual ~CartesianProductGenerator7() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_, + g7_.begin()); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), + g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end()); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + const ParamGenerator& g1, + const typename ParamGenerator::iterator& current1, + const ParamGenerator& g2, + const typename ParamGenerator::iterator& current2, + const ParamGenerator& g3, + const typename ParamGenerator::iterator& current3, + const ParamGenerator& g4, + const typename ParamGenerator::iterator& current4, + const ParamGenerator& g5, + const typename ParamGenerator::iterator& current5, + const ParamGenerator& g6, + const typename ParamGenerator::iterator& current6, + const ParamGenerator& g7, + const typename ParamGenerator::iterator& current7) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3), + begin4_(g4.begin()), end4_(g4.end()), current4_(current4), + begin5_(g5.begin()), end5_(g5.end()), current5_(current5), + begin6_(g6.begin()), end6_(g6.end()), current6_(current6), + begin7_(g7.begin()), end7_(g7.end()), current7_(current7) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current7_; + if (current7_ == end7_) { + current7_ = begin7_; + ++current6_; + } + if (current6_ == end6_) { + current6_ = begin6_; + ++current5_; + } + if (current5_ == end5_) { + current5_ = begin5_; + ++current4_; + } + if (current4_ == end4_) { + current4_ = begin4_; + ++current3_; + } + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return ¤t_value_; } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_ && + current4_ == typed_other->current4_ && + current5_ == typed_other->current5_ && + current6_ == typed_other->current6_ && + current7_ == typed_other->current7_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_), + begin4_(other.begin4_), + end4_(other.end4_), + current4_(other.current4_), + begin5_(other.begin5_), + end5_(other.end5_), + current5_(other.current5_), + begin6_(other.begin6_), + end6_(other.end6_), + current6_(other.current6_), + begin7_(other.begin7_), + end7_(other.end7_), + current7_(other.current7_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = ParamType(*current1_, *current2_, *current3_, + *current4_, *current5_, *current6_, *current7_); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_ || + current4_ == end4_ || + current5_ == end5_ || + current6_ == end6_ || + current7_ == end7_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator::iterator begin1_; + const typename ParamGenerator::iterator end1_; + typename ParamGenerator::iterator current1_; + const typename ParamGenerator::iterator begin2_; + const typename ParamGenerator::iterator end2_; + typename ParamGenerator::iterator current2_; + const typename ParamGenerator::iterator begin3_; + const typename ParamGenerator::iterator end3_; + typename ParamGenerator::iterator current3_; + const typename ParamGenerator::iterator begin4_; + const typename ParamGenerator::iterator end4_; + typename ParamGenerator::iterator current4_; + const typename ParamGenerator::iterator begin5_; + const typename ParamGenerator::iterator end5_; + typename ParamGenerator::iterator current5_; + const typename ParamGenerator::iterator begin6_; + const typename ParamGenerator::iterator end6_; + typename ParamGenerator::iterator current6_; + const typename ParamGenerator::iterator begin7_; + const typename ParamGenerator::iterator end7_; + typename ParamGenerator::iterator current7_; + ParamType current_value_; + }; // class CartesianProductGenerator7::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator7& other); + + const ParamGenerator g1_; + const ParamGenerator g2_; + const ParamGenerator g3_; + const ParamGenerator g4_; + const ParamGenerator g5_; + const ParamGenerator g6_; + const ParamGenerator g7_; +}; // class CartesianProductGenerator7 + + +template +class CartesianProductGenerator8 + : public ParamGeneratorInterface< ::std::tr1::tuple > { + public: + typedef ::std::tr1::tuple ParamType; + + CartesianProductGenerator8(const ParamGenerator& g1, + const ParamGenerator& g2, const ParamGenerator& g3, + const ParamGenerator& g4, const ParamGenerator& g5, + const ParamGenerator& g6, const ParamGenerator& g7, + const ParamGenerator& g8) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), + g8_(g8) {} + virtual ~CartesianProductGenerator8() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_, + g7_.begin(), g8_, g8_.begin()); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), + g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_, + g8_.end()); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + const ParamGenerator& g1, + const typename ParamGenerator::iterator& current1, + const ParamGenerator& g2, + const typename ParamGenerator::iterator& current2, + const ParamGenerator& g3, + const typename ParamGenerator::iterator& current3, + const ParamGenerator& g4, + const typename ParamGenerator::iterator& current4, + const ParamGenerator& g5, + const typename ParamGenerator::iterator& current5, + const ParamGenerator& g6, + const typename ParamGenerator::iterator& current6, + const ParamGenerator& g7, + const typename ParamGenerator::iterator& current7, + const ParamGenerator& g8, + const typename ParamGenerator::iterator& current8) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3), + begin4_(g4.begin()), end4_(g4.end()), current4_(current4), + begin5_(g5.begin()), end5_(g5.end()), current5_(current5), + begin6_(g6.begin()), end6_(g6.end()), current6_(current6), + begin7_(g7.begin()), end7_(g7.end()), current7_(current7), + begin8_(g8.begin()), end8_(g8.end()), current8_(current8) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current8_; + if (current8_ == end8_) { + current8_ = begin8_; + ++current7_; + } + if (current7_ == end7_) { + current7_ = begin7_; + ++current6_; + } + if (current6_ == end6_) { + current6_ = begin6_; + ++current5_; + } + if (current5_ == end5_) { + current5_ = begin5_; + ++current4_; + } + if (current4_ == end4_) { + current4_ = begin4_; + ++current3_; + } + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return ¤t_value_; } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_ && + current4_ == typed_other->current4_ && + current5_ == typed_other->current5_ && + current6_ == typed_other->current6_ && + current7_ == typed_other->current7_ && + current8_ == typed_other->current8_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_), + begin4_(other.begin4_), + end4_(other.end4_), + current4_(other.current4_), + begin5_(other.begin5_), + end5_(other.end5_), + current5_(other.current5_), + begin6_(other.begin6_), + end6_(other.end6_), + current6_(other.current6_), + begin7_(other.begin7_), + end7_(other.end7_), + current7_(other.current7_), + begin8_(other.begin8_), + end8_(other.end8_), + current8_(other.current8_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = ParamType(*current1_, *current2_, *current3_, + *current4_, *current5_, *current6_, *current7_, *current8_); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_ || + current4_ == end4_ || + current5_ == end5_ || + current6_ == end6_ || + current7_ == end7_ || + current8_ == end8_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator::iterator begin1_; + const typename ParamGenerator::iterator end1_; + typename ParamGenerator::iterator current1_; + const typename ParamGenerator::iterator begin2_; + const typename ParamGenerator::iterator end2_; + typename ParamGenerator::iterator current2_; + const typename ParamGenerator::iterator begin3_; + const typename ParamGenerator::iterator end3_; + typename ParamGenerator::iterator current3_; + const typename ParamGenerator::iterator begin4_; + const typename ParamGenerator::iterator end4_; + typename ParamGenerator::iterator current4_; + const typename ParamGenerator::iterator begin5_; + const typename ParamGenerator::iterator end5_; + typename ParamGenerator::iterator current5_; + const typename ParamGenerator::iterator begin6_; + const typename ParamGenerator::iterator end6_; + typename ParamGenerator::iterator current6_; + const typename ParamGenerator::iterator begin7_; + const typename ParamGenerator::iterator end7_; + typename ParamGenerator::iterator current7_; + const typename ParamGenerator::iterator begin8_; + const typename ParamGenerator::iterator end8_; + typename ParamGenerator::iterator current8_; + ParamType current_value_; + }; // class CartesianProductGenerator8::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator8& other); + + const ParamGenerator g1_; + const ParamGenerator g2_; + const ParamGenerator g3_; + const ParamGenerator g4_; + const ParamGenerator g5_; + const ParamGenerator g6_; + const ParamGenerator g7_; + const ParamGenerator g8_; +}; // class CartesianProductGenerator8 + + +template +class CartesianProductGenerator9 + : public ParamGeneratorInterface< ::std::tr1::tuple > { + public: + typedef ::std::tr1::tuple ParamType; + + CartesianProductGenerator9(const ParamGenerator& g1, + const ParamGenerator& g2, const ParamGenerator& g3, + const ParamGenerator& g4, const ParamGenerator& g5, + const ParamGenerator& g6, const ParamGenerator& g7, + const ParamGenerator& g8, const ParamGenerator& g9) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8), + g9_(g9) {} + virtual ~CartesianProductGenerator9() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_, + g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin()); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), + g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_, + g8_.end(), g9_, g9_.end()); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + const ParamGenerator& g1, + const typename ParamGenerator::iterator& current1, + const ParamGenerator& g2, + const typename ParamGenerator::iterator& current2, + const ParamGenerator& g3, + const typename ParamGenerator::iterator& current3, + const ParamGenerator& g4, + const typename ParamGenerator::iterator& current4, + const ParamGenerator& g5, + const typename ParamGenerator::iterator& current5, + const ParamGenerator& g6, + const typename ParamGenerator::iterator& current6, + const ParamGenerator& g7, + const typename ParamGenerator::iterator& current7, + const ParamGenerator& g8, + const typename ParamGenerator::iterator& current8, + const ParamGenerator& g9, + const typename ParamGenerator::iterator& current9) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3), + begin4_(g4.begin()), end4_(g4.end()), current4_(current4), + begin5_(g5.begin()), end5_(g5.end()), current5_(current5), + begin6_(g6.begin()), end6_(g6.end()), current6_(current6), + begin7_(g7.begin()), end7_(g7.end()), current7_(current7), + begin8_(g8.begin()), end8_(g8.end()), current8_(current8), + begin9_(g9.begin()), end9_(g9.end()), current9_(current9) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current9_; + if (current9_ == end9_) { + current9_ = begin9_; + ++current8_; + } + if (current8_ == end8_) { + current8_ = begin8_; + ++current7_; + } + if (current7_ == end7_) { + current7_ = begin7_; + ++current6_; + } + if (current6_ == end6_) { + current6_ = begin6_; + ++current5_; + } + if (current5_ == end5_) { + current5_ = begin5_; + ++current4_; + } + if (current4_ == end4_) { + current4_ = begin4_; + ++current3_; + } + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return ¤t_value_; } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_ && + current4_ == typed_other->current4_ && + current5_ == typed_other->current5_ && + current6_ == typed_other->current6_ && + current7_ == typed_other->current7_ && + current8_ == typed_other->current8_ && + current9_ == typed_other->current9_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_), + begin4_(other.begin4_), + end4_(other.end4_), + current4_(other.current4_), + begin5_(other.begin5_), + end5_(other.end5_), + current5_(other.current5_), + begin6_(other.begin6_), + end6_(other.end6_), + current6_(other.current6_), + begin7_(other.begin7_), + end7_(other.end7_), + current7_(other.current7_), + begin8_(other.begin8_), + end8_(other.end8_), + current8_(other.current8_), + begin9_(other.begin9_), + end9_(other.end9_), + current9_(other.current9_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = ParamType(*current1_, *current2_, *current3_, + *current4_, *current5_, *current6_, *current7_, *current8_, + *current9_); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_ || + current4_ == end4_ || + current5_ == end5_ || + current6_ == end6_ || + current7_ == end7_ || + current8_ == end8_ || + current9_ == end9_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator::iterator begin1_; + const typename ParamGenerator::iterator end1_; + typename ParamGenerator::iterator current1_; + const typename ParamGenerator::iterator begin2_; + const typename ParamGenerator::iterator end2_; + typename ParamGenerator::iterator current2_; + const typename ParamGenerator::iterator begin3_; + const typename ParamGenerator::iterator end3_; + typename ParamGenerator::iterator current3_; + const typename ParamGenerator::iterator begin4_; + const typename ParamGenerator::iterator end4_; + typename ParamGenerator::iterator current4_; + const typename ParamGenerator::iterator begin5_; + const typename ParamGenerator::iterator end5_; + typename ParamGenerator::iterator current5_; + const typename ParamGenerator::iterator begin6_; + const typename ParamGenerator::iterator end6_; + typename ParamGenerator::iterator current6_; + const typename ParamGenerator::iterator begin7_; + const typename ParamGenerator::iterator end7_; + typename ParamGenerator::iterator current7_; + const typename ParamGenerator::iterator begin8_; + const typename ParamGenerator::iterator end8_; + typename ParamGenerator::iterator current8_; + const typename ParamGenerator::iterator begin9_; + const typename ParamGenerator::iterator end9_; + typename ParamGenerator::iterator current9_; + ParamType current_value_; + }; // class CartesianProductGenerator9::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator9& other); + + const ParamGenerator g1_; + const ParamGenerator g2_; + const ParamGenerator g3_; + const ParamGenerator g4_; + const ParamGenerator g5_; + const ParamGenerator g6_; + const ParamGenerator g7_; + const ParamGenerator g8_; + const ParamGenerator g9_; +}; // class CartesianProductGenerator9 + + +template +class CartesianProductGenerator10 + : public ParamGeneratorInterface< ::std::tr1::tuple > { + public: + typedef ::std::tr1::tuple ParamType; + + CartesianProductGenerator10(const ParamGenerator& g1, + const ParamGenerator& g2, const ParamGenerator& g3, + const ParamGenerator& g4, const ParamGenerator& g5, + const ParamGenerator& g6, const ParamGenerator& g7, + const ParamGenerator& g8, const ParamGenerator& g9, + const ParamGenerator& g10) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8), + g9_(g9), g10_(g10) {} + virtual ~CartesianProductGenerator10() {} + + virtual ParamIteratorInterface* Begin() const { + return new Iterator(this, g1_, g1_.begin(), g2_, g2_.begin(), g3_, + g3_.begin(), g4_, g4_.begin(), g5_, g5_.begin(), g6_, g6_.begin(), g7_, + g7_.begin(), g8_, g8_.begin(), g9_, g9_.begin(), g10_, g10_.begin()); + } + virtual ParamIteratorInterface* End() const { + return new Iterator(this, g1_, g1_.end(), g2_, g2_.end(), g3_, g3_.end(), + g4_, g4_.end(), g5_, g5_.end(), g6_, g6_.end(), g7_, g7_.end(), g8_, + g8_.end(), g9_, g9_.end(), g10_, g10_.end()); + } + + private: + class Iterator : public ParamIteratorInterface { + public: + Iterator(const ParamGeneratorInterface* base, + const ParamGenerator& g1, + const typename ParamGenerator::iterator& current1, + const ParamGenerator& g2, + const typename ParamGenerator::iterator& current2, + const ParamGenerator& g3, + const typename ParamGenerator::iterator& current3, + const ParamGenerator& g4, + const typename ParamGenerator::iterator& current4, + const ParamGenerator& g5, + const typename ParamGenerator::iterator& current5, + const ParamGenerator& g6, + const typename ParamGenerator::iterator& current6, + const ParamGenerator& g7, + const typename ParamGenerator::iterator& current7, + const ParamGenerator& g8, + const typename ParamGenerator::iterator& current8, + const ParamGenerator& g9, + const typename ParamGenerator::iterator& current9, + const ParamGenerator& g10, + const typename ParamGenerator::iterator& current10) + : base_(base), + begin1_(g1.begin()), end1_(g1.end()), current1_(current1), + begin2_(g2.begin()), end2_(g2.end()), current2_(current2), + begin3_(g3.begin()), end3_(g3.end()), current3_(current3), + begin4_(g4.begin()), end4_(g4.end()), current4_(current4), + begin5_(g5.begin()), end5_(g5.end()), current5_(current5), + begin6_(g6.begin()), end6_(g6.end()), current6_(current6), + begin7_(g7.begin()), end7_(g7.end()), current7_(current7), + begin8_(g8.begin()), end8_(g8.end()), current8_(current8), + begin9_(g9.begin()), end9_(g9.end()), current9_(current9), + begin10_(g10.begin()), end10_(g10.end()), current10_(current10) { + ComputeCurrentValue(); + } + virtual ~Iterator() {} + + virtual const ParamGeneratorInterface* BaseGenerator() const { + return base_; + } + // Advance should not be called on beyond-of-range iterators + // so no component iterators must be beyond end of range, either. + virtual void Advance() { + assert(!AtEnd()); + ++current10_; + if (current10_ == end10_) { + current10_ = begin10_; + ++current9_; + } + if (current9_ == end9_) { + current9_ = begin9_; + ++current8_; + } + if (current8_ == end8_) { + current8_ = begin8_; + ++current7_; + } + if (current7_ == end7_) { + current7_ = begin7_; + ++current6_; + } + if (current6_ == end6_) { + current6_ = begin6_; + ++current5_; + } + if (current5_ == end5_) { + current5_ = begin5_; + ++current4_; + } + if (current4_ == end4_) { + current4_ = begin4_; + ++current3_; + } + if (current3_ == end3_) { + current3_ = begin3_; + ++current2_; + } + if (current2_ == end2_) { + current2_ = begin2_; + ++current1_; + } + ComputeCurrentValue(); + } + virtual ParamIteratorInterface* Clone() const { + return new Iterator(*this); + } + virtual const ParamType* Current() const { return ¤t_value_; } + virtual bool Equals(const ParamIteratorInterface& other) const { + // Having the same base generator guarantees that the other + // iterator is of the same type and we can downcast. + GTEST_CHECK_(BaseGenerator() == other.BaseGenerator()) + << "The program attempted to compare iterators " + << "from different generators." << std::endl; + const Iterator* typed_other = + CheckedDowncastToActualType(&other); + // We must report iterators equal if they both point beyond their + // respective ranges. That can happen in a variety of fashions, + // so we have to consult AtEnd(). + return (AtEnd() && typed_other->AtEnd()) || + ( + current1_ == typed_other->current1_ && + current2_ == typed_other->current2_ && + current3_ == typed_other->current3_ && + current4_ == typed_other->current4_ && + current5_ == typed_other->current5_ && + current6_ == typed_other->current6_ && + current7_ == typed_other->current7_ && + current8_ == typed_other->current8_ && + current9_ == typed_other->current9_ && + current10_ == typed_other->current10_); + } + + private: + Iterator(const Iterator& other) + : base_(other.base_), + begin1_(other.begin1_), + end1_(other.end1_), + current1_(other.current1_), + begin2_(other.begin2_), + end2_(other.end2_), + current2_(other.current2_), + begin3_(other.begin3_), + end3_(other.end3_), + current3_(other.current3_), + begin4_(other.begin4_), + end4_(other.end4_), + current4_(other.current4_), + begin5_(other.begin5_), + end5_(other.end5_), + current5_(other.current5_), + begin6_(other.begin6_), + end6_(other.end6_), + current6_(other.current6_), + begin7_(other.begin7_), + end7_(other.end7_), + current7_(other.current7_), + begin8_(other.begin8_), + end8_(other.end8_), + current8_(other.current8_), + begin9_(other.begin9_), + end9_(other.end9_), + current9_(other.current9_), + begin10_(other.begin10_), + end10_(other.end10_), + current10_(other.current10_) { + ComputeCurrentValue(); + } + + void ComputeCurrentValue() { + if (!AtEnd()) + current_value_ = ParamType(*current1_, *current2_, *current3_, + *current4_, *current5_, *current6_, *current7_, *current8_, + *current9_, *current10_); + } + bool AtEnd() const { + // We must report iterator past the end of the range when either of the + // component iterators has reached the end of its range. + return + current1_ == end1_ || + current2_ == end2_ || + current3_ == end3_ || + current4_ == end4_ || + current5_ == end5_ || + current6_ == end6_ || + current7_ == end7_ || + current8_ == end8_ || + current9_ == end9_ || + current10_ == end10_; + } + + // No implementation - assignment is unsupported. + void operator=(const Iterator& other); + + const ParamGeneratorInterface* const base_; + // begin[i]_ and end[i]_ define the i-th range that Iterator traverses. + // current[i]_ is the actual traversing iterator. + const typename ParamGenerator::iterator begin1_; + const typename ParamGenerator::iterator end1_; + typename ParamGenerator::iterator current1_; + const typename ParamGenerator::iterator begin2_; + const typename ParamGenerator::iterator end2_; + typename ParamGenerator::iterator current2_; + const typename ParamGenerator::iterator begin3_; + const typename ParamGenerator::iterator end3_; + typename ParamGenerator::iterator current3_; + const typename ParamGenerator::iterator begin4_; + const typename ParamGenerator::iterator end4_; + typename ParamGenerator::iterator current4_; + const typename ParamGenerator::iterator begin5_; + const typename ParamGenerator::iterator end5_; + typename ParamGenerator::iterator current5_; + const typename ParamGenerator::iterator begin6_; + const typename ParamGenerator::iterator end6_; + typename ParamGenerator::iterator current6_; + const typename ParamGenerator::iterator begin7_; + const typename ParamGenerator::iterator end7_; + typename ParamGenerator::iterator current7_; + const typename ParamGenerator::iterator begin8_; + const typename ParamGenerator::iterator end8_; + typename ParamGenerator::iterator current8_; + const typename ParamGenerator::iterator begin9_; + const typename ParamGenerator::iterator end9_; + typename ParamGenerator::iterator current9_; + const typename ParamGenerator::iterator begin10_; + const typename ParamGenerator::iterator end10_; + typename ParamGenerator::iterator current10_; + ParamType current_value_; + }; // class CartesianProductGenerator10::Iterator + + // No implementation - assignment is unsupported. + void operator=(const CartesianProductGenerator10& other); + + const ParamGenerator g1_; + const ParamGenerator g2_; + const ParamGenerator g3_; + const ParamGenerator g4_; + const ParamGenerator g5_; + const ParamGenerator g6_; + const ParamGenerator g7_; + const ParamGenerator g8_; + const ParamGenerator g9_; + const ParamGenerator g10_; +}; // class CartesianProductGenerator10 + + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Helper classes providing Combine() with polymorphic features. They allow +// casting CartesianProductGeneratorN to ParamGenerator if T is +// convertible to U. +// +template +class CartesianProductHolder2 { + public: +CartesianProductHolder2(const Generator1& g1, const Generator2& g2) + : g1_(g1), g2_(g2) {} + template + operator ParamGenerator< ::std::tr1::tuple >() const { + return ParamGenerator< ::std::tr1::tuple >( + new CartesianProductGenerator2( + static_cast >(g1_), + static_cast >(g2_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder2& other); + + const Generator1 g1_; + const Generator2 g2_; +}; // class CartesianProductHolder2 + +template +class CartesianProductHolder3 { + public: +CartesianProductHolder3(const Generator1& g1, const Generator2& g2, + const Generator3& g3) + : g1_(g1), g2_(g2), g3_(g3) {} + template + operator ParamGenerator< ::std::tr1::tuple >() const { + return ParamGenerator< ::std::tr1::tuple >( + new CartesianProductGenerator3( + static_cast >(g1_), + static_cast >(g2_), + static_cast >(g3_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder3& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; +}; // class CartesianProductHolder3 + +template +class CartesianProductHolder4 { + public: +CartesianProductHolder4(const Generator1& g1, const Generator2& g2, + const Generator3& g3, const Generator4& g4) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4) {} + template + operator ParamGenerator< ::std::tr1::tuple >() const { + return ParamGenerator< ::std::tr1::tuple >( + new CartesianProductGenerator4( + static_cast >(g1_), + static_cast >(g2_), + static_cast >(g3_), + static_cast >(g4_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder4& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; + const Generator4 g4_; +}; // class CartesianProductHolder4 + +template +class CartesianProductHolder5 { + public: +CartesianProductHolder5(const Generator1& g1, const Generator2& g2, + const Generator3& g3, const Generator4& g4, const Generator5& g5) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5) {} + template + operator ParamGenerator< ::std::tr1::tuple >() const { + return ParamGenerator< ::std::tr1::tuple >( + new CartesianProductGenerator5( + static_cast >(g1_), + static_cast >(g2_), + static_cast >(g3_), + static_cast >(g4_), + static_cast >(g5_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder5& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; + const Generator4 g4_; + const Generator5 g5_; +}; // class CartesianProductHolder5 + +template +class CartesianProductHolder6 { + public: +CartesianProductHolder6(const Generator1& g1, const Generator2& g2, + const Generator3& g3, const Generator4& g4, const Generator5& g5, + const Generator6& g6) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6) {} + template + operator ParamGenerator< ::std::tr1::tuple >() const { + return ParamGenerator< ::std::tr1::tuple >( + new CartesianProductGenerator6( + static_cast >(g1_), + static_cast >(g2_), + static_cast >(g3_), + static_cast >(g4_), + static_cast >(g5_), + static_cast >(g6_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder6& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; + const Generator4 g4_; + const Generator5 g5_; + const Generator6 g6_; +}; // class CartesianProductHolder6 + +template +class CartesianProductHolder7 { + public: +CartesianProductHolder7(const Generator1& g1, const Generator2& g2, + const Generator3& g3, const Generator4& g4, const Generator5& g5, + const Generator6& g6, const Generator7& g7) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7) {} + template + operator ParamGenerator< ::std::tr1::tuple >() const { + return ParamGenerator< ::std::tr1::tuple >( + new CartesianProductGenerator7( + static_cast >(g1_), + static_cast >(g2_), + static_cast >(g3_), + static_cast >(g4_), + static_cast >(g5_), + static_cast >(g6_), + static_cast >(g7_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder7& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; + const Generator4 g4_; + const Generator5 g5_; + const Generator6 g6_; + const Generator7 g7_; +}; // class CartesianProductHolder7 + +template +class CartesianProductHolder8 { + public: +CartesianProductHolder8(const Generator1& g1, const Generator2& g2, + const Generator3& g3, const Generator4& g4, const Generator5& g5, + const Generator6& g6, const Generator7& g7, const Generator8& g8) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), + g8_(g8) {} + template + operator ParamGenerator< ::std::tr1::tuple >() const { + return ParamGenerator< ::std::tr1::tuple >( + new CartesianProductGenerator8( + static_cast >(g1_), + static_cast >(g2_), + static_cast >(g3_), + static_cast >(g4_), + static_cast >(g5_), + static_cast >(g6_), + static_cast >(g7_), + static_cast >(g8_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder8& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; + const Generator4 g4_; + const Generator5 g5_; + const Generator6 g6_; + const Generator7 g7_; + const Generator8 g8_; +}; // class CartesianProductHolder8 + +template +class CartesianProductHolder9 { + public: +CartesianProductHolder9(const Generator1& g1, const Generator2& g2, + const Generator3& g3, const Generator4& g4, const Generator5& g5, + const Generator6& g6, const Generator7& g7, const Generator8& g8, + const Generator9& g9) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8), + g9_(g9) {} + template + operator ParamGenerator< ::std::tr1::tuple >() const { + return ParamGenerator< ::std::tr1::tuple >( + new CartesianProductGenerator9( + static_cast >(g1_), + static_cast >(g2_), + static_cast >(g3_), + static_cast >(g4_), + static_cast >(g5_), + static_cast >(g6_), + static_cast >(g7_), + static_cast >(g8_), + static_cast >(g9_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder9& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; + const Generator4 g4_; + const Generator5 g5_; + const Generator6 g6_; + const Generator7 g7_; + const Generator8 g8_; + const Generator9 g9_; +}; // class CartesianProductHolder9 + +template +class CartesianProductHolder10 { + public: +CartesianProductHolder10(const Generator1& g1, const Generator2& g2, + const Generator3& g3, const Generator4& g4, const Generator5& g5, + const Generator6& g6, const Generator7& g7, const Generator8& g8, + const Generator9& g9, const Generator10& g10) + : g1_(g1), g2_(g2), g3_(g3), g4_(g4), g5_(g5), g6_(g6), g7_(g7), g8_(g8), + g9_(g9), g10_(g10) {} + template + operator ParamGenerator< ::std::tr1::tuple >() const { + return ParamGenerator< ::std::tr1::tuple >( + new CartesianProductGenerator10( + static_cast >(g1_), + static_cast >(g2_), + static_cast >(g3_), + static_cast >(g4_), + static_cast >(g5_), + static_cast >(g6_), + static_cast >(g7_), + static_cast >(g8_), + static_cast >(g9_), + static_cast >(g10_))); + } + + private: + // No implementation - assignment is unsupported. + void operator=(const CartesianProductHolder10& other); + + const Generator1 g1_; + const Generator2 g2_; + const Generator3 g3_; + const Generator4 g4_; + const Generator5 g5_; + const Generator6 g6_; + const Generator7 g7_; + const Generator8 g8_; + const Generator9 g9_; + const Generator10 g10_; +}; // class CartesianProductHolder10 + +# endif // GTEST_HAS_COMBINE + +} // namespace internal +} // namespace testing + +#endif // GTEST_HAS_PARAM_TEST + +#endif // GTEST_INCLUDE_GTEST_INTERNAL_GTEST_PARAM_UTIL_GENERATED_H_ + +#if GTEST_HAS_PARAM_TEST + +namespace testing { + +// Functions producing parameter generators. +// +// Google Test uses these generators to produce parameters for value- +// parameterized tests. When a parameterized test case is instantiated +// with a particular generator, Google Test creates and runs tests +// for each element in the sequence produced by the generator. +// +// In the following sample, tests from test case FooTest are instantiated +// each three times with parameter values 3, 5, and 8: +// +// class FooTest : public TestWithParam { ... }; +// +// TEST_P(FooTest, TestThis) { +// } +// TEST_P(FooTest, TestThat) { +// } +// INSTANTIATE_TEST_CASE_P(TestSequence, FooTest, Values(3, 5, 8)); +// + +// Range() returns generators providing sequences of values in a range. +// +// Synopsis: +// Range(start, end) +// - returns a generator producing a sequence of values {start, start+1, +// start+2, ..., }. +// Range(start, end, step) +// - returns a generator producing a sequence of values {start, start+step, +// start+step+step, ..., }. +// Notes: +// * The generated sequences never include end. For example, Range(1, 5) +// returns a generator producing a sequence {1, 2, 3, 4}. Range(1, 9, 2) +// returns a generator producing {1, 3, 5, 7}. +// * start and end must have the same type. That type may be any integral or +// floating-point type or a user defined type satisfying these conditions: +// * It must be assignable (have operator=() defined). +// * It must have operator+() (operator+(int-compatible type) for +// two-operand version). +// * It must have operator<() defined. +// Elements in the resulting sequences will also have that type. +// * Condition start < end must be satisfied in order for resulting sequences +// to contain any elements. +// +template +internal::ParamGenerator Range(T start, T end, IncrementT step) { + return internal::ParamGenerator( + new internal::RangeGenerator(start, end, step)); +} + +template +internal::ParamGenerator Range(T start, T end) { + return Range(start, end, 1); +} + +// ValuesIn() function allows generation of tests with parameters coming from +// a container. +// +// Synopsis: +// ValuesIn(const T (&array)[N]) +// - returns a generator producing sequences with elements from +// a C-style array. +// ValuesIn(const Container& container) +// - returns a generator producing sequences with elements from +// an STL-style container. +// ValuesIn(Iterator begin, Iterator end) +// - returns a generator producing sequences with elements from +// a range [begin, end) defined by a pair of STL-style iterators. These +// iterators can also be plain C pointers. +// +// Please note that ValuesIn copies the values from the containers +// passed in and keeps them to generate tests in RUN_ALL_TESTS(). +// +// Examples: +// +// This instantiates tests from test case StringTest +// each with C-string values of "foo", "bar", and "baz": +// +// const char* strings[] = {"foo", "bar", "baz"}; +// INSTANTIATE_TEST_CASE_P(StringSequence, SrtingTest, ValuesIn(strings)); +// +// This instantiates tests from test case StlStringTest +// each with STL strings with values "a" and "b": +// +// ::std::vector< ::std::string> GetParameterStrings() { +// ::std::vector< ::std::string> v; +// v.push_back("a"); +// v.push_back("b"); +// return v; +// } +// +// INSTANTIATE_TEST_CASE_P(CharSequence, +// StlStringTest, +// ValuesIn(GetParameterStrings())); +// +// +// This will also instantiate tests from CharTest +// each with parameter values 'a' and 'b': +// +// ::std::list GetParameterChars() { +// ::std::list list; +// list.push_back('a'); +// list.push_back('b'); +// return list; +// } +// ::std::list l = GetParameterChars(); +// INSTANTIATE_TEST_CASE_P(CharSequence2, +// CharTest, +// ValuesIn(l.begin(), l.end())); +// +template +internal::ParamGenerator< + typename ::testing::internal::IteratorTraits::value_type> +ValuesIn(ForwardIterator begin, ForwardIterator end) { + typedef typename ::testing::internal::IteratorTraits + ::value_type ParamType; + return internal::ParamGenerator( + new internal::ValuesInIteratorRangeGenerator(begin, end)); +} + +template +internal::ParamGenerator ValuesIn(const T (&array)[N]) { + return ValuesIn(array, array + N); +} + +template +internal::ParamGenerator ValuesIn( + const Container& container) { + return ValuesIn(container.begin(), container.end()); +} + +// Values() allows generating tests from explicitly specified list of +// parameters. +// +// Synopsis: +// Values(T v1, T v2, ..., T vN) +// - returns a generator producing sequences with elements v1, v2, ..., vN. +// +// For example, this instantiates tests from test case BarTest each +// with values "one", "two", and "three": +// +// INSTANTIATE_TEST_CASE_P(NumSequence, BarTest, Values("one", "two", "three")); +// +// This instantiates tests from test case BazTest each with values 1, 2, 3.5. +// The exact type of values will depend on the type of parameter in BazTest. +// +// INSTANTIATE_TEST_CASE_P(FloatingNumbers, BazTest, Values(1, 2, 3.5)); +// +// Currently, Values() supports from 1 to 50 parameters. +// +template +internal::ValueArray1 Values(T1 v1) { + return internal::ValueArray1(v1); +} + +template +internal::ValueArray2 Values(T1 v1, T2 v2) { + return internal::ValueArray2(v1, v2); +} + +template +internal::ValueArray3 Values(T1 v1, T2 v2, T3 v3) { + return internal::ValueArray3(v1, v2, v3); +} + +template +internal::ValueArray4 Values(T1 v1, T2 v2, T3 v3, T4 v4) { + return internal::ValueArray4(v1, v2, v3, v4); +} + +template +internal::ValueArray5 Values(T1 v1, T2 v2, T3 v3, T4 v4, + T5 v5) { + return internal::ValueArray5(v1, v2, v3, v4, v5); +} + +template +internal::ValueArray6 Values(T1 v1, T2 v2, T3 v3, + T4 v4, T5 v5, T6 v6) { + return internal::ValueArray6(v1, v2, v3, v4, v5, v6); +} + +template +internal::ValueArray7 Values(T1 v1, T2 v2, T3 v3, + T4 v4, T5 v5, T6 v6, T7 v7) { + return internal::ValueArray7(v1, v2, v3, v4, v5, + v6, v7); +} + +template +internal::ValueArray8 Values(T1 v1, T2 v2, + T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8) { + return internal::ValueArray8(v1, v2, v3, v4, + v5, v6, v7, v8); +} + +template +internal::ValueArray9 Values(T1 v1, T2 v2, + T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9) { + return internal::ValueArray9(v1, v2, v3, + v4, v5, v6, v7, v8, v9); +} + +template +internal::ValueArray10 Values(T1 v1, + T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10) { + return internal::ValueArray10(v1, + v2, v3, v4, v5, v6, v7, v8, v9, v10); +} + +template +internal::ValueArray11 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11) { + return internal::ValueArray11(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11); +} + +template +internal::ValueArray12 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12) { + return internal::ValueArray12(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12); +} + +template +internal::ValueArray13 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13) { + return internal::ValueArray13(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13); +} + +template +internal::ValueArray14 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14) { + return internal::ValueArray14(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, + v14); +} + +template +internal::ValueArray15 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, + T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15) { + return internal::ValueArray15(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15); +} + +template +internal::ValueArray16 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, + T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16) { + return internal::ValueArray16(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, + v12, v13, v14, v15, v16); +} + +template +internal::ValueArray17 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, + T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17) { + return internal::ValueArray17(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, + v11, v12, v13, v14, v15, v16, v17); +} + +template +internal::ValueArray18 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, + T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17, T18 v18) { + return internal::ValueArray18(v1, v2, v3, v4, v5, v6, v7, v8, v9, + v10, v11, v12, v13, v14, v15, v16, v17, v18); +} + +template +internal::ValueArray19 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, + T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, + T15 v15, T16 v16, T17 v17, T18 v18, T19 v19) { + return internal::ValueArray19(v1, v2, v3, v4, v5, v6, v7, v8, + v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19); +} + +template +internal::ValueArray20 Values(T1 v1, T2 v2, T3 v3, T4 v4, + T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, + T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20) { + return internal::ValueArray20(v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20); +} + +template +internal::ValueArray21 Values(T1 v1, T2 v2, T3 v3, T4 v4, + T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, + T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21) { + return internal::ValueArray21(v1, v2, v3, v4, v5, v6, + v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21); +} + +template +internal::ValueArray22 Values(T1 v1, T2 v2, T3 v3, + T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, + T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, + T21 v21, T22 v22) { + return internal::ValueArray22(v1, v2, v3, v4, + v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, + v20, v21, v22); +} + +template +internal::ValueArray23 Values(T1 v1, T2 v2, + T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, + T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, + T21 v21, T22 v22, T23 v23) { + return internal::ValueArray23(v1, v2, v3, + v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, + v20, v21, v22, v23); +} + +template +internal::ValueArray24 Values(T1 v1, T2 v2, + T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, + T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, + T21 v21, T22 v22, T23 v23, T24 v24) { + return internal::ValueArray24(v1, v2, + v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, + v19, v20, v21, v22, v23, v24); +} + +template +internal::ValueArray25 Values(T1 v1, + T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, + T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, + T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25) { + return internal::ValueArray25(v1, + v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, + v18, v19, v20, v21, v22, v23, v24, v25); +} + +template +internal::ValueArray26 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26) { + return internal::ValueArray26(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26); +} + +template +internal::ValueArray27 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27) { + return internal::ValueArray27(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, + v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27); +} + +template +internal::ValueArray28 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28) { + return internal::ValueArray28(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, + v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, + v28); +} + +template +internal::ValueArray29 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29) { + return internal::ValueArray29(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, + v27, v28, v29); +} + +template +internal::ValueArray30 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, + T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, + T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, + T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30) { + return internal::ValueArray30(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, + v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, + v26, v27, v28, v29, v30); +} + +template +internal::ValueArray31 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, + T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, + T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31) { + return internal::ValueArray31(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, + v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, + v25, v26, v27, v28, v29, v30, v31); +} + +template +internal::ValueArray32 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, + T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, + T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, + T32 v32) { + return internal::ValueArray32(v1, v2, v3, v4, v5, v6, v7, v8, v9, + v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, + v24, v25, v26, v27, v28, v29, v30, v31, v32); +} + +template +internal::ValueArray33 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, + T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, + T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, + T32 v32, T33 v33) { + return internal::ValueArray33(v1, v2, v3, v4, v5, v6, v7, v8, + v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, + v24, v25, v26, v27, v28, v29, v30, v31, v32, v33); +} + +template +internal::ValueArray34 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, + T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, + T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, + T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, + T31 v31, T32 v32, T33 v33, T34 v34) { + return internal::ValueArray34(v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, + v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34); +} + +template +internal::ValueArray35 Values(T1 v1, T2 v2, T3 v3, T4 v4, + T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, + T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, + T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, + T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35) { + return internal::ValueArray35(v1, v2, v3, v4, v5, v6, + v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, + v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35); +} + +template +internal::ValueArray36 Values(T1 v1, T2 v2, T3 v3, T4 v4, + T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, + T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, + T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, + T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36) { + return internal::ValueArray36(v1, v2, v3, v4, + v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, + v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, + v34, v35, v36); +} + +template +internal::ValueArray37 Values(T1 v1, T2 v2, T3 v3, + T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, + T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, + T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, + T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, + T37 v37) { + return internal::ValueArray37(v1, v2, v3, + v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, + v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, + v34, v35, v36, v37); +} + +template +internal::ValueArray38 Values(T1 v1, T2 v2, + T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, + T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, + T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, + T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, + T37 v37, T38 v38) { + return internal::ValueArray38(v1, v2, + v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, + v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, + v33, v34, v35, v36, v37, v38); +} + +template +internal::ValueArray39 Values(T1 v1, T2 v2, + T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, + T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, + T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, + T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, + T37 v37, T38 v38, T39 v39) { + return internal::ValueArray39(v1, + v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, + v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, + v32, v33, v34, v35, v36, v37, v38, v39); +} + +template +internal::ValueArray40 Values(T1 v1, + T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, + T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, + T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, + T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, + T36 v36, T37 v37, T38 v38, T39 v39, T40 v40) { + return internal::ValueArray40(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, + v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, + v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40); +} + +template +internal::ValueArray41 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41) { + return internal::ValueArray41(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, + v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, v28, + v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41); +} + +template +internal::ValueArray42 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42) { + return internal::ValueArray42(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, v13, + v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, v27, + v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, v41, + v42); +} + +template +internal::ValueArray43 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43) { + return internal::ValueArray43(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, v12, + v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, v26, + v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, v40, + v41, v42, v43); +} + +template +internal::ValueArray44 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, + T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, T17 v17, + T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, T25 v25, + T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, T33 v33, + T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, T41 v41, + T42 v42, T43 v43, T44 v44) { + return internal::ValueArray44(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, v11, + v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, v25, + v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, v39, + v40, v41, v42, v43, v44); +} + +template +internal::ValueArray45 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, T8 v8, + T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, T16 v16, + T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, T24 v24, + T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, T32 v32, + T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, T40 v40, + T41 v41, T42 v42, T43 v43, T44 v44, T45 v45) { + return internal::ValueArray45(v1, v2, v3, v4, v5, v6, v7, v8, v9, v10, + v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, v24, + v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, v38, + v39, v40, v41, v42, v43, v44, v45); +} + +template +internal::ValueArray46 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, + T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, + T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, + T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, + T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46) { + return internal::ValueArray46(v1, v2, v3, v4, v5, v6, v7, v8, v9, + v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, + v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, + v38, v39, v40, v41, v42, v43, v44, v45, v46); +} + +template +internal::ValueArray47 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, T7 v7, + T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, + T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, + T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, + T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47) { + return internal::ValueArray47(v1, v2, v3, v4, v5, v6, v7, v8, + v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, v23, + v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, v37, + v38, v39, v40, v41, v42, v43, v44, v45, v46, v47); +} + +template +internal::ValueArray48 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, T6 v6, + T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, T15 v15, + T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, T23 v23, + T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, T31 v31, + T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, T39 v39, + T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, T47 v47, + T48 v48) { + return internal::ValueArray48(v1, v2, v3, v4, v5, v6, v7, + v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, v22, + v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, v36, + v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48); +} + +template +internal::ValueArray49 Values(T1 v1, T2 v2, T3 v3, T4 v4, T5 v5, + T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, T14 v14, + T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, T22 v22, + T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, T30 v30, + T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, T38 v38, + T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, T46 v46, + T47 v47, T48 v48, T49 v49) { + return internal::ValueArray49(v1, v2, v3, v4, v5, v6, + v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, v20, v21, + v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, v34, v35, + v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, v48, v49); +} + +template +internal::ValueArray50 Values(T1 v1, T2 v2, T3 v3, T4 v4, + T5 v5, T6 v6, T7 v7, T8 v8, T9 v9, T10 v10, T11 v11, T12 v12, T13 v13, + T14 v14, T15 v15, T16 v16, T17 v17, T18 v18, T19 v19, T20 v20, T21 v21, + T22 v22, T23 v23, T24 v24, T25 v25, T26 v26, T27 v27, T28 v28, T29 v29, + T30 v30, T31 v31, T32 v32, T33 v33, T34 v34, T35 v35, T36 v36, T37 v37, + T38 v38, T39 v39, T40 v40, T41 v41, T42 v42, T43 v43, T44 v44, T45 v45, + T46 v46, T47 v47, T48 v48, T49 v49, T50 v50) { + return internal::ValueArray50(v1, v2, v3, v4, + v5, v6, v7, v8, v9, v10, v11, v12, v13, v14, v15, v16, v17, v18, v19, + v20, v21, v22, v23, v24, v25, v26, v27, v28, v29, v30, v31, v32, v33, + v34, v35, v36, v37, v38, v39, v40, v41, v42, v43, v44, v45, v46, v47, + v48, v49, v50); +} + +// Bool() allows generating tests with parameters in a set of (false, true). +// +// Synopsis: +// Bool() +// - returns a generator producing sequences with elements {false, true}. +// +// It is useful when testing code that depends on Boolean flags. Combinations +// of multiple flags can be tested when several Bool()'s are combined using +// Combine() function. +// +// In the following example all tests in the test case FlagDependentTest +// will be instantiated twice with parameters false and true. +// +// class FlagDependentTest : public testing::TestWithParam { +// virtual void SetUp() { +// external_flag = GetParam(); +// } +// } +// INSTANTIATE_TEST_CASE_P(BoolSequence, FlagDependentTest, Bool()); +// +inline internal::ParamGenerator Bool() { + return Values(false, true); +} + +# if GTEST_HAS_COMBINE +// Combine() allows the user to combine two or more sequences to produce +// values of a Cartesian product of those sequences' elements. +// +// Synopsis: +// Combine(gen1, gen2, ..., genN) +// - returns a generator producing sequences with elements coming from +// the Cartesian product of elements from the sequences generated by +// gen1, gen2, ..., genN. The sequence elements will have a type of +// tuple where T1, T2, ..., TN are the types +// of elements from sequences produces by gen1, gen2, ..., genN. +// +// Combine can have up to 10 arguments. This number is currently limited +// by the maximum number of elements in the tuple implementation used by Google +// Test. +// +// Example: +// +// This will instantiate tests in test case AnimalTest each one with +// the parameter values tuple("cat", BLACK), tuple("cat", WHITE), +// tuple("dog", BLACK), and tuple("dog", WHITE): +// +// enum Color { BLACK, GRAY, WHITE }; +// class AnimalTest +// : public testing::TestWithParam > {...}; +// +// TEST_P(AnimalTest, AnimalLooksNice) {...} +// +// INSTANTIATE_TEST_CASE_P(AnimalVariations, AnimalTest, +// Combine(Values("cat", "dog"), +// Values(BLACK, WHITE))); +// +// This will instantiate tests in FlagDependentTest with all variations of two +// Boolean flags: +// +// class FlagDependentTest +// : public testing::TestWithParam > { +// virtual void SetUp() { +// // Assigns external_flag_1 and external_flag_2 values from the tuple. +// tie(external_flag_1, external_flag_2) = GetParam(); +// } +// }; +// +// TEST_P(FlagDependentTest, TestFeature1) { +// // Test your code using external_flag_1 and external_flag_2 here. +// } +// INSTANTIATE_TEST_CASE_P(TwoBoolSequence, FlagDependentTest, +// Combine(Bool(), Bool())); +// +template +internal::CartesianProductHolder2 Combine( + const Generator1& g1, const Generator2& g2) { + return internal::CartesianProductHolder2( + g1, g2); +} + +template +internal::CartesianProductHolder3 Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3) { + return internal::CartesianProductHolder3( + g1, g2, g3); +} + +template +internal::CartesianProductHolder4 Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3, + const Generator4& g4) { + return internal::CartesianProductHolder4( + g1, g2, g3, g4); +} + +template +internal::CartesianProductHolder5 Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3, + const Generator4& g4, const Generator5& g5) { + return internal::CartesianProductHolder5( + g1, g2, g3, g4, g5); +} + +template +internal::CartesianProductHolder6 Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3, + const Generator4& g4, const Generator5& g5, const Generator6& g6) { + return internal::CartesianProductHolder6( + g1, g2, g3, g4, g5, g6); +} + +template +internal::CartesianProductHolder7 Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3, + const Generator4& g4, const Generator5& g5, const Generator6& g6, + const Generator7& g7) { + return internal::CartesianProductHolder7( + g1, g2, g3, g4, g5, g6, g7); +} + +template +internal::CartesianProductHolder8 Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3, + const Generator4& g4, const Generator5& g5, const Generator6& g6, + const Generator7& g7, const Generator8& g8) { + return internal::CartesianProductHolder8( + g1, g2, g3, g4, g5, g6, g7, g8); +} + +template +internal::CartesianProductHolder9 Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3, + const Generator4& g4, const Generator5& g5, const Generator6& g6, + const Generator7& g7, const Generator8& g8, const Generator9& g9) { + return internal::CartesianProductHolder9( + g1, g2, g3, g4, g5, g6, g7, g8, g9); +} + +template +internal::CartesianProductHolder10 Combine( + const Generator1& g1, const Generator2& g2, const Generator3& g3, + const Generator4& g4, const Generator5& g5, const Generator6& g6, + const Generator7& g7, const Generator8& g8, const Generator9& g9, + const Generator10& g10) { + return internal::CartesianProductHolder10( + g1, g2, g3, g4, g5, g6, g7, g8, g9, g10); +} +# endif // GTEST_HAS_COMBINE + + + +# define TEST_P(test_case_name, test_name) \ + class GTEST_TEST_CLASS_NAME_(test_case_name, test_name) \ + : public test_case_name { \ + public: \ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name)() {} \ + virtual void TestBody(); \ + private: \ + static int AddToRegistry() { \ + ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \ + GetTestCasePatternHolder(\ + #test_case_name, __FILE__, __LINE__)->AddTestPattern(\ + #test_case_name, \ + #test_name, \ + new ::testing::internal::TestMetaFactory< \ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name)>()); \ + return 0; \ + } \ + static int gtest_registering_dummy_; \ + GTEST_DISALLOW_COPY_AND_ASSIGN_(\ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name)); \ + }; \ + int GTEST_TEST_CLASS_NAME_(test_case_name, \ + test_name)::gtest_registering_dummy_ = \ + GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::AddToRegistry(); \ + void GTEST_TEST_CLASS_NAME_(test_case_name, test_name)::TestBody() + +# define INSTANTIATE_TEST_CASE_P(prefix, test_case_name, generator) \ + ::testing::internal::ParamGenerator \ + gtest_##prefix##test_case_name##_EvalGenerator_() { return generator; } \ + int gtest_##prefix##test_case_name##_dummy_ = \ + ::testing::UnitTest::GetInstance()->parameterized_test_registry(). \ + GetTestCasePatternHolder(\ + #test_case_name, __FILE__, __LINE__)->AddTestCaseInstantiation(\ + #prefix, \ + >est_##prefix##test_case_name##_EvalGenerator_, \ + __FILE__, __LINE__) + +} // namespace testing + +#endif // GTEST_HAS_PARAM_TEST + +#endif // GTEST_INCLUDE_GTEST_GTEST_PARAM_TEST_H_ +// Copyright 2006, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) +// +// Google C++ Testing Framework definitions useful in production code. + +#ifndef GTEST_INCLUDE_GTEST_GTEST_PROD_H_ +#define GTEST_INCLUDE_GTEST_GTEST_PROD_H_ + +// When you need to test the private or protected members of a class, +// use the FRIEND_TEST macro to declare your tests as friends of the +// class. For example: +// +// class MyClass { +// private: +// void MyMethod(); +// FRIEND_TEST(MyClassTest, MyMethod); +// }; +// +// class MyClassTest : public testing::Test { +// // ... +// }; +// +// TEST_F(MyClassTest, MyMethod) { +// // Can call MyClass::MyMethod() here. +// } + +#define FRIEND_TEST(test_case_name, test_name)\ +friend class test_case_name##_##test_name##_Test + +#endif // GTEST_INCLUDE_GTEST_GTEST_PROD_H_ +// Copyright 2008, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: mheule@google.com (Markus Heule) +// + +#ifndef GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ +#define GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ + +#include +#include + +namespace testing { + +// A copyable object representing the result of a test part (i.e. an +// assertion or an explicit FAIL(), ADD_FAILURE(), or SUCCESS()). +// +// Don't inherit from TestPartResult as its destructor is not virtual. +class GTEST_API_ TestPartResult { + public: + // The possible outcomes of a test part (i.e. an assertion or an + // explicit SUCCEED(), FAIL(), or ADD_FAILURE()). + enum Type { + kSuccess, // Succeeded. + kNonFatalFailure, // Failed but the test can continue. + kFatalFailure // Failed and the test should be terminated. + }; + + // C'tor. TestPartResult does NOT have a default constructor. + // Always use this constructor (with parameters) to create a + // TestPartResult object. + TestPartResult(Type a_type, + const char* a_file_name, + int a_line_number, + const char* a_message) + : type_(a_type), + file_name_(a_file_name), + line_number_(a_line_number), + summary_(ExtractSummary(a_message)), + message_(a_message) { + } + + // Gets the outcome of the test part. + Type type() const { return type_; } + + // Gets the name of the source file where the test part took place, or + // NULL if it's unknown. + const char* file_name() const { return file_name_.c_str(); } + + // Gets the line in the source file where the test part took place, + // or -1 if it's unknown. + int line_number() const { return line_number_; } + + // Gets the summary of the failure message. + const char* summary() const { return summary_.c_str(); } + + // Gets the message associated with the test part. + const char* message() const { return message_.c_str(); } + + // Returns true iff the test part passed. + bool passed() const { return type_ == kSuccess; } + + // Returns true iff the test part failed. + bool failed() const { return type_ != kSuccess; } + + // Returns true iff the test part non-fatally failed. + bool nonfatally_failed() const { return type_ == kNonFatalFailure; } + + // Returns true iff the test part fatally failed. + bool fatally_failed() const { return type_ == kFatalFailure; } + private: + Type type_; + + // Gets the summary of the failure message by omitting the stack + // trace in it. + static internal::String ExtractSummary(const char* message); + + // The name of the source file where the test part took place, or + // NULL if the source file is unknown. + internal::String file_name_; + // The line in the source file where the test part took place, or -1 + // if the line number is unknown. + int line_number_; + internal::String summary_; // The test failure summary. + internal::String message_; // The test failure message. +}; + +// Prints a TestPartResult object. +std::ostream& operator<<(std::ostream& os, const TestPartResult& result); + +// An array of TestPartResult objects. +// +// Don't inherit from TestPartResultArray as its destructor is not +// virtual. +class GTEST_API_ TestPartResultArray { + public: + TestPartResultArray() {} + + // Appends the given TestPartResult to the array. + void Append(const TestPartResult& result); + + // Returns the TestPartResult at the given index (0-based). + const TestPartResult& GetTestPartResult(int index) const; + + // Returns the number of TestPartResult objects in the array. + int size() const; + + private: + std::vector array_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestPartResultArray); +}; + +// This interface knows how to report a test part result. +class TestPartResultReporterInterface { + public: + virtual ~TestPartResultReporterInterface() {} + + virtual void ReportTestPartResult(const TestPartResult& result) = 0; +}; + +namespace internal { + +// This helper class is used by {ASSERT|EXPECT}_NO_FATAL_FAILURE to check if a +// statement generates new fatal failures. To do so it registers itself as the +// current test part result reporter. Besides checking if fatal failures were +// reported, it only delegates the reporting to the former result reporter. +// The original result reporter is restored in the destructor. +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +class GTEST_API_ HasNewFatalFailureHelper + : public TestPartResultReporterInterface { + public: + HasNewFatalFailureHelper(); + virtual ~HasNewFatalFailureHelper(); + virtual void ReportTestPartResult(const TestPartResult& result); + bool has_new_fatal_failure() const { return has_new_fatal_failure_; } + private: + bool has_new_fatal_failure_; + TestPartResultReporterInterface* original_reporter_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(HasNewFatalFailureHelper); +}; + +} // namespace internal + +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_GTEST_TEST_PART_H_ +// Copyright 2008 Google Inc. +// All Rights Reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +// +// Author: wan@google.com (Zhanyong Wan) + +#ifndef GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ +#define GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ + +// This header implements typed tests and type-parameterized tests. + +// Typed (aka type-driven) tests repeat the same test for types in a +// list. You must know which types you want to test with when writing +// typed tests. Here's how you do it: + +#if 0 + +// First, define a fixture class template. It should be parameterized +// by a type. Remember to derive it from testing::Test. +template +class FooTest : public testing::Test { + public: + ... + typedef std::list List; + static T shared_; + T value_; +}; + +// Next, associate a list of types with the test case, which will be +// repeated for each type in the list. The typedef is necessary for +// the macro to parse correctly. +typedef testing::Types MyTypes; +TYPED_TEST_CASE(FooTest, MyTypes); + +// If the type list contains only one type, you can write that type +// directly without Types<...>: +// TYPED_TEST_CASE(FooTest, int); + +// Then, use TYPED_TEST() instead of TEST_F() to define as many typed +// tests for this test case as you want. +TYPED_TEST(FooTest, DoesBlah) { + // Inside a test, refer to TypeParam to get the type parameter. + // Since we are inside a derived class template, C++ requires use to + // visit the members of FooTest via 'this'. + TypeParam n = this->value_; + + // To visit static members of the fixture, add the TestFixture:: + // prefix. + n += TestFixture::shared_; + + // To refer to typedefs in the fixture, add the "typename + // TestFixture::" prefix. + typename TestFixture::List values; + values.push_back(n); + ... +} + +TYPED_TEST(FooTest, HasPropertyA) { ... } + +#endif // 0 + +// Type-parameterized tests are abstract test patterns parameterized +// by a type. Compared with typed tests, type-parameterized tests +// allow you to define the test pattern without knowing what the type +// parameters are. The defined pattern can be instantiated with +// different types any number of times, in any number of translation +// units. +// +// If you are designing an interface or concept, you can define a +// suite of type-parameterized tests to verify properties that any +// valid implementation of the interface/concept should have. Then, +// each implementation can easily instantiate the test suite to verify +// that it conforms to the requirements, without having to write +// similar tests repeatedly. Here's an example: + +#if 0 + +// First, define a fixture class template. It should be parameterized +// by a type. Remember to derive it from testing::Test. +template +class FooTest : public testing::Test { + ... +}; + +// Next, declare that you will define a type-parameterized test case +// (the _P suffix is for "parameterized" or "pattern", whichever you +// prefer): +TYPED_TEST_CASE_P(FooTest); + +// Then, use TYPED_TEST_P() to define as many type-parameterized tests +// for this type-parameterized test case as you want. +TYPED_TEST_P(FooTest, DoesBlah) { + // Inside a test, refer to TypeParam to get the type parameter. + TypeParam n = 0; + ... +} + +TYPED_TEST_P(FooTest, HasPropertyA) { ... } + +// Now the tricky part: you need to register all test patterns before +// you can instantiate them. The first argument of the macro is the +// test case name; the rest are the names of the tests in this test +// case. +REGISTER_TYPED_TEST_CASE_P(FooTest, + DoesBlah, HasPropertyA); + +// Finally, you are free to instantiate the pattern with the types you +// want. If you put the above code in a header file, you can #include +// it in multiple C++ source files and instantiate it multiple times. +// +// To distinguish different instances of the pattern, the first +// argument to the INSTANTIATE_* macro is a prefix that will be added +// to the actual test case name. Remember to pick unique prefixes for +// different instances. +typedef testing::Types MyTypes; +INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, MyTypes); + +// If the type list contains only one type, you can write that type +// directly without Types<...>: +// INSTANTIATE_TYPED_TEST_CASE_P(My, FooTest, int); + +#endif // 0 + + +// Implements typed tests. + +#if GTEST_HAS_TYPED_TEST + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Expands to the name of the typedef for the type parameters of the +// given test case. +# define GTEST_TYPE_PARAMS_(TestCaseName) gtest_type_params_##TestCaseName##_ + +// The 'Types' template argument below must have spaces around it +// since some compilers may choke on '>>' when passing a template +// instance (e.g. Types) +# define TYPED_TEST_CASE(CaseName, Types) \ + typedef ::testing::internal::TypeList< Types >::type \ + GTEST_TYPE_PARAMS_(CaseName) + +# define TYPED_TEST(CaseName, TestName) \ + template \ + class GTEST_TEST_CLASS_NAME_(CaseName, TestName) \ + : public CaseName { \ + private: \ + typedef CaseName TestFixture; \ + typedef gtest_TypeParam_ TypeParam; \ + virtual void TestBody(); \ + }; \ + bool gtest_##CaseName##_##TestName##_registered_ GTEST_ATTRIBUTE_UNUSED_ = \ + ::testing::internal::TypeParameterizedTest< \ + CaseName, \ + ::testing::internal::TemplateSel< \ + GTEST_TEST_CLASS_NAME_(CaseName, TestName)>, \ + GTEST_TYPE_PARAMS_(CaseName)>::Register(\ + "", #CaseName, #TestName, 0); \ + template \ + void GTEST_TEST_CLASS_NAME_(CaseName, TestName)::TestBody() + +#endif // GTEST_HAS_TYPED_TEST + +// Implements type-parameterized tests. + +#if GTEST_HAS_TYPED_TEST_P + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Expands to the namespace name that the type-parameterized tests for +// the given type-parameterized test case are defined in. The exact +// name of the namespace is subject to change without notice. +# define GTEST_CASE_NAMESPACE_(TestCaseName) \ + gtest_case_##TestCaseName##_ + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// +// Expands to the name of the variable used to remember the names of +// the defined tests in the given test case. +# define GTEST_TYPED_TEST_CASE_P_STATE_(TestCaseName) \ + gtest_typed_test_case_p_state_##TestCaseName##_ + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE DIRECTLY. +// +// Expands to the name of the variable used to remember the names of +// the registered tests in the given test case. +# define GTEST_REGISTERED_TEST_NAMES_(TestCaseName) \ + gtest_registered_test_names_##TestCaseName##_ + +// The variables defined in the type-parameterized test macros are +// static as typically these macros are used in a .h file that can be +// #included in multiple translation units linked together. +# define TYPED_TEST_CASE_P(CaseName) \ + static ::testing::internal::TypedTestCasePState \ + GTEST_TYPED_TEST_CASE_P_STATE_(CaseName) + +# define TYPED_TEST_P(CaseName, TestName) \ + namespace GTEST_CASE_NAMESPACE_(CaseName) { \ + template \ + class TestName : public CaseName { \ + private: \ + typedef CaseName TestFixture; \ + typedef gtest_TypeParam_ TypeParam; \ + virtual void TestBody(); \ + }; \ + static bool gtest_##TestName##_defined_ GTEST_ATTRIBUTE_UNUSED_ = \ + GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).AddTestName(\ + __FILE__, __LINE__, #CaseName, #TestName); \ + } \ + template \ + void GTEST_CASE_NAMESPACE_(CaseName)::TestName::TestBody() + +# define REGISTER_TYPED_TEST_CASE_P(CaseName, ...) \ + namespace GTEST_CASE_NAMESPACE_(CaseName) { \ + typedef ::testing::internal::Templates<__VA_ARGS__>::type gtest_AllTests_; \ + } \ + static const char* const GTEST_REGISTERED_TEST_NAMES_(CaseName) = \ + GTEST_TYPED_TEST_CASE_P_STATE_(CaseName).VerifyRegisteredTestNames(\ + __FILE__, __LINE__, #__VA_ARGS__) + +// The 'Types' template argument below must have spaces around it +// since some compilers may choke on '>>' when passing a template +// instance (e.g. Types) +# define INSTANTIATE_TYPED_TEST_CASE_P(Prefix, CaseName, Types) \ + bool gtest_##Prefix##_##CaseName GTEST_ATTRIBUTE_UNUSED_ = \ + ::testing::internal::TypeParameterizedTestCase::type>::Register(\ + #Prefix, #CaseName, GTEST_REGISTERED_TEST_NAMES_(CaseName)) + +#endif // GTEST_HAS_TYPED_TEST_P + +#endif // GTEST_INCLUDE_GTEST_GTEST_TYPED_TEST_H_ + +// Depending on the platform, different string classes are available. +// On Linux, in addition to ::std::string, Google also makes use of +// class ::string, which has the same interface as ::std::string, but +// has a different implementation. +// +// The user can define GTEST_HAS_GLOBAL_STRING to 1 to indicate that +// ::string is available AND is a distinct type to ::std::string, or +// define it to 0 to indicate otherwise. +// +// If the user's ::std::string and ::string are the same class due to +// aliasing, he should define GTEST_HAS_GLOBAL_STRING to 0. +// +// If the user doesn't define GTEST_HAS_GLOBAL_STRING, it is defined +// heuristically. + +namespace testing { + +// Declares the flags. + +// This flag temporary enables the disabled tests. +GTEST_DECLARE_bool_(also_run_disabled_tests); + +// This flag brings the debugger on an assertion failure. +GTEST_DECLARE_bool_(break_on_failure); + +// This flag controls whether Google Test catches all test-thrown exceptions +// and logs them as failures. +GTEST_DECLARE_bool_(catch_exceptions); + +// This flag enables using colors in terminal output. Available values are +// "yes" to enable colors, "no" (disable colors), or "auto" (the default) +// to let Google Test decide. +GTEST_DECLARE_string_(color); + +// This flag sets up the filter to select by name using a glob pattern +// the tests to run. If the filter is not given all tests are executed. +GTEST_DECLARE_string_(filter); + +// This flag causes the Google Test to list tests. None of the tests listed +// are actually run if the flag is provided. +GTEST_DECLARE_bool_(list_tests); + +// This flag controls whether Google Test emits a detailed XML report to a file +// in addition to its normal textual output. +GTEST_DECLARE_string_(output); + +// This flags control whether Google Test prints the elapsed time for each +// test. +GTEST_DECLARE_bool_(print_time); + +// This flag specifies the random number seed. +GTEST_DECLARE_int32_(random_seed); + +// This flag sets how many times the tests are repeated. The default value +// is 1. If the value is -1 the tests are repeating forever. +GTEST_DECLARE_int32_(repeat); + +// This flag controls whether Google Test includes Google Test internal +// stack frames in failure stack traces. +GTEST_DECLARE_bool_(show_internal_stack_frames); + +// When this flag is specified, tests' order is randomized on every iteration. +GTEST_DECLARE_bool_(shuffle); + +// This flag specifies the maximum number of stack frames to be +// printed in a failure message. +GTEST_DECLARE_int32_(stack_trace_depth); + +// When this flag is specified, a failed assertion will throw an +// exception if exceptions are enabled, or exit the program with a +// non-zero code otherwise. +GTEST_DECLARE_bool_(throw_on_failure); + +// When this flag is set with a "host:port" string, on supported +// platforms test results are streamed to the specified port on +// the specified host machine. +GTEST_DECLARE_string_(stream_result_to); + +// The upper limit for valid stack trace depths. +const int kMaxStackTraceDepth = 100; + +namespace internal { + +class AssertHelper; +class DefaultGlobalTestPartResultReporter; +class ExecDeathTest; +class NoExecDeathTest; +class FinalSuccessChecker; +class GTestFlagSaver; +class TestResultAccessor; +class TestEventListenersAccessor; +class TestEventRepeater; +class WindowsDeathTest; +class UnitTestImpl* GetUnitTestImpl(); +void ReportFailureInUnknownLocation(TestPartResult::Type result_type, + const String& message); + +// Converts a streamable value to a String. A NULL pointer is +// converted to "(null)". When the input value is a ::string, +// ::std::string, ::wstring, or ::std::wstring object, each NUL +// character in it is replaced with "\\0". +// Declared in gtest-internal.h but defined here, so that it has access +// to the definition of the Message class, required by the ARM +// compiler. +template +String StreamableToString(const T& streamable) { + return (Message() << streamable).GetString(); +} + +} // namespace internal + +// The friend relationship of some of these classes is cyclic. +// If we don't forward declare them the compiler might confuse the classes +// in friendship clauses with same named classes on the scope. +class Test; +class TestCase; +class TestInfo; +class UnitTest; + +// A class for indicating whether an assertion was successful. When +// the assertion wasn't successful, the AssertionResult object +// remembers a non-empty message that describes how it failed. +// +// To create an instance of this class, use one of the factory functions +// (AssertionSuccess() and AssertionFailure()). +// +// This class is useful for two purposes: +// 1. Defining predicate functions to be used with Boolean test assertions +// EXPECT_TRUE/EXPECT_FALSE and their ASSERT_ counterparts +// 2. Defining predicate-format functions to be +// used with predicate assertions (ASSERT_PRED_FORMAT*, etc). +// +// For example, if you define IsEven predicate: +// +// testing::AssertionResult IsEven(int n) { +// if ((n % 2) == 0) +// return testing::AssertionSuccess(); +// else +// return testing::AssertionFailure() << n << " is odd"; +// } +// +// Then the failed expectation EXPECT_TRUE(IsEven(Fib(5))) +// will print the message +// +// Value of: IsEven(Fib(5)) +// Actual: false (5 is odd) +// Expected: true +// +// instead of a more opaque +// +// Value of: IsEven(Fib(5)) +// Actual: false +// Expected: true +// +// in case IsEven is a simple Boolean predicate. +// +// If you expect your predicate to be reused and want to support informative +// messages in EXPECT_FALSE and ASSERT_FALSE (negative assertions show up +// about half as often as positive ones in our tests), supply messages for +// both success and failure cases: +// +// testing::AssertionResult IsEven(int n) { +// if ((n % 2) == 0) +// return testing::AssertionSuccess() << n << " is even"; +// else +// return testing::AssertionFailure() << n << " is odd"; +// } +// +// Then a statement EXPECT_FALSE(IsEven(Fib(6))) will print +// +// Value of: IsEven(Fib(6)) +// Actual: true (8 is even) +// Expected: false +// +// NB: Predicates that support negative Boolean assertions have reduced +// performance in positive ones so be careful not to use them in tests +// that have lots (tens of thousands) of positive Boolean assertions. +// +// To use this class with EXPECT_PRED_FORMAT assertions such as: +// +// // Verifies that Foo() returns an even number. +// EXPECT_PRED_FORMAT1(IsEven, Foo()); +// +// you need to define: +// +// testing::AssertionResult IsEven(const char* expr, int n) { +// if ((n % 2) == 0) +// return testing::AssertionSuccess(); +// else +// return testing::AssertionFailure() +// << "Expected: " << expr << " is even\n Actual: it's " << n; +// } +// +// If Foo() returns 5, you will see the following message: +// +// Expected: Foo() is even +// Actual: it's 5 +// +class GTEST_API_ AssertionResult { + public: + // Copy constructor. + // Used in EXPECT_TRUE/FALSE(assertion_result). + AssertionResult(const AssertionResult& other); + // Used in the EXPECT_TRUE/FALSE(bool_expression). + explicit AssertionResult(bool success) : success_(success) {} + + // Returns true iff the assertion succeeded. + operator bool() const { return success_; } // NOLINT + + // Returns the assertion's negation. Used with EXPECT/ASSERT_FALSE. + AssertionResult operator!() const; + + // Returns the text streamed into this AssertionResult. Test assertions + // use it when they fail (i.e., the predicate's outcome doesn't match the + // assertion's expectation). When nothing has been streamed into the + // object, returns an empty string. + const char* message() const { + return message_.get() != NULL ? message_->c_str() : ""; + } + // TODO(vladl@google.com): Remove this after making sure no clients use it. + // Deprecated; please use message() instead. + const char* failure_message() const { return message(); } + + // Streams a custom failure message into this object. + template AssertionResult& operator<<(const T& value) { + AppendMessage(Message() << value); + return *this; + } + + // Allows streaming basic output manipulators such as endl or flush into + // this object. + AssertionResult& operator<<( + ::std::ostream& (*basic_manipulator)(::std::ostream& stream)) { + AppendMessage(Message() << basic_manipulator); + return *this; + } + + private: + // Appends the contents of message to message_. + void AppendMessage(const Message& a_message) { + if (message_.get() == NULL) + message_.reset(new ::std::string); + message_->append(a_message.GetString().c_str()); + } + + // Stores result of the assertion predicate. + bool success_; + // Stores the message describing the condition in case the expectation + // construct is not satisfied with the predicate's outcome. + // Referenced via a pointer to avoid taking too much stack frame space + // with test assertions. + internal::scoped_ptr< ::std::string> message_; + + GTEST_DISALLOW_ASSIGN_(AssertionResult); +}; + +// Makes a successful assertion result. +GTEST_API_ AssertionResult AssertionSuccess(); + +// Makes a failed assertion result. +GTEST_API_ AssertionResult AssertionFailure(); + +// Makes a failed assertion result with the given failure message. +// Deprecated; use AssertionFailure() << msg. +GTEST_API_ AssertionResult AssertionFailure(const Message& msg); + +// The abstract class that all tests inherit from. +// +// In Google Test, a unit test program contains one or many TestCases, and +// each TestCase contains one or many Tests. +// +// When you define a test using the TEST macro, you don't need to +// explicitly derive from Test - the TEST macro automatically does +// this for you. +// +// The only time you derive from Test is when defining a test fixture +// to be used a TEST_F. For example: +// +// class FooTest : public testing::Test { +// protected: +// virtual void SetUp() { ... } +// virtual void TearDown() { ... } +// ... +// }; +// +// TEST_F(FooTest, Bar) { ... } +// TEST_F(FooTest, Baz) { ... } +// +// Test is not copyable. +class GTEST_API_ Test { + public: + friend class TestInfo; + + // Defines types for pointers to functions that set up and tear down + // a test case. + typedef internal::SetUpTestCaseFunc SetUpTestCaseFunc; + typedef internal::TearDownTestCaseFunc TearDownTestCaseFunc; + + // The d'tor is virtual as we intend to inherit from Test. + virtual ~Test(); + + // Sets up the stuff shared by all tests in this test case. + // + // Google Test will call Foo::SetUpTestCase() before running the first + // test in test case Foo. Hence a sub-class can define its own + // SetUpTestCase() method to shadow the one defined in the super + // class. + static void SetUpTestCase() {} + + // Tears down the stuff shared by all tests in this test case. + // + // Google Test will call Foo::TearDownTestCase() after running the last + // test in test case Foo. Hence a sub-class can define its own + // TearDownTestCase() method to shadow the one defined in the super + // class. + static void TearDownTestCase() {} + + // Returns true iff the current test has a fatal failure. + static bool HasFatalFailure(); + + // Returns true iff the current test has a non-fatal failure. + static bool HasNonfatalFailure(); + + // Returns true iff the current test has a (either fatal or + // non-fatal) failure. + static bool HasFailure() { return HasFatalFailure() || HasNonfatalFailure(); } + + // Logs a property for the current test. Only the last value for a given + // key is remembered. + // These are public static so they can be called from utility functions + // that are not members of the test fixture. + // The arguments are const char* instead strings, as Google Test is used + // on platforms where string doesn't compile. + // + // Note that a driving consideration for these RecordProperty methods + // was to produce xml output suited to the Greenspan charting utility, + // which at present will only chart values that fit in a 32-bit int. It + // is the user's responsibility to restrict their values to 32-bit ints + // if they intend them to be used with Greenspan. + static void RecordProperty(const char* key, const char* value); + static void RecordProperty(const char* key, int value); + + protected: + // Creates a Test object. + Test(); + + // Sets up the test fixture. + virtual void SetUp(); + + // Tears down the test fixture. + virtual void TearDown(); + + private: + // Returns true iff the current test has the same fixture class as + // the first test in the current test case. + static bool HasSameFixtureClass(); + + // Runs the test after the test fixture has been set up. + // + // A sub-class must implement this to define the test logic. + // + // DO NOT OVERRIDE THIS FUNCTION DIRECTLY IN A USER PROGRAM. + // Instead, use the TEST or TEST_F macro. + virtual void TestBody() = 0; + + // Sets up, executes, and tears down the test. + void Run(); + + // Deletes self. We deliberately pick an unusual name for this + // internal method to avoid clashing with names used in user TESTs. + void DeleteSelf_() { delete this; } + + // Uses a GTestFlagSaver to save and restore all Google Test flags. + const internal::GTestFlagSaver* const gtest_flag_saver_; + + // Often a user mis-spells SetUp() as Setup() and spends a long time + // wondering why it is never called by Google Test. The declaration of + // the following method is solely for catching such an error at + // compile time: + // + // - The return type is deliberately chosen to be not void, so it + // will be a conflict if a user declares void Setup() in his test + // fixture. + // + // - This method is private, so it will be another compiler error + // if a user calls it from his test fixture. + // + // DO NOT OVERRIDE THIS FUNCTION. + // + // If you see an error about overriding the following function or + // about it being private, you have mis-spelled SetUp() as Setup(). + struct Setup_should_be_spelled_SetUp {}; + virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; } + + // We disallow copying Tests. + GTEST_DISALLOW_COPY_AND_ASSIGN_(Test); +}; + +typedef internal::TimeInMillis TimeInMillis; + +// A copyable object representing a user specified test property which can be +// output as a key/value string pair. +// +// Don't inherit from TestProperty as its destructor is not virtual. +class TestProperty { + public: + // C'tor. TestProperty does NOT have a default constructor. + // Always use this constructor (with parameters) to create a + // TestProperty object. + TestProperty(const char* a_key, const char* a_value) : + key_(a_key), value_(a_value) { + } + + // Gets the user supplied key. + const char* key() const { + return key_.c_str(); + } + + // Gets the user supplied value. + const char* value() const { + return value_.c_str(); + } + + // Sets a new value, overriding the one supplied in the constructor. + void SetValue(const char* new_value) { + value_ = new_value; + } + + private: + // The key supplied by the user. + internal::String key_; + // The value supplied by the user. + internal::String value_; +}; + +// The result of a single Test. This includes a list of +// TestPartResults, a list of TestProperties, a count of how many +// death tests there are in the Test, and how much time it took to run +// the Test. +// +// TestResult is not copyable. +class GTEST_API_ TestResult { + public: + // Creates an empty TestResult. + TestResult(); + + // D'tor. Do not inherit from TestResult. + ~TestResult(); + + // Gets the number of all test parts. This is the sum of the number + // of successful test parts and the number of failed test parts. + int total_part_count() const; + + // Returns the number of the test properties. + int test_property_count() const; + + // Returns true iff the test passed (i.e. no test part failed). + bool Passed() const { return !Failed(); } + + // Returns true iff the test failed. + bool Failed() const; + + // Returns true iff the test fatally failed. + bool HasFatalFailure() const; + + // Returns true iff the test has a non-fatal failure. + bool HasNonfatalFailure() const; + + // Returns the elapsed time, in milliseconds. + TimeInMillis elapsed_time() const { return elapsed_time_; } + + // Returns the i-th test part result among all the results. i can range + // from 0 to test_property_count() - 1. If i is not in that range, aborts + // the program. + const TestPartResult& GetTestPartResult(int i) const; + + // Returns the i-th test property. i can range from 0 to + // test_property_count() - 1. If i is not in that range, aborts the + // program. + const TestProperty& GetTestProperty(int i) const; + + private: + friend class TestInfo; + friend class UnitTest; + friend class internal::DefaultGlobalTestPartResultReporter; + friend class internal::ExecDeathTest; + friend class internal::TestResultAccessor; + friend class internal::UnitTestImpl; + friend class internal::WindowsDeathTest; + + // Gets the vector of TestPartResults. + const std::vector& test_part_results() const { + return test_part_results_; + } + + // Gets the vector of TestProperties. + const std::vector& test_properties() const { + return test_properties_; + } + + // Sets the elapsed time. + void set_elapsed_time(TimeInMillis elapsed) { elapsed_time_ = elapsed; } + + // Adds a test property to the list. The property is validated and may add + // a non-fatal failure if invalid (e.g., if it conflicts with reserved + // key names). If a property is already recorded for the same key, the + // value will be updated, rather than storing multiple values for the same + // key. + void RecordProperty(const TestProperty& test_property); + + // Adds a failure if the key is a reserved attribute of Google Test + // testcase tags. Returns true if the property is valid. + // TODO(russr): Validate attribute names are legal and human readable. + static bool ValidateTestProperty(const TestProperty& test_property); + + // Adds a test part result to the list. + void AddTestPartResult(const TestPartResult& test_part_result); + + // Returns the death test count. + int death_test_count() const { return death_test_count_; } + + // Increments the death test count, returning the new count. + int increment_death_test_count() { return ++death_test_count_; } + + // Clears the test part results. + void ClearTestPartResults(); + + // Clears the object. + void Clear(); + + // Protects mutable state of the property vector and of owned + // properties, whose values may be updated. + internal::Mutex test_properites_mutex_; + + // The vector of TestPartResults + std::vector test_part_results_; + // The vector of TestProperties + std::vector test_properties_; + // Running count of death tests. + int death_test_count_; + // The elapsed time, in milliseconds. + TimeInMillis elapsed_time_; + + // We disallow copying TestResult. + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestResult); +}; // class TestResult + +// A TestInfo object stores the following information about a test: +// +// Test case name +// Test name +// Whether the test should be run +// A function pointer that creates the test object when invoked +// Test result +// +// The constructor of TestInfo registers itself with the UnitTest +// singleton such that the RUN_ALL_TESTS() macro knows which tests to +// run. +class GTEST_API_ TestInfo { + public: + // Destructs a TestInfo object. This function is not virtual, so + // don't inherit from TestInfo. + ~TestInfo(); + + // Returns the test case name. + const char* test_case_name() const { return test_case_name_.c_str(); } + + // Returns the test name. + const char* name() const { return name_.c_str(); } + + // Returns the name of the parameter type, or NULL if this is not a typed + // or a type-parameterized test. + const char* type_param() const { + if (type_param_.get() != NULL) + return type_param_->c_str(); + return NULL; + } + + // Returns the text representation of the value parameter, or NULL if this + // is not a value-parameterized test. + const char* value_param() const { + if (value_param_.get() != NULL) + return value_param_->c_str(); + return NULL; + } + + // Returns true if this test should run, that is if the test is not disabled + // (or it is disabled but the also_run_disabled_tests flag has been specified) + // and its full name matches the user-specified filter. + // + // Google Test allows the user to filter the tests by their full names. + // The full name of a test Bar in test case Foo is defined as + // "Foo.Bar". Only the tests that match the filter will run. + // + // A filter is a colon-separated list of glob (not regex) patterns, + // optionally followed by a '-' and a colon-separated list of + // negative patterns (tests to exclude). A test is run if it + // matches one of the positive patterns and does not match any of + // the negative patterns. + // + // For example, *A*:Foo.* is a filter that matches any string that + // contains the character 'A' or starts with "Foo.". + bool should_run() const { return should_run_; } + + // Returns the result of the test. + const TestResult* result() const { return &result_; } + + private: + +#if GTEST_HAS_DEATH_TEST + friend class internal::DefaultDeathTestFactory; +#endif // GTEST_HAS_DEATH_TEST + friend class Test; + friend class TestCase; + friend class internal::UnitTestImpl; + friend TestInfo* internal::MakeAndRegisterTestInfo( + const char* test_case_name, const char* name, + const char* type_param, + const char* value_param, + internal::TypeId fixture_class_id, + Test::SetUpTestCaseFunc set_up_tc, + Test::TearDownTestCaseFunc tear_down_tc, + internal::TestFactoryBase* factory); + + // Constructs a TestInfo object. The newly constructed instance assumes + // ownership of the factory object. + TestInfo(const char* test_case_name, const char* name, + const char* a_type_param, + const char* a_value_param, + internal::TypeId fixture_class_id, + internal::TestFactoryBase* factory); + + // Increments the number of death tests encountered in this test so + // far. + int increment_death_test_count() { + return result_.increment_death_test_count(); + } + + // Creates the test object, runs it, records its result, and then + // deletes it. + void Run(); + + static void ClearTestResult(TestInfo* test_info) { + test_info->result_.Clear(); + } + + // These fields are immutable properties of the test. + const std::string test_case_name_; // Test case name + const std::string name_; // Test name + // Name of the parameter type, or NULL if this is not a typed or a + // type-parameterized test. + const internal::scoped_ptr type_param_; + // Text representation of the value parameter, or NULL if this is not a + // value-parameterized test. + const internal::scoped_ptr value_param_; + const internal::TypeId fixture_class_id_; // ID of the test fixture class + bool should_run_; // True iff this test should run + bool is_disabled_; // True iff this test is disabled + bool matches_filter_; // True if this test matches the + // user-specified filter. + internal::TestFactoryBase* const factory_; // The factory that creates + // the test object + + // This field is mutable and needs to be reset before running the + // test for the second time. + TestResult result_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestInfo); +}; + +// A test case, which consists of a vector of TestInfos. +// +// TestCase is not copyable. +class GTEST_API_ TestCase { + public: + // Creates a TestCase with the given name. + // + // TestCase does NOT have a default constructor. Always use this + // constructor to create a TestCase object. + // + // Arguments: + // + // name: name of the test case + // a_type_param: the name of the test's type parameter, or NULL if + // this is not a type-parameterized test. + // set_up_tc: pointer to the function that sets up the test case + // tear_down_tc: pointer to the function that tears down the test case + TestCase(const char* name, const char* a_type_param, + Test::SetUpTestCaseFunc set_up_tc, + Test::TearDownTestCaseFunc tear_down_tc); + + // Destructor of TestCase. + virtual ~TestCase(); + + // Gets the name of the TestCase. + const char* name() const { return name_.c_str(); } + + // Returns the name of the parameter type, or NULL if this is not a + // type-parameterized test case. + const char* type_param() const { + if (type_param_.get() != NULL) + return type_param_->c_str(); + return NULL; + } + + // Returns true if any test in this test case should run. + bool should_run() const { return should_run_; } + + // Gets the number of successful tests in this test case. + int successful_test_count() const; + + // Gets the number of failed tests in this test case. + int failed_test_count() const; + + // Gets the number of disabled tests in this test case. + int disabled_test_count() const; + + // Get the number of tests in this test case that should run. + int test_to_run_count() const; + + // Gets the number of all tests in this test case. + int total_test_count() const; + + // Returns true iff the test case passed. + bool Passed() const { return !Failed(); } + + // Returns true iff the test case failed. + bool Failed() const { return failed_test_count() > 0; } + + // Returns the elapsed time, in milliseconds. + TimeInMillis elapsed_time() const { return elapsed_time_; } + + // Returns the i-th test among all the tests. i can range from 0 to + // total_test_count() - 1. If i is not in that range, returns NULL. + const TestInfo* GetTestInfo(int i) const; + + private: + friend class Test; + friend class internal::UnitTestImpl; + + // Gets the (mutable) vector of TestInfos in this TestCase. + std::vector& test_info_list() { return test_info_list_; } + + // Gets the (immutable) vector of TestInfos in this TestCase. + const std::vector& test_info_list() const { + return test_info_list_; + } + + // Returns the i-th test among all the tests. i can range from 0 to + // total_test_count() - 1. If i is not in that range, returns NULL. + TestInfo* GetMutableTestInfo(int i); + + // Sets the should_run member. + void set_should_run(bool should) { should_run_ = should; } + + // Adds a TestInfo to this test case. Will delete the TestInfo upon + // destruction of the TestCase object. + void AddTestInfo(TestInfo * test_info); + + // Clears the results of all tests in this test case. + void ClearResult(); + + // Clears the results of all tests in the given test case. + static void ClearTestCaseResult(TestCase* test_case) { + test_case->ClearResult(); + } + + // Runs every test in this TestCase. + void Run(); + + // Runs SetUpTestCase() for this TestCase. This wrapper is needed + // for catching exceptions thrown from SetUpTestCase(). + void RunSetUpTestCase() { (*set_up_tc_)(); } + + // Runs TearDownTestCase() for this TestCase. This wrapper is + // needed for catching exceptions thrown from TearDownTestCase(). + void RunTearDownTestCase() { (*tear_down_tc_)(); } + + // Returns true iff test passed. + static bool TestPassed(const TestInfo* test_info) { + return test_info->should_run() && test_info->result()->Passed(); + } + + // Returns true iff test failed. + static bool TestFailed(const TestInfo* test_info) { + return test_info->should_run() && test_info->result()->Failed(); + } + + // Returns true iff test is disabled. + static bool TestDisabled(const TestInfo* test_info) { + return test_info->is_disabled_; + } + + // Returns true if the given test should run. + static bool ShouldRunTest(const TestInfo* test_info) { + return test_info->should_run(); + } + + // Shuffles the tests in this test case. + void ShuffleTests(internal::Random* random); + + // Restores the test order to before the first shuffle. + void UnshuffleTests(); + + // Name of the test case. + internal::String name_; + // Name of the parameter type, or NULL if this is not a typed or a + // type-parameterized test. + const internal::scoped_ptr type_param_; + // The vector of TestInfos in their original order. It owns the + // elements in the vector. + std::vector test_info_list_; + // Provides a level of indirection for the test list to allow easy + // shuffling and restoring the test order. The i-th element in this + // vector is the index of the i-th test in the shuffled test list. + std::vector test_indices_; + // Pointer to the function that sets up the test case. + Test::SetUpTestCaseFunc set_up_tc_; + // Pointer to the function that tears down the test case. + Test::TearDownTestCaseFunc tear_down_tc_; + // True iff any test in this test case should run. + bool should_run_; + // Elapsed time, in milliseconds. + TimeInMillis elapsed_time_; + + // We disallow copying TestCases. + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestCase); +}; + +// An Environment object is capable of setting up and tearing down an +// environment. The user should subclass this to define his own +// environment(s). +// +// An Environment object does the set-up and tear-down in virtual +// methods SetUp() and TearDown() instead of the constructor and the +// destructor, as: +// +// 1. You cannot safely throw from a destructor. This is a problem +// as in some cases Google Test is used where exceptions are enabled, and +// we may want to implement ASSERT_* using exceptions where they are +// available. +// 2. You cannot use ASSERT_* directly in a constructor or +// destructor. +class Environment { + public: + // The d'tor is virtual as we need to subclass Environment. + virtual ~Environment() {} + + // Override this to define how to set up the environment. + virtual void SetUp() {} + + // Override this to define how to tear down the environment. + virtual void TearDown() {} + private: + // If you see an error about overriding the following function or + // about it being private, you have mis-spelled SetUp() as Setup(). + struct Setup_should_be_spelled_SetUp {}; + virtual Setup_should_be_spelled_SetUp* Setup() { return NULL; } +}; + +// The interface for tracing execution of tests. The methods are organized in +// the order the corresponding events are fired. +class TestEventListener { + public: + virtual ~TestEventListener() {} + + // Fired before any test activity starts. + virtual void OnTestProgramStart(const UnitTest& unit_test) = 0; + + // Fired before each iteration of tests starts. There may be more than + // one iteration if GTEST_FLAG(repeat) is set. iteration is the iteration + // index, starting from 0. + virtual void OnTestIterationStart(const UnitTest& unit_test, + int iteration) = 0; + + // Fired before environment set-up for each iteration of tests starts. + virtual void OnEnvironmentsSetUpStart(const UnitTest& unit_test) = 0; + + // Fired after environment set-up for each iteration of tests ends. + virtual void OnEnvironmentsSetUpEnd(const UnitTest& unit_test) = 0; + + // Fired before the test case starts. + virtual void OnTestCaseStart(const TestCase& test_case) = 0; + + // Fired before the test starts. + virtual void OnTestStart(const TestInfo& test_info) = 0; + + // Fired after a failed assertion or a SUCCEED() invocation. + virtual void OnTestPartResult(const TestPartResult& test_part_result) = 0; + + // Fired after the test ends. + virtual void OnTestEnd(const TestInfo& test_info) = 0; + + // Fired after the test case ends. + virtual void OnTestCaseEnd(const TestCase& test_case) = 0; + + // Fired before environment tear-down for each iteration of tests starts. + virtual void OnEnvironmentsTearDownStart(const UnitTest& unit_test) = 0; + + // Fired after environment tear-down for each iteration of tests ends. + virtual void OnEnvironmentsTearDownEnd(const UnitTest& unit_test) = 0; + + // Fired after each iteration of tests finishes. + virtual void OnTestIterationEnd(const UnitTest& unit_test, + int iteration) = 0; + + // Fired after all test activities have ended. + virtual void OnTestProgramEnd(const UnitTest& unit_test) = 0; +}; + +// The convenience class for users who need to override just one or two +// methods and are not concerned that a possible change to a signature of +// the methods they override will not be caught during the build. For +// comments about each method please see the definition of TestEventListener +// above. +class EmptyTestEventListener : public TestEventListener { + public: + virtual void OnTestProgramStart(const UnitTest& /*unit_test*/) {} + virtual void OnTestIterationStart(const UnitTest& /*unit_test*/, + int /*iteration*/) {} + virtual void OnEnvironmentsSetUpStart(const UnitTest& /*unit_test*/) {} + virtual void OnEnvironmentsSetUpEnd(const UnitTest& /*unit_test*/) {} + virtual void OnTestCaseStart(const TestCase& /*test_case*/) {} + virtual void OnTestStart(const TestInfo& /*test_info*/) {} + virtual void OnTestPartResult(const TestPartResult& /*test_part_result*/) {} + virtual void OnTestEnd(const TestInfo& /*test_info*/) {} + virtual void OnTestCaseEnd(const TestCase& /*test_case*/) {} + virtual void OnEnvironmentsTearDownStart(const UnitTest& /*unit_test*/) {} + virtual void OnEnvironmentsTearDownEnd(const UnitTest& /*unit_test*/) {} + virtual void OnTestIterationEnd(const UnitTest& /*unit_test*/, + int /*iteration*/) {} + virtual void OnTestProgramEnd(const UnitTest& /*unit_test*/) {} +}; + +// TestEventListeners lets users add listeners to track events in Google Test. +class GTEST_API_ TestEventListeners { + public: + TestEventListeners(); + ~TestEventListeners(); + + // Appends an event listener to the end of the list. Google Test assumes + // the ownership of the listener (i.e. it will delete the listener when + // the test program finishes). + void Append(TestEventListener* listener); + + // Removes the given event listener from the list and returns it. It then + // becomes the caller's responsibility to delete the listener. Returns + // NULL if the listener is not found in the list. + TestEventListener* Release(TestEventListener* listener); + + // Returns the standard listener responsible for the default console + // output. Can be removed from the listeners list to shut down default + // console output. Note that removing this object from the listener list + // with Release transfers its ownership to the caller and makes this + // function return NULL the next time. + TestEventListener* default_result_printer() const { + return default_result_printer_; + } + + // Returns the standard listener responsible for the default XML output + // controlled by the --gtest_output=xml flag. Can be removed from the + // listeners list by users who want to shut down the default XML output + // controlled by this flag and substitute it with custom one. Note that + // removing this object from the listener list with Release transfers its + // ownership to the caller and makes this function return NULL the next + // time. + TestEventListener* default_xml_generator() const { + return default_xml_generator_; + } + + private: + friend class TestCase; + friend class TestInfo; + friend class internal::DefaultGlobalTestPartResultReporter; + friend class internal::NoExecDeathTest; + friend class internal::TestEventListenersAccessor; + friend class internal::UnitTestImpl; + + // Returns repeater that broadcasts the TestEventListener events to all + // subscribers. + TestEventListener* repeater(); + + // Sets the default_result_printer attribute to the provided listener. + // The listener is also added to the listener list and previous + // default_result_printer is removed from it and deleted. The listener can + // also be NULL in which case it will not be added to the list. Does + // nothing if the previous and the current listener objects are the same. + void SetDefaultResultPrinter(TestEventListener* listener); + + // Sets the default_xml_generator attribute to the provided listener. The + // listener is also added to the listener list and previous + // default_xml_generator is removed from it and deleted. The listener can + // also be NULL in which case it will not be added to the list. Does + // nothing if the previous and the current listener objects are the same. + void SetDefaultXmlGenerator(TestEventListener* listener); + + // Controls whether events will be forwarded by the repeater to the + // listeners in the list. + bool EventForwardingEnabled() const; + void SuppressEventForwarding(); + + // The actual list of listeners. + internal::TestEventRepeater* repeater_; + // Listener responsible for the standard result output. + TestEventListener* default_result_printer_; + // Listener responsible for the creation of the XML output file. + TestEventListener* default_xml_generator_; + + // We disallow copying TestEventListeners. + GTEST_DISALLOW_COPY_AND_ASSIGN_(TestEventListeners); +}; + +// A UnitTest consists of a vector of TestCases. +// +// This is a singleton class. The only instance of UnitTest is +// created when UnitTest::GetInstance() is first called. This +// instance is never deleted. +// +// UnitTest is not copyable. +// +// This class is thread-safe as long as the methods are called +// according to their specification. +class GTEST_API_ UnitTest { + public: + // Gets the singleton UnitTest object. The first time this method + // is called, a UnitTest object is constructed and returned. + // Consecutive calls will return the same object. + static UnitTest* GetInstance(); + + // Runs all tests in this UnitTest object and prints the result. + // Returns 0 if successful, or 1 otherwise. + // + // This method can only be called from the main thread. + // + // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + int Run() GTEST_MUST_USE_RESULT_; + + // Returns the working directory when the first TEST() or TEST_F() + // was executed. The UnitTest object owns the string. + const char* original_working_dir() const; + + // Returns the TestCase object for the test that's currently running, + // or NULL if no test is running. + const TestCase* current_test_case() const; + + // Returns the TestInfo object for the test that's currently running, + // or NULL if no test is running. + const TestInfo* current_test_info() const; + + // Returns the random seed used at the start of the current test run. + int random_seed() const; + +#if GTEST_HAS_PARAM_TEST + // Returns the ParameterizedTestCaseRegistry object used to keep track of + // value-parameterized tests and instantiate and register them. + // + // INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + internal::ParameterizedTestCaseRegistry& parameterized_test_registry(); +#endif // GTEST_HAS_PARAM_TEST + + // Gets the number of successful test cases. + int successful_test_case_count() const; + + // Gets the number of failed test cases. + int failed_test_case_count() const; + + // Gets the number of all test cases. + int total_test_case_count() const; + + // Gets the number of all test cases that contain at least one test + // that should run. + int test_case_to_run_count() const; + + // Gets the number of successful tests. + int successful_test_count() const; + + // Gets the number of failed tests. + int failed_test_count() const; + + // Gets the number of disabled tests. + int disabled_test_count() const; + + // Gets the number of all tests. + int total_test_count() const; + + // Gets the number of tests that should run. + int test_to_run_count() const; + + // Gets the elapsed time, in milliseconds. + TimeInMillis elapsed_time() const; + + // Returns true iff the unit test passed (i.e. all test cases passed). + bool Passed() const; + + // Returns true iff the unit test failed (i.e. some test case failed + // or something outside of all tests failed). + bool Failed() const; + + // Gets the i-th test case among all the test cases. i can range from 0 to + // total_test_case_count() - 1. If i is not in that range, returns NULL. + const TestCase* GetTestCase(int i) const; + + // Returns the list of event listeners that can be used to track events + // inside Google Test. + TestEventListeners& listeners(); + + private: + // Registers and returns a global test environment. When a test + // program is run, all global test environments will be set-up in + // the order they were registered. After all tests in the program + // have finished, all global test environments will be torn-down in + // the *reverse* order they were registered. + // + // The UnitTest object takes ownership of the given environment. + // + // This method can only be called from the main thread. + Environment* AddEnvironment(Environment* env); + + // Adds a TestPartResult to the current TestResult object. All + // Google Test assertion macros (e.g. ASSERT_TRUE, EXPECT_EQ, etc) + // eventually call this to report their results. The user code + // should use the assertion macros instead of calling this directly. + void AddTestPartResult(TestPartResult::Type result_type, + const char* file_name, + int line_number, + const internal::String& message, + const internal::String& os_stack_trace); + + // Adds a TestProperty to the current TestResult object. If the result already + // contains a property with the same key, the value will be updated. + void RecordPropertyForCurrentTest(const char* key, const char* value); + + // Gets the i-th test case among all the test cases. i can range from 0 to + // total_test_case_count() - 1. If i is not in that range, returns NULL. + TestCase* GetMutableTestCase(int i); + + // Accessors for the implementation object. + internal::UnitTestImpl* impl() { return impl_; } + const internal::UnitTestImpl* impl() const { return impl_; } + + // These classes and funcions are friends as they need to access private + // members of UnitTest. + friend class Test; + friend class internal::AssertHelper; + friend class internal::ScopedTrace; + friend Environment* AddGlobalTestEnvironment(Environment* env); + friend internal::UnitTestImpl* internal::GetUnitTestImpl(); + friend void internal::ReportFailureInUnknownLocation( + TestPartResult::Type result_type, + const internal::String& message); + + // Creates an empty UnitTest. + UnitTest(); + + // D'tor + virtual ~UnitTest(); + + // Pushes a trace defined by SCOPED_TRACE() on to the per-thread + // Google Test trace stack. + void PushGTestTrace(const internal::TraceInfo& trace); + + // Pops a trace from the per-thread Google Test trace stack. + void PopGTestTrace(); + + // Protects mutable state in *impl_. This is mutable as some const + // methods need to lock it too. + mutable internal::Mutex mutex_; + + // Opaque implementation object. This field is never changed once + // the object is constructed. We don't mark it as const here, as + // doing so will cause a warning in the constructor of UnitTest. + // Mutable state in *impl_ is protected by mutex_. + internal::UnitTestImpl* impl_; + + // We disallow copying UnitTest. + GTEST_DISALLOW_COPY_AND_ASSIGN_(UnitTest); +}; + +// A convenient wrapper for adding an environment for the test +// program. +// +// You should call this before RUN_ALL_TESTS() is called, probably in +// main(). If you use gtest_main, you need to call this before main() +// starts for it to take effect. For example, you can define a global +// variable like this: +// +// testing::Environment* const foo_env = +// testing::AddGlobalTestEnvironment(new FooEnvironment); +// +// However, we strongly recommend you to write your own main() and +// call AddGlobalTestEnvironment() there, as relying on initialization +// of global variables makes the code harder to read and may cause +// problems when you register multiple environments from different +// translation units and the environments have dependencies among them +// (remember that the compiler doesn't guarantee the order in which +// global variables from different translation units are initialized). +inline Environment* AddGlobalTestEnvironment(Environment* env) { + return UnitTest::GetInstance()->AddEnvironment(env); +} + +// Initializes Google Test. This must be called before calling +// RUN_ALL_TESTS(). In particular, it parses a command line for the +// flags that Google Test recognizes. Whenever a Google Test flag is +// seen, it is removed from argv, and *argc is decremented. +// +// No value is returned. Instead, the Google Test flag variables are +// updated. +// +// Calling the function for the second time has no user-visible effect. +GTEST_API_ void InitGoogleTest(int* argc, char** argv); + +// This overloaded version can be used in Windows programs compiled in +// UNICODE mode. +GTEST_API_ void InitGoogleTest(int* argc, wchar_t** argv); + +namespace internal { + +// Formats a comparison assertion (e.g. ASSERT_EQ, EXPECT_LT, and etc) +// operand to be used in a failure message. The type (but not value) +// of the other operand may affect the format. This allows us to +// print a char* as a raw pointer when it is compared against another +// char*, and print it as a C string when it is compared against an +// std::string object, for example. +// +// The default implementation ignores the type of the other operand. +// Some specialized versions are used to handle formatting wide or +// narrow C strings. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +template +String FormatForComparisonFailureMessage(const T1& value, + const T2& /* other_operand */) { + // C++Builder compiles this incorrectly if the namespace isn't explicitly + // given. + return ::testing::PrintToString(value); +} + +// The helper function for {ASSERT|EXPECT}_EQ. +template +AssertionResult CmpHelperEQ(const char* expected_expression, + const char* actual_expression, + const T1& expected, + const T2& actual) { +#ifdef _MSC_VER +# pragma warning(push) // Saves the current warning state. +# pragma warning(disable:4389) // Temporarily disables warning on + // signed/unsigned mismatch. +#endif + + if (expected == actual) { + return AssertionSuccess(); + } + +#ifdef _MSC_VER +# pragma warning(pop) // Restores the warning state. +#endif + + return EqFailure(expected_expression, + actual_expression, + FormatForComparisonFailureMessage(expected, actual), + FormatForComparisonFailureMessage(actual, expected), + false); +} + +// With this overloaded version, we allow anonymous enums to be used +// in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous enums +// can be implicitly cast to BiggestInt. +GTEST_API_ AssertionResult CmpHelperEQ(const char* expected_expression, + const char* actual_expression, + BiggestInt expected, + BiggestInt actual); + +// The helper class for {ASSERT|EXPECT}_EQ. The template argument +// lhs_is_null_literal is true iff the first argument to ASSERT_EQ() +// is a null pointer literal. The following default implementation is +// for lhs_is_null_literal being false. +template +class EqHelper { + public: + // This templatized version is for the general case. + template + static AssertionResult Compare(const char* expected_expression, + const char* actual_expression, + const T1& expected, + const T2& actual) { + return CmpHelperEQ(expected_expression, actual_expression, expected, + actual); + } + + // With this overloaded version, we allow anonymous enums to be used + // in {ASSERT|EXPECT}_EQ when compiled with gcc 4, as anonymous + // enums can be implicitly cast to BiggestInt. + // + // Even though its body looks the same as the above version, we + // cannot merge the two, as it will make anonymous enums unhappy. + static AssertionResult Compare(const char* expected_expression, + const char* actual_expression, + BiggestInt expected, + BiggestInt actual) { + return CmpHelperEQ(expected_expression, actual_expression, expected, + actual); + } +}; + +// This specialization is used when the first argument to ASSERT_EQ() +// is a null pointer literal, like NULL, false, or 0. +template <> +class EqHelper { + public: + // We define two overloaded versions of Compare(). The first + // version will be picked when the second argument to ASSERT_EQ() is + // NOT a pointer, e.g. ASSERT_EQ(0, AnIntFunction()) or + // EXPECT_EQ(false, a_bool). + template + static AssertionResult Compare( + const char* expected_expression, + const char* actual_expression, + const T1& expected, + const T2& actual, + // The following line prevents this overload from being considered if T2 + // is not a pointer type. We need this because ASSERT_EQ(NULL, my_ptr) + // expands to Compare("", "", NULL, my_ptr), which requires a conversion + // to match the Secret* in the other overload, which would otherwise make + // this template match better. + typename EnableIf::value>::type* = 0) { + return CmpHelperEQ(expected_expression, actual_expression, expected, + actual); + } + + // This version will be picked when the second argument to ASSERT_EQ() is a + // pointer, e.g. ASSERT_EQ(NULL, a_pointer). + template + static AssertionResult Compare( + const char* expected_expression, + const char* actual_expression, + // We used to have a second template parameter instead of Secret*. That + // template parameter would deduce to 'long', making this a better match + // than the first overload even without the first overload's EnableIf. + // Unfortunately, gcc with -Wconversion-null warns when "passing NULL to + // non-pointer argument" (even a deduced integral argument), so the old + // implementation caused warnings in user code. + Secret* /* expected (NULL) */, + T* actual) { + // We already know that 'expected' is a null pointer. + return CmpHelperEQ(expected_expression, actual_expression, + static_cast(NULL), actual); + } +}; + +// A macro for implementing the helper functions needed to implement +// ASSERT_?? and EXPECT_??. It is here just to avoid copy-and-paste +// of similar code. +// +// For each templatized helper function, we also define an overloaded +// version for BiggestInt in order to reduce code bloat and allow +// anonymous enums to be used with {ASSERT|EXPECT}_?? when compiled +// with gcc 4. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +#define GTEST_IMPL_CMP_HELPER_(op_name, op)\ +template \ +AssertionResult CmpHelper##op_name(const char* expr1, const char* expr2, \ + const T1& val1, const T2& val2) {\ + if (val1 op val2) {\ + return AssertionSuccess();\ + } else {\ + return AssertionFailure() \ + << "Expected: (" << expr1 << ") " #op " (" << expr2\ + << "), actual: " << FormatForComparisonFailureMessage(val1, val2)\ + << " vs " << FormatForComparisonFailureMessage(val2, val1);\ + }\ +}\ +GTEST_API_ AssertionResult CmpHelper##op_name(\ + const char* expr1, const char* expr2, BiggestInt val1, BiggestInt val2) + +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. + +// Implements the helper function for {ASSERT|EXPECT}_NE +GTEST_IMPL_CMP_HELPER_(NE, !=); +// Implements the helper function for {ASSERT|EXPECT}_LE +GTEST_IMPL_CMP_HELPER_(LE, <=); +// Implements the helper function for {ASSERT|EXPECT}_LT +GTEST_IMPL_CMP_HELPER_(LT, < ); +// Implements the helper function for {ASSERT|EXPECT}_GE +GTEST_IMPL_CMP_HELPER_(GE, >=); +// Implements the helper function for {ASSERT|EXPECT}_GT +GTEST_IMPL_CMP_HELPER_(GT, > ); + +#undef GTEST_IMPL_CMP_HELPER_ + +// The helper function for {ASSERT|EXPECT}_STREQ. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult CmpHelperSTREQ(const char* expected_expression, + const char* actual_expression, + const char* expected, + const char* actual); + +// The helper function for {ASSERT|EXPECT}_STRCASEEQ. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult CmpHelperSTRCASEEQ(const char* expected_expression, + const char* actual_expression, + const char* expected, + const char* actual); + +// The helper function for {ASSERT|EXPECT}_STRNE. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression, + const char* s2_expression, + const char* s1, + const char* s2); + +// The helper function for {ASSERT|EXPECT}_STRCASENE. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult CmpHelperSTRCASENE(const char* s1_expression, + const char* s2_expression, + const char* s1, + const char* s2); + + +// Helper function for *_STREQ on wide strings. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult CmpHelperSTREQ(const char* expected_expression, + const char* actual_expression, + const wchar_t* expected, + const wchar_t* actual); + +// Helper function for *_STRNE on wide strings. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult CmpHelperSTRNE(const char* s1_expression, + const char* s2_expression, + const wchar_t* s1, + const wchar_t* s2); + +} // namespace internal + +// IsSubstring() and IsNotSubstring() are intended to be used as the +// first argument to {EXPECT,ASSERT}_PRED_FORMAT2(), not by +// themselves. They check whether needle is a substring of haystack +// (NULL is considered a substring of itself only), and return an +// appropriate error message when they fail. +// +// The {needle,haystack}_expr arguments are the stringified +// expressions that generated the two real arguments. +GTEST_API_ AssertionResult IsSubstring( + const char* needle_expr, const char* haystack_expr, + const char* needle, const char* haystack); +GTEST_API_ AssertionResult IsSubstring( + const char* needle_expr, const char* haystack_expr, + const wchar_t* needle, const wchar_t* haystack); +GTEST_API_ AssertionResult IsNotSubstring( + const char* needle_expr, const char* haystack_expr, + const char* needle, const char* haystack); +GTEST_API_ AssertionResult IsNotSubstring( + const char* needle_expr, const char* haystack_expr, + const wchar_t* needle, const wchar_t* haystack); +GTEST_API_ AssertionResult IsSubstring( + const char* needle_expr, const char* haystack_expr, + const ::std::string& needle, const ::std::string& haystack); +GTEST_API_ AssertionResult IsNotSubstring( + const char* needle_expr, const char* haystack_expr, + const ::std::string& needle, const ::std::string& haystack); + +#if GTEST_HAS_STD_WSTRING +GTEST_API_ AssertionResult IsSubstring( + const char* needle_expr, const char* haystack_expr, + const ::std::wstring& needle, const ::std::wstring& haystack); +GTEST_API_ AssertionResult IsNotSubstring( + const char* needle_expr, const char* haystack_expr, + const ::std::wstring& needle, const ::std::wstring& haystack); +#endif // GTEST_HAS_STD_WSTRING + +namespace internal { + +// Helper template function for comparing floating-points. +// +// Template parameter: +// +// RawType: the raw floating-point type (either float or double) +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +template +AssertionResult CmpHelperFloatingPointEQ(const char* expected_expression, + const char* actual_expression, + RawType expected, + RawType actual) { + const FloatingPoint lhs(expected), rhs(actual); + + if (lhs.AlmostEquals(rhs)) { + return AssertionSuccess(); + } + + ::std::stringstream expected_ss; + expected_ss << std::setprecision(std::numeric_limits::digits10 + 2) + << expected; + + ::std::stringstream actual_ss; + actual_ss << std::setprecision(std::numeric_limits::digits10 + 2) + << actual; + + return EqFailure(expected_expression, + actual_expression, + StringStreamToString(&expected_ss), + StringStreamToString(&actual_ss), + false); +} + +// Helper function for implementing ASSERT_NEAR. +// +// INTERNAL IMPLEMENTATION - DO NOT USE IN A USER PROGRAM. +GTEST_API_ AssertionResult DoubleNearPredFormat(const char* expr1, + const char* expr2, + const char* abs_error_expr, + double val1, + double val2, + double abs_error); + +// INTERNAL IMPLEMENTATION - DO NOT USE IN USER CODE. +// A class that enables one to stream messages to assertion macros +class GTEST_API_ AssertHelper { + public: + // Constructor. + AssertHelper(TestPartResult::Type type, + const char* file, + int line, + const char* message); + ~AssertHelper(); + + // Message assignment is a semantic trick to enable assertion + // streaming; see the GTEST_MESSAGE_ macro below. + void operator=(const Message& message) const; + + private: + // We put our data in a struct so that the size of the AssertHelper class can + // be as small as possible. This is important because gcc is incapable of + // re-using stack space even for temporary variables, so every EXPECT_EQ + // reserves stack space for another AssertHelper. + struct AssertHelperData { + AssertHelperData(TestPartResult::Type t, + const char* srcfile, + int line_num, + const char* msg) + : type(t), file(srcfile), line(line_num), message(msg) { } + + TestPartResult::Type const type; + const char* const file; + int const line; + String const message; + + private: + GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelperData); + }; + + AssertHelperData* const data_; + + GTEST_DISALLOW_COPY_AND_ASSIGN_(AssertHelper); +}; + +} // namespace internal + +#if GTEST_HAS_PARAM_TEST +// The pure interface class that all value-parameterized tests inherit from. +// A value-parameterized class must inherit from both ::testing::Test and +// ::testing::WithParamInterface. In most cases that just means inheriting +// from ::testing::TestWithParam, but more complicated test hierarchies +// may need to inherit from Test and WithParamInterface at different levels. +// +// This interface has support for accessing the test parameter value via +// the GetParam() method. +// +// Use it with one of the parameter generator defining functions, like Range(), +// Values(), ValuesIn(), Bool(), and Combine(). +// +// class FooTest : public ::testing::TestWithParam { +// protected: +// FooTest() { +// // Can use GetParam() here. +// } +// virtual ~FooTest() { +// // Can use GetParam() here. +// } +// virtual void SetUp() { +// // Can use GetParam() here. +// } +// virtual void TearDown { +// // Can use GetParam() here. +// } +// }; +// TEST_P(FooTest, DoesBar) { +// // Can use GetParam() method here. +// Foo foo; +// ASSERT_TRUE(foo.DoesBar(GetParam())); +// } +// INSTANTIATE_TEST_CASE_P(OneToTenRange, FooTest, ::testing::Range(1, 10)); + +template +class WithParamInterface { + public: + typedef T ParamType; + virtual ~WithParamInterface() {} + + // The current parameter value. Is also available in the test fixture's + // constructor. This member function is non-static, even though it only + // references static data, to reduce the opportunity for incorrect uses + // like writing 'WithParamInterface::GetParam()' for a test that + // uses a fixture whose parameter type is int. + const ParamType& GetParam() const { return *parameter_; } + + private: + // Sets parameter value. The caller is responsible for making sure the value + // remains alive and unchanged throughout the current test. + static void SetParam(const ParamType* parameter) { + parameter_ = parameter; + } + + // Static value used for accessing parameter during a test lifetime. + static const ParamType* parameter_; + + // TestClass must be a subclass of WithParamInterface and Test. + template friend class internal::ParameterizedTestFactory; +}; + +template +const T* WithParamInterface::parameter_ = NULL; + +// Most value-parameterized classes can ignore the existence of +// WithParamInterface, and can just inherit from ::testing::TestWithParam. + +template +class TestWithParam : public Test, public WithParamInterface { +}; + +#endif // GTEST_HAS_PARAM_TEST + +// Macros for indicating success/failure in test code. + +// ADD_FAILURE unconditionally adds a failure to the current test. +// SUCCEED generates a success - it doesn't automatically make the +// current test successful, as a test is only successful when it has +// no failure. +// +// EXPECT_* verifies that a certain condition is satisfied. If not, +// it behaves like ADD_FAILURE. In particular: +// +// EXPECT_TRUE verifies that a Boolean condition is true. +// EXPECT_FALSE verifies that a Boolean condition is false. +// +// FAIL and ASSERT_* are similar to ADD_FAILURE and EXPECT_*, except +// that they will also abort the current function on failure. People +// usually want the fail-fast behavior of FAIL and ASSERT_*, but those +// writing data-driven tests often find themselves using ADD_FAILURE +// and EXPECT_* more. +// +// Examples: +// +// EXPECT_TRUE(server.StatusIsOK()); +// ASSERT_FALSE(server.HasPendingRequest(port)) +// << "There are still pending requests " << "on port " << port; + +// Generates a nonfatal failure with a generic message. +#define ADD_FAILURE() GTEST_NONFATAL_FAILURE_("Failed") + +// Generates a nonfatal failure at the given source file location with +// a generic message. +#define ADD_FAILURE_AT(file, line) \ + GTEST_MESSAGE_AT_(file, line, "Failed", \ + ::testing::TestPartResult::kNonFatalFailure) + +// Generates a fatal failure with a generic message. +#define GTEST_FAIL() GTEST_FATAL_FAILURE_("Failed") + +// Define this macro to 1 to omit the definition of FAIL(), which is a +// generic name and clashes with some other libraries. +#if !GTEST_DONT_DEFINE_FAIL +# define FAIL() GTEST_FAIL() +#endif + +// Generates a success with a generic message. +#define GTEST_SUCCEED() GTEST_SUCCESS_("Succeeded") + +// Define this macro to 1 to omit the definition of SUCCEED(), which +// is a generic name and clashes with some other libraries. +#if !GTEST_DONT_DEFINE_SUCCEED +# define SUCCEED() GTEST_SUCCEED() +#endif + +// Macros for testing exceptions. +// +// * {ASSERT|EXPECT}_THROW(statement, expected_exception): +// Tests that the statement throws the expected exception. +// * {ASSERT|EXPECT}_NO_THROW(statement): +// Tests that the statement doesn't throw any exception. +// * {ASSERT|EXPECT}_ANY_THROW(statement): +// Tests that the statement throws an exception. + +#define EXPECT_THROW(statement, expected_exception) \ + GTEST_TEST_THROW_(statement, expected_exception, GTEST_NONFATAL_FAILURE_) +#define EXPECT_NO_THROW(statement) \ + GTEST_TEST_NO_THROW_(statement, GTEST_NONFATAL_FAILURE_) +#define EXPECT_ANY_THROW(statement) \ + GTEST_TEST_ANY_THROW_(statement, GTEST_NONFATAL_FAILURE_) +#define ASSERT_THROW(statement, expected_exception) \ + GTEST_TEST_THROW_(statement, expected_exception, GTEST_FATAL_FAILURE_) +#define ASSERT_NO_THROW(statement) \ + GTEST_TEST_NO_THROW_(statement, GTEST_FATAL_FAILURE_) +#define ASSERT_ANY_THROW(statement) \ + GTEST_TEST_ANY_THROW_(statement, GTEST_FATAL_FAILURE_) + +// Boolean assertions. Condition can be either a Boolean expression or an +// AssertionResult. For more information on how to use AssertionResult with +// these macros see comments on that class. +#define EXPECT_TRUE(condition) \ + GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \ + GTEST_NONFATAL_FAILURE_) +#define EXPECT_FALSE(condition) \ + GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \ + GTEST_NONFATAL_FAILURE_) +#define ASSERT_TRUE(condition) \ + GTEST_TEST_BOOLEAN_(condition, #condition, false, true, \ + GTEST_FATAL_FAILURE_) +#define ASSERT_FALSE(condition) \ + GTEST_TEST_BOOLEAN_(!(condition), #condition, true, false, \ + GTEST_FATAL_FAILURE_) + +// Includes the auto-generated header that implements a family of +// generic predicate assertion macros. +// Copyright 2006, Google Inc. +// All rights reserved. +// +// Redistribution and use in source and binary forms, with or without +// modification, are permitted provided that the following conditions are +// met: +// +// * Redistributions of source code must retain the above copyright +// notice, this list of conditions and the following disclaimer. +// * Redistributions in binary form must reproduce the above +// copyright notice, this list of conditions and the following disclaimer +// in the documentation and/or other materials provided with the +// distribution. +// * Neither the name of Google Inc. nor the names of its +// contributors may be used to endorse or promote products derived from +// this software without specific prior written permission. +// +// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS +// "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +// LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR +// A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT +// OWNER OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, +// SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT +// LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, +// DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY +// THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT +// (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +// OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + +// This file is AUTOMATICALLY GENERATED on 09/24/2010 by command +// 'gen_gtest_pred_impl.py 5'. DO NOT EDIT BY HAND! +// +// Implements a family of generic predicate assertion macros. + +#ifndef GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ +#define GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ + +// Makes sure this header is not included before gtest.h. +#ifndef GTEST_INCLUDE_GTEST_GTEST_H_ +# error Do not include gtest_pred_impl.h directly. Include gtest.h instead. +#endif // GTEST_INCLUDE_GTEST_GTEST_H_ + +// This header implements a family of generic predicate assertion +// macros: +// +// ASSERT_PRED_FORMAT1(pred_format, v1) +// ASSERT_PRED_FORMAT2(pred_format, v1, v2) +// ... +// +// where pred_format is a function or functor that takes n (in the +// case of ASSERT_PRED_FORMATn) values and their source expression +// text, and returns a testing::AssertionResult. See the definition +// of ASSERT_EQ in gtest.h for an example. +// +// If you don't care about formatting, you can use the more +// restrictive version: +// +// ASSERT_PRED1(pred, v1) +// ASSERT_PRED2(pred, v1, v2) +// ... +// +// where pred is an n-ary function or functor that returns bool, +// and the values v1, v2, ..., must support the << operator for +// streaming to std::ostream. +// +// We also define the EXPECT_* variations. +// +// For now we only support predicates whose arity is at most 5. +// Please email googletestframework@googlegroups.com if you need +// support for higher arities. + +// GTEST_ASSERT_ is the basic statement to which all of the assertions +// in this file reduce. Don't use this in your code. + +#define GTEST_ASSERT_(expression, on_failure) \ + GTEST_AMBIGUOUS_ELSE_BLOCKER_ \ + if (const ::testing::AssertionResult gtest_ar = (expression)) \ + ; \ + else \ + on_failure(gtest_ar.failure_message()) + + +// Helper function for implementing {EXPECT|ASSERT}_PRED1. Don't use +// this in your code. +template +AssertionResult AssertPred1Helper(const char* pred_text, + const char* e1, + Pred pred, + const T1& v1) { + if (pred(v1)) return AssertionSuccess(); + + return AssertionFailure() << pred_text << "(" + << e1 << ") evaluates to false, where" + << "\n" << e1 << " evaluates to " << v1; +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT1. +// Don't use this in your code. +#define GTEST_PRED_FORMAT1_(pred_format, v1, on_failure)\ + GTEST_ASSERT_(pred_format(#v1, v1),\ + on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED1. Don't use +// this in your code. +#define GTEST_PRED1_(pred, v1, on_failure)\ + GTEST_ASSERT_(::testing::AssertPred1Helper(#pred, \ + #v1, \ + pred, \ + v1), on_failure) + +// Unary predicate assertion macros. +#define EXPECT_PRED_FORMAT1(pred_format, v1) \ + GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED1(pred, v1) \ + GTEST_PRED1_(pred, v1, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT1(pred_format, v1) \ + GTEST_PRED_FORMAT1_(pred_format, v1, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED1(pred, v1) \ + GTEST_PRED1_(pred, v1, GTEST_FATAL_FAILURE_) + + + +// Helper function for implementing {EXPECT|ASSERT}_PRED2. Don't use +// this in your code. +template +AssertionResult AssertPred2Helper(const char* pred_text, + const char* e1, + const char* e2, + Pred pred, + const T1& v1, + const T2& v2) { + if (pred(v1, v2)) return AssertionSuccess(); + + return AssertionFailure() << pred_text << "(" + << e1 << ", " + << e2 << ") evaluates to false, where" + << "\n" << e1 << " evaluates to " << v1 + << "\n" << e2 << " evaluates to " << v2; +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT2. +// Don't use this in your code. +#define GTEST_PRED_FORMAT2_(pred_format, v1, v2, on_failure)\ + GTEST_ASSERT_(pred_format(#v1, #v2, v1, v2),\ + on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED2. Don't use +// this in your code. +#define GTEST_PRED2_(pred, v1, v2, on_failure)\ + GTEST_ASSERT_(::testing::AssertPred2Helper(#pred, \ + #v1, \ + #v2, \ + pred, \ + v1, \ + v2), on_failure) + +// Binary predicate assertion macros. +#define EXPECT_PRED_FORMAT2(pred_format, v1, v2) \ + GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED2(pred, v1, v2) \ + GTEST_PRED2_(pred, v1, v2, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT2(pred_format, v1, v2) \ + GTEST_PRED_FORMAT2_(pred_format, v1, v2, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED2(pred, v1, v2) \ + GTEST_PRED2_(pred, v1, v2, GTEST_FATAL_FAILURE_) + + + +// Helper function for implementing {EXPECT|ASSERT}_PRED3. Don't use +// this in your code. +template +AssertionResult AssertPred3Helper(const char* pred_text, + const char* e1, + const char* e2, + const char* e3, + Pred pred, + const T1& v1, + const T2& v2, + const T3& v3) { + if (pred(v1, v2, v3)) return AssertionSuccess(); + + return AssertionFailure() << pred_text << "(" + << e1 << ", " + << e2 << ", " + << e3 << ") evaluates to false, where" + << "\n" << e1 << " evaluates to " << v1 + << "\n" << e2 << " evaluates to " << v2 + << "\n" << e3 << " evaluates to " << v3; +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT3. +// Don't use this in your code. +#define GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, on_failure)\ + GTEST_ASSERT_(pred_format(#v1, #v2, #v3, v1, v2, v3),\ + on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED3. Don't use +// this in your code. +#define GTEST_PRED3_(pred, v1, v2, v3, on_failure)\ + GTEST_ASSERT_(::testing::AssertPred3Helper(#pred, \ + #v1, \ + #v2, \ + #v3, \ + pred, \ + v1, \ + v2, \ + v3), on_failure) + +// Ternary predicate assertion macros. +#define EXPECT_PRED_FORMAT3(pred_format, v1, v2, v3) \ + GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED3(pred, v1, v2, v3) \ + GTEST_PRED3_(pred, v1, v2, v3, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT3(pred_format, v1, v2, v3) \ + GTEST_PRED_FORMAT3_(pred_format, v1, v2, v3, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED3(pred, v1, v2, v3) \ + GTEST_PRED3_(pred, v1, v2, v3, GTEST_FATAL_FAILURE_) + + + +// Helper function for implementing {EXPECT|ASSERT}_PRED4. Don't use +// this in your code. +template +AssertionResult AssertPred4Helper(const char* pred_text, + const char* e1, + const char* e2, + const char* e3, + const char* e4, + Pred pred, + const T1& v1, + const T2& v2, + const T3& v3, + const T4& v4) { + if (pred(v1, v2, v3, v4)) return AssertionSuccess(); + + return AssertionFailure() << pred_text << "(" + << e1 << ", " + << e2 << ", " + << e3 << ", " + << e4 << ") evaluates to false, where" + << "\n" << e1 << " evaluates to " << v1 + << "\n" << e2 << " evaluates to " << v2 + << "\n" << e3 << " evaluates to " << v3 + << "\n" << e4 << " evaluates to " << v4; +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT4. +// Don't use this in your code. +#define GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, on_failure)\ + GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, v1, v2, v3, v4),\ + on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED4. Don't use +// this in your code. +#define GTEST_PRED4_(pred, v1, v2, v3, v4, on_failure)\ + GTEST_ASSERT_(::testing::AssertPred4Helper(#pred, \ + #v1, \ + #v2, \ + #v3, \ + #v4, \ + pred, \ + v1, \ + v2, \ + v3, \ + v4), on_failure) + +// 4-ary predicate assertion macros. +#define EXPECT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \ + GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED4(pred, v1, v2, v3, v4) \ + GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT4(pred_format, v1, v2, v3, v4) \ + GTEST_PRED_FORMAT4_(pred_format, v1, v2, v3, v4, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED4(pred, v1, v2, v3, v4) \ + GTEST_PRED4_(pred, v1, v2, v3, v4, GTEST_FATAL_FAILURE_) + + + +// Helper function for implementing {EXPECT|ASSERT}_PRED5. Don't use +// this in your code. +template +AssertionResult AssertPred5Helper(const char* pred_text, + const char* e1, + const char* e2, + const char* e3, + const char* e4, + const char* e5, + Pred pred, + const T1& v1, + const T2& v2, + const T3& v3, + const T4& v4, + const T5& v5) { + if (pred(v1, v2, v3, v4, v5)) return AssertionSuccess(); + + return AssertionFailure() << pred_text << "(" + << e1 << ", " + << e2 << ", " + << e3 << ", " + << e4 << ", " + << e5 << ") evaluates to false, where" + << "\n" << e1 << " evaluates to " << v1 + << "\n" << e2 << " evaluates to " << v2 + << "\n" << e3 << " evaluates to " << v3 + << "\n" << e4 << " evaluates to " << v4 + << "\n" << e5 << " evaluates to " << v5; +} + +// Internal macro for implementing {EXPECT|ASSERT}_PRED_FORMAT5. +// Don't use this in your code. +#define GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, on_failure)\ + GTEST_ASSERT_(pred_format(#v1, #v2, #v3, #v4, #v5, v1, v2, v3, v4, v5),\ + on_failure) + +// Internal macro for implementing {EXPECT|ASSERT}_PRED5. Don't use +// this in your code. +#define GTEST_PRED5_(pred, v1, v2, v3, v4, v5, on_failure)\ + GTEST_ASSERT_(::testing::AssertPred5Helper(#pred, \ + #v1, \ + #v2, \ + #v3, \ + #v4, \ + #v5, \ + pred, \ + v1, \ + v2, \ + v3, \ + v4, \ + v5), on_failure) + +// 5-ary predicate assertion macros. +#define EXPECT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \ + GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_) +#define EXPECT_PRED5(pred, v1, v2, v3, v4, v5) \ + GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_NONFATAL_FAILURE_) +#define ASSERT_PRED_FORMAT5(pred_format, v1, v2, v3, v4, v5) \ + GTEST_PRED_FORMAT5_(pred_format, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_) +#define ASSERT_PRED5(pred, v1, v2, v3, v4, v5) \ + GTEST_PRED5_(pred, v1, v2, v3, v4, v5, GTEST_FATAL_FAILURE_) + + + +#endif // GTEST_INCLUDE_GTEST_GTEST_PRED_IMPL_H_ + +// Macros for testing equalities and inequalities. +// +// * {ASSERT|EXPECT}_EQ(expected, actual): Tests that expected == actual +// * {ASSERT|EXPECT}_NE(v1, v2): Tests that v1 != v2 +// * {ASSERT|EXPECT}_LT(v1, v2): Tests that v1 < v2 +// * {ASSERT|EXPECT}_LE(v1, v2): Tests that v1 <= v2 +// * {ASSERT|EXPECT}_GT(v1, v2): Tests that v1 > v2 +// * {ASSERT|EXPECT}_GE(v1, v2): Tests that v1 >= v2 +// +// When they are not, Google Test prints both the tested expressions and +// their actual values. The values must be compatible built-in types, +// or you will get a compiler error. By "compatible" we mean that the +// values can be compared by the respective operator. +// +// Note: +// +// 1. It is possible to make a user-defined type work with +// {ASSERT|EXPECT}_??(), but that requires overloading the +// comparison operators and is thus discouraged by the Google C++ +// Usage Guide. Therefore, you are advised to use the +// {ASSERT|EXPECT}_TRUE() macro to assert that two objects are +// equal. +// +// 2. The {ASSERT|EXPECT}_??() macros do pointer comparisons on +// pointers (in particular, C strings). Therefore, if you use it +// with two C strings, you are testing how their locations in memory +// are related, not how their content is related. To compare two C +// strings by content, use {ASSERT|EXPECT}_STR*(). +// +// 3. {ASSERT|EXPECT}_EQ(expected, actual) is preferred to +// {ASSERT|EXPECT}_TRUE(expected == actual), as the former tells you +// what the actual value is when it fails, and similarly for the +// other comparisons. +// +// 4. Do not depend on the order in which {ASSERT|EXPECT}_??() +// evaluate their arguments, which is undefined. +// +// 5. These macros evaluate their arguments exactly once. +// +// Examples: +// +// EXPECT_NE(5, Foo()); +// EXPECT_EQ(NULL, a_pointer); +// ASSERT_LT(i, array_size); +// ASSERT_GT(records.size(), 0) << "There is no record left."; + +#define EXPECT_EQ(expected, actual) \ + EXPECT_PRED_FORMAT2(::testing::internal:: \ + EqHelper::Compare, \ + expected, actual) +#define EXPECT_NE(expected, actual) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperNE, expected, actual) +#define EXPECT_LE(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2) +#define EXPECT_LT(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2) +#define EXPECT_GE(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2) +#define EXPECT_GT(val1, val2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2) + +#define GTEST_ASSERT_EQ(expected, actual) \ + ASSERT_PRED_FORMAT2(::testing::internal:: \ + EqHelper::Compare, \ + expected, actual) +#define GTEST_ASSERT_NE(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperNE, val1, val2) +#define GTEST_ASSERT_LE(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLE, val1, val2) +#define GTEST_ASSERT_LT(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperLT, val1, val2) +#define GTEST_ASSERT_GE(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGE, val1, val2) +#define GTEST_ASSERT_GT(val1, val2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperGT, val1, val2) + +// Define macro GTEST_DONT_DEFINE_ASSERT_XY to 1 to omit the definition of +// ASSERT_XY(), which clashes with some users' own code. + +#if !GTEST_DONT_DEFINE_ASSERT_EQ +# define ASSERT_EQ(val1, val2) GTEST_ASSERT_EQ(val1, val2) +#endif + +#if !GTEST_DONT_DEFINE_ASSERT_NE +# define ASSERT_NE(val1, val2) GTEST_ASSERT_NE(val1, val2) +#endif + +#if !GTEST_DONT_DEFINE_ASSERT_LE +# define ASSERT_LE(val1, val2) GTEST_ASSERT_LE(val1, val2) +#endif + +#if !GTEST_DONT_DEFINE_ASSERT_LT +# define ASSERT_LT(val1, val2) GTEST_ASSERT_LT(val1, val2) +#endif + +#if !GTEST_DONT_DEFINE_ASSERT_GE +# define ASSERT_GE(val1, val2) GTEST_ASSERT_GE(val1, val2) +#endif + +#if !GTEST_DONT_DEFINE_ASSERT_GT +# define ASSERT_GT(val1, val2) GTEST_ASSERT_GT(val1, val2) +#endif + +// C String Comparisons. All tests treat NULL and any non-NULL string +// as different. Two NULLs are equal. +// +// * {ASSERT|EXPECT}_STREQ(s1, s2): Tests that s1 == s2 +// * {ASSERT|EXPECT}_STRNE(s1, s2): Tests that s1 != s2 +// * {ASSERT|EXPECT}_STRCASEEQ(s1, s2): Tests that s1 == s2, ignoring case +// * {ASSERT|EXPECT}_STRCASENE(s1, s2): Tests that s1 != s2, ignoring case +// +// For wide or narrow string objects, you can use the +// {ASSERT|EXPECT}_??() macros. +// +// Don't depend on the order in which the arguments are evaluated, +// which is undefined. +// +// These macros evaluate their arguments exactly once. + +#define EXPECT_STREQ(expected, actual) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, expected, actual) +#define EXPECT_STRNE(s1, s2) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2) +#define EXPECT_STRCASEEQ(expected, actual) \ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, expected, actual) +#define EXPECT_STRCASENE(s1, s2)\ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2) + +#define ASSERT_STREQ(expected, actual) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTREQ, expected, actual) +#define ASSERT_STRNE(s1, s2) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRNE, s1, s2) +#define ASSERT_STRCASEEQ(expected, actual) \ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASEEQ, expected, actual) +#define ASSERT_STRCASENE(s1, s2)\ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperSTRCASENE, s1, s2) + +// Macros for comparing floating-point numbers. +// +// * {ASSERT|EXPECT}_FLOAT_EQ(expected, actual): +// Tests that two float values are almost equal. +// * {ASSERT|EXPECT}_DOUBLE_EQ(expected, actual): +// Tests that two double values are almost equal. +// * {ASSERT|EXPECT}_NEAR(v1, v2, abs_error): +// Tests that v1 and v2 are within the given distance to each other. +// +// Google Test uses ULP-based comparison to automatically pick a default +// error bound that is appropriate for the operands. See the +// FloatingPoint template class in gtest-internal.h if you are +// interested in the implementation details. + +#define EXPECT_FLOAT_EQ(expected, actual)\ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ, \ + expected, actual) + +#define EXPECT_DOUBLE_EQ(expected, actual)\ + EXPECT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ, \ + expected, actual) + +#define ASSERT_FLOAT_EQ(expected, actual)\ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ, \ + expected, actual) + +#define ASSERT_DOUBLE_EQ(expected, actual)\ + ASSERT_PRED_FORMAT2(::testing::internal::CmpHelperFloatingPointEQ, \ + expected, actual) + +#define EXPECT_NEAR(val1, val2, abs_error)\ + EXPECT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \ + val1, val2, abs_error) + +#define ASSERT_NEAR(val1, val2, abs_error)\ + ASSERT_PRED_FORMAT3(::testing::internal::DoubleNearPredFormat, \ + val1, val2, abs_error) + +// These predicate format functions work on floating-point values, and +// can be used in {ASSERT|EXPECT}_PRED_FORMAT2*(), e.g. +// +// EXPECT_PRED_FORMAT2(testing::DoubleLE, Foo(), 5.0); + +// Asserts that val1 is less than, or almost equal to, val2. Fails +// otherwise. In particular, it fails if either val1 or val2 is NaN. +GTEST_API_ AssertionResult FloatLE(const char* expr1, const char* expr2, + float val1, float val2); +GTEST_API_ AssertionResult DoubleLE(const char* expr1, const char* expr2, + double val1, double val2); + + +#if GTEST_OS_WINDOWS + +// Macros that test for HRESULT failure and success, these are only useful +// on Windows, and rely on Windows SDK macros and APIs to compile. +// +// * {ASSERT|EXPECT}_HRESULT_{SUCCEEDED|FAILED}(expr) +// +// When expr unexpectedly fails or succeeds, Google Test prints the +// expected result and the actual result with both a human-readable +// string representation of the error, if available, as well as the +// hex result code. +# define EXPECT_HRESULT_SUCCEEDED(expr) \ + EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr)) + +# define ASSERT_HRESULT_SUCCEEDED(expr) \ + ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTSuccess, (expr)) + +# define EXPECT_HRESULT_FAILED(expr) \ + EXPECT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr)) + +# define ASSERT_HRESULT_FAILED(expr) \ + ASSERT_PRED_FORMAT1(::testing::internal::IsHRESULTFailure, (expr)) + +#endif // GTEST_OS_WINDOWS + +// Macros that execute statement and check that it doesn't generate new fatal +// failures in the current thread. +// +// * {ASSERT|EXPECT}_NO_FATAL_FAILURE(statement); +// +// Examples: +// +// EXPECT_NO_FATAL_FAILURE(Process()); +// ASSERT_NO_FATAL_FAILURE(Process()) << "Process() failed"; +// +#define ASSERT_NO_FATAL_FAILURE(statement) \ + GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_FATAL_FAILURE_) +#define EXPECT_NO_FATAL_FAILURE(statement) \ + GTEST_TEST_NO_FATAL_FAILURE_(statement, GTEST_NONFATAL_FAILURE_) + +// Causes a trace (including the source file path, the current line +// number, and the given message) to be included in every test failure +// message generated by code in the current scope. The effect is +// undone when the control leaves the current scope. +// +// The message argument can be anything streamable to std::ostream. +// +// In the implementation, we include the current line number as part +// of the dummy variable name, thus allowing multiple SCOPED_TRACE()s +// to appear in the same block - as long as they are on different +// lines. +#define SCOPED_TRACE(message) \ + ::testing::internal::ScopedTrace GTEST_CONCAT_TOKEN_(gtest_trace_, __LINE__)(\ + __FILE__, __LINE__, ::testing::Message() << (message)) + +// Compile-time assertion for type equality. +// StaticAssertTypeEq() compiles iff type1 and type2 are +// the same type. The value it returns is not interesting. +// +// Instead of making StaticAssertTypeEq a class template, we make it a +// function template that invokes a helper class template. This +// prevents a user from misusing StaticAssertTypeEq by +// defining objects of that type. +// +// CAVEAT: +// +// When used inside a method of a class template, +// StaticAssertTypeEq() is effective ONLY IF the method is +// instantiated. For example, given: +// +// template class Foo { +// public: +// void Bar() { testing::StaticAssertTypeEq(); } +// }; +// +// the code: +// +// void Test1() { Foo foo; } +// +// will NOT generate a compiler error, as Foo::Bar() is never +// actually instantiated. Instead, you need: +// +// void Test2() { Foo foo; foo.Bar(); } +// +// to cause a compiler error. +template +bool StaticAssertTypeEq() { + (void)internal::StaticAssertTypeEqHelper(); + return true; +} + +// Defines a test. +// +// The first parameter is the name of the test case, and the second +// parameter is the name of the test within the test case. +// +// The convention is to end the test case name with "Test". For +// example, a test case for the Foo class can be named FooTest. +// +// The user should put his test code between braces after using this +// macro. Example: +// +// TEST(FooTest, InitializesCorrectly) { +// Foo foo; +// EXPECT_TRUE(foo.StatusIsOK()); +// } + +// Note that we call GetTestTypeId() instead of GetTypeId< +// ::testing::Test>() here to get the type ID of testing::Test. This +// is to work around a suspected linker bug when using Google Test as +// a framework on Mac OS X. The bug causes GetTypeId< +// ::testing::Test>() to return different values depending on whether +// the call is from the Google Test framework itself or from user test +// code. GetTestTypeId() is guaranteed to always return the same +// value, as it always calls GetTypeId<>() from the Google Test +// framework. +#define GTEST_TEST(test_case_name, test_name)\ + GTEST_TEST_(test_case_name, test_name, \ + ::testing::Test, ::testing::internal::GetTestTypeId()) + +// Define this macro to 1 to omit the definition of TEST(), which +// is a generic name and clashes with some other libraries. +#if !GTEST_DONT_DEFINE_TEST +# define TEST(test_case_name, test_name) GTEST_TEST(test_case_name, test_name) +#endif + +// Defines a test that uses a test fixture. +// +// The first parameter is the name of the test fixture class, which +// also doubles as the test case name. The second parameter is the +// name of the test within the test case. +// +// A test fixture class must be declared earlier. The user should put +// his test code between braces after using this macro. Example: +// +// class FooTest : public testing::Test { +// protected: +// virtual void SetUp() { b_.AddElement(3); } +// +// Foo a_; +// Foo b_; +// }; +// +// TEST_F(FooTest, InitializesCorrectly) { +// EXPECT_TRUE(a_.StatusIsOK()); +// } +// +// TEST_F(FooTest, ReturnsElementCountCorrectly) { +// EXPECT_EQ(0, a_.size()); +// EXPECT_EQ(1, b_.size()); +// } + +#define TEST_F(test_fixture, test_name)\ + GTEST_TEST_(test_fixture, test_name, test_fixture, \ + ::testing::internal::GetTypeId()) + +// Use this macro in main() to run all tests. It returns 0 if all +// tests are successful, or 1 otherwise. +// +// RUN_ALL_TESTS() should be invoked after the command line has been +// parsed by InitGoogleTest(). + +#define RUN_ALL_TESTS()\ + (::testing::UnitTest::GetInstance()->Run()) + +} // namespace testing + +#endif // GTEST_INCLUDE_GTEST_GTEST_H_ diff --git a/tests/gtest/common/log.h b/tests/gtest/common/log.h new file mode 100644 index 0000000..5384620 --- /dev/null +++ b/tests/gtest/common/log.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef TESTS_GTEST_COMMON_LOG_H_ +#define TESTS_GTEST_COMMON_LOG_H_ + +extern struct gtest_configure_t gtest_conf; + +#define log_fatal(fmt, ...) \ + do { \ + if (gtest_conf.log_level > 0) \ + fprintf(stderr, "[ FATAL ] " fmt, ##__VA_ARGS__); \ + exit(1); \ + } while (0) + +#define log_error(fmt, ...) \ + do { \ + if (gtest_conf.log_level > 1) \ + fprintf(stderr, "[ ERROR ] " fmt, ##__VA_ARGS__); \ + } while (0) + +#define log_warn(fmt, ...) \ + do { \ + if (gtest_conf.log_level > 2) \ + fprintf(stderr, "[ WARN ] " fmt, ##__VA_ARGS__); \ + } while (0) + +#define log_info(fmt, ...) \ + do { \ + if (gtest_conf.log_level > 3) \ + printf("\033[0;3%sm" "[ INFO ] " fmt "\033[m", "4", ##__VA_ARGS__); \ + } while (0) + +#define log_trace(fmt, ...) \ + do { \ + if (gtest_conf.log_level > 4) \ + printf("\033[0;3%sm" "[ TRACE ] " fmt "\033[m", "7", ##__VA_ARGS__); \ + } while (0) + +#endif /* TESTS_GTEST_COMMON_LOG_H_ */ diff --git a/tests/gtest/common/sys.cc b/tests/gtest/common/sys.cc new file mode 100644 index 0000000..70cf3db --- /dev/null +++ b/tests/gtest/common/sys.cc @@ -0,0 +1,234 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "common/def.h" +#include "common/log.h" +#include "sys.h" + +#include + + +void sys_hexdump(const char *tag, void *ptr, int buflen) +{ + unsigned char *buf = (unsigned char *)ptr; + char out_buf[256]; + int ret = 0; + int out_pos = 0; + int i, j; + + if (tag) { + log_trace("%s\n", tag); + } + if (ptr) { + return ; + } + log_trace("dump data at %p\n", ptr); + for (i = 0; i < buflen; i += 16) { + out_pos = 0; + ret = sprintf(out_buf + out_pos, "%06x: ", i); + if (ret < 0) + return; + out_pos += ret; + for (j = 0; j < 16; j++) { + if (i + j < buflen) + ret = sprintf(out_buf + out_pos, "%02x ", + buf[i + j]); + else + ret = sprintf(out_buf + out_pos, " "); + if (ret < 0) + return; + out_pos += ret; + } + ret = sprintf(out_buf + out_pos, " "); + if (ret < 0) + return ; + out_pos += ret; + for (j = 0; j < 16; j++) + if (i + j < buflen) { + ret = sprintf(out_buf + out_pos, "%c", + isprint(buf[i+j]) ? + buf[i + j] : + '.'); + if (ret < 0) + return; + out_pos += ret; + } + ret = sprintf(out_buf + out_pos, "\n"); + if (ret < 0) + return ; + log_trace("%s", out_buf); + } +} + +int sys_get_addr(char *dst, struct sockaddr_in *addr) +{ + int rc = 0; + struct addrinfo *res; + + rc = getaddrinfo(dst, NULL, NULL, &res); + if (rc) { + log_error("getaddrinfo failed - invalid hostname or IP address\n"); + return rc; + } + + if (res->ai_family != PF_INET) { + rc = -1; + goto out; + } + + *addr = *(struct sockaddr_in *)res->ai_addr; +out: + freeaddrinfo(res); + return rc; +} + +char *sys_addr2dev(struct sockaddr_in *addr, char *buf, size_t size) +{ + struct ifaddrs *interfaces; + struct ifaddrs *ifa; + + if (buf && size && !getifaddrs(&interfaces)) { + buf[0] = '\0'; + for (ifa = interfaces; ifa; ifa = ifa->ifa_next) { + if (ifa->ifa_addr) + { + if (AF_INET == ifa->ifa_addr->sa_family) + { + struct sockaddr_in* inaddr = (struct sockaddr_in*)ifa->ifa_addr; + + if (inaddr->sin_addr.s_addr == addr->sin_addr.s_addr) + { + if (ifa->ifa_name) + { + size_t n = sys_min(strlen(ifa->ifa_name), size - 1); + memcpy(buf, ifa->ifa_name, n); + buf[n] = '\0'; + return buf; + } + } + } + } + } + freeifaddrs(interfaces); + } + + return NULL; +} + +int sys_dev2addr(char *dev, struct sockaddr_in *addr) +{ + int rc = 0; + int fd; + struct ifreq ifr; + + fd = socket(AF_INET, SOCK_STREAM, 0); + if (fd < 0) { + rc = -1; + goto out; + } + + ifr.ifr_addr.sa_family = AF_INET; + + strncpy(ifr.ifr_name , dev , strlen(dev)); + + rc = ioctl(fd, SIOCGIFADDR, &ifr); + if (rc >= 0 && addr) { + memcpy(addr, &ifr.ifr_addr, sizeof(*addr)); + } + + close(fd); + +out: + return rc; +} + +int sys_gateway(struct sockaddr_in *addr) +{ + char* gateway = NULL; + char line[256]; + char cmd[] = "route -n | grep 'UG[ \t]' | awk '{print $2}'"; + FILE* file = NULL; + + file = popen(cmd, "r"); + + if(fgets(line, sizeof(line), file) != NULL) { + gateway = line; + addr->sin_addr.s_addr = inet_addr(gateway); + } + + pclose(file); + + return (gateway ? 0 : -1); +} + +pid_t sys_procpid(const char* name) +{ + DIR* dir; + struct dirent* ent; + char buf[512]; + long pid; + char pname[100] = {0}; + char state; + FILE *fp=NULL; + + if (!(dir = opendir("/proc"))) { + perror("can't open /proc"); + return -1; + } + + while((ent = readdir(dir)) != NULL) { + long lpid = atol(ent->d_name); + if(lpid < 0) { + continue; + } + snprintf(buf, sizeof(buf), "/proc/%ld/stat", lpid); + fp = fopen(buf, "r"); + + if (fp) { + if ( (fscanf(fp, "%ld (%[^)]) %c", &pid, pname, &state)) != 3 ){ + printf("fscanf failed \n"); + fclose(fp); + closedir(dir); + return -1; + } + if (!strcmp(pname, name)) { + fclose(fp); + closedir(dir); + return (pid_t)lpid; + } + fclose(fp); + } + } + + closedir(dir); + return -1; +} diff --git a/tests/gtest/common/sys.h b/tests/gtest/common/sys.h new file mode 100644 index 0000000..4bf079d --- /dev/null +++ b/tests/gtest/common/sys.h @@ -0,0 +1,112 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef TESTS_GTEST_COMMON_SYS_H_ +#define TESTS_GTEST_COMMON_SYS_H_ + +/* Minimum and maximum macros */ +#define sys_max(a, b) (((a) > (b)) ? (a) : (b)) +#define sys_min(a, b) (((a) < (b)) ? (a) : (b)) + +static INLINE int sys_is_big_endian(void) +{ + return( htonl(1) == 1 ); +} + +static INLINE double sys_gettime(void) +{ + struct timeval tv; + gettimeofday(&tv, 0); + return (double)(tv.tv_sec * 1000000 + tv.tv_usec); +} + +static INLINE uint64_t sys_rdtsc(void) +{ + unsigned long long int result=0; + +#if defined(__i386__) + __asm volatile(".byte 0x0f, 0x31" : "=A" (result) : ); + +#elif defined(__x86_64__) + unsigned hi, lo; + __asm volatile("rdtsc" : "=a"(lo), "=d"(hi)); + result = hi; + result = result<<32; + result = result|lo; + +#elif defined(__powerpc__) + unsigned long int hi, lo, tmp; + __asm volatile( + "0: \n\t" + "mftbu %0 \n\t" + "mftb %1 \n\t" + "mftbu %2 \n\t" + "cmpw %2,%0 \n\t" + "bne 0b \n" + : "=r"(hi),"=r"(lo),"=r"(tmp) + ); + result = hi; + result = result<<32; + result = result|lo; + +#endif + + return (result); +} + +void sys_hexdump(const char *tag, void *ptr, int buflen); + +int sys_get_addr(char *dst, struct sockaddr_in *addr); + +char *sys_addr2dev(struct sockaddr_in *addr, char *buf, size_t size); + +int sys_dev2addr(char *dev, struct sockaddr_in *addr); + +int sys_gateway(struct sockaddr_in *addr); + +pid_t sys_procpid(const char* name); + +static INLINE char *sys_addr2str(struct sockaddr_in *addr) +{ + static __thread char addrbuf[100]; + inet_ntop(AF_INET, &addr->sin_addr, addrbuf, sizeof(addrbuf)); + sprintf(addrbuf,"%s:%d", addrbuf, ntohs(addr->sin_port)); + + return addrbuf; +} + +static INLINE int sys_rootuser(void) +{ + return (geteuid() == 0); +} + +#endif /* TESTS_GTEST_COMMON_SYS_H_ */ diff --git a/tests/gtest/common/tap.h b/tests/gtest/common/tap.h new file mode 100644 index 0000000..bd045fa --- /dev/null +++ b/tests/gtest/common/tap.h @@ -0,0 +1,253 @@ +/* + * The MIT License + * + * Copyright (c) 2011 Bruno P. Kinoshita + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to deal + * in the Software without restriction, including without limitation the rights + * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell + * copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in + * all copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN + * THE SOFTWARE. + * + * @author Bruno P. Kinoshita + * @since 0.1 + */ + +#ifndef TAP_H_ +#define TAP_H_ + +#include +#include +#include +#include +#include +#include + +namespace tap { + +#ifdef GTEST_TAP_13_DIAGNOSTIC +// based on http://stackoverflow.com/a/7724536/831180 +static std::string replace_all_copy( + std::string const& original, + std::string const& before, + std::string const& after +) { + using namespace std; + + if (before == after) return string(original); + + string retval; + if (before.length() == after.length()) retval.reserve(original.size()); + + basic_string ::const_iterator end = original.end(); + basic_string ::const_iterator current = original.begin(); + basic_string ::const_iterator next = + search(current, end, before.begin(), before.end()); + + while ( next != end ) { + retval.append( current, next ); + retval.append( after ); + current = next + before.size(); + next = search(current, end, before.begin(), before.end()); + } + retval.append( current, next ); + return retval; +} +#endif + +class TestResult { + + private: + int number; + std::string status; + std::string name; + std::string comment; + bool skip; + + public: + std::string getComment() const { + std::stringstream ss; + if (this->skip) { + ss << "# SKIP " << this->comment; + } else if (!this->comment.empty()) { + ss << "# " << this->comment; + } + return ss.str(); + } + + const std::string& getName() const { + return name; + } + + int getNumber() const { + return number; + } + + const std::string& getStatus() const { + return status; + } + + bool getSkip() const { + return skip; + } + + void setComment(const std::string& value) { + this->comment = value; + } + + void setName(const std::string& value) { + this->name = value; + } + + void setNumber(int value) { + this->number = value; + } + + void setStatus(const std::string& value) { + this->status = value; + } + + void setSkip(bool value) { + this->skip = value; + } + + std::string toString() const { + std::stringstream ss; + ss << this->status << " " << this->number << " " << this->name; +#ifdef GTEST_TAP_13_DIAGNOSTIC + std::string comment_text = this->getComment(); + if (!comment_text.empty()) { + ss << std::endl + << "# Diagnostic" << std::endl + << " ---" << std::endl + << " " << replace_all_copy(this->getComment(), "\n", "\n "); + } +#endif + return ss.str(); + } +}; + +class TestSet { + + private: + std::list testResults; + + public: + const std::list& getTestResults() const { + return testResults; + } + + void addTestResult(TestResult& testResult) { + testResult.setNumber((this->getNumberOfTests() + 1)); + this->testResults.push_back(testResult); + } + + int getNumberOfTests() const { + return this->testResults.size(); + } + + std::string toString() const { + std::stringstream ss; + ss << "1.." << this->getNumberOfTests() << std::endl; + for (std::list::const_iterator ci = this->testResults.begin(); + ci != this->testResults.end(); ++ci) { + TestResult testResult = *ci; + ss << testResult.toString() << std::endl; + } + return ss.str(); + } +}; + +class TapListener: public ::testing::EmptyTestEventListener { + + private: + std::map testCaseTestResultMap; + + void addTapTestResult(const testing::TestInfo& testInfo) { + tap::TestResult tapResult; + tapResult.setName(testInfo.name()); + tapResult.setSkip(!testInfo.should_run()); + + const testing::TestResult *testResult = testInfo.result(); + int number = testResult->total_part_count(); + tapResult.setNumber(number-1); + if (testResult->HasFatalFailure()) { + tapResult.setStatus("Bail out!"); + } else if (testResult->Failed()) { + tapResult.setStatus("not ok"); + tapResult.setComment(testResult->GetTestPartResult(number-1).summary()); + } else { + tapResult.setStatus("ok"); + } + + this->addNewOrUpdate(testInfo.test_case_name(), tapResult); + } + + std::string getCommentOrDirective(const std::string& comment, bool skip) { + std::stringstream commentText; + + if (skip) { + commentText << " # SKIP " << comment; + } else if (!comment.empty()) { + commentText << " # " << comment; + } + + return commentText.str(); + } + + void addNewOrUpdate(const std::string& testCaseName, tap::TestResult testResult) { + std::map::const_iterator ci = + this->testCaseTestResultMap.find(testCaseName); + if (ci != this->testCaseTestResultMap.end()) { + tap::TestSet testSet = ci->second; + testSet.addTestResult(testResult); + this->testCaseTestResultMap[testCaseName] = testSet; + } else { + tap::TestSet testSet; + testSet.addTestResult(testResult); + this->testCaseTestResultMap[testCaseName] = testSet; + } + } + +public: + virtual void OnTestEnd(const testing::TestInfo& testInfo) { + //printf("%s %d - %s\n", testInfo.result()->Passed() ? "ok" : "not ok", this->testNumber, testInfo.name()); + this->addTapTestResult(testInfo); + } + + virtual void OnTestProgramEnd(const testing::UnitTest& unit_test) { + //--- Write the count and the word. + (void)unit_test; + std::map::const_iterator ci; + for (ci = this->testCaseTestResultMap.begin(); + ci != this->testCaseTestResultMap.end(); ++ci) { + const tap::TestSet& testSet = ci->second; +#ifdef GTEST_TAP_PRINT_TO_STDOUT + std::cout << "TAP version 13" << std::endl; + std::cout << testSet.toString(); +#else + std::string ext = ".tap"; + std::ofstream tapFile; + tapFile.open((ci->first + ext).c_str()); + tapFile << testSet.toString(); + tapFile.close(); +#endif + } + } +}; + +} // namespace tap + +#endif // TAP_H_ diff --git a/tests/gtest/main.cc b/tests/gtest/main.cc new file mode 100644 index 0000000..ce2b3cf --- /dev/null +++ b/tests/gtest/main.cc @@ -0,0 +1,238 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include + +#include "common/gtest.h" +#include "common/tap.h" +#include "common/def.h" +#include "common/log.h" +#include "common/sys.h" + + +static int _set_config(int argc, char **argv); +static int _def_config(void); +static void _usage(void); + +struct gtest_configure_t gtest_conf; + + +int main(int argc, char **argv) { + // coverity[fun_call_w_exception]: uncaught exceptions cause nonzero exit anyway, so don't warn. + ::testing::InitGoogleTest(&argc, argv); + + char *str = getenv("GTEST_TAP"); + /* Append TAP Listener */ + if (str) { + if (0 < strtol(str, NULL, 0)) { + testing::TestEventListeners& listeners = testing::UnitTest::GetInstance()->listeners(); + if (1 == strtol(str, NULL, 0)) { + delete listeners.Release(listeners.default_result_printer()); + } + listeners.Append(new tap::TapListener()); + } + } + + _def_config(); + _set_config(argc, argv); + + return RUN_ALL_TESTS(); +} + +static int _def_config(void) +{ + int rc = 0; + + memset(>est_conf, 0, sizeof(gtest_conf)); + gtest_conf.log_level = 4; + gtest_conf.random_seed = time(NULL) % 32768; + gtest_conf.client_addr.sin_family = PF_INET; + gtest_conf.client_addr.sin_addr.s_addr = INADDR_ANY; + gtest_conf.client_addr.sin_port = 0; + gtest_conf.server_addr.sin_family = PF_INET; + gtest_conf.server_addr.sin_addr.s_addr = INADDR_ANY; + gtest_conf.server_addr.sin_port = 0; + gtest_conf.remote_addr.sin_family = PF_INET; + gtest_conf.remote_addr.sin_addr.s_addr = INADDR_ANY; + gtest_conf.remote_addr.sin_port = 0; + sys_gateway(>est_conf.remote_addr); + gtest_conf.port = 55555; + + return rc; +} + +static int _set_config(int argc, char **argv) +{ + int rc = 0; + static struct option long_options[] = { + {"addr", required_argument, 0, 'a'}, + {"if", required_argument, 0, 'i'}, + {"port", required_argument, 0, 'p'}, + {"random", required_argument, 0, 's'}, + {"debug", required_argument, 0, 'd'}, + {"help", no_argument, 0, 'h'}, + }; + int op; + int option_index; + + while ((op = getopt_long(argc, argv, "a:i:p:d:h", long_options, &option_index)) != -1) { + switch (op) { + case 'a': + { + char *token1 = NULL; + char *token2 = NULL; + const char s[2] = ":"; + if (optarg) { + if (optarg[0] != ':') { + token1 = strtok(optarg, s); + token2 = strtok(NULL, s); + } else { + token1 = NULL; + token2 = strtok(optarg, s); + } + } + + if (token1) { + rc = sys_get_addr(token1, >est_conf.client_addr); + if (rc < 0) { + rc = -EINVAL; + log_fatal("Failed to resolve ip address %s\n", token1); + } + } + if (token2) { + rc = sys_get_addr(token2, >est_conf.server_addr); + if (rc < 0) { + rc = -EINVAL; + log_fatal("Failed to resolve ip address %s\n", token2); + } + } + } + break; + case 'i': + { + char *token1 = NULL; + char *token2 = NULL; + const char s[2] = ":"; + if (optarg) { + if (optarg[0] != ':') { + token1 = strtok(optarg, s); + token2 = strtok(NULL, s); + } else { + token1 = NULL; + token2 = strtok(optarg, s); + } + } + + if (token1) { + rc = sys_dev2addr(token1, >est_conf.client_addr); + if (rc < 0) { + rc = -EINVAL; + log_fatal("Failed to resolve ip address %s\n", token1); + } + } + if (token2) { + rc = sys_dev2addr(token2, >est_conf.server_addr); + if (rc < 0) { + rc = -EINVAL; + log_fatal("Failed to resolve ip address %s\n", token2); + } + } + } + break; + case 'p': + errno = 0; + gtest_conf.port = strtol(optarg, NULL, 0); + if (0 != errno) { + rc = -EINVAL; + log_error("Invalid option value <%s>\n", optarg); + } + break; + case 's': + errno = 0; + gtest_conf.random_seed = strtol(optarg, NULL, 0); + if (0 != errno) { + rc = -EINVAL; + log_error("Invalid option value <%s>\n", optarg); + } + break; + case 'd': + errno = 0; + gtest_conf.log_level = strtol(optarg, NULL, 0); + if (0 != errno) { + rc = -EINVAL; + log_error("Invalid option value <%s>\n", optarg); + } + break; + case 'h': + _usage(); + break; + default: + rc = -EINVAL; + log_error("Unknown option <%c>\n", op); + break; + } + } + + if (0 != rc) { + _usage(); + } else { + srand(gtest_conf.random_seed); + gtest_conf.client_addr.sin_port = htons(gtest_conf.port); + gtest_conf.server_addr.sin_port = htons(gtest_conf.port); + gtest_conf.remote_addr.sin_port = htons(gtest_conf.port); + log_info("CONFIGURATION:\n"); + log_info("log level: %d\n", gtest_conf.log_level); + log_info("seed: %d\n", gtest_conf.random_seed); + log_info("client ip: %s\n", sys_addr2str(>est_conf.client_addr)); + log_info("server ip: %s\n", sys_addr2str(>est_conf.server_addr)); + log_info("remote ip: %s\n", sys_addr2str(>est_conf.remote_addr)); + log_info("port: %d\n", gtest_conf.port); + } + + return rc; +} + +static void _usage(void) +{ + printf("Usage: gtest [options]\n" + "\t--addr,-a IP address client:server\n" + "\t--if,-i Interface client:server\n" + "\t--port,-p Listen/connect to port (default %d).\n" + "\t--random,-s Seed (default %d).\n" + "\t--debug,-d Output verbose level (default: %d).\n" + "\t--help,-h Print help and exit\n", + + gtest_conf.port, + gtest_conf.random_seed, + gtest_conf.log_level); + exit(0); +} diff --git a/tests/gtest/mix/mix_base.cc b/tests/gtest/mix/mix_base.cc new file mode 100644 index 0000000..70e581e --- /dev/null +++ b/tests/gtest/mix/mix_base.cc @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "common/def.h" + +#include "mix_base.h" + +void mix_base::SetUp() +{ + errno = EOK; +} + +void mix_base::TearDown() +{ + +} + diff --git a/tests/gtest/mix/mix_base.h b/tests/gtest/mix/mix_base.h new file mode 100644 index 0000000..70d4e9a --- /dev/null +++ b/tests/gtest/mix/mix_base.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef TESTS_GTEST_MIX_BASE_H_ +#define TESTS_GTEST_MIX_BASE_H_ + +#include "common/def.h" +#include "common/log.h" +#include "common/sys.h" +#include "common/base.h" + + +class mix_base : public testing::Test, public test_base +{ +protected: + virtual void SetUp(); + virtual void TearDown(); +}; + +#endif //TESTS_GTEST_MIX_VASE_H_ + diff --git a/tests/gtest/mix/mix_list.cc b/tests/gtest/mix/mix_list.cc new file mode 100644 index 0000000..9caa959 --- /dev/null +++ b/tests/gtest/mix/mix_list.cc @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "common/def.h" +#include "common/log.h" +#include "common/sys.h" +#include "common/base.h" +#include "common/cmn.h" + +#include "mix_base.h" + +#include "src/vma/util/list.h" + +struct element { + struct list_head item; + int value; +}; + +class mix_list : public mix_base {}; + +TEST_F(mix_list, ti_1) { + struct list_head head; + + INIT_LIST_HEAD(&head); + ASSERT_TRUE(head.next == &head); + ASSERT_TRUE(head.prev == &head); + ASSERT_TRUE(list_empty(&head)); +} + +TEST_F(mix_list, ti_2) { + struct element element; + struct element *cur_element = NULL; + + element.value = 12345; + + cur_element = list_entry(&element.item, struct element, item); + + ASSERT_EQ(12345, cur_element->value); +} + +TEST_F(mix_list, ti_3) { + struct list_head head; + struct element element; + + INIT_LIST_HEAD(&head); + list_add(&element.item, &head); + + ASSERT_TRUE(head.next == &element.item); + ASSERT_TRUE(head.prev == &element.item); + ASSERT_TRUE(element.item.next == &head); + ASSERT_TRUE(element.item.prev == &head); + ASSERT_FALSE(list_empty(&head)); +} + +TEST_F(mix_list, ti_4) { + struct list_head head; + struct element element; + struct element *cur_element = NULL; + + INIT_LIST_HEAD(&head); + element.value = 12345; + list_add(&element.item, &head); + + ASSERT_FALSE(list_empty(&head)); + + cur_element = list_first_entry(&head, struct element, item); + + ASSERT_EQ(12345, cur_element->value); +} + +TEST_F(mix_list, ti_5) { + struct list_head head; + struct element element[2]; + struct list_head *cur_entry = NULL; + struct element *cur_element = NULL; + int reference[] = {-12345, 12345, 0}; + int i = 0; + + INIT_LIST_HEAD(&head); + i = 0; + element[i].value = reference[i]; + list_add(&element[i].item, &head); + i++; + element[i].value = reference[i]; + list_add(&element[i].item, &head); + i++; + element[i].value = reference[i]; + list_add(&element[i].item, &head); + + ASSERT_FALSE(list_empty(&head)); + + i = 1; + list_for_each(cur_entry, &head) { + cur_element = list_entry(cur_entry, struct element, item); + ASSERT_EQ(reference[ARRAY_SIZE(reference) - i], cur_element->value); + i++; + } +} + +TEST_F(mix_list, ti_6) { + struct list_head head; + struct element element; + + INIT_LIST_HEAD(&head); + list_add_tail(&element.item, &head); + + ASSERT_TRUE(head.prev == &element.item); + ASSERT_TRUE(head.next == &element.item); + ASSERT_TRUE(element.item.prev == &head); + ASSERT_TRUE(element.item.next == &head); + ASSERT_FALSE(list_empty(&head)); +} + +TEST_F(mix_list, ti_7) { + struct list_head head; + struct element element; + struct element *cur_element = NULL; + + INIT_LIST_HEAD(&head); + element.value = 12345; + list_add_tail(&element.item, &head); + + ASSERT_FALSE(list_empty(&head)); + ASSERT_TRUE(list_is_last(&element.item, &head)); +} + +TEST_F(mix_list, ti_8) { + struct list_head head; + struct element element[2]; + struct list_head *cur_entry = NULL; + struct element *cur_element = NULL; + int reference[] = {-12345, 12345, 0}; + int i = 0; + + INIT_LIST_HEAD(&head); + i = 0; + element[i].value = reference[i]; + list_add_tail(&element[i].item, &head); + i++; + element[i].value = reference[i]; + list_add_tail(&element[i].item, &head); + i++; + element[i].value = reference[i]; + list_add_tail(&element[i].item, &head); + + ASSERT_FALSE(list_empty(&head)); + + i = 0; + list_for_each(cur_entry, &head) { + cur_element = list_entry(cur_entry, struct element, item); + ASSERT_EQ(reference[i], cur_element->value); + i++; + } +} + +TEST_F(mix_list, ti_9) { + struct list_head head; + struct element element; + + INIT_LIST_HEAD(&head); + list_add(&element.item, &head); + + ASSERT_FALSE(list_empty(&head)); + + list_del(&element.item); + + ASSERT_TRUE(list_empty(&head)); + ASSERT_FALSE(list_empty(&element.item)); +} + +TEST_F(mix_list, ti_10) { + struct list_head head; + struct element element; + + INIT_LIST_HEAD(&head); + list_add(&element.item, &head); + + ASSERT_FALSE(list_empty(&head)); + + list_del_init(&element.item); + + ASSERT_TRUE(list_empty(&head)); + ASSERT_TRUE(list_empty(&element.item)); +} diff --git a/tests/gtest/mix/sg_array.cc b/tests/gtest/mix/sg_array.cc new file mode 100644 index 0000000..ee82921 --- /dev/null +++ b/tests/gtest/mix/sg_array.cc @@ -0,0 +1,178 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "common/def.h" + +#include "mix_base.h" + +#include "src/vma/ib/base/verbs_extra.h" +#include "src/vma/util/sg_array.h" + +class sg_array_test : public mix_base { +public: + struct ibv_sge* sge0; + struct ibv_sge sge1; + struct ibv_sge sge2[2]; + struct ibv_sge sge3[3]; + sg_array_test() + { + sge0 = NULL; + + sge1.addr = (uint64_t)"0123456789"; + sge1.length = 10; + + sge2[0].addr = (uint64_t)"0123456789"; + sge2[0].length = 10; + sge2[1].addr = (uint64_t)"0123456789"; + sge2[1].length = 10; + + sge3[0].addr = (uint64_t)"0123456789"; + sge3[0].length = 10; + sge3[1].addr = (uint64_t)"0123456789"; + sge3[1].length = 10; + sge3[2].addr = (uint64_t)"0123456789"; + sge3[2].length = 10; + } + +}; +//! Tests for constructor +TEST_F(sg_array_test, sga_ctr) +{ + sg_array sa0(sge0,0); + EXPECT_EQ(-1, sa0.get_num_sge()); + EXPECT_EQ(0, sa0.length()); + + sg_array sa1(&sge1,1); + EXPECT_EQ(1, sa1.get_num_sge()); + EXPECT_EQ(10, sa1.length()); + + sg_array sa2(sge2,2); + EXPECT_EQ(2, sa2.get_num_sge()); + EXPECT_EQ(20, sa2.length()); + + sg_array sa3(sge3,3); + EXPECT_EQ(3, sa3.get_num_sge()); + EXPECT_EQ(30, sa3.length()); + +} + +//! Tests for relative index +// +TEST_F(sg_array_test, sga_index_0) +{ + sg_array sa0(sge0, 0); + EXPECT_EQ(NULL, sa0.get_data(0)); + + sg_array sa1(&sge1, 1); + EXPECT_EQ(NULL, sa0.get_data(0)); +} + +//! Tests for minimum bound +// +TEST_F(sg_array_test, sga_min_bound) +{ + sg_array sa0(sge0, 0); + int len=-1; + EXPECT_EQ(NULL, sa0.get_data(&len)); + + sg_array sa1(&sge1, 1); + EXPECT_EQ(NULL, sa1.get_data(&len)); +} + +//! Test for maximum bound +// +TEST_F(sg_array_test, sga_max_bound) +{ + sg_array sa0(sge0, 0); + int len = 1; + EXPECT_EQ(NULL, sa0.get_data(&len)); + + sg_array sa1(&sge1, 1); + len = 11; + uint8_t *p = sa1.get_data(&len); + EXPECT_EQ(len, 10); + EXPECT_EQ((uint64_t)p, sge1.addr); + + p = sa1.get_data(&len); + EXPECT_EQ((uint64_t)p, NULL); +} + +//! Tests for in_bound +// +TEST_F(sg_array_test, sga_in_bound) +{ + sg_array sa1(&sge1, 1); + + int len = 5; + uint8_t *p = sa1.get_data(&len); + + EXPECT_EQ(len, 5); + EXPECT_EQ((uint64_t)p, sge1.addr); + + len = 10; + p = sa1.get_data(&len); + + EXPECT_EQ(len, 5); + EXPECT_EQ(*p, '5'); +} + +//! Tests for in_bound +// +TEST_F(sg_array_test, sga_in_bound_multi_sge) +{ + sg_array sa3(sge3, 3); + + int len = 5; + uint8_t *p = sa3.get_data(&len); + + EXPECT_EQ(len, 5); + EXPECT_EQ((uint64_t)p, sge3[0].addr); + + len = 10; + p = sa3.get_data(&len); + + EXPECT_EQ(len, 5); + EXPECT_EQ(*p, '5'); + + len = 15; + p = sa3.get_data(&len); + + EXPECT_EQ(len, 10); + EXPECT_EQ((uint64_t)p, sge3[1].addr); + + len = 10; + p = sa3.get_data(&len); + + EXPECT_EQ(len, 10); + EXPECT_EQ((uint64_t)p, sge3[2].addr); +} + + diff --git a/tests/gtest/sock/sock_base.cc b/tests/gtest/sock/sock_base.cc new file mode 100644 index 0000000..8a8ab02 --- /dev/null +++ b/tests/gtest/sock/sock_base.cc @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "common/def.h" +#include "common/log.h" +#include "common/sys.h" +#include "common/base.h" + +#include "sock_base.h" + +void sock_base::SetUp() +{ + errno = EOK; +} + +void sock_base::TearDown() +{ +} diff --git a/tests/gtest/sock/sock_base.h b/tests/gtest/sock/sock_base.h new file mode 100644 index 0000000..e925201 --- /dev/null +++ b/tests/gtest/sock/sock_base.h @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef TESTS_GTEST_SOCK_BASE_H_ +#define TESTS_GTEST_SOCK_BASE_H_ + + +/** + * SOCK Base class for tests + */ +class sock_base : public testing::Test, public test_base { +protected: + virtual void SetUp(); + virtual void TearDown(); +}; + +#endif /* TESTS_GTEST_SOCK_BASE_H_ */ diff --git a/tests/gtest/sock/sock_socket.cc b/tests/gtest/sock/sock_socket.cc new file mode 100644 index 0000000..923ec43 --- /dev/null +++ b/tests/gtest/sock/sock_socket.cc @@ -0,0 +1,181 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "common/def.h" +#include "common/log.h" +#include "common/sys.h" +#include "common/base.h" + +#include "sock_base.h" + +class sock_socket : public sock_base {}; + +/** + * @test sock_socket.ti_1 + * @brief + * Create UDP socket + * @details + */ +TEST_F(sock_socket, ti_1) { + int fd = UNDEFINED_VALUE; + + errno = EOK; + fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); + EXPECT_LE(0, fd); + EXPECT_EQ(EOK, errno); + + close(fd); +} + +/** + * @test sock_socket.ti_2 + * @brief + * Create TCP socket + * @details + */ +TEST_F(sock_socket, ti_2) { + int fd = UNDEFINED_VALUE; + + errno = EOK; + fd = socket(PF_INET, SOCK_STREAM, IPPROTO_IP); + EXPECT_LE(0, fd); + EXPECT_EQ(EOK, errno); + + close(fd); +} + +/** + * @test sock_socket.ti_3 + * @brief + * Create UNIX socket + * @details + */ +TEST_F(sock_socket, ti_3) { + int fd = UNDEFINED_VALUE; + + errno = EOK; + fd = socket(PF_UNIX, SOCK_DGRAM, IPPROTO_IP); + EXPECT_LE(0, fd); + EXPECT_EQ(EOK, errno); + + close(fd); +} + +/** + * @test sock_socket.ti_4 + * @brief + * Create RAW socket + * @details + */ +TEST_F(sock_socket, ti_4) { + int fd = UNDEFINED_VALUE; + + errno = EOK; + fd = socket(PF_INET, SOCK_RAW, IPPROTO_TCP); + EXPECT_EQ((sys_rootuser() ? 0 : -1), fd); + EXPECT_EQ((sys_rootuser() ? EOK : EPERM), errno); + + close(fd); +} + +/** + * @test sock_socket.ti_5 + * @brief + * Check domain argument + * @details + */ +TEST_F(sock_socket, ti_5) { + int fd = UNDEFINED_VALUE; + + errno = EOK; + fd = socket(PF_UNSPEC, SOCK_DGRAM, IPPROTO_IP); + EXPECT_EQ(-1, fd); + EXPECT_EQ(EAFNOSUPPORT, errno); + + errno = EOK; + fd = socket(PF_MAX + 1, SOCK_STREAM, IPPROTO_IP); + EXPECT_EQ(-1, fd); + EXPECT_EQ(EAFNOSUPPORT, errno); +} + +/** + * @test sock_socket.ti_6 + * @brief + * Check type argument + * @details + */ +TEST_F(sock_socket, ti_6) { + int fd = UNDEFINED_VALUE; + + errno = EOK; + fd = socket(PF_INET, 0x10, IPPROTO_IP); + EXPECT_EQ(-1, fd); + EXPECT_EQ(EINVAL, errno); +} + +/** + * @test sock_socket.ti_7 + * @brief + * Check proto argument + * @details + */ +TEST_F(sock_socket, ti_7) { + int fd = UNDEFINED_VALUE; + + errno = EOK; + fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); + EXPECT_LE(0, fd); + EXPECT_EQ(EOK, errno); + + close(fd); + + errno = EOK; + fd = socket(PF_INET, SOCK_STREAM, IPPROTO_UDP); + EXPECT_EQ(-1, fd); + EXPECT_EQ(EPROTONOSUPPORT, errno); + + close(fd); + + errno = EOK; + fd = socket(PF_INET, SOCK_STREAM, IPPROTO_TCP); + EXPECT_LE(0, fd); + EXPECT_EQ(EOK, errno); + + close(fd); + + errno = EOK; + fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_TCP); + EXPECT_EQ(-1, fd); + EXPECT_EQ(EPROTONOSUPPORT, errno); + + close(fd); +} diff --git a/tests/gtest/tcp/tcp_base.cc b/tests/gtest/tcp/tcp_base.cc new file mode 100644 index 0000000..970ec6e --- /dev/null +++ b/tests/gtest/tcp/tcp_base.cc @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "common/def.h" +#include "common/log.h" +#include "common/sys.h" +#include "common/base.h" + +#include "tcp_base.h" + +void tcp_base::SetUp() +{ + errno = EOK; +} + +void tcp_base::TearDown() +{ +} + +int tcp_base::sock_create(void) +{ + int rc; + int fd; + int opt_val = 0; + socklen_t opt_len; + + fd = socket(PF_INET, SOCK_STREAM, IPPROTO_IP); + if (fd < 0) { + log_error("failed socket() %s\n", strerror(errno)); + goto err; + } + + rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt_val, sizeof(opt_len)); + if (rc < 0) { + log_error("failed setsockopt(SO_REUSEADDR) %s\n", strerror(errno)); + goto err; + } + + return fd; + +err: + close(fd); + + return (-1); +} + +int tcp_base::sock_create_nb(void) +{ + int rc; + int fd; + int opt_val = 0; + socklen_t opt_len; + + fd = socket(PF_INET, SOCK_STREAM, IPPROTO_IP); + if (fd < 0) { + log_error("failed socket() %s\n", strerror(errno)); + goto err; + } + + rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt_val, sizeof(opt_len)); + if (rc < 0) { + log_error("failed setsockopt(SO_REUSEADDR) %s\n", strerror(errno)); + goto err; + } + + rc = test_base::sock_noblock(fd); + if (rc < 0) { + log_error("failed sock_noblock() %s\n", strerror(errno)); + goto err; + } + + return fd; + +err: + close(fd); + + return (-1); +} diff --git a/tests/gtest/tcp/tcp_base.h b/tests/gtest/tcp/tcp_base.h new file mode 100644 index 0000000..0cce24d --- /dev/null +++ b/tests/gtest/tcp/tcp_base.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef TESTS_GTEST_TCP_BASE_H_ +#define TESTS_GTEST_TCP_BASE_H_ + + +/** + * TCP Base class for tests + */ +class tcp_base : public testing::Test, public test_base { +public: + static int sock_create(void); + static int sock_create_nb(void); + +protected: + virtual void SetUp(); + virtual void TearDown(); +}; + +#endif /* TESTS_GTEST_TCP_BASE_H_ */ diff --git a/tests/gtest/tcp/tcp_bind.cc b/tests/gtest/tcp/tcp_bind.cc new file mode 100644 index 0000000..9067f2c --- /dev/null +++ b/tests/gtest/tcp/tcp_bind.cc @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "common/def.h" +#include "common/log.h" +#include "common/sys.h" +#include "common/base.h" + +#include "tcp_base.h" + + +class tcp_bind : public tcp_base {}; + +/** + * @test tcp_bind.ti_1 + * @brief + * bind(SOCK_STREAM) socket to local ip + * @details + */ +TEST_F(tcp_bind, ti_1) { + int rc = EOK; + int fd; + + fd = tcp_base::sock_create(); + ASSERT_LE(0, fd); + + errno = EOK; + rc = bind(fd, (struct sockaddr *)&client_addr, sizeof(client_addr)); + EXPECT_EQ(EOK, errno); + EXPECT_EQ(0, rc); + + close(fd); +} + +/** + * @test tcp_bind.ti_2 + * @brief + * bind(SOCK_STREAM) socket to remote ip + * @details + */ +TEST_F(tcp_bind, ti_2) { + int rc = EOK; + int fd; + + fd = tcp_base::sock_create(); + ASSERT_LE(0, fd); + + errno = EOK; + rc = bind(fd, (struct sockaddr *)&remote_addr, sizeof(remote_addr)); + EXPECT_EQ(EADDRNOTAVAIL, errno); + EXPECT_GT(0, rc); + + close(fd); +} + +/** + * @test tcp_bind.ti_3 + * @brief + * bind(SOCK_STREAM) socket twice + * @details + */ +TEST_F(tcp_bind, ti_3) { + int rc = EOK; + int fd; + struct sockaddr_in addr; + + fd = tcp_base::sock_create(); + ASSERT_LE(0, fd); + + errno = EOK; + rc = bind(fd, (struct sockaddr *)&client_addr, sizeof(client_addr)); + EXPECT_EQ(EOK, errno); + EXPECT_EQ(0, rc); + + memcpy(&addr, &client_addr, sizeof(addr)); + addr.sin_port = htons(bogus_port); + + errno = EOK; + rc = bind(fd, (struct sockaddr *)&addr, sizeof(addr)); + EXPECT_EQ(EINVAL, errno); + EXPECT_GT(0, rc); + + close(fd); +} + +/** + * @test tcp_bind.ti_4 + * @brief + * bind(SOCK_STREAM) two sockets on the same ip + * @details + */ +TEST_F(tcp_bind, ti_4) { + int rc = EOK; + int fd; + int fd2; + + fd = tcp_base::sock_create(); + ASSERT_LE(0, fd); + + errno = EOK; + rc = bind(fd, (struct sockaddr *)&client_addr, sizeof(client_addr)); + EXPECT_EQ(EOK, errno); + EXPECT_EQ(0, rc); + + fd2 = tcp_base::sock_create(); + ASSERT_LE(0, fd); + + errno = EOK; + rc = bind(fd2, (struct sockaddr *)&client_addr, sizeof(client_addr)); + EXPECT_EQ(EADDRINUSE, errno); + EXPECT_GT(0, rc); + + close(fd); + close(fd2); +} diff --git a/tests/gtest/tcp/tcp_connect.cc b/tests/gtest/tcp/tcp_connect.cc new file mode 100644 index 0000000..8ba2638 --- /dev/null +++ b/tests/gtest/tcp/tcp_connect.cc @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "common/def.h" +#include "common/log.h" +#include "common/sys.h" +#include "common/base.h" + +#include "tcp_base.h" + +class tcp_connect : public tcp_base {}; + +/** + * @test tcp_connect.ti_1 + * @brief + * Loop of blocking connect() to ip on the same node + * @details + */ +TEST_F(tcp_connect, DISABLED_ti_1) { + int rc = EOK; + int fd; + int i; + + fd = socket(PF_INET, SOCK_STREAM, IPPROTO_IP); + ASSERT_LE(0, fd); + + rc = bind(fd, (struct sockaddr *)&client_addr, sizeof(client_addr)); + ASSERT_EQ(EOK, errno); + ASSERT_EQ(0, rc); + + for (i = 0; i < 10; i++) { + rc = connect(fd, (struct sockaddr *)&server_addr, sizeof(server_addr)); + ASSERT_EQ(ECONNREFUSED, errno) << + "connect() attempt = " << i; + ASSERT_EQ((-1), rc) << + "connect() attempt = " << i; + usleep(500); + } + + close(fd); +} + +/** + * @test tcp_connect.ti_2 + * @brief + * Loop of blocking connect() to remote ip + * @details + */ +TEST_F(tcp_connect, DISABLED_ti_2) { + int rc = EOK; + int fd; + int i; + + fd = socket(PF_INET, SOCK_STREAM, IPPROTO_IP); + ASSERT_LE(0, fd); + + rc = bind(fd, (struct sockaddr *)&client_addr, sizeof(client_addr)); + ASSERT_EQ(EOK, errno); + ASSERT_EQ(0, rc); + + for (i = 0; i < 10; i++) { + rc = connect(fd, (struct sockaddr *)&remote_addr, sizeof(remote_addr)); + ASSERT_TRUE(ECONNREFUSED == errno || ETIMEDOUT == errno) << + "connect() attempt = " << i; + ASSERT_EQ((-1), rc) << + "connect() attempt = " << i; + usleep(500); + if (ETIMEDOUT == errno) { + log_warn("Routing issue, consider another remote address instead of %s\n", + sys_addr2str(&remote_addr)); + break; + } + } + + close(fd); +} + +/** + * @test tcp_connect.ti_3 + * @brief + * Loop of blocking connect() to unreachable ip + * @details + */ +TEST_F(tcp_connect, DISABLED_ti_3) { + int rc = EOK; + int fd; + int i; + + fd = socket(PF_INET, SOCK_STREAM, IPPROTO_IP); + ASSERT_LE(0, fd); + + rc = bind(fd, (struct sockaddr *)&client_addr, sizeof(client_addr)); + ASSERT_EQ(EOK, errno); + ASSERT_EQ(0, rc); + + for (i = 0; i < 10; i++) { + rc = connect(fd, (struct sockaddr *)&bogus_addr, sizeof(bogus_addr)); + ASSERT_EQ(EHOSTUNREACH, errno) << + "connect() attempt = " << i; + ASSERT_EQ((-1), rc) << + "connect() attempt = " << i; + usleep(500); + } + + close(fd); +} diff --git a/tests/gtest/tcp/tcp_connect_nb.cc b/tests/gtest/tcp/tcp_connect_nb.cc new file mode 100644 index 0000000..5460769 --- /dev/null +++ b/tests/gtest/tcp/tcp_connect_nb.cc @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "common/def.h" +#include "common/log.h" +#include "common/sys.h" +#include "common/base.h" + +#include "tcp_base.h" + +class tcp_connect_nb : public tcp_base {}; + +/** + * @test tcp_connect_nb.ti_1 + * @brief + * Loop of blocking connect() to ip on the same node + * @details + */ +TEST_F(tcp_connect_nb, ti_1) { +} diff --git a/tests/gtest/tcp/tcp_event.cc b/tests/gtest/tcp/tcp_event.cc new file mode 100644 index 0000000..7f5a643 --- /dev/null +++ b/tests/gtest/tcp/tcp_event.cc @@ -0,0 +1,207 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "common/def.h" +#include "common/log.h" +#include "common/sys.h" +#include "common/base.h" + +#include "tcp_base.h" + +static void _proc_server(void *ptr); +static void _proc_client(void *ptr); + +class tcp_event : public tcp_base {}; + +TEST_F(tcp_event, DISABLED_ti_1) { + int rc = EOK; + int fd; + struct epoll_event event; + + fd = tcp_base::sock_create_nb(); + ASSERT_LE(0, fd); + + rc = connect(fd, (struct sockaddr *)&server_addr, sizeof(server_addr)); + ASSERT_EQ(EINPROGRESS, errno); + ASSERT_EQ((-1), rc); + + event.events = 0; + event.data.fd = fd; + rc = test_base::event_wait(&event); + EXPECT_LT(0, rc); +/* EXPECT_EQ(EPOLLHUP, event.events); TODO: UNDER VMA */ + EXPECT_EQ((uint32_t)(EPOLLERR | EPOLLHUP), event.events); + + close(fd); +} + +TEST_F(tcp_event, ti_2) { + int rc = EOK; + int fd; + struct epoll_event event; + + fd = tcp_base::sock_create_nb(); + ASSERT_LE(0, fd); + + rc = connect(fd, (struct sockaddr *)&remote_addr, sizeof(remote_addr)); + ASSERT_EQ(EINPROGRESS, errno); + ASSERT_EQ((-1), rc); + + event.events = 0; + event.data.fd = fd; + rc = test_base::event_wait(&event); + EXPECT_LT(0, rc); + EXPECT_EQ((uint32_t)(EPOLLERR | EPOLLHUP), event.events); + + close(fd); +} + +TEST_F(tcp_event, DISABLED_ti_3) { + int rc = EOK; + int fd; + struct epoll_event event; + + fd = tcp_base::sock_create_nb(); + ASSERT_LE(0, fd); + + rc = connect(fd, (struct sockaddr *)&server_addr, sizeof(server_addr)); + ASSERT_EQ(EINPROGRESS, errno); + ASSERT_EQ((-1), rc); + + event.events = EPOLLOUT | EPOLLIN; + event.data.fd = fd; + rc = test_base::event_wait(&event); + EXPECT_LT(0, rc); + /* EXPECT_EQ((EPOLLHUP | EPOLLIN), event.events); TODO: UNDER VMA */ + EXPECT_EQ((uint32_t)(EPOLLERR | EPOLLHUP | EPOLLOUT | EPOLLIN), event.events); + + close(fd); +} + +TEST_F(tcp_event, DISABLED_ti_4) { + int rc = EOK; + int fd; + struct epoll_event event; + + fd = tcp_base::sock_create_nb(); + ASSERT_LE(0, fd); + + rc = connect(fd, (struct sockaddr *)&remote_addr, sizeof(remote_addr)); + ASSERT_EQ(EINPROGRESS, errno); + ASSERT_EQ((-1), rc); + + event.events = EPOLLOUT | EPOLLIN; + event.data.fd = fd; + rc = test_base::event_wait(&event); + EXPECT_LT(0, rc); + /* EXPECT_EQ((EPOLLERR | EPOLLHUP | EPOLLIN), event.events); TODO: UNDER VMA */ + EXPECT_EQ((uint32_t)(EPOLLERR | EPOLLHUP | EPOLLOUT | EPOLLIN), event.events); + + close(fd); +} + +static void _proc_server(void *ptr) +{ + int rc = EOK; + int fd; + int fd_peer; + struct sockaddr peer_addr; + socklen_t socklen; + + UNREFERENCED_PARAMETER(ptr); + + fd = tcp_base::sock_create(); + ASSERT_LE(0, fd); + + rc = bind(fd, (struct sockaddr *)>est_conf.server_addr, sizeof(gtest_conf.server_addr)); + ASSERT_EQ(EOK, errno); + ASSERT_EQ(0, rc); + + rc = listen(fd, 5); + ASSERT_EQ(EOK, errno); + ASSERT_EQ(0, rc); + + socklen = sizeof(peer_addr); + fd_peer = accept(fd, &peer_addr, &socklen); + EXPECT_EQ(EOK, errno); + EXPECT_LE(0, fd_peer); + EXPECT_EQ(sizeof(peer_addr), socklen); + + log_trace("Accepted connection: fd=%d from %s\n", + fd_peer, sys_addr2str((struct sockaddr_in *) &peer_addr)); + + close(fd_peer); + close(fd); +} + +static void _proc_client(void *ptr) +{ + int rc = EOK; + int fd; + struct epoll_event event; + + UNREFERENCED_PARAMETER(ptr); + + fd = tcp_base::sock_create_nb(); + ASSERT_LE(0, fd); + + rc = bind(fd, (struct sockaddr *)>est_conf.client_addr, sizeof(gtest_conf.client_addr)); + ASSERT_EQ(EOK, errno); + ASSERT_EQ(0, rc); + + rc = connect(fd, (struct sockaddr *)>est_conf.server_addr, sizeof(gtest_conf.server_addr)); + ASSERT_EQ(EINPROGRESS, errno); + ASSERT_EQ((-1), rc); + + event.events = EPOLLOUT | EPOLLIN; + event.data.fd = fd; + rc = test_base::event_wait(&event); + EXPECT_LT(0, rc); + EXPECT_EQ((uint32_t)(EPOLLOUT), event.events); + + log_trace("Established connection: fd=%d to %s\n", + fd, sys_addr2str((struct sockaddr_in *) >est_conf.server_addr)); + + close(fd); +} + +TEST_F(tcp_event, DISABLED_ti_5) { + pthread_t server_thread = 0; + pthread_t client_thread = 0; + + pthread_create(&server_thread, NULL, (void* (*)(void*))_proc_server, NULL); + sleep(1); + pthread_create(&client_thread, NULL, (void* (*)(void*))_proc_client, NULL); + + pthread_join(server_thread, NULL); + pthread_join(client_thread, NULL); +} diff --git a/tests/gtest/tcp/tcp_send.cc b/tests/gtest/tcp/tcp_send.cc new file mode 100644 index 0000000..3c7683f --- /dev/null +++ b/tests/gtest/tcp/tcp_send.cc @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "common/def.h" +#include "common/log.h" +#include "common/sys.h" +#include "common/base.h" + +#include "tcp_base.h" + + +class tcp_send : public tcp_base {}; + +/** + * @test tcp_send.ti_1 + * @brief + * send() invalid socket fd + * @details + */ +TEST_F(tcp_send, ti_1) { + int rc = EOK; + int fd; + char buf[] = "hello"; + + fd = tcp_base::sock_create(); + ASSERT_LE(0, fd); + + errno = EOK; + rc = bind(fd, (struct sockaddr *)&client_addr, sizeof(client_addr)); + EXPECT_EQ(EOK, errno); + EXPECT_EQ(0, rc); + + errno = EOK; + rc = send(0xFF, (void *)buf, sizeof(buf), 0); + EXPECT_EQ(EBADF, errno); + EXPECT_EQ(-1, rc); + + close(fd); +} + +/** + * @test tcp_send.ti_2 + * @brief + * send() no connection + * @details + */ +TEST_F(tcp_send, ti_2) { + int rc = EOK; + int fd; + char buf[] = "hello"; + + fd = tcp_base::sock_create(); + ASSERT_LE(0, fd); + + errno = EOK; + rc = bind(fd, (struct sockaddr *)&client_addr, sizeof(client_addr)); + EXPECT_EQ(EOK, errno); + EXPECT_EQ(0, rc); + + errno = EOK; + (void)signal(SIGPIPE, SIG_IGN); + rc = send(fd, (void *)buf, sizeof(buf), 0); + EXPECT_EQ(EPIPE, errno); + EXPECT_EQ(-1, rc); + (void)signal(SIGPIPE, SIG_DFL); + + close(fd); +} diff --git a/tests/gtest/tcp/tcp_sendto.cc b/tests/gtest/tcp/tcp_sendto.cc new file mode 100644 index 0000000..ccc80a4 --- /dev/null +++ b/tests/gtest/tcp/tcp_sendto.cc @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "common/def.h" +#include "common/log.h" +#include "common/sys.h" +#include "common/base.h" + +#include "tcp_base.h" + + +class tcp_sendto : public tcp_base {}; + +/** + * @test tcp_sendto.ti_1 + * @brief + * send() invalid socket fd + * @details + */ +TEST_F(tcp_sendto, ti_1) { + int rc = EOK; + int fd; + char buf[] = "hello"; + + fd = tcp_base::sock_create(); + ASSERT_LE(0, fd); + + errno = EOK; + rc = bind(fd, (struct sockaddr *)&client_addr, sizeof(client_addr)); + EXPECT_EQ(EOK, errno); + EXPECT_EQ(0, rc); + + errno = EOK; + rc = sendto(0xFF, (void *)buf, sizeof(buf), 0, + (struct sockaddr *)&server_addr, sizeof(server_addr)); + EXPECT_EQ(EBADF, errno); + EXPECT_EQ(-1, rc); + + close(fd); +} + +/** + * @test tcp_sendto.ti_2 + * @brief + * send() no connection + * @details + */ +TEST_F(tcp_sendto, ti_2) { + int rc = EOK; + int fd; + char buf[] = "hello"; + + fd = tcp_base::sock_create(); + ASSERT_LE(0, fd); + + errno = EOK; + rc = bind(fd, (struct sockaddr *)&client_addr, sizeof(client_addr)); + EXPECT_EQ(EOK, errno); + EXPECT_EQ(0, rc); + + errno = EOK; + (void)signal(SIGPIPE, SIG_IGN); + rc = sendto(fd, (void *)buf, sizeof(buf), 0, + (struct sockaddr *)&server_addr, sizeof(server_addr)); + EXPECT_EQ(EPIPE, errno); + EXPECT_EQ(-1, rc); + (void)signal(SIGPIPE, SIG_DFL); + + close(fd); +} diff --git a/tests/gtest/tcp/tcp_socket.cc b/tests/gtest/tcp/tcp_socket.cc new file mode 100644 index 0000000..e0cff3e --- /dev/null +++ b/tests/gtest/tcp/tcp_socket.cc @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "common/def.h" +#include "common/log.h" +#include "common/sys.h" +#include "common/base.h" + +#include "tcp_base.h" + +class tcp_socket : public tcp_base {}; + +/** + * @test tcp_socket.ti_1 + * @brief + * Create TCP socket + * @details + */ +TEST_F(tcp_socket, ti_1) { + int fd; + + fd = socket(PF_INET, SOCK_STREAM, IPPROTO_IP); + EXPECT_LE(0, fd); + EXPECT_EQ(errno, EOK); + + close(fd); +} diff --git a/tests/gtest/udp/udp_base.cc b/tests/gtest/udp/udp_base.cc new file mode 100644 index 0000000..304dcbd --- /dev/null +++ b/tests/gtest/udp/udp_base.cc @@ -0,0 +1,107 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "common/def.h" +#include "common/log.h" +#include "common/sys.h" +#include "common/base.h" + +#include "udp_base.h" + +void udp_base::SetUp() +{ + errno = EOK; +} + +void udp_base::TearDown() +{ +} + +int udp_base::sock_create(void) +{ + int rc; + int fd; + int opt_val = 0; + socklen_t opt_len; + + fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); + if (fd < 0) { + log_error("failed socket() %s\n", strerror(errno)); + goto err; + } + + rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt_val, sizeof(opt_len)); + if (rc < 0) { + log_error("failed setsockopt(SO_REUSEADDR) %s\n", strerror(errno)); + goto err; + } + + return fd; + +err: + close(fd); + + return (-1); +} + +int udp_base::sock_create_nb(void) +{ + int rc; + int fd; + int opt_val = 0; + socklen_t opt_len; + + fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); + if (fd < 0) { + log_error("failed socket() %s\n", strerror(errno)); + goto err; + } + + rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &opt_val, sizeof(opt_len)); + if (rc < 0) { + log_error("failed setsockopt(SO_REUSEADDR) %s\n", strerror(errno)); + goto err; + } + + rc = test_base::sock_noblock(fd); + if (rc < 0) { + log_error("failed sock_noblock() %s\n", strerror(errno)); + goto err; + } + + return fd; + +err: + close(fd); + + return (-1); +} diff --git a/tests/gtest/udp/udp_base.h b/tests/gtest/udp/udp_base.h new file mode 100644 index 0000000..a7854d5 --- /dev/null +++ b/tests/gtest/udp/udp_base.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef TESTS_GTEST_UDP_BASE_H_ +#define TESTS_GTEST_UDP_BASE_H_ + + +/** + * UDP Base class for tests + */ +class udp_base : public testing::Test, public test_base { +public: + static int sock_create(void); + static int sock_create_nb(void); + +protected: + virtual void SetUp(); + virtual void TearDown(); +}; + +#endif /* TESTS_GTEST_UDP_BASE_H_ */ diff --git a/tests/gtest/udp/udp_bind.cc b/tests/gtest/udp/udp_bind.cc new file mode 100644 index 0000000..160ce6e --- /dev/null +++ b/tests/gtest/udp/udp_bind.cc @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "common/def.h" +#include "common/log.h" +#include "common/sys.h" +#include "common/base.h" + +#include "udp_base.h" + + +class udp_bind : public udp_base {}; + +/** + * @test udp_bind.ti_1 + * @brief + * bind(SOCK_DGRAM) socket to local ip + * @details + */ +TEST_F(udp_bind, ti_1) { + int rc = EOK; + int fd; + + fd = udp_base::sock_create(); + ASSERT_LE(0, fd); + + errno = EOK; + rc = bind(fd, (struct sockaddr *)&client_addr, sizeof(client_addr)); + EXPECT_EQ(EOK, errno); + EXPECT_EQ(0, rc); + + close(fd); +} + +/** + * @test udp_bind.ti_2 + * @brief + * bind(SOCK_DGRAM) socket to remote ip + * @details + */ +TEST_F(udp_bind, ti_2) { + int rc = EOK; + int fd; + + fd = udp_base::sock_create(); + ASSERT_LE(0, fd); + + errno = EOK; + rc = bind(fd, (struct sockaddr *)&remote_addr, sizeof(remote_addr)); + EXPECT_EQ(EADDRNOTAVAIL, errno); + EXPECT_GT(0, rc); + + close(fd); +} + +/** + * @test udp_bind.ti_3 + * @brief + * bind(SOCK_DGRAM) socket twice + * @details + */ +TEST_F(udp_bind, ti_3) { + int rc = EOK; + int fd; + struct sockaddr_in addr; + + fd = udp_base::sock_create(); + ASSERT_LE(0, fd); + + errno = EOK; + rc = bind(fd, (struct sockaddr *)&client_addr, sizeof(client_addr)); + EXPECT_EQ(EOK, errno); + EXPECT_EQ(0, rc); + + memcpy(&addr, &client_addr, sizeof(addr)); + addr.sin_port = htons(bogus_port); + + errno = EOK; + rc = bind(fd, (struct sockaddr *)&addr, sizeof(addr)); + EXPECT_EQ(EINVAL, errno); + EXPECT_GT(0, rc); + + close(fd); +} + +/** + * @test udp_bind.ti_4 + * @brief + * bind(SOCK_DGRAM) two sockets on the same ip + * @details + */ +TEST_F(udp_bind, ti_4) { + int rc = EOK; + int fd; + int fd2; + + fd = udp_base::sock_create(); + ASSERT_LE(0, fd); + + errno = EOK; + rc = bind(fd, (struct sockaddr *)&client_addr, sizeof(client_addr)); + EXPECT_EQ(EOK, errno); + EXPECT_EQ(0, rc); + + fd2 = udp_base::sock_create(); + ASSERT_LE(0, fd); + + errno = EOK; + rc = bind(fd2, (struct sockaddr *)&client_addr, sizeof(client_addr)); + EXPECT_EQ(EADDRINUSE, errno); + EXPECT_GT(0, rc); + + close(fd); + close(fd2); +} diff --git a/tests/gtest/udp/udp_send.cc b/tests/gtest/udp/udp_send.cc new file mode 100644 index 0000000..ef7fc9f --- /dev/null +++ b/tests/gtest/udp/udp_send.cc @@ -0,0 +1,201 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "common/def.h" +#include "common/log.h" +#include "common/sys.h" +#include "common/base.h" + +#include "udp_base.h" + + +class udp_send : public udp_base {}; + +/** + * @test udp_send.ti_1 + * @brief + * send() successful call + * @details + */ +TEST_F(udp_send, ti_1) { + int rc = EOK; + int fd; + char buf[] = "hello"; + + fd = udp_base::sock_create(); + ASSERT_LE(0, fd); + + errno = EOK; + rc = bind(fd, (struct sockaddr *)&client_addr, sizeof(client_addr)); + EXPECT_EQ(EOK, errno); + EXPECT_EQ(0, rc); + + errno = EOK; + rc = connect(fd, (struct sockaddr *)&server_addr, sizeof(server_addr)); + EXPECT_EQ(EOK, errno); + EXPECT_EQ(0, rc); + + errno = EOK; + rc = send(fd, (void *)buf, sizeof(buf), 0); + EXPECT_EQ(EOK, errno); + EXPECT_EQ(sizeof(buf), rc); + + close(fd); +} + +/** + * @test udp_send.ti_2 + * @brief + * send() invalid socket fd + * @details + */ +TEST_F(udp_send, ti_2) { + int rc = EOK; + int fd; + char buf[] = "hello"; + + fd = udp_base::sock_create(); + ASSERT_LE(0, fd); + + errno = EOK; + rc = bind(fd, (struct sockaddr *)&client_addr, sizeof(client_addr)); + EXPECT_EQ(EOK, errno); + EXPECT_EQ(0, rc); + + errno = EOK; + rc = connect(fd, (struct sockaddr *)&server_addr, sizeof(server_addr)); + EXPECT_EQ(EOK, errno); + EXPECT_EQ(0, rc); + + errno = EOK; + rc = send(0xFF, (void *)buf, sizeof(buf), 0); + EXPECT_EQ(EBADF, errno); + EXPECT_EQ(-1, rc); + + close(fd); +} + +/** + * @test udp_send.ti_3 + * @brief + * send() invalid buffer length (>65,507 bytes) + * @details + */ +TEST_F(udp_send, ti_3) { + int rc = EOK; + int fd; + char buf[65508] = "hello"; + + fd = udp_base::sock_create(); + ASSERT_LE(0, fd); + + errno = EOK; + rc = bind(fd, (struct sockaddr *)&client_addr, sizeof(client_addr)); + EXPECT_EQ(EOK, errno); + EXPECT_EQ(0, rc); + + errno = EOK; + rc = connect(fd, (struct sockaddr *)&server_addr, sizeof(server_addr)); + EXPECT_EQ(EOK, errno); + EXPECT_EQ(0, rc); + + errno = EOK; + rc = send(fd, (void *)buf, 65507, 0); + EXPECT_EQ(EOK, errno); + EXPECT_EQ(65507, rc); + + errno = EOK; + rc = send(fd, (void *)buf, sizeof(buf), 0); + EXPECT_EQ(EMSGSIZE, errno); + EXPECT_EQ(-1, rc); + + close(fd); +} + +/** + * @test udp_send.ti_4 + * @brief + * send() invalid address length + * @details + */ +TEST_F(udp_send, ti_4) { + int rc = EOK; + int fd; + char buf[] = "hello"; + + fd = udp_base::sock_create(); + ASSERT_LE(0, fd); + + errno = EOK; + rc = bind(fd, (struct sockaddr *)&client_addr, sizeof(client_addr)); + EXPECT_EQ(EOK, errno); + EXPECT_EQ(0, rc); + + errno = EOK; + rc = connect(fd, (struct sockaddr *)&server_addr, sizeof(server_addr) - 1); + EXPECT_EQ(EINVAL, errno); + EXPECT_EQ(-1, rc); + + close(fd); +} + +/** + * @test udp_send.ti_5 + * @brief + * send() invalid flag set + * @details + */ +TEST_F(udp_send, ti_5) { + int rc = EOK; + int fd; + char buf[] = "hello"; + + fd = udp_base::sock_create(); + ASSERT_LE(0, fd); + + errno = EOK; + rc = bind(fd, (struct sockaddr *)&client_addr, sizeof(client_addr)); + EXPECT_EQ(EOK, errno); + EXPECT_EQ(0, rc); + + errno = EOK; + rc = connect(fd, (struct sockaddr *)&server_addr, sizeof(server_addr)); + EXPECT_EQ(EOK, errno); + EXPECT_EQ(0, rc); + + errno = EOK; + rc = send(fd, (void *)buf, sizeof(buf), 0x000000FF); + EXPECT_EQ(EOPNOTSUPP, errno); + EXPECT_EQ(-1, rc); + + close(fd); +} diff --git a/tests/gtest/udp/udp_sendto.cc b/tests/gtest/udp/udp_sendto.cc new file mode 100644 index 0000000..56c535d --- /dev/null +++ b/tests/gtest/udp/udp_sendto.cc @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "common/def.h" +#include "common/log.h" +#include "common/sys.h" +#include "common/base.h" + +#include "udp_base.h" + + +class udp_sendto : public udp_base {}; + +/** + * @test udp_sendto.ti_1 + * @brief + * sendto() successful call + * @details + */ +TEST_F(udp_sendto, ti_1) { + int rc = EOK; + int fd; + char buf[] = "hello"; + + fd = udp_base::sock_create(); + ASSERT_LE(0, fd); + + errno = EOK; + rc = bind(fd, (struct sockaddr *)&client_addr, sizeof(client_addr)); + EXPECT_EQ(EOK, errno); + EXPECT_EQ(0, rc); + + errno = EOK; + rc = sendto(fd, (void *)buf, sizeof(buf), 0, + (struct sockaddr *)&server_addr, sizeof(server_addr)); + EXPECT_EQ(EOK, errno); + EXPECT_EQ(sizeof(buf), rc); + + close(fd); +} + +/** + * @test udp_sendto.ti_2 + * @brief + * sendto() invalid socket fd + * @details + */ +TEST_F(udp_sendto, ti_2) { + int rc = EOK; + int fd; + char buf[] = "hello"; + + fd = udp_base::sock_create(); + ASSERT_LE(0, fd); + + errno = EOK; + rc = bind(fd, (struct sockaddr *)&client_addr, sizeof(client_addr)); + EXPECT_EQ(EOK, errno); + EXPECT_EQ(0, rc); + + errno = EOK; + rc = sendto(0xFF, (void *)buf, sizeof(buf), 0, + (struct sockaddr *)&server_addr, sizeof(server_addr)); + EXPECT_EQ(EBADF, errno); + EXPECT_EQ(-1, rc); + + close(fd); +} + +/** + * @test udp_sendto.ti_3 + * @brief + * sendto() invalid buffer length (>65,507 bytes) + * @details + */ +TEST_F(udp_sendto, ti_3) { + int rc = EOK; + int fd; + char buf[65508] = "hello"; + + fd = udp_base::sock_create(); + ASSERT_LE(0, fd); + + errno = EOK; + rc = bind(fd, (struct sockaddr *)&client_addr, sizeof(client_addr)); + EXPECT_EQ(EOK, errno); + EXPECT_EQ(0, rc); + + errno = EOK; + rc = sendto(fd, (void *)buf, 65507, 0, + (struct sockaddr *)&server_addr, sizeof(server_addr)); + EXPECT_EQ(EOK, errno); + EXPECT_EQ(65507, rc); + + errno = EOK; + rc = sendto(fd, (void *)buf, sizeof(buf), 0, + (struct sockaddr *)&server_addr, sizeof(server_addr)); + EXPECT_EQ(EMSGSIZE, errno); + EXPECT_EQ(-1, rc); + + close(fd); +} + +/** + * @test udp_sendto.ti_4 + * @brief + * sendto() invalid address length + * @details + */ +TEST_F(udp_sendto, ti_4) { + int rc = EOK; + int fd; + char buf[] = "hello"; + + fd = udp_base::sock_create(); + ASSERT_LE(0, fd); + + errno = EOK; + rc = bind(fd, (struct sockaddr *)&client_addr, sizeof(client_addr)); + EXPECT_EQ(EOK, errno); + EXPECT_EQ(0, rc); + + errno = EOK; + rc = sendto(fd, (void *)buf, sizeof(buf), 0, + (struct sockaddr *)&server_addr, sizeof(server_addr) - 1); + EXPECT_EQ(EINVAL, errno); + EXPECT_EQ(-1, rc); + + close(fd); +} + +/** + * @test udp_sendto.ti_5 + * @brief + * sendto() invalid flag set + * @details + */ +TEST_F(udp_sendto, ti_5) { + int rc = EOK; + int fd; + char buf[] = "hello"; + + fd = udp_base::sock_create(); + ASSERT_LE(0, fd); + + errno = EOK; + rc = bind(fd, (struct sockaddr *)&client_addr, sizeof(client_addr)); + EXPECT_EQ(EOK, errno); + EXPECT_EQ(0, rc); + + errno = EOK; + rc = sendto(fd, (void *)buf, sizeof(buf), 0x000000FF, + (struct sockaddr *)&server_addr, sizeof(server_addr)); + EXPECT_EQ(EOPNOTSUPP, errno); + EXPECT_EQ(-1, rc); + + close(fd); +} diff --git a/tests/gtest/vma/vma_base.cc b/tests/gtest/vma/vma_base.cc new file mode 100644 index 0000000..88ce0a6 --- /dev/null +++ b/tests/gtest/vma/vma_base.cc @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "common/def.h" +#include "common/log.h" +#include "common/sys.h" +#include "common/base.h" + +#include "vma_base.h" + +void vma_base::SetUp() +{ + errno = EOK; + vma_api = vma_get_api(); + ASSERT_TRUE(vma_api) << + "vma test suite should be launched under libvma.so"; +} + +void vma_base::TearDown() +{ +} diff --git a/tests/gtest/vma/vma_base.h b/tests/gtest/vma/vma_base.h new file mode 100644 index 0000000..da0c1d2 --- /dev/null +++ b/tests/gtest/vma/vma_base.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef TESTS_GTEST_VMA_BASE_H_ +#define TESTS_GTEST_VMA_BASE_H_ + +#include + +/** + * To enable vma tests you need to set below VMA_EXTRA_API_ENABLED to 1 + * or you can add the following CPPFLAG during compilation 'make CPPFLAGS="-DVMA_EXTRA_API_ENABLED=1"' + */ +#ifndef VMA_EXTRA_API_ENABLED +#define VMA_EXTRA_API_ENABLED 0 +#endif + +/** + * VMA Base class for tests + */ +class vma_base : public testing::Test, public test_base { +protected: + virtual void SetUp(); + virtual void TearDown(); + +protected: + struct vma_api_t *vma_api; +}; + +#endif /* TESTS_GTEST_VMA_BASE_H_ */ diff --git a/tests/gtest/vma/vma_poll.cc b/tests/gtest/vma/vma_poll.cc new file mode 100644 index 0000000..86c4eeb --- /dev/null +++ b/tests/gtest/vma/vma_poll.cc @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "common/def.h" +#include "common/log.h" +#include "common/sys.h" +#include "common/base.h" + +#include "vma_base.h" + +#if defined(VMA_EXTRA_API_ENABLED) && (VMA_EXTRA_API_ENABLED == 1) + +class vma_poll : public vma_base {}; + +TEST_F(vma_poll, ti_1) { +} + +#endif /* VMA_EXTRA_API_ENABLED */ diff --git a/tests/gtest/vma/vma_ring.cc b/tests/gtest/vma/vma_ring.cc new file mode 100644 index 0000000..8a9232b --- /dev/null +++ b/tests/gtest/vma/vma_ring.cc @@ -0,0 +1,336 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "common/def.h" +#include "common/log.h" +#include "common/sys.h" +#include "common/base.h" +#include "common/cmn.h" + +#include "vma_base.h" + +#if defined(VMA_EXTRA_API_ENABLED) && (VMA_EXTRA_API_ENABLED == 1) + +class vma_ring : public vma_base {}; + +TEST_F(vma_ring, ti_1) { + int rc = EOK; + int ring_fd = UNDEFINED_VALUE; + + rc = vma_api->get_socket_rings_fds(0, &ring_fd, 1); + EXPECT_GE(0, rc); + EXPECT_EQ(UNDEFINED_VALUE, ring_fd); +} + +TEST_F(vma_ring, ti_2) { + int rc = EOK; + int ring_fd = UNDEFINED_VALUE; + int fd; + + fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); + ASSERT_LE(0, fd); + + rc = vma_api->get_socket_rings_fds(fd, &ring_fd, 1); + EXPECT_GE(0, rc); + EXPECT_EQ(UNDEFINED_VALUE, ring_fd); + + close(fd); +} + +TEST_F(vma_ring, ti_3) { + int rc = EOK; + int ring_fd = UNDEFINED_VALUE; + int fd; + + fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); + ASSERT_LE(0, fd); + + rc = bind(fd, (struct sockaddr *)&server_addr, sizeof(server_addr)); + ASSERT_EQ(EOK, errno); + ASSERT_EQ(0, rc); + + rc = vma_api->get_socket_rings_fds(fd, &ring_fd, 1); + EXPECT_EQ(1, rc); + EXPECT_LE(0, ring_fd); + + close(fd); +} + +TEST_F(vma_ring, ti_4) { + int rc = EOK; + int ring_fd = UNDEFINED_VALUE; + int fd; + + fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); + ASSERT_LE(0, fd); + + rc = connect(fd, (struct sockaddr *)&server_addr, sizeof(server_addr)); + ASSERT_EQ(EOK, errno); + ASSERT_EQ(0, rc); + + rc = vma_api->get_socket_rings_fds(fd, &ring_fd, 1); + EXPECT_EQ(1, rc); + EXPECT_LE(0, ring_fd); + + close(fd); +} + +TEST_F(vma_ring, ti_5) { + int rc = EOK; + int ring_fd = UNDEFINED_VALUE; + int fd; + + fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); + ASSERT_LE(0, fd); + + rc = test_base::sock_noblock(fd); + ASSERT_EQ(0, rc); + + rc = connect(fd, (struct sockaddr *)&server_addr, sizeof(server_addr)); + ASSERT_EQ(EOK, errno); + ASSERT_EQ(0, rc); + + rc = vma_api->get_socket_rings_fds(fd, &ring_fd, 1); + EXPECT_EQ(1, rc); + EXPECT_LE(0, ring_fd); + + close(fd); +} + +TEST_F(vma_ring, ti_6) { + int rc = EOK; + int ring_fd = UNDEFINED_VALUE; + int fd; + + fd = socket(PF_INET, SOCK_STREAM, IPPROTO_IP); + ASSERT_LE(0, fd); + + rc = vma_api->get_socket_rings_fds(fd, &ring_fd, 1); + EXPECT_GE(0, rc); + EXPECT_EQ(UNDEFINED_VALUE, ring_fd); + + close(fd); +} + +TEST_F(vma_ring, ti_7) { + int rc = EOK; + int ring_fd = UNDEFINED_VALUE; + int fd; + + fd = socket(PF_INET, SOCK_STREAM, IPPROTO_IP); + ASSERT_LE(0, fd); + + rc = bind(fd, (struct sockaddr *)&server_addr, sizeof(server_addr)); + ASSERT_EQ(EOK, errno); + ASSERT_EQ(0, rc); + + rc = vma_api->get_socket_rings_fds(fd, &ring_fd, 1); + EXPECT_EQ(1, rc); + EXPECT_LE(0, ring_fd); + + close(fd); +} + +TEST_F(vma_ring, ti_8) { + int rc = EOK; + int ring_fd = UNDEFINED_VALUE; + int fd; + + fd = socket(PF_INET, SOCK_STREAM, IPPROTO_IP); + ASSERT_LE(0, fd); + + rc = test_base::sock_noblock(fd); + ASSERT_EQ(0, rc); + + rc = connect(fd, (struct sockaddr *)&server_addr, sizeof(server_addr)); + ASSERT_EQ(EINPROGRESS, errno); + ASSERT_EQ((-1), rc); + + rc = vma_api->get_socket_rings_fds(fd, &ring_fd, 1); + EXPECT_EQ(1, rc); + EXPECT_LE(0, ring_fd); + + close(fd); +} + +TEST_F(vma_ring, ti_9) { + int rc = EOK; + int ring_fd = UNDEFINED_VALUE; + int fd; + char opt_val[100]; + socklen_t opt_len; + + SKIP_TRUE(sys_rootuser(), "This test requires root permission"); + + fd = socket(PF_INET, SOCK_STREAM, IPPROTO_IP); + ASSERT_LE(0, fd); + + opt_val[0] = '\0'; + opt_len = sizeof(opt_val); + ASSERT_TRUE(sys_addr2dev(&server_addr, opt_val, opt_len)); + log_trace("SO_BINDTODEVICE: fd=%d as %s on %s\n", + fd, sys_addr2str((struct sockaddr_in *) &server_addr), opt_val); + + rc = setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, (void *)opt_val, opt_len); + ASSERT_EQ(EOK, errno); + ASSERT_EQ(0, rc); + + rc = vma_api->get_socket_rings_fds(fd, &ring_fd, 1); + EXPECT_GE(0, rc); + EXPECT_EQ(UNDEFINED_VALUE, ring_fd); + + close(fd); +} + +TEST_F(vma_ring, ti_10) { + int rc = EOK; + int ring_fd_bind = UNDEFINED_VALUE; + int ring_fd_bind_opt = UNDEFINED_VALUE; + int ring_fd_connect = UNDEFINED_VALUE; + int fd; + char opt_val[100]; + socklen_t opt_len; + + SKIP_TRUE(sys_rootuser(), "This test requires root permission"); + + fd = socket(PF_INET, SOCK_STREAM, IPPROTO_IP); + ASSERT_LE(0, fd); + + opt_val[0] = '\0'; + opt_len = sizeof(opt_val); + ASSERT_TRUE(sys_addr2dev(&server_addr, opt_val, opt_len)); + + log_trace("bind(): fd=%d as %s on %s\n", + fd, sys_addr2str((struct sockaddr_in *) &server_addr), opt_val); + + rc = bind(fd, (struct sockaddr *)&server_addr, sizeof(server_addr)); + ASSERT_EQ(EOK, errno); + ASSERT_EQ(0, rc); + + rc = vma_api->get_socket_rings_fds(fd, &ring_fd_bind, 1); + EXPECT_GE(1, rc); + EXPECT_LE(0, ring_fd_bind); + + opt_val[0] = '\0'; + opt_len = sizeof(opt_val); + ASSERT_TRUE(sys_addr2dev(&client_addr, opt_val, opt_len)); + + log_trace("SO_BINDTODEVICE: fd=%d as %s on %s\n", + fd, sys_addr2str((struct sockaddr_in *) &client_addr), opt_val); + + rc = setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, (void *)opt_val, opt_len); + ASSERT_EQ(EOK, errno); + ASSERT_EQ(0, rc); + + rc = vma_api->get_socket_rings_fds(fd, &ring_fd_bind_opt, 1); + EXPECT_GE(1, rc); + EXPECT_LE(0, ring_fd_bind_opt); + + rc = test_base::sock_noblock(fd); + ASSERT_EQ(0, rc); + + rc = connect(fd, (struct sockaddr *)&remote_addr, sizeof(remote_addr)); + ASSERT_EQ(EINPROGRESS, errno); + ASSERT_EQ((-1), rc); + + rc = vma_api->get_socket_rings_fds(fd, &ring_fd_connect, 1); + EXPECT_EQ(1, rc); + EXPECT_LE(0, ring_fd_connect); + + EXPECT_TRUE(ring_fd_bind == ring_fd_bind_opt); + EXPECT_TRUE(ring_fd_bind == ring_fd_connect); + + close(fd); +} + +TEST_F(vma_ring, ti_11) { + int rc = EOK; + int ring_fd_bind = UNDEFINED_VALUE; + int ring_fd_bind_opt = UNDEFINED_VALUE; + int ring_fd_connect = UNDEFINED_VALUE; + int fd; + char opt_val[100]; + socklen_t opt_len; + + SKIP_TRUE(sys_rootuser(), "This test requires root permission"); + + fd = socket(PF_INET, SOCK_STREAM, IPPROTO_IP); + ASSERT_LE(0, fd); + + opt_val[0] = '\0'; + opt_len = sizeof(opt_val); + ASSERT_TRUE(sys_addr2dev(&server_addr, opt_val, opt_len)); + + log_trace("SO_BINDTODEVICE: fd=%d as %s on %s\n", + fd, sys_addr2str((struct sockaddr_in *) &server_addr), opt_val); + + rc = setsockopt(fd, SOL_SOCKET, SO_BINDTODEVICE, (void *)opt_val, opt_len); + ASSERT_EQ(EOK, errno); + ASSERT_EQ(0, rc); + + rc = vma_api->get_socket_rings_fds(fd, &ring_fd_bind_opt, 1); + EXPECT_GE(0, rc); + EXPECT_EQ(UNDEFINED_VALUE, ring_fd_bind_opt); + + opt_val[0] = '\0'; + opt_len = sizeof(opt_val); + ASSERT_TRUE(sys_addr2dev(&client_addr, opt_val, opt_len)); + + log_trace("bind(): fd=%d as %s on %s\n", + fd, sys_addr2str((struct sockaddr_in *) &client_addr), opt_val); + + rc = bind(fd, (struct sockaddr *)&client_addr, sizeof(client_addr)); + ASSERT_EQ(EOK, errno); + ASSERT_EQ(0, rc); + + rc = vma_api->get_socket_rings_fds(fd, &ring_fd_bind, 1); + EXPECT_EQ(1, rc); + EXPECT_LE(0, ring_fd_bind); + + rc = test_base::sock_noblock(fd); + ASSERT_EQ(0, rc); + + rc = connect(fd, (struct sockaddr *)&remote_addr, sizeof(remote_addr)); + ASSERT_EQ(EINPROGRESS, errno); + ASSERT_EQ((-1), rc); + + rc = vma_api->get_socket_rings_fds(fd, &ring_fd_connect, 1); + EXPECT_EQ(1, rc); + EXPECT_LE(0, ring_fd_connect); + + EXPECT_TRUE(ring_fd_bind != ring_fd_bind_opt); + EXPECT_TRUE(ring_fd_bind == ring_fd_connect); + + close(fd); +} + +#endif /* VMA_EXTRA_API_ENABLED */ diff --git a/tests/gtest/vma/vma_sockopt.cc b/tests/gtest/vma/vma_sockopt.cc new file mode 100644 index 0000000..73b31f6 --- /dev/null +++ b/tests/gtest/vma/vma_sockopt.cc @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "common/def.h" +#include "common/log.h" +#include "common/sys.h" +#include "common/base.h" +#include "common/cmn.h" + +#include "vma_base.h" + +#if defined(VMA_EXTRA_API_ENABLED) && (VMA_EXTRA_API_ENABLED == 1) + +class vma_sockopt : public vma_base {}; + +/** + * @test vma_sockopt.ti_1 + * @brief + * UDP RING_USER_ID good flow + * @details + */ +TEST_F(vma_sockopt, ti_1) { + int rc = EOK; + int fd = UNDEFINED_VALUE; + struct vma_ring_alloc_logic_attr profile; + int user_id = 100; + + memset(&profile, 0, sizeof(struct vma_ring_alloc_logic_attr)); + + profile.user_id = user_id; + profile.ring_alloc_logic = RING_LOGIC_PER_USER_ID; + profile.engress = 1; + profile.comp_mask = VMA_RING_ALLOC_MASK_RING_USER_ID | VMA_RING_ALLOC_MASK_RING_ENGRESS; + + errno = EOK; + fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); + EXPECT_LE(0, fd); + EXPECT_EQ(EOK, errno); + + errno = EOK; + rc = setsockopt(fd, SOL_SOCKET, SO_VMA_RING_ALLOC_LOGIC, &profile, sizeof(profile)); + EXPECT_EQ(0, rc); + EXPECT_EQ(EOK, errno); + + close(fd); +} + +/** + * @test vma_sockopt.ti_2 + * @brief + * UDP RING_USER_ID bad flow + * @details + */ +TEST_F(vma_sockopt, ti_2) { + int rc = EOK; + int fd = UNDEFINED_VALUE; + struct vma_ring_alloc_logic_attr profile; + int user_id = 100; + int unsupported_mask = (1<<4); + + memset(&profile, 0, sizeof(struct vma_ring_alloc_logic_attr)); + + profile.user_id = user_id; + profile.ring_alloc_logic = RING_LOGIC_PER_USER_ID; + profile.engress = 1; + profile.comp_mask = unsupported_mask; + + errno = EOK; + fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); + EXPECT_LE(0, fd); + EXPECT_EQ(EOK, errno); + + /* Wrong passed value */ + errno = EOK; + rc = setsockopt(fd, SOL_SOCKET, SO_VMA_RING_ALLOC_LOGIC, &profile, sizeof(profile)); + EXPECT_GT(0, rc); + EXPECT_EQ(EINVAL, errno); + + /* Wrong data size */ + errno = EOK; + rc = setsockopt(fd, SOL_SOCKET, SO_VMA_RING_ALLOC_LOGIC, &profile, sizeof(profile) - 1); + EXPECT_GT(0, rc); + EXPECT_EQ(EINVAL, errno); + + close(fd); +} + +#endif /* VMA_EXTRA_API_ENABLED */ diff --git a/tests/gtest/vmad/vmad_base.cc b/tests/gtest/vmad/vmad_base.cc new file mode 100644 index 0000000..36bb061 --- /dev/null +++ b/tests/gtest/vmad/vmad_base.cc @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "common/def.h" +#include "common/log.h" +#include "common/sys.h" +#include "common/base.h" +#include "common/cmn.h" + +#include "vmad_base.h" + +#include "src/vma/util/agent_def.h" +#include "config.h" + +void vmad_base::SetUp() +{ + int rc = 0 ; + int optval = 1; + struct timeval opttv; + struct sockaddr_un sock_addr; + + errno = EOK; + m_self_pid = getpid(); + m_vmad_pid = sys_procpid("vmad"); + m_base_name = "vma_gtest"; + SKIP_TRUE((m_vmad_pid > 0), "This test requires VMA daemon running"); + + ASSERT_FALSE((mkdir(VMA_AGENT_PATH, 0777) != 0) && (errno != EEXIST)); + + rc = snprintf(m_sock_file, sizeof(m_sock_file) - 1, + "%s/%s.%d.sock", VMA_AGENT_PATH, m_base_name, m_self_pid); + ASSERT_FALSE((rc < 0 ) || (rc == (sizeof(m_sock_file) - 1))); + + rc = snprintf(m_pid_file, sizeof(m_pid_file) - 1, + "%s/%s.%d.pid", VMA_AGENT_PATH, m_base_name, m_self_pid); + ASSERT_FALSE((rc < 0 ) || (rc == (sizeof(m_pid_file) - 1) )); + + m_pid_fd = open(m_pid_file, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR | S_IRGRP); + ASSERT_FALSE(m_pid_fd < 0); + + /* Create UNIX UDP socket to receive data from VMA processes */ + memset(&sock_addr, 0, sizeof(sock_addr)); + sock_addr.sun_family = AF_UNIX; + strncpy(sock_addr.sun_path, m_sock_file, sizeof(sock_addr.sun_path) - 1); + /* remove possible old socket */ + unlink(m_sock_file); + + m_sock_fd = socket(AF_UNIX, SOCK_DGRAM, 0); + ASSERT_FALSE(m_sock_fd < 0); + + optval = 1; + rc = setsockopt(m_sock_fd, SOL_SOCKET, SO_REUSEADDR, + (const void *)&optval, sizeof(optval)); + ASSERT_FALSE(rc < 0); + + /* Sets the timeout value as 1 sec that specifies the maximum amount of time + * an input function waits until it completes. + */ + opttv.tv_sec = 1; + opttv.tv_usec = 0; + rc = setsockopt(m_sock_fd, SOL_SOCKET, SO_RCVTIMEO, + (const void *)&opttv, sizeof(opttv)); + ASSERT_FALSE(rc < 0); + + /* bind created socket */ + rc = bind(m_sock_fd, (struct sockaddr *)&sock_addr, + sizeof(sock_addr)); + ASSERT_FALSE(rc < 0); + + /* Set server address */ + memset(&m_server_addr, 0, sizeof(m_server_addr)); + m_server_addr.sun_family = AF_UNIX; + strncpy(m_server_addr.sun_path, VMA_AGENT_ADDR, sizeof(m_server_addr.sun_path) - 1); + + rc = connect(m_sock_fd, (struct sockaddr *)&m_server_addr, + sizeof(struct sockaddr_un)); + ASSERT_FALSE(rc < 0); +} + +void vmad_base::TearDown() +{ + close(m_sock_fd); + unlink(m_sock_file); + + close(m_pid_fd); + unlink(m_pid_file); +} + +int vmad_base::msg_init(pid_t pid) +{ + int rc = 0; + struct vma_msg_init data; + uint8_t *version; + + memset(&data, 0, sizeof(data)); + data.hdr.code = VMA_MSG_INIT; + data.hdr.ver = VMA_AGENT_VER; + data.hdr.pid = pid; + version = (uint8_t *)&data.ver; + version[0] = VMA_LIBRARY_MAJOR; + version[1] = VMA_LIBRARY_MINOR; + version[2] = VMA_LIBRARY_RELEASE; + version[3] = VMA_LIBRARY_REVISION; + + errno = 0; + rc = send(m_sock_fd, &data, sizeof(data), 0); + if (rc != sizeof(data)) { + rc = -ECONNREFUSED; + goto err; + } + + memset(&data, 0, sizeof(data)); + rc = recv(m_sock_fd, &data, sizeof(data), 0); + if (rc != sizeof(data)) { + rc = -ECONNREFUSED; + goto err; + } + + if (data.hdr.code != (VMA_MSG_INIT | VMA_MSG_ACK) || + data.hdr.ver < VMA_AGENT_VER || + data.hdr.pid != pid) { + log_error("Protocol version mismatch: code = 0x%X ver = 0x%X pid = %d\n", + data.hdr.code, data.hdr.ver, data.hdr.pid); + rc = -EPROTO; + goto err; + } + +err: + return rc; +} + +int vmad_base::msg_exit(pid_t pid) +{ + int rc = 0; + struct vma_msg_exit data; + + memset(&data, 0, sizeof(data)); + data.hdr.code = VMA_MSG_EXIT; + data.hdr.ver = VMA_AGENT_VER; + data.hdr.pid = pid; + + errno = 0; + rc = send(m_sock_fd, &data, sizeof(data), 0); + if (rc != sizeof(data)) { + rc = -ECONNREFUSED; + goto err; + } + +err: + return rc; +} diff --git a/tests/gtest/vmad/vmad_base.h b/tests/gtest/vmad/vmad_base.h new file mode 100644 index 0000000..dc41842 --- /dev/null +++ b/tests/gtest/vmad/vmad_base.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef TESTS_GTEST_VMAD_BASE_H_ +#define TESTS_GTEST_VMAD_BASE_H_ + + +/** + * VMAD Base class for tests + */ +class vmad_base : public testing::Test, public test_base { +protected: + virtual void SetUp(); + virtual void TearDown(); + + int msg_init(pid_t pid); + int msg_exit(pid_t pid); + +protected: + pid_t m_self_pid; + pid_t m_vmad_pid; + + const char *m_base_name; + + /* socket used for communication with daemon */ + int m_sock_fd; + + /* file descriptor that is tracked by daemon */ + int m_pid_fd; + + /* unix socket name + * size should be less than sockaddr_un.sun_path + */ + char m_sock_file[100]; + + /* name of pid file */ + char m_pid_file[100]; + + /* server address */ + struct sockaddr_un m_server_addr; +}; + +#endif /* TESTS_GTEST_VMAD_BASE_H_ */ diff --git a/tests/gtest/vmad/vmad_bitmap.cc b/tests/gtest/vmad/vmad_bitmap.cc new file mode 100644 index 0000000..1f975b1 --- /dev/null +++ b/tests/gtest/vmad/vmad_bitmap.cc @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "common/def.h" +#include "common/log.h" +#include "common/sys.h" +#include "common/base.h" +#include "common/cmn.h" + +#include "vmad_base.h" + +#include "tools/daemon/bitmap.h" + +class vmad_bitmap : public ::testing::Test {}; + +TEST_F(vmad_bitmap, ti_1) { + ASSERT_EQ(4, sizeof(bitmap_item_t)); +} + +TEST_F(vmad_bitmap, ti_2) { + bitmap_t *bm = NULL; + + bitmap_create(&bm, 10); + ASSERT_TRUE(bm); + ASSERT_EQ(10, bitmap_size(bm)); + + bitmap_destroy(bm); +} + +TEST_F(vmad_bitmap, ti_3) { + bitmap_t *bm = NULL; + + bitmap_create(&bm, 0x7ff); + ASSERT_TRUE(bm); + + ASSERT_EQ(0x7ff, bitmap_size(bm)); + + EXPECT_EQ(0, elem_idx(0)); + EXPECT_EQ(0, elem_idx(31)); + EXPECT_EQ(1, elem_idx(32)); + EXPECT_EQ(2, elem_idx(64)); + EXPECT_EQ(32, elem_idx(0x400)); + EXPECT_EQ(63, elem_idx(0x7ff)); + + bitmap_destroy(bm); +} + +TEST_F(vmad_bitmap, ti_4) { + bitmap_t *bm = NULL; + int bits[] = {0, 7, 31, 32, 64}; + int i; + + bitmap_create(&bm, 64); + ASSERT_TRUE(bm); + + for (i = 0; i < ARRAY_SIZE(bits); i++) { + EXPECT_EQ(0, bitmap_test(bm, i)); + bitmap_set(bm, i); + EXPECT_EQ(1, bitmap_test(bm, i)); + } + + bitmap_destroy(bm); +} + +TEST_F(vmad_bitmap, ti_5) { + bitmap_t *bm = NULL; + int bits[] = {0, 7, 31, 32, 64}; + int i; + + bitmap_create(&bm, 64); + ASSERT_TRUE(bm); + + for (i = 0; i < ARRAY_SIZE(bits); i++) { + EXPECT_EQ(0, bitmap_test(bm, i)); + bitmap_set(bm, i); + EXPECT_EQ(1, bitmap_test(bm, i)); + bitmap_clear(bm, i); + EXPECT_EQ(0, bitmap_test(bm, i)); + } + + bitmap_destroy(bm); +} + +TEST_F(vmad_bitmap, ti_6) { + bitmap_t *bm = NULL; + int bits[] = {0, 7, 31, 32, 64}; + int i; + + bitmap_create(&bm, 64); + ASSERT_TRUE(bm); + + for (i = 0; i < ARRAY_SIZE(bits); i++) { + EXPECT_EQ(0, bitmap_test(bm, i)); + bitmap_flip(bm, i); + EXPECT_EQ(1, bitmap_test(bm, i)); + } + + for (i = 0; i < ARRAY_SIZE(bits); i++) { + EXPECT_EQ(1, bitmap_test(bm, i)); + bitmap_flip(bm, i); + EXPECT_EQ(0, bitmap_test(bm, i)); + } + + bitmap_destroy(bm); +} + +TEST_F(vmad_bitmap, ti_7) { + bitmap_t *bm = NULL; + + bitmap_create(&bm, 64); + ASSERT_TRUE(bm); + + ASSERT_EQ(64, bitmap_size(bm)); + + EXPECT_EQ(0, bitmap_test_group(bm, 0, 7)); + EXPECT_EQ(0, bitmap_test_group(bm, 0, 64)); + + bitmap_set(bm, 7); + bitmap_set(bm, 8); + EXPECT_EQ(1, bitmap_test_group(bm, 7, 2)); + + EXPECT_EQ(-1, bitmap_test_group(bm, 6, 3)); + EXPECT_EQ(-1, bitmap_test_group(bm, 0, 64)); + + bitmap_destroy(bm); +} + +TEST_F(vmad_bitmap, ti_8) { + bitmap_t *bm = NULL; + + bitmap_create(&bm, 64); + ASSERT_TRUE(bm); + + ASSERT_EQ(64, bitmap_size(bm)); + + EXPECT_EQ(0, bitmap_find_group(bm, 0, 2, 0)); + EXPECT_EQ(32, bitmap_find_group(bm, 32, 7, 0)); + + EXPECT_EQ(-1, bitmap_find_group(bm, 0, 7, 1)); + EXPECT_EQ(-1, bitmap_find_group(bm, 32, 7, 1)); + + bitmap_set(bm, 7); + bitmap_set(bm, 8); + EXPECT_EQ(7, bitmap_find_group(bm, 0, 2, 1)); + + bitmap_destroy(bm); +} + +TEST_F(vmad_bitmap, ti_9) { + bitmap_t *bm = NULL; + int i; + + bitmap_create(&bm, 64); + ASSERT_TRUE(bm); + + ASSERT_EQ(64, bitmap_size(bm)); + + EXPECT_EQ(0, bitmap_find_first_zero(bm)); + + bitmap_set(bm, 0); + bitmap_set(bm, 1); + bitmap_set(bm, 2); + EXPECT_EQ(3, bitmap_find_first_zero(bm)); + + bitmap_set(bm, 4); + EXPECT_EQ(3, bitmap_find_first_zero(bm)); + + bitmap_set(bm, 3); + EXPECT_EQ(5, bitmap_find_first_zero(bm)); + + for (i = 0; i < 33; i++) { + bitmap_set(bm, i); + } + EXPECT_EQ(33, bitmap_find_first_zero(bm)); + + for (i = 0; i < 64; i++) { + bitmap_set(bm, i); + } + EXPECT_EQ(-1, bitmap_find_first_zero(bm)); + + bitmap_destroy(bm); +} diff --git a/tests/gtest/vmad/vmad_flow.cc b/tests/gtest/vmad/vmad_flow.cc new file mode 100644 index 0000000..158a132 --- /dev/null +++ b/tests/gtest/vmad/vmad_flow.cc @@ -0,0 +1,334 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "common/def.h" +#include "common/log.h" +#include "common/sys.h" +#include "common/base.h" +#include "common/cmn.h" + +#include "vmad_base.h" + +#include "src/vma/util/agent_def.h" + +class vmad_flow : public vmad_base { +protected: + struct vma_msg_flow m_data; + pid_t m_pid; + int m_if; + int m_tap; + vmad_flow() + { + + char opt_val[IF_NAMESIZE]; + socklen_t opt_len; + + m_pid = 0x464C4F57; + memset(&m_data, 0, sizeof(m_data)); + m_data.hdr.code = VMA_MSG_FLOW; + m_data.hdr.ver = VMA_AGENT_VER; + m_data.hdr.pid = m_pid; + + opt_val[0] = '\0'; + opt_len = sizeof(opt_val); + sys_addr2dev(&server_addr, opt_val, opt_len); + m_if = if_nametoindex(opt_val); + sys_addr2dev(&client_addr, opt_val, opt_len); + m_tap = if_nametoindex(opt_val); + m_data.if_id = m_if; + m_data.tap_id = m_tap; + } + +}; + +/** + * @test vmad_flow.ti_1 + * @brief + * Send valid TCP 3tuple VMA_MSG_FLOW(ADD) + * @details + */ +TEST_F(vmad_flow, ti_1) { + int rc = 0; + struct vma_hdr answer; + + rc = vmad_base::msg_init(m_pid); + ASSERT_LT(0, rc); + + m_data.hdr.status = 1; + m_data.action = VMA_MSG_FLOW_ADD; + m_data.type = VMA_MSG_FLOW_TCP_3T; + m_data.flow.dst_ip = server_addr.sin_addr.s_addr; + m_data.flow.dst_port = server_addr.sin_port; + + errno = 0; + rc = send(m_sock_fd, &m_data, sizeof(m_data), 0); + EXPECT_EQ(0, errno); + EXPECT_EQ((int)sizeof(m_data), rc); + + memset(&answer, 0, sizeof(answer)); + rc = recv(m_sock_fd, &answer, sizeof(answer), 0); + EXPECT_EQ((int)sizeof(answer), rc); + + EXPECT_EQ((VMA_MSG_FLOW | VMA_MSG_ACK), answer.code); + EXPECT_LE(VMA_AGENT_VER, answer.ver); + EXPECT_EQ(m_pid, answer.pid); + EXPECT_EQ(0, answer.status); + + rc = vmad_base::msg_exit(m_pid); + ASSERT_LT(0, rc); +} + +/** + * @test vmad_flow.ti_2 + * @brief + * Send valid TCP 5tuple VMA_MSG_FLOW(ADD) + * @details + */ +TEST_F(vmad_flow, ti_2) { + int rc = 0; + struct vma_hdr answer; + + rc = vmad_base::msg_init(m_pid); + ASSERT_LT(0, rc); + + m_data.hdr.status = 1; + m_data.action = VMA_MSG_FLOW_ADD; + m_data.type = VMA_MSG_FLOW_TCP_5T; + m_data.flow.dst_ip = server_addr.sin_addr.s_addr; + m_data.flow.dst_port = server_addr.sin_port; + m_data.flow.t5.src_ip = client_addr.sin_addr.s_addr; + m_data.flow.t5.src_port = client_addr.sin_port; + + errno = 0; + rc = send(m_sock_fd, &m_data, sizeof(m_data), 0); + EXPECT_EQ(0, errno); + EXPECT_EQ((int)sizeof(m_data), rc); + + memset(&answer, 0, sizeof(answer)); + rc = recv(m_sock_fd, &answer, sizeof(answer), 0); + EXPECT_EQ((int)sizeof(answer), rc); + + EXPECT_EQ((VMA_MSG_FLOW | VMA_MSG_ACK), answer.code); + EXPECT_LE(VMA_AGENT_VER, answer.ver); + EXPECT_EQ(m_pid, answer.pid); + EXPECT_EQ(0, answer.status); + + rc = vmad_base::msg_exit(m_pid); + ASSERT_LT(0, rc); +} + +/** + * @test vmad_flow.ti_3 + * @brief + * Send valid 3tuple VMA_MSG_FLOW(ADD) and VMA_MSG_FLOW(DEL) + * @details + */ +TEST_F(vmad_flow, ti_3) { + int rc = 0; + struct vma_hdr answer; + + rc = vmad_base::msg_init(m_pid); + ASSERT_LT(0, rc); + + m_data.hdr.status = 1; + m_data.action = VMA_MSG_FLOW_ADD; + m_data.type = VMA_MSG_FLOW_TCP_3T; + m_data.flow.dst_ip = server_addr.sin_addr.s_addr; + m_data.flow.dst_port = server_addr.sin_port; + + errno = 0; + rc = send(m_sock_fd, &m_data, sizeof(m_data), 0); + EXPECT_EQ(0, errno); + EXPECT_EQ((int)sizeof(m_data), rc); + + memset(&answer, 0, sizeof(answer)); + rc = recv(m_sock_fd, &answer, sizeof(answer), 0); + EXPECT_EQ((int)sizeof(answer), rc); + + EXPECT_EQ((VMA_MSG_FLOW | VMA_MSG_ACK), answer.code); + EXPECT_LE(VMA_AGENT_VER, answer.ver); + EXPECT_EQ(m_pid, answer.pid); + EXPECT_EQ(0, answer.status); + + m_data.hdr.status = 1; + m_data.action = VMA_MSG_FLOW_DEL; + + errno = 0; + rc = send(m_sock_fd, &m_data, sizeof(m_data), 0); + EXPECT_EQ(0, errno); + EXPECT_EQ((int)sizeof(m_data), rc); + + memset(&answer, 0, sizeof(answer)); + rc = recv(m_sock_fd, &answer, sizeof(answer), 0); + EXPECT_EQ((int)sizeof(answer), rc); + + EXPECT_EQ((VMA_MSG_FLOW | VMA_MSG_ACK), answer.code); + EXPECT_LE(VMA_AGENT_VER, answer.ver); + EXPECT_EQ(m_pid, answer.pid); + EXPECT_EQ(0, answer.status); + + rc = vmad_base::msg_exit(m_pid); + ASSERT_LT(0, rc); +} + +/** + * @test vmad_flow.ti_4 + * @brief + * Send valid 5tuple VMA_MSG_FLOW(ADD) and VMA_MSG_FLOW(DEL) + * @details + */ +TEST_F(vmad_flow, ti_4) { + int rc = 0; + struct vma_hdr answer; + + rc = vmad_base::msg_init(m_pid); + ASSERT_LT(0, rc); + + m_data.hdr.status = 1; + m_data.action = VMA_MSG_FLOW_ADD; + m_data.type = VMA_MSG_FLOW_TCP_5T; + m_data.flow.dst_ip = server_addr.sin_addr.s_addr; + m_data.flow.dst_port = server_addr.sin_port; + m_data.flow.t5.src_ip = client_addr.sin_addr.s_addr; + m_data.flow.t5.src_port = client_addr.sin_port; + + errno = 0; + rc = send(m_sock_fd, &m_data, sizeof(m_data), 0); + EXPECT_EQ(0, errno); + EXPECT_EQ((int)sizeof(m_data), rc); + + memset(&answer, 0, sizeof(answer)); + rc = recv(m_sock_fd, &answer, sizeof(answer), 0); + EXPECT_EQ((int)sizeof(answer), rc); + + EXPECT_EQ((VMA_MSG_FLOW | VMA_MSG_ACK), answer.code); + EXPECT_LE(VMA_AGENT_VER, answer.ver); + EXPECT_EQ(m_pid, answer.pid); + EXPECT_EQ(0, answer.status); + + m_data.hdr.status = 1; + m_data.action = VMA_MSG_FLOW_DEL; + + errno = 0; + rc = send(m_sock_fd, &m_data, sizeof(m_data), 0); + EXPECT_EQ(0, errno); + EXPECT_EQ((int)sizeof(m_data), rc); + + memset(&answer, 0, sizeof(answer)); + rc = recv(m_sock_fd, &answer, sizeof(answer), 0); + EXPECT_EQ((int)sizeof(answer), rc); + + EXPECT_EQ((VMA_MSG_FLOW | VMA_MSG_ACK), answer.code); + EXPECT_LE(VMA_AGENT_VER, answer.ver); + EXPECT_EQ(m_pid, answer.pid); + EXPECT_EQ(0, answer.status); + + rc = vmad_base::msg_exit(m_pid); + ASSERT_LT(0, rc); +} + +/** + * @test vmad_flow.ti_51 + * @brief + * Send valid UDP 3tuple VMA_MSG_FLOW(ADD) + * @details + */ +TEST_F(vmad_flow, ti_5) { + int rc = 0; + struct vma_hdr answer; + + rc = vmad_base::msg_init(m_pid); + ASSERT_LT(0, rc); + + m_data.hdr.status = 1; + m_data.action = VMA_MSG_FLOW_ADD; + m_data.type = VMA_MSG_FLOW_UDP_3T; + m_data.flow.dst_ip = server_addr.sin_addr.s_addr; + m_data.flow.dst_port = server_addr.sin_port; + + errno = 0; + rc = send(m_sock_fd, &m_data, sizeof(m_data), 0); + EXPECT_EQ(0, errno); + EXPECT_EQ((int)sizeof(m_data), rc); + + memset(&answer, 0, sizeof(answer)); + rc = recv(m_sock_fd, &answer, sizeof(answer), 0); + EXPECT_EQ((int)sizeof(answer), rc); + + EXPECT_EQ((VMA_MSG_FLOW | VMA_MSG_ACK), answer.code); + EXPECT_LE(VMA_AGENT_VER, answer.ver); + EXPECT_EQ(m_pid, answer.pid); + EXPECT_EQ(0, answer.status); + + rc = vmad_base::msg_exit(m_pid); + ASSERT_LT(0, rc); +} + +/** + * @test vmad_flow.ti_6 + * @brief + * Send valid UDP 5tuple VMA_MSG_FLOW(ADD) + * @details + */ +TEST_F(vmad_flow, ti_6) { + int rc = 0; + struct vma_hdr answer; + + rc = vmad_base::msg_init(m_pid); + ASSERT_LT(0, rc); + + m_data.hdr.status = 1; + m_data.action = VMA_MSG_FLOW_ADD; + m_data.type = VMA_MSG_FLOW_UDP_5T; + m_data.flow.dst_ip = server_addr.sin_addr.s_addr; + m_data.flow.dst_port = server_addr.sin_port; + m_data.flow.t5.src_ip = client_addr.sin_addr.s_addr; + m_data.flow.t5.src_port = client_addr.sin_port; + + errno = 0; + rc = send(m_sock_fd, &m_data, sizeof(m_data), 0); + EXPECT_EQ(0, errno); + EXPECT_EQ((int)sizeof(m_data), rc); + + memset(&answer, 0, sizeof(answer)); + rc = recv(m_sock_fd, &answer, sizeof(answer), 0); + EXPECT_EQ((int)sizeof(answer), rc); + + EXPECT_EQ((VMA_MSG_FLOW | VMA_MSG_ACK), answer.code); + EXPECT_LE(VMA_AGENT_VER, answer.ver); + EXPECT_EQ(m_pid, answer.pid); + EXPECT_EQ(0, answer.status); + + rc = vmad_base::msg_exit(m_pid); + ASSERT_LT(0, rc); +} + diff --git a/tests/gtest/vmad/vmad_hash.cc b/tests/gtest/vmad/vmad_hash.cc new file mode 100644 index 0000000..7c05580 --- /dev/null +++ b/tests/gtest/vmad/vmad_hash.cc @@ -0,0 +1,235 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "common/def.h" +#include "common/log.h" +#include "common/sys.h" +#include "common/base.h" +#include "common/cmn.h" + +#include "vmad_base.h" + +#include "tools/daemon/hash.h" + +struct element { + hash_key_t key; + int value; +}; + +class vmad_hash : public ::testing::Test {}; + +TEST_F(vmad_hash, ti_1) { + hash_t ht; + int reference[] = {3, 5, 107, 199}; + int i = 0; + + for (i = 0; i < ARRAY_SIZE(reference); i++) { + ht = hash_create(NULL, reference[i]); + ASSERT_TRUE(ht); + EXPECT_EQ(reference[i], hash_size(ht)); + EXPECT_EQ(0, hash_count(ht)); + hash_destroy(ht); + } +} + +TEST_F(vmad_hash, ti_2) { + hash_t ht; + int reference[] = {4, 12, 100, 200}; + int i = 0; + + for (i = 0; i < ARRAY_SIZE(reference); i++) { + ht = hash_create(NULL, reference[i]); + ASSERT_FALSE(ht); + } +} + +TEST_F(vmad_hash, ti_3) { + hash_t ht; + struct element element[] = {{12345, 1}, {-12345, 2}, {0, 3}}; + int i; + + ht = hash_create(NULL, 5); + ASSERT_TRUE(ht); + ASSERT_EQ(5, hash_size(ht)); + + for (i = 0; i < ARRAY_SIZE(element); i++) { + EXPECT_TRUE(hash_put(ht, element[i].key, &element[i])); + } + EXPECT_EQ(3, hash_count(ht)); + + hash_destroy(ht); +} + +TEST_F(vmad_hash, ti_4) { + hash_t ht; + struct element element[] = {{12345, 1}, {123, 2}, {12, 3}}; + int i; + + ht = hash_create(NULL, 5); + ASSERT_TRUE(ht); + ASSERT_EQ(5, hash_size(ht)); + + for (i = 0; i < ARRAY_SIZE(element); i++) { + EXPECT_TRUE(hash_put(ht, element[i].key, &element[i])); + } + EXPECT_EQ(3, hash_count(ht)); + + for (i = 0; i < ARRAY_SIZE(element); i++) { + EXPECT_EQ(((uintptr_t)&element[i]), ((uintptr_t)hash_get(ht, element[i].key))); + } + + hash_destroy(ht); +} + +TEST_F(vmad_hash, ti_5) { + hash_t ht; + struct element element[] = {{12345, 1}, {0, 2}, {12, 3}, {77, 4}}; + int i; + + ht = hash_create(NULL, 3); + ASSERT_TRUE(ht); + ASSERT_EQ(3, hash_size(ht)); + + for (i = 0; i < ARRAY_SIZE(element) - 1; i++) { + EXPECT_TRUE(hash_put(ht, element[i].key, &element[i])); + } + EXPECT_EQ(3, hash_count(ht)); + + EXPECT_FALSE(hash_put(ht, element[3].key, &element[3])); + EXPECT_EQ(3, hash_count(ht)); + + hash_destroy(ht); +} + +TEST_F(vmad_hash, ti_6) { + hash_t ht; + struct element element[] = {{12345, 1}, {0, 2}, {12, 3}}; + struct element *e; + int i; + + ht = hash_create(NULL, 5); + ASSERT_TRUE(ht); + ASSERT_EQ(5, hash_size(ht)); + + for (i = 0; i < ARRAY_SIZE(element); i++) { + EXPECT_TRUE(hash_put(ht, element[i].key, &element[i])); + } + EXPECT_EQ(3, hash_count(ht)); + + element[1].value = 555; + e = (struct element *)hash_get(ht, element[1].key); + EXPECT_EQ(((uintptr_t)&element[1]), ((uintptr_t)e)); + EXPECT_EQ(3, hash_count(ht)); + e = (struct element *)hash_get(ht, element[1].key); + ASSERT_TRUE(e); + EXPECT_EQ(((uintptr_t)&element[1]), ((uintptr_t)e)); + EXPECT_EQ(555, e->value); + + hash_destroy(ht); +} + +TEST_F(vmad_hash, ti_7) { + hash_t ht; + struct element element[] = {{12345, 1}, {123, 2}, {1234, 3}}; + int i; + + ht = hash_create(NULL, 5); + ASSERT_TRUE(ht); + ASSERT_EQ(5, hash_size(ht)); + + for (i = 0; i < ARRAY_SIZE(element); i++) { + EXPECT_TRUE(hash_put(ht, element[i].key, &element[i])); + } + EXPECT_EQ(3, hash_count(ht)); + + hash_del(ht, element[1].key); + EXPECT_EQ(2, hash_count(ht)); + EXPECT_FALSE(hash_get(ht, element[1].key)); + + hash_destroy(ht); +} + +TEST_F(vmad_hash, ti_8) { + hash_t ht; + struct element element[] = {{12345, 1}, {-12345, 2}, {0, 3}}; + int i; + + ht = hash_create(NULL, 5); + ASSERT_TRUE(ht); + ASSERT_EQ(5, hash_size(ht)); + + for (i = 0; i < ARRAY_SIZE(element); i++) { + EXPECT_TRUE(hash_put(ht, element[i].key, &element[i])); + } + EXPECT_EQ(3, hash_count(ht)); + + for (i = 0; i < ARRAY_SIZE(element); i++) { + hash_del(ht, element[i].key); + } + EXPECT_EQ(0, hash_count(ht)); + + hash_destroy(ht); +} + +TEST_F(vmad_hash, ti_9) { + hash_t ht; + struct element element[] = {{12345, 1}, {1234, 2}, {12, 3}}; + struct element *e; + int i; + + ht = hash_create(NULL, 3); + ASSERT_TRUE(ht); + ASSERT_EQ(3, hash_size(ht)); + + for (i = 0; i < ARRAY_SIZE(element); i++) { + EXPECT_TRUE(hash_put(ht, element[i].key, &element[i])); + } + EXPECT_EQ(3, hash_count(ht)); + + for (i = 0; i < 256; i++) { + hash_del(ht, element[1].key); + ASSERT_EQ(2, hash_count(ht)); + + element[1].value = i; + e = (struct element *)hash_put(ht, element[1].key, &element[1]); + ASSERT_TRUE(e); + ASSERT_EQ(3, hash_count(ht)); + ASSERT_EQ(((uintptr_t)&element[1]), ((uintptr_t)e)); + + e = (struct element *)hash_get(ht, element[1].key); + ASSERT_TRUE(e); + ASSERT_EQ(((uintptr_t)&element[1]), ((uintptr_t)e)); + ASSERT_EQ(i, e->value); + } + + hash_destroy(ht); +} diff --git a/tests/gtest/vmad/vmad_init.cc b/tests/gtest/vmad/vmad_init.cc new file mode 100644 index 0000000..f08a0c2 --- /dev/null +++ b/tests/gtest/vmad/vmad_init.cc @@ -0,0 +1,173 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "common/def.h" +#include "common/log.h" +#include "common/sys.h" +#include "common/base.h" +#include "common/cmn.h" + +#include "vmad_base.h" + +#include "src/vma/util/agent_def.h" +#include "config.h" + +class vmad_init : public vmad_base { +protected: + struct vma_msg_init m_data; + pid_t m_pid; + vmad_init() + { + uint8_t *version; + + m_pid = 0x494E4954; + memset(&m_data, 0, sizeof(m_data)); + m_data.hdr.code = VMA_MSG_INIT; + m_data.hdr.ver = VMA_AGENT_VER; + m_data.hdr.pid = m_pid; + version = (uint8_t *)&m_data.ver; + version[0] = VMA_LIBRARY_MAJOR; + version[1] = VMA_LIBRARY_MINOR; + version[2] = VMA_LIBRARY_RELEASE; + version[3] = VMA_LIBRARY_REVISION; + } + +}; + +/** + * @test vmad_init.ti_1 + * @brief + * Send data less than (struct vma_hdr) + * @details + */ +TEST_F(vmad_init, ti_1) { + int rc = 0; + struct vma_msg_init data; + + errno = 0; + rc = send(m_sock_fd, &m_data, sizeof(m_data.hdr) - 1, 0); + EXPECT_EQ(0, errno); + ASSERT_EQ((int)sizeof(m_data.hdr) - 1, rc); + + memset(&data, 0, sizeof(data)); + rc = recv(m_sock_fd, &data, sizeof(data), 0); + EXPECT_EQ(EAGAIN, errno); + EXPECT_EQ((-1), rc); +} + +/** + * @test vmad_init.ti_2 + * @brief + * Send data less than (struct vma_msg_init) + * @details + */ +TEST_F(vmad_init, ti_2) { + int rc = 0; + struct vma_msg_init data; + + errno = 0; + rc = send(m_sock_fd, &m_data, sizeof(m_data) - 1, 0); + EXPECT_EQ(0, errno); + ASSERT_EQ((int)sizeof(m_data) - 1, rc); + + memset(&data, 0, sizeof(data)); + rc = recv(m_sock_fd, &data, sizeof(data), 0); + EXPECT_EQ(EAGAIN, errno); + EXPECT_EQ((-1), rc); +} + +/** + * @test vmad_init.ti_3 + * @brief + * Send data with invalid header version + * @details + */ +TEST_F(vmad_init, ti_3) { + int rc = 0; + struct vma_msg_init data; + + errno = 0; + m_data.hdr.ver = 0xFF; + rc = send(m_sock_fd, &m_data, sizeof(m_data) - 1, 0); + EXPECT_EQ(0, errno); + ASSERT_EQ((int)sizeof(m_data) - 1, rc); + + memset(&data, 0, sizeof(data)); + rc = recv(m_sock_fd, &data, sizeof(data), 0); + EXPECT_EQ(EAGAIN, errno); + EXPECT_EQ((-1), rc); +} + +/** + * @test vmad_init.ti_4 + * @brief + * Send valid VMA_MSG_INIT + * @details + */ +TEST_F(vmad_init, ti_4) { + int rc = 0; + struct vma_msg_init data; + + errno = 0; + rc = send(m_sock_fd, &m_data, sizeof(m_data), 0); + EXPECT_EQ(0, errno); + ASSERT_EQ((int)sizeof(m_data), rc); + + memset(&data, 0, sizeof(data)); + rc = recv(m_sock_fd, &data, sizeof(data), 0); + EXPECT_EQ((int)sizeof(data), rc); + + EXPECT_EQ((VMA_MSG_INIT | VMA_MSG_ACK), data.hdr.code); + EXPECT_LE(VMA_AGENT_VER, data.hdr.ver); + EXPECT_EQ(m_pid, data.hdr.pid); +} + +/** + * @test vmad_init.ti_5 + * @brief + * Send valid VMA_MSG_EXIT + * @details + */ +TEST_F(vmad_init, ti_5) { + int rc = 0; + struct vma_msg_exit data; + + memset(&data, 0, sizeof(data)); + data.hdr.code = VMA_MSG_EXIT; + data.hdr.ver = VMA_AGENT_VER; + data.hdr.pid = m_pid; + + errno = 0; + rc = send(m_sock_fd, &data, sizeof(data), 0); + EXPECT_EQ(0, errno); + ASSERT_EQ((int)sizeof(data), rc); +} diff --git a/tests/gtest/vmad/vmad_state.cc b/tests/gtest/vmad/vmad_state.cc new file mode 100644 index 0000000..f358959 --- /dev/null +++ b/tests/gtest/vmad/vmad_state.cc @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "common/def.h" +#include "common/log.h" +#include "common/sys.h" +#include "common/base.h" +#include "common/cmn.h" + +#include "vmad_base.h" + +#include "src/vma/util/agent_def.h" +#include "src/vma/lwip/tcp.h" + +class vmad_state : public vmad_base { +protected: + struct vma_msg_state m_data; + pid_t m_pid; + vmad_state() + { + m_pid = 0x53544154; + memset(&m_data, 0, sizeof(m_data)); + m_data.hdr.code = VMA_MSG_STATE; + m_data.hdr.ver = VMA_AGENT_VER; + m_data.hdr.pid = m_pid; + } + +}; + +/** + * @test vmad_state.ti_1 + * @brief + * Send valid VMA_MSG_STATE + * @details + */ +TEST_F(vmad_state, ti_1) { + int rc = 0; + + rc = vmad_base::msg_init(m_pid); + ASSERT_LT(0, rc); + + m_data.fid = 0; + m_data.state = ESTABLISHED; + m_data.type = SOCK_STREAM; + m_data.src_ip = client_addr.sin_addr.s_addr; + m_data.src_port = client_addr.sin_port; + m_data.dst_ip = server_addr.sin_addr.s_addr; + m_data.dst_port = server_addr.sin_port; + + errno = 0; + rc = send(m_sock_fd, &m_data, sizeof(m_data), 0); + EXPECT_EQ(0, errno); + EXPECT_EQ((int)sizeof(m_data), rc); + + rc = vmad_base::msg_exit(m_pid); + ASSERT_LT(0, rc); +} diff --git a/tests/latency_test/Makefile.am b/tests/latency_test/Makefile.am new file mode 100644 index 0000000..b8c2a8b --- /dev/null +++ b/tests/latency_test/Makefile.am @@ -0,0 +1,17 @@ +noinst_PROGRAMS = udp_lat tcp_lat + +AM_CPPFLAGS := \ + -I$(top_builddir)/. -I$(top_srcdir)/. \ + -I$(top_builddir)/src -I$(top_srcdir)/src + +udp_lat_SOURCES = udp_lat.c +udp_lat_LDADD = -lrt + +tcp_lat_SOURCES = tcp_lat.cpp +tcp_lat_LDADD = \ + $(top_builddir)/src/utils/libutils.la + +udp_lat_DEPENDENCIES = Makefile.am Makefile.in Makefile +tcp_lat_DEPENDENCIES = Makefile.am Makefile.in Makefile + + diff --git a/tests/latency_test/tcp_lat.cpp b/tests/latency_test/tcp_lat.cpp new file mode 100644 index 0000000..29ecd13 --- /dev/null +++ b/tests/latency_test/tcp_lat.cpp @@ -0,0 +1,861 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +/* + * How to Build: 'gcc -lrt -o tcp_lat tcp_lat.c' +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "src/utils/rdtsc.h" + +#define log_dbg(fmt, args...) \ +do { \ + if (debug) \ + printf(fmt, ##args); \ +} while (0) + +#define TCP_LAT_PORT 1212 + +#undef log_dbg +#define log_dbg(fmt, args...) + +enum { + OPT_REQS_PER_RESP = 1, + OPT_USE_PERFECT_BATCH, + OPT_TIME_RR, + OPT_DELAY_TIME, + OPT_NOBLOCK, + OPT_SELECT_ON_ACCEPT, + OPT_TCP_PORT +}; + + +static struct option long_options[] = { + {"server", 0, 0, 's'}, + {"client", 1, 0, 'c'}, + {"test", 1, 0, 't'}, + {"msglen", 1, 0, 'l'}, + {"msgnum", 1, 0, 'n'}, + {"reqs-per-resp", 1, 0, OPT_REQS_PER_RESP}, + {"use-perfect-batch", 0, 0, OPT_USE_PERFECT_BATCH}, + {"time-rr", 0, 0, OPT_TIME_RR}, + {"use-alert-poll", 0, 0, 'p'}, + {"delay", 1, 0, OPT_DELAY_TIME}, + {"port", 1, 0, OPT_TCP_PORT}, + {"noblock", 0, 0, OPT_NOBLOCK}, + {"select-on-accept", 0, 0, OPT_SELECT_ON_ACCEPT}, + {"debug", 0, 0, 'd'}, + {"help", 0, 0, 'h'}, +}; + +static int debug = 0; +static int tcp_lat_pkt_size = 200; +static int max_n_msgs = 1000000; +//static int max_n_msgs = 1000; +static int reqs_per_resp = 1; +static int use_perfect_batch = 0; +static int time_rr = 0; +static int delay_time = 0; +static int noblock = 0; +static int select_on_accept = 0; +static int tcp_lat_port = TCP_LAT_PORT; +bool g_b_exit = false; +struct sigaction sigact; + +//#define N_MSGS 1000000 //10000000 +struct timestamp { + uint32_t secs; + uint32_t nsecs; +}; + +enum tcp_lat_msg_types { + TCP_LAT_MSG_TS = 0xAC +}; +struct tcp_lat_msg { + uint8_t msg_type; + union { + struct timestamp ts; + }; +} __attribute__((packed)); + +enum test_modes { + TST_BLOCKING_PING_PONG = 1, + TST_SELECT_PING_PONG, + TST_CL_THREADED_PING_PONG, + TST_MAX_TEST + +}; + +void sig_handler(int signum) +{ + if (g_b_exit) { + printf("Test end (interrupted by signal %d)", signum); + return; + } + + switch (signum) { + case SIGINT: + printf("Test end (interrupted by user)"); + break; + default: + printf("Test end (interrupted by signal %d)", signum); + break; + } + g_b_exit = true; +} + +/* set the action taken when signal received */ +void set_signal_action() +{ + sigact.sa_handler = sig_handler; + sigemptyset(&sigact.sa_mask); + sigact.sa_flags = 0; + + sigaction(SIGINT, &sigact, NULL); +} + +static void usage() +{ + printf("Usage: tcp_lat [options]\n" + "\t--test,-t Test to run. Default is 1\n" + "\t--server,-s Server mode\n" + "\t--client,-c Client mode. Connect to server at \n" + "\t--msglen,-l Message size in bytes. Default %d\n" + "\t--msgnum,-n Total number of messages to send. Default %d\n" + "\t--reqs-per-resp Send a responce on every nth request. Default %d\n" + "\t--use-perfect-batch Send one transaction as a one HUGE message (TCP only)\n" + "\t--time-rr Time every req/responce cycle with gettimeofday()\n" + "\t--delay Sleep between transactions\n" + "\t--noblock Use non blocking sockets\n" + "\t--select-on-accept Use select to check if socket is ready to accept()\n" + "\t--port Listen/connect to port . Default %d\n" + "\t--debug,-d Print extra debug info\n" + "\t--help,-h Print help and exit\n", + + tcp_lat_pkt_size, + max_n_msgs, + reqs_per_resp, + tcp_lat_port + + ); + printf("Test types:\n" + " 1 - blocking ping pong\n" + " 2 - select() with non blocking ping pong\n" + "\n" + ); + + exit(1); +} + +static void set_noblock(int ns) +{ + int ret; + int flag; + // set it to non blocking mode + if (noblock) { + flag = fcntl(ns, F_GETFL); + if (flag < 0) { + printf("failed to get socket flags %m\n"); + } + flag |= O_NONBLOCK; + ret = fcntl(ns, F_SETFL, flag); + if (ret < 0) { + printf("failed to set socket flags %m\n"); + } + printf("set socket to nb mode\n"); + } +} + +static int do_select_on_accept(int s) +{ + fd_set rfds; + int ret; + + while(!g_b_exit) { + FD_ZERO(&rfds); + FD_SET(s, &rfds); + ret = select(s+1, &rfds, 0, 0, 0); + if (ret < 0 && errno == EINTR) { + printf("select interrupted\n"); + continue; + } + if (ret < 0) + return -1; + if (FD_ISSET(s, &rfds)) + return s; + } + return -1; +} + +static int get_addr(char *dst, struct sockaddr_in *addr) +{ + struct addrinfo *res; + int ret; + + ret = getaddrinfo(dst, NULL, NULL, &res); + if (ret) { + printf + ("getaddrinfo failed - invalid hostname or IP address\n"); + return ret; + } + + if (res->ai_family != PF_INET) { + ret = -1; + goto out; + } + + *addr = *(struct sockaddr_in *)res->ai_addr; + out: + freeaddrinfo(res); + return ret; +} + +static int tcp_read(int s, char *b, int count) +{ + int n; + int nb; + + nb = 0; + do { + n = read(s, b, count); + if (n == 0) { + printf("EOF?\n"); + return nb; + } + if (n < 0) { + if (errno == EAGAIN) { + log_dbg("blocking read ret=%d read %d of %d = %m\n", n, nb, count); + continue; + } + printf("bad read ret=%d read %d of %d = %m(%d)\n", n, nb, count, errno); + return nb; + } + count -= n; + b += n; + nb += n; + } while (count > 0); + return nb; +} + +static int tcp_write(int s, char *b, int count) +{ + int n, nb; + + nb = 0; + do { + n = write(s, b, count); + if (n <= 0) { + if (errno == EAGAIN) { + log_dbg("blocking write ret=%d written %d of %d = %m\n", n, nb, count); + continue; + } + printf("bad write ret=%d written %d of %d = %m(%d)\n", n, nb, count, errno); + return nb; + } + count -= n; + b += n; + nb += n; + } while (count > 0); + return nb; +} + +void run_select_server() +{ + int s, ns; + struct sockaddr_in addr; + int ret; + unsigned len; + char buf[tcp_lat_pkt_size]; + //char batch_buf[tcp_lat_pkt_size*reqs_per_resp]; + int flag; + fd_set rfds, wfds; + + signal(SIGPIPE, SIG_IGN); + printf("starting TCP select() server\n"); + s = socket(PF_INET, SOCK_STREAM, IPPROTO_IP); + if (s < 0) { + printf("Failed to create socket\n"); + exit(1); + } + + /* listen on any port */ + memset(&addr, sizeof(addr), 0); + addr.sin_family = PF_INET; + addr.sin_addr.s_addr = INADDR_ANY; + addr.sin_port = htons(tcp_lat_port); + flag = 1; + setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *) &flag, sizeof(int)); + + ret = bind(s, (struct sockaddr *)&addr, sizeof(addr)); + if (ret < 0) { + printf("failed to bind = %m\n"); + exit(1); + } + listen(s, 5); + while(!g_b_exit) { + //int flag; + printf("Waiting for connection\n"); + len = sizeof(addr); + ns = accept(s, (struct sockaddr *)&addr, &len); + if (ns < 0) { + printf("accept failed = %m\n"); + exit(1); + } +#if 1 + flag = 1; + ret = setsockopt(ns, IPPROTO_TCP, TCP_NODELAY, (char *) &flag, sizeof(int)); + if (ret < 0) { + printf("Failed to disable NAGLE\n"); + } +#endif + //set_noblock(ns); + if (noblock) set_noblock(ns); + printf("connected\n"); + while(!g_b_exit) { + FD_ZERO(&rfds); + FD_ZERO(&wfds); + // select() + FD_SET(ns, &rfds); + ret = select(ns+1, &rfds, 0, 0, 0); + if (ret <= 0) { + if (errno != EINTR) { + printf("select erroro %m\n"); + break; + } + else { + printf("interrupted select!\n"); + continue; + } + } + ret = tcp_read(ns, buf, tcp_lat_pkt_size); + if (ret < 0) { + printf("bad read? = %m (%d/%d)\n", ret, tcp_lat_pkt_size); + exit(1); + } + if (ret == 0) { + printf("EOF detected - going back to accept\n"); + break; + } + + // get requests till we block... + // send reply + log_dbg("Read request, sending responce\n"); + ret = tcp_write(ns, buf, tcp_lat_pkt_size); + if (ret != tcp_lat_pkt_size) { + printf("partial packet write (%d != %d)\n", ret, tcp_lat_pkt_size); + exit(1); + } + log_dbg("==ack sent\n"); + } + + close(ns); + printf("all done\n"); + } + +} + + +static void run_tcp_server() +{ + int s, ns; + struct sockaddr_in addr; + int ret, i; + unsigned len; + char buf[tcp_lat_pkt_size]; + char batch_buf[tcp_lat_pkt_size*reqs_per_resp]; + int flag; + + signal(SIGPIPE, SIG_IGN); + printf("starting TCP server\n"); + s = socket(PF_INET, SOCK_STREAM, IPPROTO_IP); + if (s < 0) { + printf("Failed to create socket\n"); + exit(1); + } + + /* listen on any port */ + memset(&addr, sizeof(addr), 0); + addr.sin_family = PF_INET; + addr.sin_addr.s_addr = INADDR_ANY; + addr.sin_port = htons(tcp_lat_port); + flag = 1; + setsockopt(s, SOL_SOCKET, SO_REUSEADDR, (char *) &flag, sizeof(int)); + + ret = bind(s, (struct sockaddr *)&addr, sizeof(addr)); + if (ret < 0) { + printf("failed to bind = %m\n"); + exit(1); + } + listen(s, 5); + while(!g_b_exit) { + //int flag; + printf("Waiting for connection\n"); + len = sizeof(addr); + if (select_on_accept) { + log_dbg("select() to check for new connection\n"); + if (do_select_on_accept(s) < 0) { + printf("can not select on accept\n"); + exit(1); + } + } + ns = accept(s, (struct sockaddr *)&addr, &len); + if (ns < 0) { + printf("accept failed = %m\n"); + exit(1); + } +#if 1 + flag = 1; + ret = setsockopt(ns, IPPROTO_TCP, TCP_NODELAY, (char *) &flag, sizeof(int)); + if (ret < 0) { + printf("Failed to disable NAGLE\n"); + } +#endif + if (noblock) set_noblock(ns); + printf("connected\n"); + for (i = 0; i < max_n_msgs; i+=reqs_per_resp) { + int k; + uint64_t sum = 0; + if (use_perfect_batch) { + ret = tcp_read(ns, batch_buf, tcp_lat_pkt_size*reqs_per_resp); + if (ret < 0) { + printf("bad read? = %m (%d/%d)\n", ret, tcp_lat_pkt_size); + exit(1); + } + if (ret == 0) { + printf("EOF detected - going back to accept\n"); + break; + } + } + for (k = 0; k < reqs_per_resp; k++) { + if (use_perfect_batch) { + memcpy(buf, batch_buf + k*tcp_lat_pkt_size, tcp_lat_pkt_size); + sum += buf[11]; + } + else { + ret = tcp_read(ns, buf, tcp_lat_pkt_size); + if (ret < 0) { + printf("bad read? = %m (%d/%d)\n", ret, tcp_lat_pkt_size); + exit(1); + } + if (ret == 0) { + printf("EOF detected - going back to accept\n"); + goto done; + } + log_dbg("==> trans req: %d\n", i); + } + } + //printf("Read request, sending responce\n"); + ret = tcp_write(ns, buf, tcp_lat_pkt_size); + if (ret != tcp_lat_pkt_size) { + printf("partial packet write (%d != %d)\n", ret, tcp_lat_pkt_size); + exit(1); + } + log_dbg("==ack %d sent\n", i); + } + done: + close(ns); + printf("all done\n"); + } + +} + +static int tcp_client_init(struct sockaddr_in *addr) +{ + int s; + int ret; + int flag; + + s = socket(PF_INET, SOCK_STREAM, IPPROTO_IP); + if (!s) { + printf("Failed to create socket\n"); + exit(1); + } + addr->sin_port = htons(tcp_lat_port); + +#if 1 + flag = 1; + ret = setsockopt(s, IPPROTO_TCP, TCP_NODELAY, (char *) &flag, sizeof(int)); + if (ret < 0) { + printf("Failed to disable NAGLE\n"); + } +#endif + ret = connect(s, (struct sockaddr *)addr, sizeof(*addr)); + if (ret < 0) { + printf("connect failed\n"); + exit(1); + } + + if (noblock) set_noblock(s); + + return s; +} + +static struct timeval _st, _et; +pthread_spinlock_t lck; +static unsigned tx_cnt; +static uint64_t _total_usec, _n_rr; + +static void take_ts(struct tcp_lat_msg *m) +{ + struct timeval dt; + m->msg_type = TCP_LAT_MSG_TS; + + gettimeofday(&dt, 0); + m->ts.secs = dt.tv_sec; + m->ts.nsecs = dt.tv_usec * 1000; +// printf("tx start: sec: %u usec: %u\n", m->ts.secs, m->ts.nsecs/1000); +} + +void *tcp_rep_handler(void *arg) +{ + unsigned long s = (unsigned long)arg; + int ret, i; + char buf[tcp_lat_pkt_size]; + unsigned rx_cnt = 0; + + for (i = 0; i < max_n_msgs; i++) { + log_dbg("==> waiting for resp: %d\n", i); +//pthread_spin_lock(&lck); +// log_dbg("==> waiting for resp1: %d\n", i); +// while(tx_cnt <= rx_cnt); +// log_dbg("==> waiting for resp2: %d\n", i); + ret = tcp_read(s, buf, tcp_lat_pkt_size); + log_dbg("==> resp1: %d\n", i); +//pthread_spin_unlock(&lck); + if (ret != tcp_lat_pkt_size) { + printf("resp: %d partial packet read (%d != %d)\n", i, ret, tcp_lat_pkt_size); + //exit(1); + break; + } + if (time_rr) { + struct timeval st_rr, et_rr, dt_rr; + struct tcp_lat_msg *m = (struct tcp_lat_msg *)buf; + if (m->msg_type != TCP_LAT_MSG_TS) { + printf("expect timestamped packet\n"); + exit(1); + } + gettimeofday(&et_rr, 0); + st_rr.tv_sec = m->ts.secs; + st_rr.tv_usec = m->ts.nsecs/1000; + //printf("rx start: sec: %u usec: %u\n", m->ts.secs, m->ts.nsecs/1000); + //printf("RX NOW: sec: %ld usec: %ld\n", et_rr.tv_sec, et_rr.tv_usec); + timersub(&et_rr, &st_rr, &dt_rr); + _total_usec += dt_rr.tv_sec * 1000000 + dt_rr.tv_usec; + //printf("DELTA: %ld\n", dt_rr.tv_sec * 1000000 + dt_rr.tv_usec); + _n_rr++; + } + log_dbg("==> resp: %d\n", i); + rx_cnt++; + } + gettimeofday(&_et, 0); + return 0; +} + + +static void run_tcp_threaded_client(struct sockaddr_in *addr) +{ + int s; + char buf[tcp_lat_pkt_size]; + pthread_t tid; + struct timeval dt; + int i, ret; + struct tcp_lat_msg *msg; + + if ((unsigned)tcp_lat_pkt_size < sizeof(*msg)) { + printf("message size is too small\n"); + exit(1); + } + +pthread_spin_init(&lck, 0); + printf("running client in thread per read/thread per write mode\n"); + s = tcp_client_init(addr); + if (!s) { + printf("Failed to create socket\n"); + exit(1); + } + // spawn reader thread + pthread_create(&tid, 0, tcp_rep_handler, (void *)(unsigned long)s); + gettimeofday(&_st, 0); + for (i = 0; i < max_n_msgs; i++) { + log_dbg("==> write req: %d\n", i); +//pthread_spin_lock(&lck); + log_dbg("==> write req1: %d\n", i); + msg = (struct tcp_lat_msg *)buf; + if (time_rr) + take_ts(msg); + ret = tcp_write(s, buf, tcp_lat_pkt_size); + log_dbg("==> done write req1: %d\n", i); +//pthread_spin_unlock(&lck); + if (ret < 0) { + printf("partial packet write (%d != %d)\n", ret, tcp_lat_pkt_size); + exit(1); + } + log_dbg("==> done write req: %d\n", i); + tx_cnt++; + } + pthread_join(tid, 0); + timersub(&_et, &_st, &dt); + printf("%d message processed in %u s %u usec\n", max_n_msgs, (unsigned)dt.tv_sec, (unsigned)dt.tv_usec); + printf("Average latency is: %1.2lf usec\n", (double)(dt.tv_sec * 1000000 + dt.tv_usec)/(max_n_msgs+max_n_msgs)); + printf("Speed is: %1.2lf msg/sec\n", 1000000*(double)(max_n_msgs + max_n_msgs)/(dt.tv_sec * 1000000 + dt.tv_usec)); + if (time_rr) { + printf("Average latency: %1.2f usec\n", (double)_total_usec/(2*_n_rr)); + } +} + +static void run_tcp_client(struct sockaddr_in *addr) +{ + int s; + int ret, i; + char buf[tcp_lat_pkt_size]; + char batch_buf[tcp_lat_pkt_size*reqs_per_resp]; + struct timeval st, et, dt; + //struct timeval st_rr, et_rr, dt_rr; + struct timespec st_rr, et_rr, dt_rr; + uint64_t total_usec, n_rr; + + printf("starting TCP client\n"); + s = tcp_client_init(addr); + if (!s) { + printf("Failed to create socket\n"); + exit(1); + } + gettimeofday(&st, 0); + total_usec = n_rr = 0; + ts_clear(&st_rr); + ts_clear(&et_rr); + ts_clear(&dt_rr); + //printf("Starting run\n"); + for (i = 0; i < max_n_msgs && !g_b_exit; i+=reqs_per_resp) { + log_dbg("==> write req\n"); + int k; + if (time_rr) { + gettimefromtsc(&st_rr); + //gettimeofday(&st_rr, 0); + } + for (k = 0; k < reqs_per_resp; k++) { + if (!use_perfect_batch) { + ret = tcp_write(s, buf, tcp_lat_pkt_size); + if (ret < 0) { + printf("partial packet write (%d != %d)\n", ret, tcp_lat_pkt_size); + exit(1); + } + } + else + memcpy(batch_buf + k * tcp_lat_pkt_size, buf, tcp_lat_pkt_size); + } + if (use_perfect_batch) { + ret = tcp_write(s, batch_buf, tcp_lat_pkt_size*reqs_per_resp); + if (ret < 0) { + printf("partial packet write (%d != %d)\n", ret, tcp_lat_pkt_size); + exit(1); + } + } + log_dbg("==> write req done - waiting for resp resp\n"); + ret = tcp_read(s, buf, tcp_lat_pkt_size); + if (ret != tcp_lat_pkt_size) { + printf("partial packet read (%d != %d)\n", ret, tcp_lat_pkt_size); + exit(1); + } + if (time_rr) { + gettimefromtsc(&et_rr); + //gettimeofday(&et_rr, 0); + //timersub(&et_rr, &st_rr, &dt_rr); + ts_sub(&et_rr, &st_rr, &dt_rr); + total_usec += dt_rr.tv_sec * 1000000000L + dt_rr.tv_nsec; + n_rr++; + } + log_dbg("all rcvd\n"); + if (delay_time) + sleep(delay_time); + + } + gettimeofday(&et, 0); + timersub(&et, &st, &dt); + printf("%d message processed in %u s %u usec\n", max_n_msgs, (unsigned)dt.tv_sec, (unsigned)dt.tv_usec); + //printf("Average latency is: %1.2lf usec\n", (double)(dt.tv_sec * 1000000 + dt.tv_usec)/(max_n_msgs+max_n_msgs/reqs_per_resp)); + printf("Speed is: %1.2lf msg/sec\n", 1000000*(double)(max_n_msgs + max_n_msgs/reqs_per_resp)/(dt.tv_sec * 1000000 + dt.tv_usec)); + if (time_rr) { + printf("Average ***latency: %1.3f usec\n", (double)total_usec/(2*n_rr*1000)); + } + + close(s); + printf("client done\n"); +} + + + +int main(int argc, char *argv[]) +{ + int op; + int option_index; + int server_mode = -1; + struct sockaddr_in server_addr; + int ret; + int poll_mode = 0; + int testn = 1; + + (void)poll_mode; + while ((op = getopt_long(argc, argv, "psc:dhl:n:t:", long_options, &option_index)) != -1) { + switch (op) { + case 'c': + if (server_mode == 1) { + printf("can not run both in server and client mode\n"); + exit(1); + } + ret = get_addr(optarg, &server_addr); + if (ret < 0) { + printf("Failed to resolve server address\n"); + exit(1); + } + server_mode = 0; + break; + case 's': + if (server_mode == 0) { + printf("can not run both in server and client mode\n"); + exit(1); + } + server_mode = 1; + break; + case 'l': + tcp_lat_pkt_size = atoi(optarg); + if (tcp_lat_pkt_size <= 0) { + printf("Invalid packed size value\n"); + exit(1); + } + break; + case 'n': + max_n_msgs = atoi(optarg); + if (max_n_msgs <= 0) { + printf("Invalind number of messages\n"); + exit(1); + } + break; + case OPT_REQS_PER_RESP: + reqs_per_resp = atoi(optarg); + break; + case OPT_USE_PERFECT_BATCH: + use_perfect_batch = 1; + break; + case OPT_TIME_RR: + time_rr = 1; + break; + case OPT_DELAY_TIME: + delay_time = atoi(optarg); + break; + case OPT_NOBLOCK: + noblock = 1; + printf("using non blocking sockets"); + break; + case OPT_SELECT_ON_ACCEPT: + select_on_accept = 1; + printf("Use select to check for new connections\n"); + break; + case OPT_TCP_PORT: + tcp_lat_port = atoi(optarg); + printf("Use port %d\n", tcp_lat_port); + break; + case 't': + testn = atoi(optarg); + if (testn <= 0 || testn >= TST_MAX_TEST) { + printf("uknown test number: %d\n", testn); + exit(1); + } + printf("Test number %d\n", atoi(optarg)); + break; + case 'd': + debug = 1; + break; + case 'p': + poll_mode = 1; + break; + case 'h': + default: + usage(); + } + } + if (server_mode == -1) { + printf("Must choose either client (-c) or server (-s) mode\n"); + exit(1); + } + set_signal_action(); + + // force tsc init + struct timespec ts; + gettimefromtsc(&ts); + switch (testn) { + case TST_BLOCKING_PING_PONG: + if (server_mode) { + run_tcp_server(); + } + else { + run_tcp_client(&server_addr); + } + return 0; + case TST_SELECT_PING_PONG: + if (server_mode) { + run_select_server(); + } + else { + run_tcp_client(&server_addr); + } + return 0; + case TST_CL_THREADED_PING_PONG: + if (server_mode) { + printf("only works in client mode\n"); + exit(1); + } + run_tcp_threaded_client(&server_addr); + return 0; + default: + printf("bad test number %d\n", testn); + } + return 0; +} diff --git a/tests/latency_test/udp_lat.c b/tests/latency_test/udp_lat.c new file mode 100644 index 0000000..ffedcca --- /dev/null +++ b/tests/latency_test/udp_lat.c @@ -0,0 +1,2440 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +/* + * How to Build: 'gcc -lpthread -lrt -o udp_lat udp_lat.c' + */ + +#include +#include +#include /* random()*/ +#include +#include +#include +#include /* clock_gettime()*/ +#include /* getopt() and sleep()*/ +#include /* getopt()*/ +#include /* isprint()*/ +#include +#include +#include +#include +#include +#include /* sockets*/ +#include /* timers*/ +#include /* sockets*/ +#include /* select() According to POSIX 1003.1-2001 */ +#include +#include +#include /* internet address manipulation*/ +#include /* internet address manipulation*/ + + +#define USING_VMA_EXTRA_API +#ifdef USING_VMA_EXTRA_API +#include +#endif + +int prepare_socket(struct sockaddr_in* p_addr); + +#define MIN_PAYLOAD_SIZE 2 +#define MAX_PAYLOAD_SIZE (65506) +#define MAX_STREAM_SIZE (50*1024*1024) + +#define DEFAULT_TEST_DURATION 1 /* [sec] */ +#define DEFAULT_MC_ADDR "0.0.0.0" +#define DEFAULT_PORT 11111 +#define DEFAULT_IP_MTU 1500 +#define DEFAULT_IP_PAYLOAD_SZ (DEFAULT_IP_MTU-28) +#define DUMMY_PORT 57341 + +#ifndef MAX_PATH_LENGTH +#define MAX_PATH_LENGTH 1024 +#endif +#define MAX_MCFILE_LINE_LENGTH 23 /* sizeof("255.255.255.255:11111\0") */ +#define IP_PORT_FORMAT_REG_EXP "^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}"\ + "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?):"\ + "(6553[0-5]|655[0-2][0-9]|65[0-4][0-9]{2}|6[0-4][0-9]{3}|[0-5]?[0-9]{1,4})\n" + +#define CLIENT_MASK 0x55 +#define SERVER_MASK 0xAA +#define MAX_ARGV_SIZE 256 +#define RECIEVE_AGAIN_S_SELECT 2 +#define MAX_DURATION 36000000 +#define MAX_FDS_NUM 1024 +#define UDP_BUFF_DEFAULT_SIZE 0 +#define DEFAULT_SELECT_TIMEOUT_MSEC 10 +#define DEFAULT_DEBUG_LEVEL 0 + +enum { + OPT_RX_MC_IF = 1, + OPT_TX_MC_IF, // 2 + OPT_SELECT_TIMEOUT, // 3 + OPT_MULTI_THREADED_SERVER, // 4 + OPT_CLIENT_CYCLE_DURATION, // 5 + OPT_UDP_BUFFER_SIZE, // 6 + OPT_DATA_INTEGRITY, // 7 + OPT_DAEMONIZE, // 8 + OPT_NONBLOCKED, // 9 + OPT_DONTWARMUP, //10 + OPT_PREWARMUPWAIT, //11 + OPT_VMARXFILTERCB, //12 + OPT_VMAZCOPYREAD, //13 + OPT_MC_LOOPBACK_DISABLE, //14 + OPT_CLIENT_WORK_WITH_SRV_NUM, //15 + OPT_FORCE_UC_REPLY, //16 + OPT_TTL //17 +}; + +#define SCALE_UP_1000(_n_) (((_n_) + 500) / 1000) +#define SCALE_DOWN_1000(_n_) ((_n_) * 1000) + + +#define NANO_TO_MICRO(n) SCALE_UP_1000(n) +#define MICRO_TO_NANO(n) SCALE_DOWN_1000(n) +#define MICRO_TO_SEC(n) SCALE_UP_1000( SCALE_UP_1000(n) ) +#define SEC_TO_MICRO(n) SCALE_DOWN_1000( SCALE_DOWN_1000(n) ) +#define SEC_TO_NANO(n) SCALE_DOWN_1000( SCALE_DOWN_1000( SCALE_DOWN_1000(n) ) ) + +#define TS_TO_NANO(x) (SEC_TO_NANO((long long)((x)->tv_sec)) + (long long)((x)->tv_nsec)) + +#define TIME_DIFF_in_NANO(start,end) (SEC_TO_NANO((end).tv_sec-(start).tv_sec) + \ + ((end).tv_nsec-(start).tv_nsec)) +#define TIME_DIFF_in_MICRO(start,end) (SEC_TO_MICRO((end).tv_sec-(start).tv_sec) + \ + (NANO_TO_MICRO((end).tv_nsec-(start).tv_nsec))) + + +#define MODULE_NAME "udp_lat: " +#define log_msg(log_fmt, log_args...) printf(MODULE_NAME log_fmt "\n", ##log_args) +#define log_err(log_fmt, log_args...) printf(MODULE_NAME "%d:ERROR: " log_fmt " (errno=%d %s)\n", __LINE__, ##log_args, errno, strerror(errno)) +#define log_dbg(log_fmt, log_args...) if (debug_level >= LOG_LVL_DEBUG) { printf(MODULE_NAME log_fmt "\n", ##log_args); } + +typedef enum { + MODE_CLIENT = 0, + MODE_SERVER, + MODE_BRIDGE +} work_mode_t; + +typedef enum { + RECVFROM = 0, + SELECT, + POLL, + EPOLL, + FD_HANDLE_MAX +} fd_block_handler_t; + +typedef enum { + LOG_LVL_INFO = 0, + LOG_LVL_DEBUG +} debug_level_t; + +int epfd; +bool b_exit = false; +struct sigaction sigact; +unsigned long long packet_counter = 0; + +unsigned long long cycle_counter = 0; +unsigned long long cycle_wait_loop_counter = 0; +unsigned long long cycle_start_time_nsec; + +double latency_usec_max = 0.0; +unsigned int packet_counter_at_max_latency = 0; + +struct { + unsigned int min_usec, count; +} latency_hist[] = { {0,0}, {3,0}, {5,0}, {7,0}, {10,0}, {15,0}, {20,0}, {50,0}, {100,0}, {200,0}, {500,0}, {1000,0}, {2000,0}, {5000,0}, {-1,0}}; +int latency_hist_size = (int)(sizeof(latency_hist)/sizeof(latency_hist[0])); + +struct timespec start_time, end_time; +struct timespec start_round_time, end_round_time; + +debug_level_t debug_level = LOG_LVL_INFO; +int fd_max = 0; +int fd_min = 0; /* used as THE fd when single mc group is given (RECVFROM blocked mode) */ +int fd_num = 0; +int *pid_arr = NULL; +fd_set readfds; +unsigned char *msgbuf = NULL; + +#ifdef USING_VMA_EXTRA_API +unsigned char* pkt_buf = NULL; +struct vma_packets_t* pkts = NULL; +#endif + +int max_buff_size = 0; +int vma_dgram_desc_size = 0; +unsigned char *pattern = NULL; +unsigned int data_integrity_failed = 0; +unsigned int duplicate_packets_counter = 0; +int sockets_num = 0; +int read_from_file = 0; +regex_t regexpr; +struct pollfd *poll_fd_arr = NULL; +struct epoll_event *epoll_events = NULL; +struct timeval curr_tv, last_tv; +unsigned long long last_packet_counter = 0; + +const char* fds_handle_desc[FD_HANDLE_MAX] = +{ + "recvfrom", + "select", + "poll", + "epoll" +}; + + +struct user_params_t { + work_mode_t mode; + struct in_addr rx_mc_if_addr; + struct in_addr tx_mc_if_addr; + int msg_size; + int msg_size_range; + int sec_test_duration; + bool data_integrity; + fd_block_handler_t fd_handler_type; + unsigned int packetrate_stats_print_ratio; + unsigned int burst_size; + bool packetrate_stats_print_details; + bool b_client_calc_details; + bool stream_mode; + int mthread_server; + struct timeval* select_timeout; + int udp_buff_size; + int threads_num; + bool is_blocked; + bool do_warmup; + unsigned int pre_warmup_wait; + bool is_vmarxfiltercb; + bool is_vmazcopyread; + unsigned long long cycle_duration_nsec; + bool mc_loop_disable; + int client_work_with_srv_num; + bool b_server_reply_via_uc; + int mc_ttl; + int enable_hw_time; +} user_params; + +typedef struct spike{ + double usec; + unsigned long long packet_counter_at_spike; + int next; + }spike; + +typedef struct static_lst{ + int head, tail; +}static_lst; + +typedef struct fds_data { + struct sockaddr_in addr; + int is_multicast; + int next_fd; +} fds_data; + +typedef union packet_rtt_data { + struct timespec start_round_time; + double rtt; +}packet_rtt_data; + +typedef struct sub_fds_arr_info { + int fd_min; + int fd_max; + int fd_num; +}sub_fds_arr_info; + +fds_data* fds_array[MAX_FDS_NUM]; +static_lst spikes_lst; +spike *spikes = NULL; +int max_spikes_num = 1; +int spikes_num = 0; +packet_rtt_data *rtt_data = NULL; +unsigned long long * packet_counter_arr = NULL; +int min_msg_size = MIN_PAYLOAD_SIZE; +int max_msg_size = MIN_PAYLOAD_SIZE; + +#ifdef USING_VMA_EXTRA_API +struct vma_api_t *vma_api; +#endif + +#define max(x,y) ({typeof(x) _x = (x); typeof(y) _y = (y); (void)(&_x == &_y); _x > _y ? _x : _y; }) +#define min(x,y) ({typeof(x) _x = (x); typeof(y) _y = (y); (void)(&_x == &_y); _x < _y ? _x : _y; }) + +static void usage(const char *argv0) +{ + printf("\nUdp Latency Test\n"); + printf("Usage:\n"); + printf("\t%s [OPTIONS]\n", argv0); + printf("\t%s -s\n", argv0); + printf("\t%s -s [-i ip] [-p port] [-m message_size] [--rx_mc_if ip] [--tx_mc_if ip]\n", argv0); + printf("\t%s -s -f file [-F s/p/e] [-m message_size] [--rx_mc_if ip] [--tx_mc_if ip]\n", argv0); + printf("\t%s -c -i ip [-p port] [-m message_size] [-t time] [--data_integrity] [-I 5]\n", argv0); + printf("\t%s -c -f file [-F s/p/e] [-m message_size] [-r msg_size_range] [-t time]\n", argv0); + printf("\t%s -B -i ip [-p port] [--rx_mc_if ip] [--tx_mc_if ip] [-A 10000]\n", argv0); + printf("\t%s -B -f file [-F s/p/e] [--rx_mc_if ip] [--tx_mc_if ip] [-a 10000]\n", argv0); + printf("\n"); + printf("Options:\n"); + printf(" -i, --ip=\t\t\tlisten on/send to ip \n"); + printf(" -p, --port=\t\tlisten on/connect to port (default %d)\n", DEFAULT_PORT); + printf(" -m, --msg_size=\t\tuse messages of size bytes (minimum default %d)\n", MIN_PAYLOAD_SIZE); + printf(" -f, --file=\t\tread multiple ip+port combinations from file (server uses select)\n"); + printf(" -F, --io_hanlder_type\t\ttype of multiple file descriptors handle [s|select|p|poll|e|epoll](default select)\n"); + printf(" -a, --activity=\t\tmeasure activity by printing a '.' for the last packets processed\n"); + printf(" -A, --Activity=\t\tmeasure activity by printing the duration for last packets processed\n"); + printf(" --rx_mc_if=\t\t address of interface on which to receive mulitcast packets (can be other then route table)\n"); + printf(" --tx_mc_if=\t\t address of interface on which to transmit mulitcast packets (can be other then route table)\n"); + printf(" --timeout=\t\tset select/poll/epoll timeout to , -1 for infinite (default is 10 msec)\n"); + printf(" --mc_loopback_disable\t\tdisables mc loopback (default enables).\n"); + printf(" --udp-buffer-size=\tset udp buffer size to bytes\n"); + printf(" --vmazcopyread\t\tIf possible use VMA's zero copy reads API (See VMA's readme)\n"); + printf(" --daemonize\t\t\trun as daemon\n"); + printf(" --nonblocked\t\t\topen non-blocked sockets\n"); + printf(" --dontwarmup\t\t\tdon't send warm up packets on start\n"); + printf(" --pre_warmup_wait\t\ttime to wait before sending warm up packets (seconds)\n"); + printf(" --mc-ttl\t\t\tlimit the lifetime of the packet (default 2)\n"); + printf(" -d, --debug\t\t\tprint extra debug information\n"); + printf(" -v, --version\t\t\tprint version\n"); + printf(" -h, --help\t\t\tprint this help message\n"); + printf("Server:\n"); + printf(" -T, --hw_timestamp\t\tenable hw_timestamp (default - no)\n"); + printf(" -s, --server\t\t\trun server (default - unicast)\n"); + printf(" -B, --Bridge\t\t\trun in Bridge mode\n"); + printf(" --threads-num=\t\trun threads on server side (requires '-f' option)\n"); + printf(" --vmarxfiltercb\t\tIf possible use VMA's receive path packet filter callback API (See VMA's readme)\n"); + printf(" --force_unicast_reply\t\tforce server to reply via unicast\n"); + printf("Client:\n"); + printf(" -c, --client\t\t\trun client\n"); + printf(" -t, --time=\t\trun for seconds (default %d, max = %d)\n", DEFAULT_TEST_DURATION, MAX_DURATION); + printf(" -b, --burst=\t\tcontrol the client's number of a packets sent in every burst\n"); + printf(" -r, --range=\t\tcomes with -m , randomly change the messages size in range: +- \n"); + printf(" -I, --information=\t\tcollect and print client side additional latency information including details about highest spikes\n"); + printf(" --data_integrity\t\tperform data integrity test\n"); + printf(" --cycle_duration=\tsets the client's send+receive cycle duration to at least \n"); + printf(" --srv_num=\t\t\tset num of servers the client works with to N\n"); + printf("\n"); +} + +void print_version() +{ +#ifdef VMA_LIBRARY_MAJOR + log_msg("Linked with VMA version: %d.%d.%d.%d", VMA_LIBRARY_MAJOR, VMA_LIBRARY_MINOR, VMA_LIBRARY_REVISION, VMA_LIBRARY_RELEASE); +#else + log_msg("No version info"); +#endif +#ifdef VMA_DATE_TIME + log_msg("Build Date: %s", VMA_DATE_TIME); +#endif +} + +void cleanup() +{ + if (user_params.fd_handler_type == RECVFROM) { + close(fd_min); + } + else { + int ifd; + for (ifd = 0; ifd <= fd_max; ifd++) { + if (fds_array[ifd]) { + close(ifd); + free(fds_array[ifd]); + } + } + + } + if(user_params.b_client_calc_details == true){ + free(packet_counter_arr); + free(rtt_data); + free(spikes); + } + if(user_params.select_timeout) { + free(user_params.select_timeout); + user_params.select_timeout = NULL; + } + if (msgbuf) { + free(msgbuf); + msgbuf = NULL; + } +#ifdef USING_VMA_EXTRA_API + if (pkt_buf) { + free(pkt_buf); + pkt_buf = NULL; + } +#endif + if (pattern) { + free(pattern); + pattern = NULL; + } + if (pid_arr) { + free(pid_arr); + } +} + +pid_t gettid(void) +{ + return syscall(__NR_gettid); +} + +void sig_handler(int signum) +{ + if (b_exit) { + log_msg("Test end (interrupted by signal %d)", signum); + return; + } + + // Just in case not Activity updates where logged add a '\n' + if (user_params.packetrate_stats_print_ratio && !user_params.packetrate_stats_print_details && + (user_params.packetrate_stats_print_ratio < packet_counter)) + printf("\n"); + + if (user_params.mthread_server) { + if (gettid() == pid_arr[0]) { //main thread + if (debug_level >= LOG_LVL_DEBUG) { + log_dbg("Main thread %d got signal %d - exiting",gettid(),signum); + } + else { + log_msg("Got signal %d - exiting", signum); + } + } + else { + log_dbg("Secondary thread %d got signal %d - exiting", gettid(),signum); + } + } + else { + switch (signum) { + case SIGINT: + log_msg("Test end (interrupted by user)"); + break; + default: + log_msg("Test end (interrupted by signal %d)", signum); + break; + } + } + + if (!packet_counter) { + log_msg("No messages were received on the server."); + } + else { + if (user_params.stream_mode) { + // Send only mode! + log_msg("Total of %lld messages received", packet_counter); + } + else { + // Default latency test mode + log_msg("Total %lld messages received and echoed back", packet_counter); + } + } + + b_exit = true; +} + +static inline void print_activity_info(unsigned long long counter) +{ + static int print_activity_info_header = 0; + + if (user_params.packetrate_stats_print_details) { + gettimeofday(&curr_tv, NULL); + if ((curr_tv.tv_sec - last_tv.tv_sec) < 3600) { + unsigned long long interval_usec = (curr_tv.tv_sec - last_tv.tv_sec)*1000000 + (curr_tv.tv_usec - last_tv.tv_usec); + if (interval_usec) { + unsigned long long interval_packet_rate = (1000000 * (unsigned long long)user_params.packetrate_stats_print_ratio) / interval_usec; + if (print_activity_info_header <= 0) { + print_activity_info_header = 20; + printf(" -- Interval -- -- Message Rate -- -- Total Message Count --\n"); + } + printf(" %10llu [usec] %10llu [msg/s] %13llu [msg]\n", + interval_usec, interval_packet_rate, counter); + print_activity_info_header--; + } + else { + printf("Interval: %8lld [usec]\n", interval_usec); + } + } + last_tv = curr_tv; + } + else { + printf("."); + } + fflush(stdout); +} +void print_average_latency(double usecAvarageLatency) +{ + if (user_params.burst_size == 1) { + log_msg("Summary: Latency is %.3lf usec", usecAvarageLatency); + } + else { + log_msg("Summary: Latency of burst of %d packets is %.3lf usec", user_params.burst_size, usecAvarageLatency); + } +} + +void print_histogram() +{ + int pos_usec = 0; + bool found_first_non_zero_latency_value = false; + + printf("Latency histogram [usec]: \n"); + for (pos_usec = latency_hist_size-1; pos_usec >= 0; pos_usec--) { + if (found_first_non_zero_latency_value == false && pos_usec > 0 && latency_hist[pos_usec-1].count > 0) + found_first_non_zero_latency_value = true; + if (found_first_non_zero_latency_value == true && pos_usec < latency_hist_size - 1) + printf ("\tmin_usec: %5d count: %d\n", latency_hist[pos_usec].min_usec, latency_hist[pos_usec].count); + } +} + +void print_spike_info(spike* spike) +{ + printf("\tspike: %6.3lf at packet counter: %lld\n", spike->usec, spike->packet_counter_at_spike); +} + +void print_spikes_list() +{ + int count = 1; + int curr = spikes_lst.head; + + printf("Spikes details [usec]: \n"); + while(curr != -1){ + //printf("%d ",count); + print_spike_info(&spikes[curr]); + curr = spikes[curr].next; + count++; + } +} + +void client_sig_handler(int signum) +{ + if (b_exit) { + log_msg("Test end (interrupted by signal %d)", signum); + return; + } + + // Just in case not Activity updates where logged add a '\n' + if (user_params.packetrate_stats_print_ratio && !user_params.packetrate_stats_print_details && + (user_params.packetrate_stats_print_ratio < packet_counter)) + printf("\n"); + + switch (signum) { + case SIGALRM: + log_msg("Test end (interrupted by timer)"); + break; + case SIGINT: + log_msg("Test end (interrupted by user)"); + break; + default: + log_msg("Test end (interrupted by signal %d)", signum); + break; + } + + if (clock_gettime(CLOCK_MONOTONIC, &end_time)) { + log_err("clock_gettime()"); + exit(1); + } + if (!packet_counter) { + if (user_params.stream_mode) { + log_msg("No messages were sent"); + } + else { + log_msg("No messages were received from the server. Is the server down?"); + } + } + else { + double usecTotalRunTime = TIME_DIFF_in_MICRO(start_time, end_time); + + if (user_params.stream_mode) { + // Send only mode! + printf(MODULE_NAME "Total of %lld messages sent in %.3lf sec", packet_counter, usecTotalRunTime/1000000); + if (cycle_counter != packet_counter) { + printf(", cycles counter = %lld\n", cycle_counter); + } + else { + printf("\n"); + } + if (usecTotalRunTime) { + int ip_frags_per_msg = (user_params.msg_size + DEFAULT_IP_PAYLOAD_SZ - 1) / DEFAULT_IP_PAYLOAD_SZ; + int mps = packet_counter / (unsigned long long)MICRO_TO_SEC(usecTotalRunTime); + int pps = mps * ip_frags_per_msg; + int total_line_ip_data = user_params.msg_size + ip_frags_per_msg*28; + double MBps = ((double)mps * total_line_ip_data)/1024/1024; /* No including IP + UDP Headers per fragment */ + if (ip_frags_per_msg == 1) + log_msg("Summary: Message Rate is %d [msg/sec]", mps); + else + log_msg("Summary: Message Rate is %d [msg/sec], Packet Rate is %d [pkt/sec] (%d ip frags / msg)", mps, pps, ip_frags_per_msg); + log_msg("Summary: BandWidth is %.3f MBps (%.3f Mbps)", MBps, MBps*8); + } + } + else { + if (duplicate_packets_counter) + log_msg("Warning: Mismatched packets counter = %d (Drops, Duplicates or Out of order)", duplicate_packets_counter); + + if (user_params.data_integrity) { + if (data_integrity_failed) + log_msg("Data integrity test failed!"); + else + log_msg("Data integrity test succeeded"); + } + + // Default latency test mode + double usecAvarageLatency = (usecTotalRunTime / (packet_counter * 2)) * user_params.burst_size; + log_msg("Total %lld messages sent in %.3lf sec", packet_counter, usecTotalRunTime/1000000); + print_average_latency(usecAvarageLatency); + if (user_params.b_client_calc_details) { + print_spikes_list(); + print_histogram(); + } + } + + if (user_params.cycle_duration_nsec > 0 && !cycle_wait_loop_counter) + log_msg("Warning: the value of the clients cycle duration might be too small (--cycle_duration=%lld usec)", + NANO_TO_MICRO(user_params.cycle_duration_nsec)); + } + + b_exit = true; +} + +/* set the timer on client to the [-t sec] parameter given by user */ +void set_client_timer(struct itimerval *timer) +{ + timer->it_value.tv_sec = user_params.sec_test_duration; + timer->it_value.tv_usec = 0; + timer->it_interval.tv_sec = 0; + timer->it_interval.tv_usec = 0; +} + +/* set the action taken when signal received */ +void set_signal_action() +{ + sigact.sa_handler = user_params.mode ? sig_handler : client_sig_handler; + sigemptyset(&sigact.sa_mask); + sigact.sa_flags = 0; + + sigaction(SIGINT, &sigact, NULL); + + if (user_params.mode == MODE_CLIENT) + sigaction(SIGALRM, &sigact, NULL); +} + +/* set the timeout of select*/ +void set_select_timeout(int time_out_msec) +{ + if (!user_params.select_timeout) { + user_params.select_timeout = (struct timeval*)malloc(sizeof(struct timeval)); + if (!user_params.select_timeout) { + log_err("Failed to allocate memory for pointer select timeout structure"); + exit(1); + } + } + if (time_out_msec >= 0) { + // Update timeout + user_params.select_timeout->tv_sec = time_out_msec/1000; + user_params.select_timeout->tv_usec = 1000 * (time_out_msec - user_params.select_timeout->tv_sec*1000); + } + else { + // Clear timeout + free(user_params.select_timeout); + user_params.select_timeout = NULL; + } +} + +void set_defaults() +{ + memset(&user_params, 0, sizeof(struct user_params_t)); + memset(fds_array, 0, sizeof(fds_data*)*MAX_FDS_NUM); + user_params.rx_mc_if_addr.s_addr = htonl(INADDR_ANY); + user_params.tx_mc_if_addr.s_addr = htonl(INADDR_ANY); + user_params.sec_test_duration = DEFAULT_TEST_DURATION; + user_params.msg_size = MIN_PAYLOAD_SIZE; + user_params.mode = MODE_SERVER; + user_params.packetrate_stats_print_ratio = 0; + user_params.packetrate_stats_print_details = false; + user_params.burst_size = 1; + user_params.data_integrity = false; + user_params.b_client_calc_details = false; + user_params.fd_handler_type = RECVFROM; + user_params.stream_mode = false; + user_params.mthread_server = 0; + user_params.msg_size_range = 0; + user_params.udp_buff_size = UDP_BUFF_DEFAULT_SIZE; + set_select_timeout(DEFAULT_SELECT_TIMEOUT_MSEC); + last_tv.tv_sec = 0; last_tv.tv_usec = 0; + curr_tv.tv_sec = 0; curr_tv.tv_usec = 0; + user_params.threads_num = 1; + user_params.is_blocked = true; + user_params.do_warmup = true; + user_params.pre_warmup_wait = 0; + user_params.is_vmarxfiltercb = false; + user_params.is_vmazcopyread = false; + user_params.cycle_duration_nsec = 0; + debug_level = LOG_LVL_INFO; + user_params.mc_loop_disable = false; + user_params.client_work_with_srv_num = 1; + user_params.b_server_reply_via_uc = false; + user_params.mc_ttl = 2; +} + +/* write a pattern to buffer */ +void write_pattern(unsigned char *buf, int buf_size) +{ + int len = 0; + srand((unsigned)time(NULL)); + for (len = 0; len < buf_size; len++) + buf[len] = (char)(rand() % 128); +} + +/* returns 1 if buffers are identical */ +static inline int check_data_integrity(unsigned char *pattern_buf, size_t buf_size) +{ + /*static int to_print = 1; + if (to_print == 1) { + printf("%s\n", rcvd_buf); + to_print = 0; + }*/ +#ifdef USING_VMA_EXTRA_API + if (pkts && pkts->n_packet_num > 0) { + size_t i, pos, len; + struct vma_packet_t *pkt; + + pkt = &pkts->pkts[0]; + ((char*)pkt->iov[0].iov_base)[1] = CLIENT_MASK; /*match to client so data_integrity will pass*/ + + pos = 0; + for (i = 0; i < pkt->sz_iov; ++i) { + len = pkt->iov[i].iov_len; + + if (buf_size < pos + len || + memcmp((char*)pkt->iov[i].iov_base, + (char*)pattern_buf + pos, len)) { + return 0; + } + pos += len; + } + return pos == buf_size; + } else { + printf("pkts is NULL\n"); + } +#endif + msgbuf[1] = CLIENT_MASK; /*match to client so data_integrity will pass*/ + return !memcmp((char*)msgbuf,(char*)pattern_buf, buf_size); + +} +/* get IP:port pairs from the file and initialize the list */ +int set_mcgroups_fromfile(char *mcg_filename) +{ + FILE *file_fd = NULL; + char line[MAX_MCFILE_LINE_LENGTH]; + char *res; + char *ip; + char *port; + fds_data *tmp; + int curr_fd = 0, last_fd = 0; + int regexpres; + + if ((file_fd = fopen(mcg_filename, "r")) == NULL) { + printf("No such file: %s \n", mcg_filename); + exit(4); + } + + while ((res = fgets(line, MAX_MCFILE_LINE_LENGTH, file_fd))) { + if (!res) { + if (ferror(file_fd)) { + log_err("fread()"); + return -1; + } + else + return 0; /* encountered EOF */ + } + sockets_num++; + + regexpres = regcomp(®expr, IP_PORT_FORMAT_REG_EXP, REG_EXTENDED|REG_NOSUB); + if (regexpres) { + log_err("Failed to compile regexp"); + exit(1); + } + regexpres = regexec(®expr, line, (size_t)0, NULL, 0); + regfree(®expr); + if (regexpres) { + log_msg("Invalid input in line %d: " + "each line must have the following format: ip:port", + sockets_num); + exit(1); + } + + ip = strtok(line, ":"); + port = strtok(NULL, ":\n"); + if (!ip || !port) { + log_msg("Invalid input in line %d: " + "each line must have the following format: ip:port", + sockets_num); + exit(8); + } + tmp = (struct fds_data *)malloc(sizeof(struct fds_data)); + if (!tmp) { + log_err("Failed to allocate memory with malloc()"); + exit(1); + } + memset(tmp,0,sizeof(struct fds_data)); + tmp->addr.sin_family = AF_INET; + tmp->addr.sin_port = htons(atoi(port)); + if (!inet_aton(ip, &tmp->addr.sin_addr)) { + log_msg("Invalid input in line %d: '%s:%s'", sockets_num, ip, port); + exit(8); + } + tmp->is_multicast = IN_MULTICAST(ntohl(tmp->addr.sin_addr.s_addr)); + curr_fd = prepare_socket(&tmp->addr); + if (sockets_num != 1) { /*it is not the first fd*/ + fds_array[last_fd]->next_fd = curr_fd; + } + else { + fd_min = curr_fd; + } + last_fd = curr_fd; + fds_array[curr_fd] = tmp; + fd_max = max(fd_max, curr_fd); + fd_min = min(fd_min, curr_fd); + fd_num++; + } + + fds_array[fd_max]->next_fd = fd_min; /* close loop for fast wrap around in client */ + + fclose(file_fd); + return 0; +} + +#ifdef USING_VMA_EXTRA_API +extern vma_recv_callback_retval_t myapp_vma_recv_pkt_filter_callback(int fd, size_t iov_sz, struct iovec iov[], struct vma_info_t* vma_info, void *context); +#endif + +/* returns the new socket fd + or exit with error code */ +int prepare_socket(struct sockaddr_in* p_addr) +{ + int fd = -1; + int is_mulicast = 0; + u_int reuseaddr_true = 1; + struct sockaddr_in bind_addr; + int rcv_buff_size = 0; + int snd_buff_size = 0; + int size = sizeof(int); + int flags, ret; + int ttl; + is_mulicast = IN_MULTICAST(ntohl(p_addr->sin_addr.s_addr)); + + /* create a UDP socket */ + if ((fd = socket(AF_INET, SOCK_DGRAM, 0)) < 0) { + log_err("socket(AF_INET, SOCK_DGRAM)"); + exit(1); + } + + if (!user_params.is_blocked) { + /*Uncomment to test FIONBIO command of ioctl + * int opt = 1; + * ioctl(fd, FIONBIO, &opt); + */ + + /* change socket to non-blocking */ + flags = fcntl(fd, F_GETFL); + if (flags < 0) { + log_err("fcntl(F_GETFL)"); + } + flags |= O_NONBLOCK; + ret = fcntl(fd, F_SETFL, flags); + if (ret < 0) { + log_err("fcntl(F_SETFL)"); + } + //log_msg("fd %d is non-blocked now", fd); + } + + /* allow multiple sockets to use the same PORT number */ + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &reuseaddr_true, sizeof(reuseaddr_true)) < 0) { + log_err("setsockopt(SO_REUSEADDR) failed"); + exit(1); + } + /* set timestamping */ + if (user_params.enable_hw_time) { + int opt = (1<<2); + if (setsockopt(fd, SOL_SOCKET, SO_TIMESTAMPING, &opt, + sizeof(opt)) < 0) { + log_err("setsockopt(SO_TIMESTAMPING) failed"); + exit(1); + } + } + /* Set TTL */ + ttl = user_params.mc_ttl; + if (setsockopt(fd, IPPROTO_IP, IP_MULTICAST_TTL, (char*)&ttl, sizeof(ttl)) < 0) { + log_err("setsockopt(IP_MULTICAST_TTL) failed"); + exit(1); + } + + if (user_params.udp_buff_size > 0) { + /* enlarge socket's buffer depth */ + + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &(user_params.udp_buff_size), sizeof(user_params.udp_buff_size)) < 0) { + log_err("setsockopt(SO_RCVBUF) failed"); + exit(1); + } + if (getsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcv_buff_size,(socklen_t *)&size) < 0) { + log_err("getsockopt(SO_RCVBUF) failed"); + exit(1); + } + if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &(user_params.udp_buff_size), sizeof(user_params.udp_buff_size)) < 0) { + log_err("setsockopt(SO_SNDBUF) failed"); + exit(1); + } + if (getsockopt(fd, SOL_SOCKET, SO_SNDBUF, &snd_buff_size, (socklen_t *)&size) < 0) { + log_err("getsockopt(SO_SNDBUF) failed"); + exit(1); + } + + log_msg("UDP buffers sizes of fd %d: RX: %d Byte, TX: %d Byte", fd, rcv_buff_size, snd_buff_size); + if (rcv_buff_size < user_params.udp_buff_size*2 || + snd_buff_size < user_params.udp_buff_size*2 ) { + log_msg("WARNING: Failed setting receive or send udp socket buffer size to %d bytes (check 'sysctl net.core.rmem_max' value)", user_params.udp_buff_size); + } + } + + memset(&bind_addr, 0, sizeof(struct sockaddr_in)); + bind_addr.sin_family = AF_INET; + bind_addr.sin_port = p_addr->sin_port; + bind_addr.sin_addr.s_addr = htonl(INADDR_ANY); + + /*log_dbg ("IP to bind: %s",inet_ntoa(client_addr.sin_addr));*/ + if (bind(fd, (struct sockaddr*)&bind_addr, sizeof(struct sockaddr)) < 0) { + log_err("bind()"); + exit(1); + } + + if (is_mulicast) { + struct ip_mreq mreq; + memset(&mreq,0,sizeof(struct ip_mreq)); + + /* use setsockopt() to request that the kernel join a multicast group */ + /* and specify a specific interface address on which to receive the packets of this socket */ + /* NOTE: we don't do this if case of client (sender) in stream mode */ + if (!user_params.stream_mode || user_params.mode != MODE_CLIENT) { + mreq.imr_multiaddr = p_addr->sin_addr; + mreq.imr_interface.s_addr = user_params.rx_mc_if_addr.s_addr; + if (setsockopt(fd, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0) { + log_err("setsockopt(IP_ADD_MEMBERSHIP)"); + exit(1); + } + } + + /* specify a specific interface address on which to transmitted the multicast packets of this socket */ + if (user_params.tx_mc_if_addr.s_addr != INADDR_ANY) { + if (setsockopt(fd, IPPROTO_IP, IP_MULTICAST_IF, &user_params.tx_mc_if_addr, sizeof(user_params.tx_mc_if_addr)) < 0) { + log_err("setsockopt(IP_MULTICAST_IF)"); + exit(1); + } + } + + if (user_params.mc_loop_disable) { + /* disable multicast loop of all transmitted packets */ + u_char loop_disabled = 0; + if (setsockopt(fd, IPPROTO_IP, IP_MULTICAST_LOOP, &loop_disabled, sizeof(loop_disabled)) < 0) { + log_err("setsockopt(IP_MULTICAST_LOOP)"); + exit(1); + } + } + } + +#ifdef USING_VMA_EXTRA_API + if (user_params.is_vmarxfiltercb && vma_api) { + // Try to register application with VMA's special receive notification callback logic + if (vma_api->register_recv_callback(fd, myapp_vma_recv_pkt_filter_callback, &fd) < 0) { + log_err("vma_api->register_recv_callback failed. Try running without option 'vmarxfiltercb'"); + } + else { + log_dbg("vma_api->register_recv_callback successful registered"); + } + } +#endif + + return fd; +} + +void prepare_network(int fd_min, int fd_max, int fd_num,fd_set *p_s_readfds,struct pollfd **poll_fd_arr,struct epoll_event **epoll_events, int *epfd) +{ + int ifd, fd_count = 0; + struct epoll_event ev; + + if (read_from_file == 0) { + printf(" IP = %s PORT = %d\n", inet_ntoa(fds_array[fd_min]->addr.sin_addr), ntohs(fds_array[fd_min]->addr.sin_port)); + } + else { + int list_count = 0; + switch (user_params.fd_handler_type) { + case SELECT: + FD_ZERO(p_s_readfds); + break; + case POLL: + *poll_fd_arr = (struct pollfd *)malloc(fd_num * sizeof(struct pollfd)); + if (!*poll_fd_arr) { + log_err("Failed to allocate memory for poll fd array"); + exit(1); + } + break; + case EPOLL: + *epoll_events = (struct epoll_event *)malloc(sizeof(struct epoll_event)*fd_num); + if (!*epoll_events) { + log_err("Failed to allocate memory for epoll event array"); + exit(1); + } + break; + default: + break; + } + + printf("\n"); + for (ifd = fd_min; ifd <= fd_max; ifd++) { + if (fds_array[ifd]) { + printf("[%2d] IP = %-15s PORT = %5d\n", list_count++, inet_ntoa(fds_array[ifd]->addr.sin_addr), ntohs(fds_array[ifd]->addr.sin_port)); + + switch (user_params.fd_handler_type) { + case SELECT: + FD_SET(ifd, p_s_readfds); + break; + case POLL: + (*poll_fd_arr)[fd_count].fd = ifd; + (*poll_fd_arr)[fd_count].events = POLLIN | POLLPRI; + break; + case EPOLL: + ev.events = EPOLLIN | EPOLLPRI; + ev.data.fd = ifd; + epoll_ctl(*epfd, EPOLL_CTL_ADD, ev.data.fd, &ev); + break; + default: + break; + } + fd_count++; + } + } + } +} + +static inline int msg_recvfrom(int fd, struct sockaddr_in *recvfrom_addr) +{ + int ret = 0; + socklen_t size = sizeof(struct sockaddr); + + //log_msg("Calling recvfrom with FD %d", fd); + +#ifdef USING_VMA_EXTRA_API + + if (user_params.is_vmazcopyread && vma_api) { + int flags = 0; + + // Free VMA's previously received zero copied datagram + if (pkts) { + vma_api->free_packets(fd, pkts->pkts, pkts->n_packet_num); + pkts = NULL; + } + + // Receive the next datagram with zero copy API + ret = vma_api->recvfrom_zcopy(fd, pkt_buf, max_buff_size, + &flags, (struct sockaddr*)recvfrom_addr, &size); + if (ret >= 2) { + if (flags & MSG_VMA_ZCOPY) { + // zcopy + struct vma_packet_t *pkt; + + pkts = (struct vma_packets_t*)pkt_buf; + pkt = &pkts->pkts[0]; + // copy signature + msgbuf[0] = ((uint8_t*)pkt->iov[0].iov_base)[0]; + msgbuf[1] = ((uint8_t*)pkt->iov[0].iov_base)[1]; + } + else { + // copy signature + msgbuf[0] = pkt_buf[0]; + msgbuf[1] = pkt_buf[1]; + } + + } + } + else +#endif + { + ret = recvfrom(fd, msgbuf, user_params.msg_size, 0, (struct sockaddr*)recvfrom_addr, &size); + } + + if (ret < 2 && errno != EAGAIN && errno != EINTR) { + log_err("recvfrom() Failed receiving on fd[%d]", fd); + exit(1); + } + + //log_msg("Data received from fd=%d, ret=%d", fd, ret); + return ret; +} + +static inline int msg_sendto(int fd, uint8_t* buf, int nbytes, struct sockaddr_in *sendto_addr) +{ + //log_msg("Sending on fd[%d] to %s:%d msg size of %d bytes", fd, inet_ntoa(sendto_addr->sin_addr), ntohs(sendto_addr->sin_port), nbytes); + //int flags = 0x10; + int ret = sendto(fd, buf, nbytes, 0, (struct sockaddr*)sendto_addr, sizeof(struct sockaddr)); + if (ret < 0 && errno && errno != EINTR) { + log_err("sendto() Failed sending on fd[%d] to %s:%d msg size of %d bytes", fd, inet_ntoa(sendto_addr->sin_addr), ntohs(sendto_addr->sin_port), nbytes); + exit(1); + } + //log_msg("Done sending"); + return ret; +} + +void warmup() +{ + if (!user_params.do_warmup) + return; + + log_msg("Warmup stage (sending a few dummy packets)..."); + int ifd, count; + for (ifd = fd_min; ifd <= fd_max; ifd++) { + if (fds_array[ifd] && fds_array[ifd]->is_multicast) { + struct sockaddr_in sendto_addr = fds_array[ifd]->addr; + sendto_addr.sin_port = htons(DUMMY_PORT); + + for (count=0; count<2; count++) { + msg_sendto(ifd, pattern, user_params.msg_size, &sendto_addr); + } + } + } +} + +#ifdef USING_VMA_EXTRA_API +vma_recv_callback_retval_t myapp_vma_recv_pkt_filter_callback( + int fd, size_t iov_sz, struct iovec iov[], struct vma_info_t* vma_info, void *context) +{ + if (iov_sz) {}; + if (context) {}; + + // Check info structure version + if (vma_info->struct_sz < sizeof(struct vma_info_t)) { + log_msg("VMA's info struct is not something we can handle so un-register the application's callback function"); + vma_api->register_recv_callback(fd, NULL, &fd); + return VMA_PACKET_RECV; + } + + int recvsize = iov[0].iov_len; + uint8_t* recvbuf = iov[0].iov_base; + if (user_params.enable_hw_time && !vma_info->hw_timestamp.tv_sec) { + log_err("hw_timstamp is enables but hw not found"); + } + if (!user_params.enable_hw_time && vma_info->hw_timestamp.tv_sec) { + log_err("hw_timstamp is disabled but hw found"); + } +/* + if ("rule to check if packet should be dropped") + return VMA_PACKET_DROP; +*/ + +/* + if ("Do we support zero copy logic?") { + // Application must duplicate the iov' & 'vma_info' parameters for later usage + struct iovec* my_iov = calloc(iov_sz, sizeof(struct iovec)); + memcpy(my_iov, iov, sizeof(struct iovec)*iov_sz); + myapp_queue_task_new_rcv_pkt(my_iov, iov_sz, vma_info->pkt_desc_id); + return VMA_PACKET_HOLD; + } +*/ + /* This does the server_recev_then_send all in the VMA's callback */ + if (user_params.mode != MODE_BRIDGE) { + if (recvbuf[1] != CLIENT_MASK) { + if (user_params.mc_loop_disable) + log_err("got != CLIENT_MASK"); + return VMA_PACKET_DROP; + } + recvbuf[1] = SERVER_MASK; + } + if (!user_params.stream_mode) { + /* get source addr to reply to */ + struct sockaddr_in sendto_addr = fds_array[fd]->addr; + if (!fds_array[fd]->is_multicast) /* In unicast case reply to sender*/ + sendto_addr.sin_addr = vma_info->src->sin_addr; + msg_sendto(fd, recvbuf, recvsize, &sendto_addr); + } + + packet_counter++; + if ((user_params.packetrate_stats_print_ratio > 0) && ((packet_counter % user_params.packetrate_stats_print_ratio) == 0)) { + print_activity_info(packet_counter); + } + + return VMA_PACKET_DROP; +} +#endif + +/* +** Check that msg arrived from CLIENT and not a loop from a server +** Return 0 for successful +** -1 for failure +*/ +static inline int server_prepare_msg_reply() +{ + uint8_t* msg_mode_mask = NULL; + if (user_params.mode != MODE_BRIDGE) { + msg_mode_mask = &msgbuf[1]; + + if (*msg_mode_mask != CLIENT_MASK) + return -1; + + *msg_mode_mask = SERVER_MASK; + } + return 0; +} + +/* +** receive from and send to selected socket +*/ +static inline void server_receive_then_send(int ifd) +{ + int nbytes; + struct sockaddr_in recvfrom_addr; + struct sockaddr_in sendto_addr; + + nbytes = msg_recvfrom(ifd, &recvfrom_addr); + if (b_exit) return; + if (nbytes < 0) return; + + if (server_prepare_msg_reply()) return; + + if (!user_params.stream_mode) { + /* get source addr to reply to */ + sendto_addr = fds_array[ifd]->addr; + if (!fds_array[ifd]->is_multicast || user_params.b_server_reply_via_uc) {/* In unicast case reply to sender*/ + sendto_addr.sin_addr = recvfrom_addr.sin_addr; + } + msg_sendto(ifd, msgbuf, nbytes, &sendto_addr); + } + + packet_counter++; + if ((user_params.packetrate_stats_print_ratio > 0) && ((packet_counter % user_params.packetrate_stats_print_ratio) == 0)) { + print_activity_info(packet_counter); + } + + return; +} + +void devide_fds_arr_between_threads(int *p_num_of_remainded_fds, int *p_fds_arr_len) { + + *p_num_of_remainded_fds = sockets_num%user_params.threads_num; + *p_fds_arr_len = sockets_num/user_params.threads_num; +} + +void find_min_max_fds(int start_look_from, int len, int* p_fd_min, int* p_fd_max) { + int num_of_detected_fds; + int i; + + for(num_of_detected_fds = 0, i = start_look_from; num_of_detected_fds < len;i++) { + if (fds_array[i]) { + if (!num_of_detected_fds) { + *p_fd_min = i; + } + num_of_detected_fds++; + } + } + *p_fd_max = i - 1; +} + +void server_handler(int fd_min, int fd_max, int fd_num) +{ + int epfd; + int save_timeout_sec = 0; + int save_timeout_usec = 0; + int timeout_msec = -1; + fd_set s_readfds, save_fds; + int res = 0; + int ifd, look_start = 0, look_end = 0; + int fd_handler_type = user_params.fd_handler_type; + char *to_array=(char *)malloc(20*sizeof(char)); + struct pollfd *poll_fd_arr = NULL; + struct epoll_event *epoll_events = NULL; + + log_dbg("thread %d: fd_min: %d, fd_max : %d, fd_num: %d", gettid(), fd_min, fd_max,fd_num); + + if (user_params.mode == MODE_BRIDGE) { + sprintf(to_array, "%s", inet_ntoa(user_params.tx_mc_if_addr)); + printf(MODULE_NAME "[BRIDGE] transferring packets from %s to %s on:", inet_ntoa(user_params.rx_mc_if_addr), to_array); + } + else { + printf(MODULE_NAME "[SERVER] listen on:"); + } + + if (fd_handler_type == EPOLL) { + epfd = epoll_create(fd_num); + } + prepare_network(fd_min, fd_max, fd_num, &s_readfds,&poll_fd_arr,&epoll_events,&epfd); + memcpy(&save_fds, &s_readfds, sizeof(fd_set)); + + sleep(user_params.pre_warmup_wait); + + warmup(); + + log_msg("[tid %d] using %s() to block on socket(s)", gettid(), fds_handle_desc[user_params.fd_handler_type]); + + switch (fd_handler_type) { + case RECVFROM: + res = 1; + look_start = fd_min; + look_end = fd_min+1; + break; + case SELECT: + if (user_params.select_timeout) { + save_timeout_sec = user_params.select_timeout->tv_sec; + save_timeout_usec = user_params.select_timeout->tv_usec; + } + look_start = fd_min; + look_end = fd_max+1; + break; + case POLL: + if (user_params.select_timeout) { + timeout_msec = user_params.select_timeout->tv_sec * 1000 + user_params.select_timeout->tv_usec / 1000; + } + look_end = fd_num; + break; + case EPOLL: + if (user_params.select_timeout) { + timeout_msec = user_params.select_timeout->tv_sec * 1000 + user_params.select_timeout->tv_usec / 1000; + } + look_end = 0; + break; + } + + /* + ** SERVER LOOP + */ + while (!b_exit) { + + switch (fd_handler_type) { + case RECVFROM: + break; + case SELECT: + if (user_params.select_timeout) { + user_params.select_timeout->tv_sec = save_timeout_sec; + user_params.select_timeout->tv_usec = save_timeout_usec; + } + memcpy(&s_readfds, &save_fds, sizeof(fd_set)); + res = select(fd_max+1, &s_readfds, NULL, NULL, user_params.select_timeout); + break; + case POLL: + res = poll(poll_fd_arr, fd_num, timeout_msec); + break; + case EPOLL: + res = epoll_wait(epfd, epoll_events, fd_num, timeout_msec); + look_end = res; + break; + } + + if (b_exit) continue; + if (res < 0) { + log_err("%s()", fds_handle_desc[user_params.fd_handler_type]); + exit(1); + } + if (res == 0) { + if (!user_params.select_timeout) + log_msg("Error: %s() returned without fd ready", fds_handle_desc[user_params.fd_handler_type]); + continue; + } + + for (ifd = look_start; ifd < look_end; ifd++) { + switch (fd_handler_type) { + case RECVFROM: + server_receive_then_send(ifd); + break; + case SELECT: + if (FD_ISSET(ifd, &s_readfds)) { + server_receive_then_send(ifd); + } + break; + case POLL: + if ((poll_fd_arr[ifd].revents & POLLIN) || (poll_fd_arr[ifd].revents & POLLPRI)) { + server_receive_then_send(poll_fd_arr[ifd].fd); + } + break; + case EPOLL: + server_receive_then_send(epoll_events[ifd].data.fd); + break; + } + } + } + if (to_array != NULL) { + free(to_array); + } + switch (user_params.fd_handler_type) { + case POLL: + if (poll_fd_arr) { + free(poll_fd_arr); + } + break; + case EPOLL: + close(epfd); + free(epoll_events); + break; + default: + break; + } + + log_dbg("thread %d released allocations",gettid()); + + if (!user_params.mthread_server) { + log_msg("%s() exit", __func__); + cleanup(); + } +} + +void *server_handler_for_multi_threaded(void *arg) +{ + int fd_min; + int fd_max; + int fd_num; + sub_fds_arr_info *p_sub_fds_arr_info = (sub_fds_arr_info*)arg; + + fd_min = p_sub_fds_arr_info->fd_min; + fd_max = p_sub_fds_arr_info->fd_max; + fd_num = p_sub_fds_arr_info->fd_num; + server_handler(fd_min, fd_max, fd_num); + if (p_sub_fds_arr_info != NULL){ + free(p_sub_fds_arr_info); + } + return 0; +} + +void server_select_per_thread() +{ + int i; + pthread_t tid; + int fd_num; + int num_of_remainded_fds; + int last_fds = 0; + + pid_arr[0] = gettid(); + devide_fds_arr_between_threads(&num_of_remainded_fds, &fd_num); + + for (i = 0; i < user_params.threads_num; i++) { + sub_fds_arr_info *thread_fds_arr_info = (sub_fds_arr_info*)malloc(sizeof(sub_fds_arr_info)); + if (!thread_fds_arr_info) { + log_err("Failed to allocate memory for sub_fds_arr_info"); + exit(1); + } + thread_fds_arr_info->fd_num = fd_num; + if (num_of_remainded_fds) { + thread_fds_arr_info->fd_num++; + num_of_remainded_fds--; + } + find_min_max_fds(last_fds, thread_fds_arr_info->fd_num, &(thread_fds_arr_info->fd_min), &(thread_fds_arr_info->fd_max)); + pthread_create(&tid, 0, server_handler_for_multi_threaded, (void *)thread_fds_arr_info); + pid_arr[i + 1] = tid; + last_fds = thread_fds_arr_info->fd_max + 1; + } + while (!b_exit) { + sleep(1); + } + for (i = 1; i <= user_params.threads_num; i++) { + pthread_kill(pid_arr[i], SIGINT); + pthread_join(pid_arr[i], 0); + } + log_msg("%s() exit", __func__); + cleanup(); +} + +void make_empty_spikes_list() +{ + spikes_lst.head = -1; + spikes_lst.tail = -1; + spikes=(spike *)malloc(max_spikes_num*sizeof(spike)); + if (!spikes) { + log_err("Failed to allocate memory for list of highest spikes"); + exit(1); + } +} + +static inline void insert_node_to_begining_of_spikes_lst(unsigned int node_index) +{ + spikes[node_index].next = spikes_lst.head; + spikes_lst.head = node_index; +} + +static inline void insert_node_to_end_of_spikes_lst(unsigned int node_index) +{ + spikes[spikes_lst.tail].next = node_index; + spikes[node_index].next = -1; + spikes_lst.tail = node_index; +} + +static inline void insert_node_to_middle_of_spikes_lst(unsigned int prev_node, unsigned int node_index) +{ + int temp = spikes[prev_node].next; + spikes[prev_node].next = node_index; + spikes[node_index].next = temp; +} + +static inline int is_spikes_list_empty() +{ + return (spikes_lst.head == -1 && spikes_lst.tail == -1); +} + +static inline void delete_node_from_spikes_lst_head() +{ + int temp; + if(!is_spikes_list_empty()){ + if(spikes_lst.head == spikes_lst.tail){ //list of one node + spikes_lst.head = -1; + spikes_lst.tail = -1; + } + else{ + temp = spikes_lst.head; + spikes_lst.head = spikes[spikes_lst.head].next; + spikes[temp].next = -1; + } + } +} + +static inline void insert_node_2_empty_spikes_list(unsigned int node_index) +{ + spikes_lst.head = node_index; + spikes_lst.tail = node_index; +} + +static inline void insert_new_spike_vals_2_list_node(unsigned int location, unsigned int usec, unsigned long long packet_counter) +{ + spikes[location].packet_counter_at_spike = packet_counter; + spikes[location].usec = usec; + spikes[location].next = -1; +} + +static inline void locate_node_in_spikes_list(unsigned int node_index) +{ + int usec_val = spikes[node_index].usec; + int prev_node = spikes_lst.head; + int curr_node = spikes_lst.head; + + if (is_spikes_list_empty()){ + insert_node_2_empty_spikes_list(node_index); + } + else{ + while(curr_node != -1 ){ + if(usec_val > spikes[curr_node].usec){ + prev_node = curr_node; + curr_node = spikes[curr_node].next; + } + else{ + break; + } + } + + if(curr_node == spikes_lst.head){ + insert_node_to_begining_of_spikes_lst(node_index); + } + else if(curr_node == -1){ + insert_node_to_end_of_spikes_lst(node_index); + } + else{ + insert_node_to_middle_of_spikes_lst(prev_node, node_index); + } + } +} + +static inline void update_spikes_list(unsigned int usec, unsigned long long packet_counter) +{ + unsigned int location; + + if(spikes_num < max_spikes_num){ + location = spikes_num; + spikes_num++; + } + else{ + if(usec <= spikes[spikes_lst.head].usec) + return; + else{ + location = spikes_lst.head; + delete_node_from_spikes_lst_head(); + } + } + insert_new_spike_vals_2_list_node(location,usec, packet_counter); + locate_node_in_spikes_list(location); +} + +static inline void calc_round_trip_times_details(double i_rtt, unsigned long long i_packet_counter) +{ + int pos_usec = 0; + + double latency_usec = i_rtt / 2; + update_spikes_list(latency_usec, i_packet_counter); + for (pos_usec = 1; pos_usec < latency_hist_size; pos_usec++) { + if (latency_usec < latency_hist[pos_usec].min_usec) { + latency_hist[pos_usec-1].count++; + break; + } + } +} + +static inline void client_receive_from_selected(int ifd, int time_stamp_index) +{ + int i; + + for(i = 0; i < user_params.client_work_with_srv_num; i++) { + int nbytes = 0; + int recived_legal_message = 0; + struct sockaddr_in recvfrom_addr; + + do { + if (b_exit) return; + + nbytes = msg_recvfrom(ifd, &recvfrom_addr); + if (b_exit) return; + if (nbytes < 0) continue; + + /* log_dbg("Received from: FD = %d; IP = %s; PORT = %d", ifd, inet_ntoa(recvfrom_addr.sin_addr), ntohs(recvfrom_addr.sin_port));*/ + if (nbytes != user_params.msg_size) { + log_msg("received message size test failed (sent:%d received:%d)", user_params.msg_size, nbytes); + exit(16); + } + + if (msgbuf[1] != SERVER_MASK) { + if (user_params.mc_loop_disable) + log_err("got != SERVER_MASK"); + continue; + } + + if (pattern[0] != msgbuf[0]){ + //log_dbg("duplicate message recieved expected=%d, recieved=%d",pattern[0], msgbuf[0] ); + duplicate_packets_counter++; + continue; + } + + recived_legal_message = 1; + + } while (recived_legal_message == 0); + + if (user_params.data_integrity && !check_data_integrity(pattern, user_params.msg_size)) { + data_integrity_failed = 1; + log_msg("data integrity test failed"); + exit(16); + } + } + if (user_params.b_client_calc_details) { + if (clock_gettime(CLOCK_MONOTONIC, &end_round_time)) { + log_err("clock_gettime()"); + exit(1); + } + rtt_data[time_stamp_index].rtt = TIME_DIFF_in_MICRO(rtt_data[time_stamp_index].start_round_time, end_round_time); + } +} + +static inline void client_inc_sequnce_counter() +{ + if (pattern[0] == 0xff) + pattern[0] = 0; + pattern[0]++; +} + +static inline void client_update_counters(unsigned int* p_recived_packets_num, int packet_cnt_index) +{ + int recived_packets_num = *p_recived_packets_num; + recived_packets_num++; + packet_counter++; + if (packet_counter >= (UINT64_MAX-1000000)) { + log_err("Error: counter overflow"); + } + client_inc_sequnce_counter(); + *p_recived_packets_num = recived_packets_num; + if (user_params.b_client_calc_details) { + packet_counter_arr[packet_cnt_index] = packet_counter; + } +} + +static inline void client_send_packet(int ifd, int i) +{ + struct timespec start_round_time; + + if (user_params.b_client_calc_details){ + if (clock_gettime(CLOCK_MONOTONIC, &start_round_time)) { + log_err("clock_gettime()"); + exit(1); + } + rtt_data[i].start_round_time = start_round_time; + } + msg_sendto(ifd, pattern, user_params.msg_size, &(fds_array[ifd]->addr)); +} + +static inline unsigned int client_recieve(int ifd, int time_stamp_index) +{ + fd_set s_readfds; + int res = 0; + struct timeval timeout_timeval = {0, 0}; + struct timeval* p_timeout_timeval = NULL; + int timeout_msec = -1; + int look_start = 0, look_end = 0; + int fd_handler_type = user_params.fd_handler_type; + int ready_fds_found = 0; + int fd = 0; + unsigned int recived_packets_num = 0; + + switch (fd_handler_type) { + case RECVFROM: /*recive only from the file descriptor which function got as input parameter : ifd */ + res = 1; + look_start = ifd; + look_end = ifd+1; + break; + case SELECT: + if (user_params.select_timeout) { + p_timeout_timeval = &timeout_timeval; + } + look_start = fd_min; + look_end = fd_max+1; + break; + case POLL: + if (user_params.select_timeout) { + timeout_msec = user_params.select_timeout->tv_sec * 1000 + user_params.select_timeout->tv_usec / 1000; + } + look_end = fd_num; + break; + case EPOLL: + if (user_params.select_timeout) { + timeout_msec = user_params.select_timeout->tv_sec * 1000 + user_params.select_timeout->tv_usec / 1000; + } + look_end = 0; + break; + } + + if ((user_params.packetrate_stats_print_ratio > 0) && ((packet_counter % user_params.packetrate_stats_print_ratio) == 0)) { + print_activity_info(packet_counter); + } + + do { + switch (fd_handler_type) { + case RECVFROM: + ready_fds_found = 1; + continue; + break; + case SELECT: + if (user_params.select_timeout) { + timeout_timeval = *user_params.select_timeout; + } + memcpy(&s_readfds, &readfds, sizeof(fd_set)); + res = select(fd_max+1, &s_readfds, NULL, NULL, p_timeout_timeval); + break; + case POLL: + res = poll(poll_fd_arr, fd_num, timeout_msec); + break; + case EPOLL: + res = epoll_wait(epfd, epoll_events, fd_num, timeout_msec); + look_end = res; + break; + } + + if (b_exit) return recived_packets_num; + if (res < 0) { + log_err("%s()", fds_handle_desc[user_params.fd_handler_type]); + exit(1); + } + if (res == 0) { + if (!user_params.select_timeout) + log_msg("Error: %s() returned without fd ready", fds_handle_desc[user_params.fd_handler_type]); + continue; + } + ready_fds_found = 1; + + } while (ready_fds_found == 0); + + /* ready fds were found so receive from the relevant sockets*/ + + for (fd = look_start; fd < look_end; fd++) { + switch (fd_handler_type) { + case RECVFROM: + client_receive_from_selected(fd, time_stamp_index); + client_update_counters(&recived_packets_num,time_stamp_index); + break; + case SELECT: + if (FD_ISSET(fd, &s_readfds)) { + client_receive_from_selected(fd, time_stamp_index); + client_update_counters(&recived_packets_num,time_stamp_index); + } + break; + case POLL: + if ((poll_fd_arr[fd].revents & POLLIN) || (poll_fd_arr[fd].revents & POLLPRI)) { + client_receive_from_selected(poll_fd_arr[fd].fd, time_stamp_index); + client_update_counters(&recived_packets_num,time_stamp_index); + } + break; + case EPOLL: + client_receive_from_selected(epoll_events[fd].data.fd, time_stamp_index); + client_update_counters(&recived_packets_num,time_stamp_index); + break; + } + } + return recived_packets_num; +} + +static inline void client_update_msg_size() +{ + if (user_params.msg_size_range > 0) { + user_params.msg_size = min(MAX_PAYLOAD_SIZE, (min_msg_size + (int)(rand() % (user_params.msg_size_range)))); + //log_dbg("sending message size: %d",user_params.msg_size); + } +} + +static inline void calc_round_trip_times_details_of_burst() +{ + unsigned int i; + for (i = 0; i < user_params.burst_size; i++) { + calc_round_trip_times_details(rtt_data[i].rtt, packet_counter_arr[i]); + } +} + + +/* +** busy wait between two cycles starting point and take starting point of next cycle +*/ +static inline void cycle_duration_wait() +{ + long long delta; + struct timespec et; + + while (!b_exit) { + if (clock_gettime(CLOCK_MONOTONIC, &et) < 0) + log_err("Error: clock_gettime failed"); + + //delta = (long long)(et.tv_sec) * 1000000000 + (long long)(et.tv_nsec) - cycle_start_time_nsec - user_params.cycle_duration_nsec; + delta = TS_TO_NANO(&et) - cycle_start_time_nsec - user_params.cycle_duration_nsec; + if (delta >= 0) { + /*long long int end_of_cycle = TS_TO_NANO(&et); + log_msg("end of cycle #%lld %llu\n",cycle_counter,end_of_cycle); */ + break; + } + cycle_wait_loop_counter++; + if (!cycle_wait_loop_counter) + log_err("Error: cycle_wait_loop_counter overflow"); + } + cycle_start_time_nsec += user_params.cycle_duration_nsec; + //log_msg("start of cycle #%lld %llu\n",cycle_counter,cycle_start_time_nsec); +} + +/* +** send to and recive from selected socket +*/ +static inline void client_send_then_receive(int ifd) +{ + int starting_pattern_val = 0; + unsigned int i; + + client_update_msg_size(); + client_inc_sequnce_counter(); + starting_pattern_val = pattern[0]; + + if (user_params.cycle_duration_nsec) + cycle_duration_wait(); + + if (user_params.stream_mode) + ++cycle_counter; + + for (i = 0; i < user_params.burst_size && !b_exit; i++) { + if (user_params.stream_mode) + ++packet_counter; + + //log_msg("%s() sending to FD %d", __func__, ifd); + client_send_packet(ifd, i); + client_inc_sequnce_counter(); + } + if (user_params.stream_mode) { + if ((user_params.packetrate_stats_print_ratio > 0) && ((packet_counter % user_params.packetrate_stats_print_ratio) == 0)) { + print_activity_info(packet_counter); + } + return; + } + + pattern[0] = starting_pattern_val; + + for (i = 0; i < user_params.burst_size && !b_exit; ) { + //log_msg("%s() Done sending, Waiting to receive from FD %d", __func__, ifd); + i += client_recieve(ifd, i); + } + if (user_params.b_client_calc_details) { + calc_round_trip_times_details_of_burst(); + } + + cycle_counter++; + return; +} + +void client_handler() +{ + int ret; + struct itimerval timer; + int curr_fds; + fds_data *tmp; + + + printf(MODULE_NAME "[CLIENT] send on:"); + prepare_network(fd_min, fd_max, fd_num, &readfds,&poll_fd_arr,&epoll_events,&epfd); + if (b_exit) return; + + sleep(user_params.pre_warmup_wait); + warmup(); + + if (!user_params.stream_mode) { + log_msg("using %s() to block on socket(s)", fds_handle_desc[user_params.fd_handler_type]); + } + + sleep(2); + if (b_exit) return; + + log_msg("Starting test..."); + + gettimeofday(&last_tv, NULL); + set_client_timer(&timer); + ret = setitimer(ITIMER_REAL, &timer, NULL); + if (ret) { + log_err("setitimer()"); + exit(1); + } + + if (clock_gettime(CLOCK_MONOTONIC, &start_time)) { + log_err("clock_gettime()"); + exit(1); + } + start_round_time = start_time; + + struct timespec et; + if (clock_gettime(CLOCK_MONOTONIC, &et) < 0) + log_err("Error: clock_gettime failed"); + cycle_start_time_nsec = TS_TO_NANO(&et) - user_params.cycle_duration_nsec; + /* log_msg("%s() using %s", __func__, user_params.fds_handle_desc);*/ + + curr_fds = fd_min; + while (!b_exit) { + tmp = fds_array[curr_fds]; + client_send_then_receive(curr_fds); + curr_fds = tmp->next_fd; /* cycle through all set fds in the array (with wrap around to beginning)*/ + } + cleanup(); + return; +} + +void update_min_max_msg_sizes() +{ + min_msg_size = max(MIN_PAYLOAD_SIZE, user_params.msg_size - user_params.msg_size_range); + max_msg_size = min(MAX_PAYLOAD_SIZE, user_params.msg_size + user_params.msg_size_range); + user_params.msg_size_range = max_msg_size - min_msg_size + 1; + log_msg("Message size range: [%d - %d]", min_msg_size, max_msg_size); +} + +void prepare_to_info_mode() +{ + if (rtt_data) { + free(rtt_data); + rtt_data = NULL; + } + if (packet_counter_arr) { + free(packet_counter_arr); + packet_counter_arr = NULL; + } + + rtt_data = (packet_rtt_data*)malloc(sizeof(packet_rtt_data)*user_params.burst_size); + if(!rtt_data) { + log_err("Failed to allocate memory for timestamps array"); + exit(1); + } + packet_counter_arr = (unsigned long long *)malloc(sizeof(double)*user_params.burst_size); + if(!packet_counter_arr) { + log_err("Failed to allocate memory for timestamps array"); + exit(1); + } +} + +int main(int argc, char *argv[]) +{ + int daemonize = false; + struct sockaddr_in addr; + char mcg_filename[MAX_PATH_LENGTH]; + char fd_handle_type[MAX_ARGV_SIZE]; + if (argc == 1){ + usage(argv[0]); + return 1; + } + /* set default values */ + set_defaults(); + mcg_filename[0] = '\0'; + addr.sin_family = AF_INET; + addr.sin_port = htons(DEFAULT_PORT); + inet_aton(DEFAULT_MC_ADDR, &addr.sin_addr); + + /* Parse the parameters */ + while (1) { + int c = 0; + static struct option long_options[] = { + {.name = "client", .has_arg = 0, .val = 'c'}, + {.name = "server", .has_arg = 0, .val = 's'}, + {.name = "bridge", .has_arg = 0, .val = 'B'}, + {.name = "ip", .has_arg = 1, .val = 'i'}, + {.name = "port", .has_arg = 1, .val = 'p'}, + {.name = "msg_size", .has_arg = 1, .val = 'm'}, + {.name = "range", .has_arg = 1, .val = 'r'}, + {.name = "burst", .has_arg = 1, .val = 'b'}, + {.name = "time", .has_arg = 1, .val = 't'}, + {.name = "file", .has_arg = 1, .val = 'f'}, + {.name = "fd_hanlder_type", .has_arg = 1, .val = 'F'}, + {.name = "information", .has_arg = 1, .val = 'I'}, + {.name = "streammode", .has_arg = 0, .val = 'k'}, + {.name = "activity", .has_arg = 0, .val = 'a'}, + {.name = "Activity", .has_arg = 0, .val = 'A'}, + {.name = "rx_mc_if", .has_arg = 1, .val = OPT_RX_MC_IF }, + {.name = "tx_mc_if", .has_arg = 1, .val = OPT_TX_MC_IF }, + {.name = "timeout", .has_arg = 1, .val = OPT_SELECT_TIMEOUT }, + {.name = "threads-num", .has_arg = 1, .val = OPT_MULTI_THREADED_SERVER }, + {.name = "cycle_duration", .has_arg = 1, .val = OPT_CLIENT_CYCLE_DURATION }, + {.name = "udp-buffer-size", .has_arg = 1, .val = OPT_UDP_BUFFER_SIZE }, + {.name = "data_integrity", .has_arg = 0, .val = OPT_DATA_INTEGRITY }, + {.name = "daemonize", .has_arg = 0, .val = OPT_DAEMONIZE }, + {.name = "nonblocked", .has_arg = 0, .val = OPT_NONBLOCKED }, + {.name = "dontwarmup", .has_arg = 0, .val = OPT_DONTWARMUP }, + {.name = "pre_warmup_wait", .has_arg = 1, .val = OPT_PREWARMUPWAIT }, + {.name = "vmarxfiltercb", .has_arg = 0, .val = OPT_VMARXFILTERCB }, + {.name = "vmazcopyread", .has_arg = 0, .val = OPT_VMAZCOPYREAD }, + {.name = "mc_loopback_disable", .has_arg = 0, .val = OPT_MC_LOOPBACK_DISABLE }, + {.name = "srv_num", .has_arg = 1, .val = OPT_CLIENT_WORK_WITH_SRV_NUM }, + {.name = "force_unicast_reply", .has_arg = 0, .val = OPT_FORCE_UC_REPLY}, + {.name = "mc-ttl", .has_arg = 1, .val = OPT_TTL}, + {.name = "hw_timestamp", .has_arg = 0, .val = 'T'}, + {.name = "version", .has_arg = 0, .val = 'v'}, + {.name = "debug", .has_arg = 0, .val = 'd'}, + {.name = "help", .has_arg = 0, .val = 'h'}, + {0,0,0,0} + }; + + if ((c = getopt_long(argc, argv, "csBdi:p:t:f:F:m:r:b:x:g:a:A:kTI:vh?", long_options, NULL)) == -1) + break; + + switch (c) { + case 'c': + user_params.mode = MODE_CLIENT; + break; + case 's': + user_params.mode = MODE_SERVER; + user_params.msg_size = MAX_PAYLOAD_SIZE; + break; + case 'B': + log_msg("update to bridge mode"); + user_params.mode = MODE_BRIDGE; + user_params.msg_size = MAX_PAYLOAD_SIZE; + addr.sin_port = htons(5001); /*iperf's default port*/ + break; + case 'T': + log_msg("enabling hw_timstamp"); + user_params.enable_hw_time = 1; + break; + case 'i': + if (!inet_aton(optarg, &addr.sin_addr)) { /* already in network byte order*/ + log_msg("'-%c' Invalid address: %s", c, optarg); + usage(argv[0]); + return 1; + } + user_params.fd_handler_type = RECVFROM; + break; + case 'p': + { + errno = 0; + long mc_dest_port = strtol(optarg, NULL, 0); + /* strtol() returns 0 if there were no digits at all */ + if (errno != 0) { + log_msg("'-%c' Invalid port: %d", c, (int)mc_dest_port); + usage(argv[0]); + return 1; + } + addr.sin_port = htons((uint16_t)mc_dest_port); + user_params.fd_handler_type = RECVFROM; + } + break; + case 'm': + user_params.msg_size = strtol(optarg, NULL, 0); + if (user_params.msg_size < MIN_PAYLOAD_SIZE) { + log_msg("'-%c' Invalid message size: %d (min: %d)", c, user_params.msg_size, MIN_PAYLOAD_SIZE); + usage(argv[0]); + return 1; + } + break; + case 'r': + errno = 0; + int range = strtol(optarg, NULL, 0); + if (errno != 0 || range < 0) { + log_msg("'-%c' Invalid message range: %s", c,optarg); + usage(argv[0]); + return 1; + } + user_params.msg_size_range = range; + break; + case 'b': + errno = 0; + int burst_size = strtol(optarg, NULL, 0); + if (errno != 0 || burst_size < 1) { + log_msg("'-%c' Invalid burst size: %s", c, optarg); + usage(argv[0]); + return 1; + } + user_params.burst_size = burst_size; + break; + case 't': + user_params.sec_test_duration = strtol(optarg, NULL, 0); + if (user_params.sec_test_duration <= 0 || user_params.sec_test_duration > MAX_DURATION) { + log_msg("'-%c' Invalid duration: %d", c, user_params.sec_test_duration); + usage(argv[0]); + return 1; + } + break; + case 'f': + strncpy(mcg_filename, optarg, MAX_ARGV_SIZE); + mcg_filename[MAX_PATH_LENGTH - 1] = '\0'; + read_from_file = 1; + break; + case 'F': + strncpy(fd_handle_type, optarg, MAX_ARGV_SIZE); + fd_handle_type[MAX_ARGV_SIZE - 1] = '\0'; + if (!strcmp( fd_handle_type, "epoll" ) || !strcmp( fd_handle_type, "e")) { + user_params.fd_handler_type = EPOLL; + } + else if (!strcmp( fd_handle_type, "poll" )|| !strcmp( fd_handle_type, "p")) { + user_params.fd_handler_type = POLL; + } + else if (!strcmp( fd_handle_type, "select" ) || !strcmp( fd_handle_type, "s")) { + user_params.fd_handler_type = SELECT; + } + else { + log_msg("'-%c' Invalid muliply io hanlde type: %s", c, optarg); + usage(argv[0]); + return 1; + } + break; + case 'd': + debug_level = LOG_LVL_DEBUG; + break; + case OPT_RX_MC_IF: + if ((user_params.rx_mc_if_addr.s_addr = inet_addr(optarg)) == INADDR_NONE) { /* already in network byte order*/ + log_msg("'-%c' Invalid address: %s", c, optarg); + usage(argv[0]); + return 1; + } + break; + case OPT_TX_MC_IF: + if ((user_params.tx_mc_if_addr.s_addr = inet_addr(optarg)) == INADDR_NONE) { /* already in network byte order*/ + log_msg("'-%c' Invalid address: %s", c, optarg); + usage(argv[0]); + return 1; + } + break; + case OPT_SELECT_TIMEOUT: + { + errno = 0; + int timeout = strtol(optarg, NULL, 0); + if (errno != 0 || timeout < -1) { + log_msg("'-%c' Invalid select/poll/epoll timeout val: %s", c,optarg); + usage(argv[0]); + return 1; + } + set_select_timeout(timeout); + } + break; + case OPT_MULTI_THREADED_SERVER: + { + user_params.mthread_server = 1; + errno = 0; + int threads_num = strtol(optarg, NULL, 0); + if (errno != 0 || threads_num < 0) { + log_msg("-%c' Invalid threads number: %s", c,optarg); + usage(argv[0]); + return 1; + } + user_params.threads_num = threads_num; + } + break; + case OPT_CLIENT_CYCLE_DURATION: + errno = 0; + long long time_interval = strtol(optarg, NULL, 0); + if (errno != 0 || time_interval < -1) { + log_msg("'-%c' Invalid duration val: %s", c, optarg); + usage(argv[0]); + return 1; + } + user_params.cycle_duration_nsec = MICRO_TO_NANO(time_interval); + break; + case OPT_UDP_BUFFER_SIZE: + { + errno = 0; + int udp_buff_size = strtol(optarg, NULL, 0); + if (errno != 0 || udp_buff_size <= 0) { + log_msg("'-%c' Invalid udp buffer size: %s", c,optarg); + usage(argv[0]); + return 1; + } + user_params.udp_buff_size = udp_buff_size; + } + break; + case OPT_DATA_INTEGRITY: + user_params.data_integrity = true; + break; + case OPT_DAEMONIZE: + daemonize = true; + break; + case OPT_NONBLOCKED: + user_params.is_blocked = false; + break; + case OPT_DONTWARMUP: + user_params.do_warmup = false; + break; + case OPT_PREWARMUPWAIT: + errno = 0; + int pre_warmup_wait = strtol(optarg, NULL, 0); + if (errno != 0 || pre_warmup_wait <= 0) { + log_msg("'-%c' Invalid pre warmup wait: %s", c,optarg); + usage(argv[0]); + return 1; + } + user_params.pre_warmup_wait = pre_warmup_wait; + break; + case OPT_VMARXFILTERCB: + user_params.is_vmarxfiltercb = true; + break; + case OPT_VMAZCOPYREAD: + user_params.is_vmazcopyread = true; + break; + case OPT_MC_LOOPBACK_DISABLE: + user_params.mc_loop_disable = true; + break; + case OPT_CLIENT_WORK_WITH_SRV_NUM: + { + errno = 0; + int srv_num = strtol(optarg, NULL, 0); + if (errno != 0 || srv_num < 1) { + log_msg("'-%c' Invalid server num val: %s", c, optarg); + usage(argv[0]); + return 1; + } + user_params.client_work_with_srv_num = srv_num; + } + break; + case OPT_FORCE_UC_REPLY: + user_params.b_server_reply_via_uc = true; + break; + case OPT_TTL: + { + int mc_ttl = strtol(optarg, NULL, 0); + if (errno != 0 || mc_ttl < 0 || mc_ttl > 255) { + log_msg("'--mc_ttl' Invalid ttl value: %s", optarg); + usage(argv[0]); + return 1; + } + else + user_params.mc_ttl = mc_ttl; + } + break; + + case 'k': + user_params.stream_mode = true; + break; + case 'I': + errno = 0; + max_spikes_num = strtol(optarg, NULL, 0); + if (errno != 0 || max_spikes_num < 1) { + log_msg("'-%c' Invalid spikes quantity: %s", c,optarg); + usage(argv[0]); + return 1; + } + user_params.b_client_calc_details = true; + break; + case 'a': + case 'A': + errno = 0; + user_params.packetrate_stats_print_ratio = strtol(optarg, NULL, 0); + user_params.packetrate_stats_print_details = (c == 'A')?true:false; + if (errno != 0) { + log_msg("'-%c' Invalid packet rate stats print value: %d", c, user_params.packetrate_stats_print_ratio); + usage(argv[0]); + return 1; + } + break; + case 'v': + print_version(); + return 0; + case '?': + case 'h': + usage(argv[0]); + return 1; + break; + default: + usage(argv[0]); + return 1; + } + } + if (optind < argc) { + printf(MODULE_NAME "non-option ARGV-elements: "); + while (optind < argc) + printf("%s ", argv[optind++]); + printf("\n"); + usage(argv[0]); + return 1; + } + + if (user_params.mode != MODE_SERVER && user_params.mthread_server) { + log_msg("--threads-num can only work on server side"); + return 1; + } + + if (strlen(mcg_filename) == 0 && (user_params.mthread_server || user_params.threads_num > 1)) { + log_msg("--threads-num must be used with feed file (option '-f')"); + return 1; + } + + if (user_params.mode != MODE_CLIENT && user_params.msg_size_range > 0) { + log_msg("dynamic message size mode can be used on client side only"); + return 1; + } + + if (user_params.mode == MODE_CLIENT && user_params.is_vmarxfiltercb) { + log_msg("--vmarxfiltercb can only work on server side"); + return 1; + } + + if ((user_params.fd_handler_type != RECVFROM) && (strlen(mcg_filename) <= 0)) { + log_msg("[-F | fd_hanlder_type] has to come with option: [-f | --file]"); + usage(argv[0]); + return 1; + } + +#if 0 + // AlexR: for testing daemonize with allready opened UDP socket + int fd = socket(AF_INET, SOCK_DGRAM, 0); + if (fd) { + log_msg("new socket - dummy"); + } +#endif + + if (daemonize) { + if (daemon(1, 1)) { + log_err("Failed to daemonize"); + } + log_msg("Running as daemon"); + } + + if (user_params.is_vmarxfiltercb || user_params.is_vmazcopyread) { +#ifdef USING_VMA_EXTRA_API + // Get VMA extended API + vma_api = vma_get_api(); + if (vma_api == NULL) + log_err("VMA Extra API not found - working with default socket APIs"); + else + log_msg("VMA Extra API found - using VMA's receive zero copy and packet filter APIs"); + + vma_dgram_desc_size = sizeof(struct vma_packets_t) + sizeof(struct vma_packet_t) + sizeof(struct iovec) * 16; +#else + log_msg("This udp_lat version is not compiled with VMA extra API"); +#endif + } + + if (user_params.b_client_calc_details == true) { + make_empty_spikes_list(); + prepare_to_info_mode(); + } + + if (user_params.fd_handler_type == RECVFROM && read_from_file == 1) { + if (user_params.mode == MODE_SERVER) { /* if mode equal to MODE_CLIENT the handler type will be RECVFROM*/ + user_params.fd_handler_type = SELECT; + } + } + + if (strlen(mcg_filename) > 0 || user_params.mthread_server) { + if (strlen(mcg_filename) > 0) { + set_mcgroups_fromfile(mcg_filename); + } + if (user_params.mthread_server) { + if (user_params.threads_num > sockets_num || user_params.threads_num == 0) { + user_params.threads_num = sockets_num; + } + pid_arr = (int*)malloc(sizeof(int)*(user_params.threads_num + 1)); + if(!pid_arr) { + log_err("Failed to allocate memory for pid array"); + exit(1); + } + log_msg("Running %d threads to manage %d sockets",user_params.threads_num,sockets_num); + } + } + else { + fds_data *tmp = (struct fds_data *)malloc(sizeof(struct fds_data)); + memset(tmp, 0, sizeof(struct fds_data)); + memcpy(&tmp->addr, &addr, sizeof(struct sockaddr_in)); + tmp->is_multicast = IN_MULTICAST(ntohl(tmp->addr.sin_addr.s_addr)); + fd_min = fd_max = prepare_socket(&tmp->addr); + fd_num = 1; + fds_array[fd_min] = tmp; + fds_array[fd_min]->next_fd = fd_min; + } + + if ((user_params.fd_handler_type != RECVFROM) && (user_params.fd_handler_type == EPOLL) && user_params.threads_num == 1) { + epfd = epoll_create(sockets_num); + } + + max_buff_size = max(user_params.msg_size+1, vma_dgram_desc_size); + + if(user_params.msg_size_range > 0){ + update_min_max_msg_sizes(); + max_buff_size = max_msg_size + 1; + } + +#ifdef USING_VMA_EXTRA_API + if (user_params.is_vmazcopyread && vma_api){ + pkt_buf = malloc(max_buff_size); + } +#endif + + msgbuf = malloc(max_buff_size); + msgbuf[0] = '$'; + + pattern = malloc(max_buff_size); + pattern[0] = '$'; + write_pattern(pattern, max_buff_size - 1); + pattern[1] = CLIENT_MASK; + + set_signal_action(); + + + /* + ** TEST START + */ + switch (user_params.mode) { + case MODE_CLIENT: + client_handler(); + break; + case MODE_SERVER: + if (user_params.mthread_server) { + server_select_per_thread(); + break; + } + case MODE_BRIDGE: + server_handler(fd_min, fd_max, fd_num); + break; + } + /* + ** TEST END + */ + + return 0; +} diff --git a/tests/listen/tcp_client.py b/tests/listen/tcp_client.py new file mode 100755 index 0000000..9f94d1d --- /dev/null +++ b/tests/listen/tcp_client.py @@ -0,0 +1,21 @@ +#!/usr/bin/python +import sys +from socket import * +import time +from collections import deque + +serverHost = sys.argv[1] +serverPort = int(sys.argv[2]) +l = [] +d = deque() +i = 0 +while True: + time.sleep(0.001) + sock = socket(AF_INET, SOCK_STREAM) + sock.setblocking(0) + sock.connect_ex((serverHost, serverPort)) # connect to server on the port + #sock.send("Hello world") # send the data + i += 1 + print "%s: Connecting #%03d..." % (time.strftime("%Y-%m-%d %H:%M:%S"), i) + if len(d) == 10000: d.pop().close() + d.append(sock) diff --git a/tests/listen/tcp_server.py b/tests/listen/tcp_server.py new file mode 100755 index 0000000..22d143a --- /dev/null +++ b/tests/listen/tcp_server.py @@ -0,0 +1,23 @@ +#!/usr/bin/python +from socket import * +import fcntl, os, sys +import time +from collections import deque + + +BACKLOG=10 +myHost = sys.argv[1] +myPort = int(sys.argv[2]) +s = socket(AF_INET, SOCK_STREAM) # create a TCP socket +s.bind((myHost, myPort)) +s.listen(BACKLOG) +d = deque() +while True: + #time.sleep(2) + conn, addr = s.accept() + if len(d) == 100: + time.sleep(0.001) + sock=d.pop() + print "%s: Closing an accepted socket %s..." % (time.strftime("%Y-%m-%d %H:%M:%S"), str(sock)) + sock.close() + d.append(conn) diff --git a/tests/low_pps_tcp_send_test/exchange.cpp b/tests/low_pps_tcp_send_test/exchange.cpp new file mode 100755 index 0000000..ebe9a33 --- /dev/null +++ b/tests/low_pps_tcp_send_test/exchange.cpp @@ -0,0 +1,337 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + ** Build command: g++ -lpthread exchange.cpp -o exchange + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NUM_SOCKETS 3 +#define MC_SOCKET 0 +#define TCP_SOCKET 1 +#define NUM_PACKETS 200000 +#define IF_ADDRESS "1.1.1.18" +#define UC_SERVER_ADDRESS "1.1.1.19" +#define MC_ADDRESS "224.0.1.2" +#define MC_DEST_PORT 15111 +#define TCP_LOCAL_PORT 15222 +#define UC_SERVER_PORT 15333 +#define MC_BUFFLEN 200 +#define UC_BUFFLEN 4 +#define MIN_UC_BUFFLEN 10 +#define SLEEP_TIME_USEC 10 +#define MAX_PARAM_LENGTH 20 + +int fd_list[NUM_SOCKETS]; +uint64_t tx_pkt_count, delta_usec_quote; +struct timeval tv_quote_start, tv_quote_end; + +char if_address[MAX_PARAM_LENGTH] = "NO IF ADDRESS!!!"; +int num_packets = NUM_PACKETS; +char mc_address[MAX_PARAM_LENGTH] = MC_ADDRESS; +uint16_t mc_dest_port = MC_DEST_PORT; +uint16_t tcp_local_port = TCP_LOCAL_PORT; +int mc_bufflen = MC_BUFFLEN; +int uc_bufflen = UC_BUFFLEN; +uint64_t sleep_time_usec = SLEEP_TIME_USEC; + + +void usage(void) +{ + printf("Usage:\n"); + printf("\t-l\t\n"); + printf("\t[-n]\t\n"); + printf("\t[-m]\t\n"); + printf("\t[-pm]\t\n"); + printf("\t[-lp]\t\n"); + printf("\t[-sm]\t\n"); + printf("\t[-su]\t\n"); + printf("\t[-u]\t\n"); +} + + +int prepare_socket() +{ + struct sockaddr_in groupsock; + struct in_addr localInterface; + + int fd = socket(AF_INET, SOCK_DGRAM, 0); + if(fd < 0) + { + perror("Opening datagram socket error"); + exit(1); + } + + memset(&groupsock, 0, sizeof(groupsock)); + groupsock.sin_family = AF_INET; + groupsock.sin_addr.s_addr = inet_addr(mc_address); + groupsock.sin_port = htons(mc_dest_port); + + /* Disable loopback so you do not receive your own datagrams.*/ + char loopch = 0; + if(setsockopt(fd, IPPROTO_IP, IP_MULTICAST_LOOP, (char *)&loopch, sizeof(loopch)) < 0) + { + perror("Setting IP_MULTICAST_LOOP error"); + close(fd); + exit(1); + } + + /* Set local interface for outbound multicast datagrams. */ + localInterface.s_addr = inet_addr(if_address); + if(setsockopt(fd, IPPROTO_IP, IP_MULTICAST_IF, (char *)&localInterface, sizeof(localInterface)) < 0) + { + perror("Setting local interface error"); + exit(1); + } + + printf("Connecting..\n"); + if(connect(fd, (struct sockaddr *) &groupsock, sizeof(struct sockaddr))) + { + perror("connect"); + close(fd); + exit(1); + } + + return fd; +} + + +void* send_mc_loop(void* num) +{ + int ret; + char databuf[] = "NOTHING"; + char quote[] = "QUOTE"; + uint64_t delta_usec, delta_usec_sleep; + + /* Prepare MC socket */ + printf("Opening datagram MC socket\n"); + fd_list[MC_SOCKET] = prepare_socket(); + + // Prepare to start measurements + tx_pkt_count = 0; + struct timeval tv_start, tv_sleep_start, tv_sleep_end; + gettimeofday(&tv_start, NULL); + gettimeofday(&tv_sleep_start, NULL); + gettimeofday(&tv_sleep_end, NULL); + + while(true) + { + delta_usec_sleep = ((tv_sleep_end.tv_sec - tv_sleep_start.tv_sec) * 1000000) + (tv_sleep_end.tv_usec - tv_sleep_start.tv_usec); + if (delta_usec_sleep > sleep_time_usec) + { + ret = send(fd_list[MC_SOCKET], databuf, sizeof(databuf), 0); // Can use send with UDP socket because called connect() before... + if (ret < 0) + printf("ERROR on SEND errno = %s\n", strerror(errno)); + tx_pkt_count++; + tv_sleep_start = tv_sleep_end; + } + else + { + gettimeofday(&tv_sleep_end, NULL); + } + + + if ((tx_pkt_count != 0) && (tx_pkt_count % num_packets) == 0) { + struct timeval tv_now; + gettimeofday(&tv_now, NULL); + delta_usec = ((tv_now.tv_sec - tv_start.tv_sec) * 1000000) + (tv_now.tv_usec - tv_start.tv_usec); + tv_start = tv_now; + + double mps = 1000000 * (tx_pkt_count/(double)delta_usec); + double bwGbps = mps * mc_bufflen * 8/(1024*1024*1024); + printf("BW(Gbps)=%6.3f, MPS=%10.0f\n", bwGbps, mps); + tx_pkt_count = 0; + + gettimeofday(&tv_quote_start, NULL); + ret = send(fd_list[MC_SOCKET], quote, sizeof(quote), 0); + if (ret < 0) + printf("ERROR on SEND errno = %s\n", strerror(errno)); + } + } + + return 0; +} + +void * tcp_func(void * num) +{ + struct sockaddr_in localSock, servaddr; + socklen_t servaddrlen = sizeof(struct sockaddr); + char buf[uc_bufflen], ord_ack[] = "ORD_ACK", ka_ack[] = "KAA_ACK"; + int ret, print = 0; + + fd_list[TCP_SOCKET] = socket(AF_INET, SOCK_STREAM, 0); + if(fd_list[TCP_SOCKET] < 0) + { + perror("Opening TCP socket error"); + exit(1); + } + printf("Opening TCP socket....OK.\n"); + memset((char *) &localSock, 0, sizeof(localSock)); + localSock.sin_family = AF_INET; + localSock.sin_addr.s_addr = inet_addr(if_address); + localSock.sin_port = htons(tcp_local_port); + + if(bind(fd_list[TCP_SOCKET], (struct sockaddr*)&localSock, sizeof(struct sockaddr))) + { + perror("Binding TCP socket error"); + close(fd_list[TCP_SOCKET]); + exit(1); + } + else + { + printf("Binding TCP socket...OK.\n"); + } + + if (listen(fd_list[TCP_SOCKET], 5) == 0){ + printf("listen TCP socket...OK.\n"); + } else { + perror("listen TCP socket error"); + close(fd_list[TCP_SOCKET]); + exit(1); + } + + int new_fd = accept(fd_list[TCP_SOCKET], NULL, 0); + + if (new_fd == -1) { + perror("accept TCP socket error"); + close(fd_list[TCP_SOCKET]); + exit(1); + } else { + printf("accept TCP socket...OK. new_fd=%d\n", new_fd); + } + + while(1) + { + + ret = recv(new_fd, buf, uc_bufflen, MSG_WAITALL); + if (ret < 0) + { + printf("ERROR on TCP RECV errno = %s \n", strerror(errno)); + printf("errno value = %d\n", errno); + } + else + { + if (strcmp(buf, "KAA") == 0){ + ret = send(new_fd, ka_ack, sizeof(ka_ack), 0); + if (ret < 0) + { + printf("ERROR on SEND KA errno = %s \n", strerror(errno)); + printf("errno value = %d\n", errno); + } + } + else if (strcmp(buf, "ORD") == 0) + { + ret = send(new_fd, ord_ack, sizeof(ord_ack), 0); + if (ret < 0) + { + printf("ERROR on SEND TCP ORD errno = %s \n", strerror(errno)); + printf("errno value = %d\n", errno); + } + } + else{ + printf("Internal error: Exchange received TCP packet- not ORD or KA, ret=%d, buf=%s\n", ret, buf); + close(fd_list[TCP_SOCKET]); + close(new_fd); + exit(1); + } + } + } + + close(new_fd); + close(fd_list[TCP_SOCKET]); + printf("closed TCP socket\n"); + return 0; +} + +int main(int argc, char *argv[]) +{ + pthread_t tcp_thread; + int nThreadId_tcp = 1, i; + + for (i=1; ig++ -lpthread exchange.cpp -o exchange +>g++ -lpthread -lrt trader.cpp -o trader + +The test app includes two applications - exchange and trader: +Exchange: +1. Opens a MC socket and sends MC packets in a predefined rate (one every 10usec by default). +2. Open a UC TCP socket and blocks on recvfrom(). +a. If an ORDER packet is received --> send ORD_ACK +b. If a keep alive packet is received --> send KA_ACK +3. Every X MC packets (configurable) --> send a MC QUOTE packet. + +Trader: +1. Open one MC socket and one TCP socket. +2. On thread #1, the MC socket blocks on recv(). If it encounters the QUOTE packet it immediately sends a TPC ORDER packet through the TCP socket, and and measure the time it the send operation took. +3. On thread #2 (optional), the TCP socket blocks on recv(): +a. Receives reply for ORDER packet (i.e. ORD_ACK) +b. Receives reply for keep alive packet (i.e. KA_ACK) +4. On thread #3 (optional), the TCP socket send keep alive packet every X usecs (configurable) + +Running the application: +1. First run the exchange app on one server and then the trader on another. +2. If you run the app with no parameters (or with --help flag) you will get a usage print. +All of the configurable parameters are described along with their default parameters. +3. There are only 2 mandatory parameters for the trader app (local interface IP and peer UC IP), and one mandatory parameter for the exchange app (local interface IP). +4. Make sure to attach each thread to different core, there are parameters to control it. + diff --git a/tests/low_pps_tcp_send_test/trader.cpp b/tests/low_pps_tcp_send_test/trader.cpp new file mode 100755 index 0000000..f1dcde1 --- /dev/null +++ b/tests/low_pps_tcp_send_test/trader.cpp @@ -0,0 +1,549 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* + ** Build command: g++ -lpthread -lrt trader.cpp -o trader + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +#define NUM_PAIR_OF_THREADS 1 +#define IF_ADDRESS "1.1.1.19" +#define UC_SERVER_ADDRESS "1.1.1.18" +#define MC_ADDRESS "224.0.1.2" +#define UC_SERVER_PORT 15222 +#define MC_LOCAL_PORT 15111 +#define UC_LOCAL_PORT 15333 +#define MC_BUFFLEN 200 +#define UC_BUFFLEN 8 +#define MIN_UC_BUFFLEN 10 +#define RECV_PACKETS_AMOUNT 300000 +#define SEND_PACKETS_AMOUNT 1 +#define MAX_PARAM_LENGTH 20 +#define MAX_THREADS_PAIRS 50 +#define KEEP_ALIVE_INTERVAL 20 +#define TCP_DUMMY_SEND_RATE 100 +#define TCP_KA_CPU 1 +#define MC_CPU 2 +#define TCP_RECV_CPU 3 + + +char if_address[MAX_PARAM_LENGTH] = "NO IF ADDRESS!!!"; +char uc_server_address[MAX_PARAM_LENGTH] = "NO UC SERV ADDRESS!"; +int num_pair_of_threads = NUM_PAIR_OF_THREADS; +uint64_t recv_packets_amount = RECV_PACKETS_AMOUNT; +uint64_t send_packets_amount = SEND_PACKETS_AMOUNT; +char mc_address[MAX_PARAM_LENGTH] = MC_ADDRESS; +uint16_t mc_local_port = MC_LOCAL_PORT; +uint16_t uc_server_port = UC_SERVER_PORT; +uint16_t uc_local_port = UC_LOCAL_PORT; +int mc_bufflen = MC_BUFFLEN; +int uc_bufflen = UC_BUFFLEN; +int keep_alive_interval = KEEP_ALIVE_INTERVAL; +int keep_alive_cpu = TCP_KA_CPU; +int mc_cpu = MC_CPU; +int tcp_recv_cpu = TCP_RECV_CPU; +int tcp_dummy_send_rate = TCP_DUMMY_SEND_RATE; +int disable_ka = 0; +int disable_tcp_recv = 0; + + +struct ThreadsPair +{ + int mc_fd; + int uc_fd; +}; + +ThreadsPair fd_list[MAX_THREADS_PAIRS]; +struct timeval tv_order_start, tv_order_end; +pthread_spinlock_t uc_spinlock_arr[MAX_THREADS_PAIRS]; + +void usage(void) +{ + printf("Usage:\n"); + printf("\t-l\t\n"); + printf("\t-ua\t\n"); + //printf("\t[-nt]\t\n"); + //printf("\t[-n]\t\n"); + printf("\t[-ns]\t\n"); + printf("\t[-m]\t\n"); + printf("\t[-pm]\t\n"); + printf("\t[-up]\t\n"); + printf("\t[-lp]\t\n"); + printf("\t[-sm]\t\n"); + printf("\t[-su]\t\n"); + printf("\t[-ka]\t\n"); + printf("\t[-kac]\t\n"); + printf("\t[-mcc]\t\n"); + printf("\t[-trc]\t\n"); + printf("\t[-tds]\t\n"); + printf("\t[-dtr]\t\n"); + printf("\t[-dka]\t\n"); +} + + +int prepare_mc_socket(int sock_num) +{ + struct sockaddr_in localSock; + struct ip_mreq group; + int fd; + + fd = socket(AF_INET, SOCK_DGRAM, 0); + if(fd < 0) + { + printf("Opening MC datagram socket num = %d error", sock_num); + exit(1); + } + else + { + printf("Opening MC datagram socket num = %d....OK.\n", sock_num); + } + + /* Enable SO_REUSEADDR to allow multiple instances of this */ + /* application to receive copies of the multicast datagrams. */ + int reuse = 1; + if(setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char *)&reuse, sizeof(reuse)) < 0) + { + printf("Setting SO_REUSEADDR for MC datagram socket num = %d error!!!", sock_num); + close(fd); + exit(1); + } + else + printf("Setting SO_REUSEADDR on MC socket num = %d...OK.\n", sock_num); + + /* Bind to the proper port number with the IP address */ + /* specified as INADDR_ANY. */ + memset((char *)&localSock, 0, sizeof(localSock)); + localSock.sin_family = AF_INET; + localSock.sin_addr.s_addr = INADDR_ANY; + localSock.sin_port = htons(mc_local_port); + if(bind(fd, (struct sockaddr*)&localSock, sizeof(struct sockaddr))) + { + printf("Binding MC datagram socket num = %d error", sock_num); + close(fd); + exit(1); + } + else + { + printf("Binding MC datagram socket num = %d...OK.\n", sock_num); + } + + /* Join the multicast group on the local 1.1.1.19 */ + /* interface. Note that this IP_ADD_MEMBERSHIP option must be */ + /* called for each local interface over which the multicast */ + /* datagrams are to be received. */ + group.imr_multiaddr.s_addr = inet_addr(mc_address); + group.imr_interface.s_addr = inet_addr(if_address); + if(setsockopt(fd, IPPROTO_IP, IP_ADD_MEMBERSHIP, (char *)&group, sizeof(group)) < 0) + { + printf("Adding multicast group for socket num = %d error", sock_num); + close(fd); + exit(1); + } + else + { + printf("Adding multicast group for socket num = %d...OK.\n", sock_num); + } + + return fd; +} + +#include +int prepare_tcp_socket(int sock_num) +{ + struct sockaddr_in localSock; + int fd; + + fd = socket(AF_INET, SOCK_STREAM, 0); + if(fd < 0) + { + perror("Opening tcp socket error"); + exit(1); + } + printf("Opening tcp socket fd=%d....OK.\n", fd); + memset((char *) &localSock, 0, sizeof(localSock)); + localSock.sin_family = AF_INET; + localSock.sin_addr.s_addr = inet_addr(if_address); + localSock.sin_port = htons(uc_local_port+sock_num-1); + + int flag = 1; + if (setsockopt(fd,IPPROTO_TCP,TCP_NODELAY,(char *)&flag,sizeof(flag)) == -1) + { + perror("SETSOCKOPT tcp socket error"); + close(fd); + exit(1); + } + + if(bind(fd, (struct sockaddr*)&localSock, sizeof(struct sockaddr))) + { + perror("Binding tcp socket error"); + close(fd); + exit(1); + } + + printf("Binding tcp socket num %d....OK.\n", sock_num); + + + struct sockaddr_in remoteSock; + memset((char *) &remoteSock, 0, sizeof(remoteSock)); + remoteSock.sin_family = AF_INET; + remoteSock.sin_addr.s_addr = inet_addr(uc_server_address); + remoteSock.sin_port = htons(uc_server_port); + + if (connect(fd, (struct sockaddr*)&remoteSock, sizeof(struct sockaddr)) != 0) { + perror("Connect tcp socket error"); + close(fd); + exit(1); + } + + printf("Connect tcp socket num %d....OK.\n", sock_num); + + return fd; +} + +void * tcp_ka_func(void * num) +{ + int thread_num = (long int)num; + char ka[] = "KAA"; + int ret; + while(1) + { + if (!fd_list[thread_num].uc_fd) continue; + + ret = send(fd_list[thread_num].uc_fd, ka, sizeof(ka), 0); + if (ret < 0) + { + printf("ERROR on SEND KA errno = %s\n", strerror(errno)); + printf("errno value = %d\n", errno); + } + usleep(keep_alive_interval); + } + return 0; +} + +void * tcp_recv_func(void * num) +{ + struct sockaddr_in servaddr; + char buf[uc_bufflen], ka[] = "KAA"; + int ret; + int thread_num = (long int)num; + + uint64_t delta_usec; + + + while(1) + { + if (!fd_list[thread_num].uc_fd) continue; + /* Timeout on recvfrom using setsockopt */ + ret = recv(fd_list[thread_num].uc_fd, buf, uc_bufflen, MSG_WAITALL); + if (ret < 0) + { + if (errno == EAGAIN){ // meaning that Timeout occured + + ret = send(fd_list[thread_num].uc_fd, ka, sizeof(ka), 0); + + if (ret < 0) + { + printf("ERROR on SEND 1 errno = %s\n", strerror(errno)); + printf("errno value = %d\n", errno); + for (int i=0; i< num_pair_of_threads; i++) + { + close(fd_list[i].uc_fd); + } + exit(1); + } + } else { + printf("ERROR on SEND 2 errno = %s\n", strerror(errno)); + printf("errno value = %d\n", errno); + for (int i=0; i< num_pair_of_threads; i++) + { + close(fd_list[i].uc_fd); + } + exit(1); + } + } else { // packet received + if (strcmp(buf, "ORD_ACK") == 0) { + gettimeofday(&tv_order_end, NULL); + delta_usec = ((tv_order_end.tv_sec - tv_order_start.tv_sec) * 1000000) + (tv_order_end.tv_usec - tv_order_start.tv_usec); + //printf("#### Thread num = %d - ORDER sent and received ####. RTT time = %llu\n", thread_num+1, (long long unsigned int)delta_usec); + } else if (strcmp(buf, "KAA_ACK") == 0) { + //printf("DEBUG: *** Keep Alive sent and received ***\n"); + } else { + printf("Internal error! UC packet received, not ORD_ACK or KA_ACK, buf=%s\n", buf); + for (int i=0; i< num_pair_of_threads; i++) + { + close(fd_list[i].uc_fd); + } + exit(1); + } + } + } + + return 0; +} + +void * recv_loop(void * num) +{ + int ret; + int thread_num = (long int)num; + char buf[mc_bufflen], order[] = "ORD"; + struct sockaddr_in servaddr; + uint64_t rx_pkt_count, tx_pkt_count, delta_usec; + int t = 0; + char ka[] = "KAA"; + + printf("MC Thread number %d entered recv_loop\n", thread_num+1); + + int dummy = socket(AF_INET, SOCK_STREAM, 0); + int flag = 1; + setsockopt(dummy,IPPROTO_TCP,TCP_NODELAY,(char *)&flag,sizeof(flag)); + struct sockaddr_in remoteSock; + memset((char *) &remoteSock, 0, sizeof(remoteSock)); + remoteSock.sin_family = AF_INET; + remoteSock.sin_addr.s_addr = inet_addr(uc_server_address); + remoteSock.sin_port = htons(uc_server_port); + + connect(dummy, (struct sockaddr*)&remoteSock, sizeof(struct sockaddr)); + + memset(&servaddr, 0, sizeof(servaddr)); + servaddr.sin_family = AF_INET; + servaddr.sin_port = htons(uc_server_port); + if (inet_aton(uc_server_address, &(servaddr.sin_addr)) <= 0) + { + printf("ERROR: Invalid IP address.\n"); + exit(1); + } + + rx_pkt_count=0; + tx_pkt_count = 0; + + struct timeval tv_start, tv_end; + gettimeofday(&tv_start, NULL); + + while (true) + { + t++; + + ret = recv(fd_list[thread_num].mc_fd, buf, mc_bufflen, 0); + + if(ret == -1) + { + printf("ERROR in recv! errno = %s\n", strerror(errno)); + exit(1); + } + /* + rx_pkt_count++; + if (rx_pkt_count > recv_packets_amount) + { + gettimeofday(&tv_end, NULL); + delta_usec = ((tv_end.tv_sec - tv_start.tv_sec) * 1000000) + (tv_end.tv_usec - tv_start.tv_usec); + printf("MC thread num %d received %llu packets in usec = %llu\n", thread_num+1, (long long unsigned int)recv_packets_amount, (long long unsigned int)delta_usec); + rx_pkt_count=0; + gettimeofday(&tv_start, NULL); + } + */ + if (strcmp(buf, "QUOTE") == 0) { + gettimeofday(&tv_order_start, NULL); + + struct timespec ts_start = {0,0}, ts_end = {0,0}, ts_start1 = {0,0}, ts_end1 = {0,0}; + + tx_pkt_count++; + + clock_gettime(CLOCK_MONOTONIC, &ts_start); + ret = send(fd_list[thread_num].uc_fd, order, sizeof(order), 0); + clock_gettime(CLOCK_MONOTONIC, &ts_end); + + if (tx_pkt_count >= send_packets_amount) { + tx_pkt_count = 0; + uint64_t delta_usec = ((ts_end.tv_sec - ts_start.tv_sec) * 1000000000) + (ts_end.tv_nsec - ts_start.tv_nsec); + printf("MC thread number %d got QUOTE, sending TCP order (send time = %llu nsec) \n", thread_num+1, (long long unsigned int)delta_usec); + } + if (ret < 0) + { + printf("ERROR on SEND errno = %s\n", strerror(errno)); + printf("errno value = %d\n", errno); + } + } else if (t % tcp_dummy_send_rate == 0){ + //dummy send + send(fd_list[thread_num].uc_fd, NULL, 1, 0); + } + } + + return 0; +} + +#include + +int main(int argc, char *argv[]) +{ + int i; + + for (i=1; i // sockets +#include // sockets +#include // internet address manipulation +#include +#include +#include +#include +#include // random() +#include // getopt() and sleep() +#include // getopt() +#include + +#define DEFAULT_MC_ADDR "224.4.4.1" +#define DEFAULT_PORT 11111 + +#define IP_PORT_FORMAT_REG_EXP "^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}"\ + "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?):"\ + "(6553[0-5]|655[0-2][0-9]|65[0-4][0-9]{2}|6[0-4][0-9]{3}|[0-5]?[0-9]{1,4})\n" + +struct sigaction sigact; +int fd; /* used when single mc group is given */ +char *msgbuf = 0; +char *pattern = 0; + +u_char is_loop = 0; + +struct user_params_t { + struct sockaddr_in addr; + uint16_t mc_dest_port; + int msg_size; + int server; +} user_params; + + +static void usage(const char *argv0) +{ + printf("\nMC Loop Test\n"); + printf("Usage:\n"); + printf("\t%s [OPTIONS]\n", argv0); + printf("\t%s -s [-i ip] [-p port] [-m message_size] [-l]\n", argv0); + printf("\t%s -c [-i ip] [-p port] [-m message_size] [-l]\n", argv0); + printf("\n"); + printf("Options:\n"); + printf(" -s, --server\t\t\trun server (default - unicast)\n"); + printf(" -c, --client\t\t\trun client\n"); + printf(" -i, --ip=\t\t\tlisten on/send to ip (default %s)\n", DEFAULT_MC_ADDR); + printf(" -l, --loop\t\t\tto enable mc loop (in the default it's disabled)\n"); + printf(" -p, --port=\t\tlisten on/connect to port (default %d)\n", DEFAULT_PORT); + printf(" -m, --msg_size=\t\tuse messages of size bytes\n"); + printf(" -h, --help\t\t\tprint this help message\n"); +} + +void cleanup() +{ + close(fd); +} + +void server_sig_handler(int signum) +{ + printf("Got signal %d - exiting.\n", signum); + cleanup(); + exit(0); +} + +void client_sig_handler(int signum) +{ + cleanup(); + exit(0); +} + + +/* set the action taken when signal received */ +void set_signal_action() +{ + sigact.sa_handler = + user_params.server ? server_sig_handler : client_sig_handler; + sigemptyset(&sigact.sa_mask); + sigact.sa_flags = 0; + sigaction(user_params.server ? SIGINT : SIGALRM, &sigact, NULL); +} + +void set_defaults() +{ + memset(&user_params, 0, sizeof(struct user_params_t)); + inet_aton(DEFAULT_MC_ADDR, &user_params.addr.sin_addr); + user_params.mc_dest_port = DEFAULT_PORT; + user_params.addr.sin_family = AF_INET; + user_params.addr.sin_port = htons(user_params.mc_dest_port); + user_params.msg_size = 1; + user_params.server = 1; +} + + +/* write a pattern to buffer */ +void write_pattern(char * buf, int buf_size) +{ + int len = 0; + char c; + + srand((unsigned)time(NULL)); + while (len < buf_size) { + c = (char) (rand() % 128); + //buf[len] = c; + pattern[len] = c; + len++; + } +} + + +int check_empty_addr(struct in_addr in){ + return !(strcmp("0.0.0.0", inet_ntoa(in))); + +} + +void prepare_network(int is_server) +{ + u_int yes = 1; + struct ip_mreq mreq; + uint32_t in_addr; + struct sockaddr_in client_addr; + u_char i_loop = is_loop; + + memset(&mreq,0,sizeof(struct ip_mreq)); + printf(" %s port %d\n", inet_ntoa(user_params.addr.sin_addr), user_params.mc_dest_port); + /* create a UDP socket */ + if ((fd = socket(AF_INET, SOCK_DGRAM, 0)) < 0) { + perror("mc_loop_test: socket()"); + exit(1); + } + + /* allow multiple sockets to use the same PORT number */ + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0) { + perror("mc_loop_test: Reusing ADDR failed"); + exit(1); + } + + in_addr = ntohl(((struct sockaddr_in *)&user_params.addr)->sin_addr.s_addr); + + /* bind to receive address */ + if (is_server){ + /* check if the ip is 0.0.0.0 and if so insert INADDR_ANY to user_params.addr */ + if (check_empty_addr(user_params.addr.sin_addr)){ + user_params.addr.sin_addr.s_addr = htonl(INADDR_ANY); + } + } + + if (IN_MULTICAST(in_addr)){ + if (bind(fd, (struct sockaddr *)&user_params.addr, sizeof(user_params.addr)) < 0) { + perror("mc_loop_test: bind()"); + exit(1); + } + + /* use setsockopt() to request that the kernel join a multicast group */ + mreq.imr_multiaddr.s_addr = user_params.addr.sin_addr.s_addr; + mreq.imr_interface.s_addr = htonl(INADDR_ANY); + if (setsockopt(fd, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0) { + perror("mc_loop_test: setsockopt()"); + exit(1); + } + + if (setsockopt(fd, IPPROTO_IP, IP_MULTICAST_LOOP, &i_loop, sizeof(i_loop)) == (-1)) { + perror("mc_loop_test: setsockopt()"); + exit(1); + } + } + else { + if (!is_server){ + client_addr.sin_family = AF_INET; + client_addr.sin_port = user_params.addr.sin_port; + client_addr.sin_addr.s_addr = htonl( INADDR_ANY ); + memset(&(client_addr.sin_zero), '\0', 8); // zero the rest of the struct + + //printf ("IP to bind: %s\n",inet_ntoa(client_addr.sin_addr)); + if (bind(fd, (struct sockaddr *)&client_addr, sizeof(client_addr)) < 0) { + perror("mc_loop_test: bind()"); + exit(1); + } + } + else { //server - unicast + if (bind(fd, (struct sockaddr *)&user_params.addr, sizeof(user_params.addr)) < 0) { + perror("mc_loop_test: bind()"); + exit(1); + } + } + } + //printf("udp_lat: %s: exit\n", __func__); +} + +void server_handler() +{ + int nbytes; + socklen_t size = sizeof(struct sockaddr); + struct sockaddr_in client_addr; + printf("mc_loop_test: [SERVER] Listen on: "); + prepare_network(1); + + printf("Waiting to receive from FD %d\n", fd); + if ((nbytes = recvfrom(fd, msgbuf, user_params.msg_size, 0, (struct sockaddr *)&client_addr, &size)) < 0) { + perror("mc_loop_test: recvfrom()"); + exit(1); + } + printf("server:Message received...\n"); + + printf("mc_loop_test: %s: exit\n", __func__); +} + +void client_handler() +{ + printf("mc_loop_test: [CLIENT] Start sending on: "); + prepare_network(0); + + sleep(2); + + //printf("Sending to: FD = %d; IP = %s; PORT = %d\n",fd, inet_ntoa(user_params.addr.sin_addr), ntohs(user_params.addr.sin_port)); + if (sendto(fd, pattern/*msgbuf*/, user_params.msg_size, 0, + (struct sockaddr *)&(user_params.addr), sizeof(user_params.addr)) < 0) { + perror("mc_loop_test: sendto()"); + exit(1); + } + printf("mc_loop_test: Client done sending.\n") ; +} + +int main(int argc, char *argv[]) { + if (argc == 1){ + usage(argv[0]); + return 1; + } + /* set default values */ + set_defaults(); + + /* Parse the parameters */ + while (1) { + int c = 0; + + static struct option long_options[] = { + {.name = "port", .has_arg = 1,.val = 'p'}, + {.name = "loop", .has_arg = 0,.val = 'l'}, + {.name = "msg_size", .has_arg = 1,.val = 'm'}, + {.name = "ip", .has_arg = 1,.val = 'i'}, + {.name = "client", .has_arg = 0,.val = 'c'}, + {.name = "server", .has_arg = 0,.val = 's'}, + {.name = "help", .has_arg = 0,.val = 'h'}, + {0} + }; + + if ((c = getopt_long(argc, argv, "p:m:i:lsch", + long_options, NULL)) == -1) + break; + + switch (c) { + case 'p': + user_params.mc_dest_port = strtol(optarg, NULL, 0); + /* strtol() returns 0 if there were no digits at all */ + if (user_params.mc_dest_port <= 0) { + printf("mc_loop_test: Invalid port: %d \n", user_params.mc_dest_port); + usage(argv[0]); + return 1; + } + user_params.addr.sin_port = htons(user_params.mc_dest_port); + break; + + case 'l': + is_loop=1; + break; + case 'm': + user_params.msg_size = strtol(optarg, NULL, 0); + if (user_params.msg_size <= 0) { + printf("mc_loop_test: Invalid message size: %d \n", + user_params.msg_size); + usage(argv[0]); + return 1; + } + break; + case 'i': + if (!inet_aton(optarg, &user_params.addr.sin_addr)) { // already in network byte order + printf("mc_loop_test: Invalid address: %s\n", optarg); + usage(argv[0]); + return 1; + } + break; + case 's': + user_params.server = 1; + break; + case 'c': + user_params.server = 0; + break; + case 'h': + usage(argv[0]); + return 0; + break; + default: + usage(argv[0]); + return 1; + } + } + if (optind < argc) { + printf("mc_loop_test: non-option ARGV-elements: "); + while (optind < argc) + printf("%s ", argv[optind++]); + printf("\n"); + usage(argv[0]); + return 1; + } + + msgbuf = malloc(user_params.msg_size+1); + msgbuf[0] = '$'; + + pattern = malloc(user_params.msg_size+1); + pattern[0] = '$'; + + write_pattern(msgbuf, user_params.msg_size); + + set_signal_action(); + + if (user_params.server) { + server_handler(); + } + else { + client_handler(); + } + + if (msgbuf) { + free(msgbuf); + msgbuf = 0; + } + if (pattern) { + free(pattern); + pattern = 0; + } + return 0; +} + diff --git a/tests/multithread_test/exchange.cpp b/tests/multithread_test/exchange.cpp new file mode 100755 index 0000000..19b16a9 --- /dev/null +++ b/tests/multithread_test/exchange.cpp @@ -0,0 +1,293 @@ +/* +** Build command: g++ -lpthread exchange.cpp -o exchange +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#define NUM_SOCKETS 2 +#define MC_SOCKET 0 +#define UC_SOCKET 1 +#define NUM_PACKETS 200000 +#define IF_ADDRESS "1.1.1.18" +#define UC_SERVER_ADDRESS "1.1.1.19" +#define MC_ADDRESS "224.0.1.2" +#define MC_DEST_PORT 15111 +#define UC_LOCAL_PORT 15222 +#define UC_SERVER_PORT 15333 +#define MC_BUFFLEN 200 +#define UC_BUFFLEN 12 +#define MIN_UC_BUFFLEN 10 +#define SLEEP_TIME_USEC 10 +#define MAX_PARAM_LENGTH 20 + +int fd_list[NUM_SOCKETS]; +uint64_t tx_pkt_count, delta_usec_quote; +struct timeval tv_quote_start, tv_quote_end; + +char if_address[MAX_PARAM_LENGTH] = "NO IF ADDRESS!!!"; +int num_packets = NUM_PACKETS; +char mc_address[MAX_PARAM_LENGTH] = MC_ADDRESS; +uint16_t mc_dest_port = MC_DEST_PORT; +uint16_t uc_local_port = UC_LOCAL_PORT; +int mc_bufflen = MC_BUFFLEN; +int uc_bufflen = UC_BUFFLEN; +uint64_t sleep_time_usec = SLEEP_TIME_USEC; + + +void usage(void) +{ + printf("Usage:\n"); + printf("\t-l\t\n"); + printf("\t[-n]\t\n"); + printf("\t[-m]\t\n"); + printf("\t[-pm]\t\n"); + printf("\t[-lp]\t\n"); + printf("\t[-sm]\t\n"); + printf("\t[-su]\t\n"); + printf("\t[-u]\t\n"); +} + + +int prepare_socket() +{ + struct sockaddr_in groupsock; + struct in_addr localInterface; + + int fd = socket(AF_INET, SOCK_DGRAM, 0); + if(fd < 0) + { + perror("Opening datagram socket error"); + exit(1); + } + + memset(&groupsock, 0, sizeof(groupsock)); + groupsock.sin_family = AF_INET; + groupsock.sin_addr.s_addr = inet_addr(mc_address); + groupsock.sin_port = htons(mc_dest_port); + + /* Disable loopback so you do not receive your own datagrams.*/ + char loopch = 0; + if(setsockopt(fd, IPPROTO_IP, IP_MULTICAST_LOOP, (char *)&loopch, sizeof(loopch)) < 0) + { + perror("Setting IP_MULTICAST_LOOP error"); + close(fd); + exit(1); + } + + /* Set local interface for outbound multicast datagrams. */ + localInterface.s_addr = inet_addr(if_address); + if(setsockopt(fd, IPPROTO_IP, IP_MULTICAST_IF, (char *)&localInterface, sizeof(localInterface)) < 0) + { + perror("Setting local interface error"); + exit(1); + } + + printf("Connecting..\n"); + if(connect(fd, (struct sockaddr *) &groupsock, sizeof(struct sockaddr))) + { + perror("connect"); + close(fd); + exit(1); + } + + return fd; +} + + +void* send_mc_loop(void* num) +{ + int ret; + char databuf[mc_bufflen]; + char quote[] = "QUOTE"; + uint64_t delta_usec, delta_usec_sleep; + + /* Prepare MC socket */ + printf("Opening datagram MC socket\n"); + fd_list[MC_SOCKET] = prepare_socket(); + + // Prepare to start measurements + tx_pkt_count = 0; + struct timeval tv_start, tv_sleep_start, tv_sleep_end; + gettimeofday(&tv_start, NULL); + gettimeofday(&tv_sleep_start, NULL); + gettimeofday(&tv_sleep_end, NULL); + + while(true) + { + delta_usec_sleep = ((tv_sleep_end.tv_sec - tv_sleep_start.tv_sec) * 1000000) + (tv_sleep_end.tv_usec - tv_sleep_start.tv_usec); + if (delta_usec_sleep > sleep_time_usec) + { + ret = send(fd_list[MC_SOCKET], databuf, mc_bufflen, 0); // Can use send with UDP socket because called connect() before... + if (ret < 0) + printf("ERROR on SEND errno = %s\n", strerror(errno)); + tx_pkt_count++; + tv_sleep_start = tv_sleep_end; + } + else + { + gettimeofday(&tv_sleep_end, NULL); + } + + + if ((tx_pkt_count != 0) && (tx_pkt_count % num_packets) == 0) { + struct timeval tv_now; + gettimeofday(&tv_now, NULL); + delta_usec = ((tv_now.tv_sec - tv_start.tv_sec) * 1000000) + (tv_now.tv_usec - tv_start.tv_usec); + tv_start = tv_now; + + double mps = 1000000 * (tx_pkt_count/(double)delta_usec); + double bwGbps = mps * mc_bufflen * 8/(1024*1024*1024); + printf("BW(Gbps)=%6.3f, MPS=%10.0f\n", bwGbps, mps); + tx_pkt_count = 0; + + gettimeofday(&tv_quote_start, NULL); + ret = send(fd_list[MC_SOCKET], quote, sizeof(quote), 0); + if (ret < 0) + printf("ERROR on SEND errno = %s\n", strerror(errno)); + } + } + + return 0; +} + + +void * uc_func(void * num) +{ + struct sockaddr_in localSock, servaddr; + socklen_t servaddrlen = sizeof(struct sockaddr); + char buf[uc_bufflen], ord_ack[] = "ORD_ACK", ka_ack[] = "KA_ACK"; + int ret, print = 0; + + fd_list[UC_SOCKET] = socket(AF_INET, SOCK_DGRAM, 0); + if(fd_list[UC_SOCKET] < 0) + { + perror("Opening datagram UC socket error"); + exit(1); + } + printf("Opening datagram UC socket....OK.\n"); + memset((char *) &localSock, 0, sizeof(localSock)); + localSock.sin_family = AF_INET; + localSock.sin_addr.s_addr = inet_addr(if_address); + localSock.sin_port = htons(uc_local_port); + + if(bind(fd_list[UC_SOCKET], (struct sockaddr*)&localSock, sizeof(struct sockaddr))) + { + perror("Binding datagram UC socket error"); + close(fd_list[UC_SOCKET]); + exit(1); + } + else + { + printf("Binding datagram UC socket...OK.\n"); + } + + while(1) + { + ret = recvfrom(fd_list[UC_SOCKET], buf, uc_bufflen, 0, (struct sockaddr *)&servaddr, &servaddrlen); + gettimeofday(&tv_quote_end, NULL); + if (ret < 0) + { + printf("ERROR on RECV errno = %s \n", strerror(errno)); + printf("errno value = %d\n", errno); + } + else + { + if (strcmp(buf, "ORD") == 0) + { + ret = sendto(fd_list[UC_SOCKET], ord_ack, sizeof(ord_ack), 0, (struct sockaddr *) &servaddr, sizeof(struct sockaddr)); + if (ret < 0) + { + printf("ERROR on SEND UC errno = %s \n", strerror(errno)); + printf("errno value = %d\n", errno); + } + print = 1; + } + else if (strcmp(buf, "KA") == 0){ + ret = sendto(fd_list[UC_SOCKET], ka_ack, sizeof(ka_ack), 0, (struct sockaddr *) &servaddr, sizeof(struct sockaddr)); + if (ret < 0) + { + printf("ERROR on SEND UC errno = %s \n", strerror(errno)); + printf("errno value = %d\n", errno); + } + } + else{ + printf("Internal error: Exchange received UC packet- not ORD or KA\n"); + } + + if (print) + { + delta_usec_quote = ((tv_quote_end.tv_sec - tv_quote_start.tv_sec) * 1000000) + (tv_quote_end.tv_usec - tv_quote_start.tv_usec); + printf("@@@@@@@ QUOTE from port %u RTT in usec = %llu @@@@@@@\n", ntohs(servaddr.sin_port), (long long unsigned int)delta_usec_quote); + print = 0; + } + } + } + + close(fd_list[UC_SOCKET]); + printf("closed UC socket\n"); + return 0; +} + + +int main(int argc, char *argv[]) +{ + pthread_t uc_thread; + int nThreadId = 1, i; + + for (i=1; ig++ -lpthread exchange.cpp -o exchange +>g++ -lpthread trader.cpp -o trader + +It is very important for us to better understand the spikes situation you experienced. +Me and Alex made some changes today to the test and we plan to further improve the test till we get the desired results. +I would like to describe the test application, and hopefully with your help refine it so we could succeed with pin pointing the issue. +It's going to be a bit long but it's important for us that you will have all the information and could easily help refine the test. + + +The test app includes two applications - exchange and trader: +Exchange: +1. Opens a MC socket and sends MC packets in a predefined rate. +2. Open a UC socket and blocks on recvfrom(). +a. If an ORDER packet is received --> send ORD_ACK +b. If a keep alive packet is received --> send KA_ACK +3. Every X packets (configurable) --> send a MC QUOTE packet. +4. Time measurement is performed in usec. It starts upon send of the MC QUOTE packet. It ends and prints upon receiving of the UC ORDER packet. + +Trader: +1. Opens X pairs of threads (configurable). +2. Each pair opens one MC socket and one UC socket. +3. The MC socket blocks on recv(). If it encounters the QUOTE packet it immediately sends a UC ORDER packet through the UC socket, and starts time measurement. +4. The UC socket blocks on recvfrom() with SO_RCVTIMEO (configurable): +a. If it times out then it sends a keep alive packet. +b. If it receives reply for ORDER packet (i.e. ORD_ACK) --> it stops and prints the time measurement (for ORDER RTT). +c. If it receives reply for keep alive packet (i.e. KA_ACK) --> it does nothing. + +Running the application: +1. First run the exchange app on one server and then the trader on another. +2. If you run the app with no parameters (or with --help flag) you will get a usage print. +All of the configurable parameters are described along with their default parameters. +3. There are only 2 mandatory parameters for the trader app (local interface IP and peer UC IP), and one mandatory parameter for the exchange app (local interface IP). +4. I am now adding execution example with some printouts (it uses all of the defaults, meaning 2 pair of threads on the trader side): + + +Exchange side printout: +======================= + +[odeds@hail18 Debug]$ VMA_RX_POLL=-1 LD_PRELOAD=/.autodirect/mtrswgwork/odeds/workspace/vma/6.1/last/src/vma/.libs/libvma.so exchange -l 1.1.1.18 + VMA INFO : ------------------------------------------------- + VMA INFO : Version: 6.1.7.0 + VMA INFO : Current Time: Thu May 3 15:28:26 2012 + VMA INFO : Cmd Line: exchange -l 1.1.1.18 + VMA INFO : Pid: 25628 + VMA INFO : OFED Version: OFED-VMA-1.5.3-0010: + VMA INFO : System: 2.6.32-71.el6.x86_64 + VMA INFO : Architecture: x86_64 + VMA INFO : Node: hail18 + VMA INFO : --------------------------------------------------------- + VMA INFO : Log Level 3 [VMA_TRACELEVEL] + VMA INFO : Log File [VMA_LOG_FILE] + VMA INFO : Rx Poll Loops -1 [VMA_RX_POLL] + VMA INFO : --------------------------------------------------------- + VMA INFO : *************************************************************** + VMA INFO : * This VMA license was granted to: cust * + VMA INFO : * Successfully passed license validation, starting VMA. * + VMA INFO : *************************************************************** +Opening datagram MC socket + VMA WARNING: *************************************************************** + VMA WARNING: * NO IMMEDIATE ACTION NEEDED! * + VMA WARNING: * Not enough hugepage resources for VMA memory allocation. * + VMA WARNING: * VMA will continue working with regular memory allocation. * + VMA INFO : * Optional: 1. Disable VMA's hugepage support (VMA_HUGETBL=0) * + VMA INFO : * 2. Restart process after increasing the number of * + VMA INFO : * hugepages resources in the system: * + VMA INFO : * "cat /proc/meminfo | grep -i HugePage" * + VMA INFO : * "echo 1000000000 > /proc/sys/kernel/shmmax" * + VMA INFO : * "echo 800 > /proc/sys/vm/nr_hugepages" * + VMA WARNING: * Read more about the Huge Pages in the VMA's User Manual * + VMA WARNING: *************************************************************** +Connecting.. +Opening datagram UC socket....OK. +Binding datagram UC socket...OK. +BW(Gbps)= 0.134, MPS= 90076 +BW(Gbps)= 0.135, MPS= 90898 +BW(Gbps)= 0.135, MPS= 90899 +BW(Gbps)= 0.135, MPS= 90899 +BW(Gbps)= 0.135, MPS= 90899 +BW(Gbps)= 0.135, MPS= 90899 +BW(Gbps)= 0.135, MPS= 90899 +BW(Gbps)= 0.135, MPS= 90900 +BW(Gbps)= 0.135, MPS= 90900 +BW(Gbps)= 0.135, MPS= 90900 +BW(Gbps)= 0.135, MPS= 90900 +BW(Gbps)= 0.135, MPS= 90899 +BW(Gbps)= 0.135, MPS= 90899 +BW(Gbps)= 0.135, MPS= 90899 +BW(Gbps)= 0.135, MPS= 90899 +@@@@@@@ QUOTE from port 15333 RTT in usec = 14738 @@@@@@@ +@@@@@@@ QUOTE from port 15334 RTT in usec = 14757 @@@@@@@ +BW(Gbps)= 0.135, MPS= 90899 +@@@@@@@ QUOTE from port 15334 RTT in usec = 14736 @@@@@@@ +@@@@@@@ QUOTE from port 15333 RTT in usec = 14746 @@@@@@@ +BW(Gbps)= 0.135, MPS= 90898 +@@@@@@@ QUOTE from port 15333 RTT in usec = 14781 @@@@@@@ +@@@@@@@ QUOTE from port 15334 RTT in usec = 14791 @@@@@@@ +BW(Gbps)= 0.135, MPS= 90899 +@@@@@@@ QUOTE from port 15333 RTT in usec = 14706 @@@@@@@ +@@@@@@@ QUOTE from port 15334 RTT in usec = 14717 @@@@@@@ +BW(Gbps)= 0.135, MPS= 90898 +@@@@@@@ QUOTE from port 15333 RTT in usec = 14801 @@@@@@@ +@@@@@@@ QUOTE from port 15334 RTT in usec = 14812 @@@@@@@ +BW(Gbps)= 0.135, MPS= 90899 +@@@@@@@ QUOTE from port 15333 RTT in usec = 14766 @@@@@@@ +@@@@@@@ QUOTE from port 15334 RTT in usec = 14779 @@@@@@@ +BW(Gbps)= 0.135, MPS= 90899 +@@@@@@@ QUOTE from port 15333 RTT in usec = 14745 @@@@@@@ +BW(Gbps)= 0.135, MPS= 90899 +@@@@@@@ QUOTE from port 15334 RTT in usec = 14738 @@@@@@@ +BW(Gbps)= 0.135, MPS= 90899 +BW(Gbps)= 0.135, MPS= 90899 +@@@@@@@ QUOTE from port 15334 RTT in usec = 14736 @@@@@@@ +@@@@@@@ QUOTE from port 15333 RTT in usec = 14746 @@@@@@@ +BW(Gbps)= 0.135, MPS= 90898 + + + +Trader side printout: +===================== + +[odeds@hail19 Debug]$ VMA_RX_POLL=-1 LD_PRELOAD=/.autodirect/mtrswgwork/odeds/workspace/vma/6.1/last/src/vma/.libs/libvma.so ./trader -l 1.1.1.19 -ua 1.1.1.18 + VMA INFO : ------------------------------------------------- + VMA INFO : Version: 6.1.7.0 + VMA INFO : Current Time: Thu May 3 15:28:57 2012 + VMA INFO : Cmd Line: ./trader -l 1.1.1.19 -ua 1.1.1.18 + VMA INFO : Pid: 27087 + VMA INFO : OFED Version: OFED-VMA-1.5.3-0010: + VMA INFO : System: 2.6.32-71.el6.x86_64 + VMA INFO : Architecture: x86_64 + VMA INFO : Node: hail19 + VMA INFO : --------------------------------------------------------- + VMA INFO : Log Level 3 [VMA_TRACELEVEL] + VMA INFO : Log File [VMA_LOG_FILE] + VMA INFO : Rx Poll Loops -1 [VMA_RX_POLL] + VMA INFO : --------------------------------------------------------- + VMA INFO : *************************************************************** + VMA INFO : * This VMA license was granted to: Mellanox internal evaluation license. Not for external use! * + VMA INFO : * Successfully passed license validation, starting VMA. * + VMA INFO : *************************************************************** +Opening MC datagram socket 1 + VMA WARNING: *************************************************************** + VMA WARNING: * NO IMMEDIATE ACTION NEEDED! * + VMA WARNING: * Not enough hugepage resources for VMA memory allocation. * + VMA WARNING: * VMA will continue working with regular memory allocation. * + VMA INFO : * Optional: 1. Disable VMA's hugepage support (VMA_HUGETBL=0) * + VMA INFO : * 2. Restart process after increasing the number of * + VMA INFO : * hugepages resources in the system: * + VMA INFO : * "cat /proc/meminfo | grep -i HugePage" * + VMA INFO : * "echo 1000000000 > /proc/sys/kernel/shmmax" * + VMA INFO : * "echo 800 > /proc/sys/vm/nr_hugepages" * + VMA WARNING: * Read more about the Huge Pages in the VMA's User Manual * + VMA WARNING: *************************************************************** +Opening MC datagram socket num = 1....OK. +Setting SO_REUSEADDR on MC socket num = 1...OK. +Binding MC datagram socket num = 1...OK. +Adding multicast group for socket num = 1...OK. +Opening datagram uc socket fd=23....OK. +Binding datagram uc socket num 1....OK. +Opening MC datagram socket 2 +Opening MC datagram socket num = 2....OK. +Setting SO_REUSEADDR on MC socket num = 2...OK. +Binding MC datagram socket num = 2...OK. +Adding multicast group for socket num = 2...OK. +Opening datagram uc socket fd=27....OK. +Binding datagram uc socket num 2....OK. +MC Thread number 1 entered recv_loop +MC Thread number 2 entered recv_loop +#### Thread num = 1 - ORDER sent and received ####. RTT time = 398011 +#### Thread num = 2 - ORDER sent and received ####. RTT time = 398028 +#### Thread num = 2 - ORDER sent and received ####. RTT time = 597608 +#### Thread num = 1 - ORDER sent and received ####. RTT time = 597612 +#### Thread num = 1 - ORDER sent and received ####. RTT time = 14781 +#### Thread num = 2 - ORDER sent and received ####. RTT time = 14794 +#### Thread num = 1 - ORDER sent and received ####. RTT time = 14704 +#### Thread num = 2 - ORDER sent and received ####. RTT time = 14715 +#### Thread num = 1 - ORDER sent and received ####. RTT time = 199447 +#### Thread num = 2 - ORDER sent and received ####. RTT time = 199461 +#### Thread num = 1 - ORDER sent and received ####. RTT time = 399008 +#### Thread num = 2 - ORDER sent and received ####. RTT time = 399017 +#### Thread num = 1 - ORDER sent and received ####. RTT time = 14742 +#### Thread num = 2 - ORDER sent and received ####. RTT time = 799166 +#### Thread num = 2 - ORDER sent and received ####. RTT time = 200353 +#### Thread num = 1 - ORDER sent and received ####. RTT time = 200359 + + + + +Suggestions for further testing with this test app: +=================================================== + +1. TCP connection (instead of the UC UDP) +2. Add a mutex lock on the UC socket on the trader side + + +Open issues found with this tool: +================================= + +1. with the default SO_RCVTIMEO value - 20usec - there is a 15000usec overhead to the time measurement. + If the value is larger (i.e. 20000usec) it doesn't happen. + It doesn't happen with OS. +2. There are spikes of upto 1 sec, only on the trader UC socket RTT. + With the OS such spikes also happen but less frequent. diff --git a/tests/multithread_test/trader.cpp b/tests/multithread_test/trader.cpp new file mode 100755 index 0000000..b762c6e --- /dev/null +++ b/tests/multithread_test/trader.cpp @@ -0,0 +1,377 @@ +/* +** Build command: g++ -lpthread trader.cpp -o trader +*/ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +using namespace std; + +#define NUM_PAIR_OF_THREADS 2 +#define IF_ADDRESS "1.1.1.19" +#define UC_SERVER_ADDRESS "1.1.1.18" +#define MC_ADDRESS "224.0.1.2" +#define UC_SERVER_PORT 15222 +#define MC_LOCAL_PORT 15111 +#define UC_LOCAL_PORT 15333 +#define MC_BUFFLEN 200 +#define UC_BUFFLEN 12 +#define MIN_UC_BUFFLEN 10 +#define RECV_PACKETS_AMOUNT 300000 +#define MAX_PARAM_LENGTH 20 +#define MAX_THREADS_PAIRS 50 +#define KEEP_ALIVE_INTERVAL 20 + + +char if_address[MAX_PARAM_LENGTH] = "NO IF ADDRESS!!!"; +char uc_server_address[MAX_PARAM_LENGTH] = "NO UC SERV ADDRESS!"; +int num_pair_of_threads = NUM_PAIR_OF_THREADS; +uint64_t recv_packets_amount = RECV_PACKETS_AMOUNT; +char mc_address[MAX_PARAM_LENGTH] = MC_ADDRESS; +uint16_t mc_local_port = MC_LOCAL_PORT; +uint16_t uc_server_port = UC_SERVER_PORT; +uint16_t uc_local_port = UC_LOCAL_PORT; +int mc_bufflen = MC_BUFFLEN; +int uc_bufflen = UC_BUFFLEN; +int keep_alive_interval = KEEP_ALIVE_INTERVAL; + + +struct ThreadsPair +{ + int mc_fd; + int uc_fd; +}; + +ThreadsPair fd_list[MAX_THREADS_PAIRS]; +struct timeval tv_order_start, tv_order_end; +pthread_spinlock_t uc_spinlock_arr[MAX_THREADS_PAIRS]; + +void usage(void) +{ + printf("Usage:\n"); + printf("\t-l\t\n"); + printf("\t-ua\t\n"); + printf("\t[-nt]\t\n"); + printf("\t[-n]\t\n"); + printf("\t[-m]\t\n"); + printf("\t[-pm]\t\n"); + printf("\t[-up]\t\n"); + printf("\t[-lp]\t\n"); + printf("\t[-sm]\t\n"); + printf("\t[-su]\t\n"); + printf("\t[-ka]\t\n"); +} + + +int prepare_mc_socket(int sock_num) +{ + struct sockaddr_in localSock; + struct ip_mreq group; + int fd; + + fd = socket(AF_INET, SOCK_DGRAM, 0); + if(fd < 0) + { + printf("Opening MC datagram socket num = %d error", sock_num); + exit(1); + } + else + { + printf("Opening MC datagram socket num = %d....OK.\n", sock_num); + } + + /* Enable SO_REUSEADDR to allow multiple instances of this */ + /* application to receive copies of the multicast datagrams. */ + int reuse = 1; + if(setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char *)&reuse, sizeof(reuse)) < 0) + { + printf("Setting SO_REUSEADDR for MC datagram socket num = %d error!!!", sock_num); + close(fd); + exit(1); + } + else + printf("Setting SO_REUSEADDR on MC socket num = %d...OK.\n", sock_num); + + /* Bind to the proper port number with the IP address */ + /* specified as INADDR_ANY. */ + memset((char *)&localSock, 0, sizeof(localSock)); + localSock.sin_family = AF_INET; + localSock.sin_addr.s_addr = INADDR_ANY; + localSock.sin_port = htons(mc_local_port); + if(bind(fd, (struct sockaddr*)&localSock, sizeof(struct sockaddr))) + { + printf("Binding MC datagram socket num = %d error", sock_num); + close(fd); + exit(1); + } + else + { + printf("Binding MC datagram socket num = %d...OK.\n", sock_num); + } + + /* Join the multicast group on the local 1.1.1.19 */ + /* interface. Note that this IP_ADD_MEMBERSHIP option must be */ + /* called for each local interface over which the multicast */ + /* datagrams are to be received. */ + group.imr_multiaddr.s_addr = inet_addr(mc_address); + group.imr_interface.s_addr = inet_addr(if_address); + if(setsockopt(fd, IPPROTO_IP, IP_ADD_MEMBERSHIP, (char *)&group, sizeof(group)) < 0) + { + printf("Adding multicast group for socket num = %d error", sock_num); + close(fd); + exit(1); + } + else + { + printf("Adding multicast group for socket num = %d...OK.\n", sock_num); + } + + return fd; +} + + +int prepare_uc_socket(int sock_num) +{ + struct sockaddr_in localSock; + int fd; + + fd = socket(AF_INET, SOCK_DGRAM, 0); + if(fd < 0) + { + perror("Opening datagram socket error"); + exit(1); + } + printf("Opening datagram uc socket fd=%d....OK.\n", fd); + memset((char *) &localSock, 0, sizeof(localSock)); + localSock.sin_family = AF_INET; + localSock.sin_addr.s_addr = inet_addr(if_address); + localSock.sin_port = htons(uc_local_port+sock_num-1); + + struct timeval tv_keep_alive_interval; + tv_keep_alive_interval.tv_sec = 0; + tv_keep_alive_interval.tv_usec = keep_alive_interval; + setsockopt(fd, SOL_SOCKET, SO_RCVTIMEO, (struct timeval *)&tv_keep_alive_interval, sizeof(struct timeval)); + + if(bind(fd, (struct sockaddr*)&localSock, sizeof(struct sockaddr))) + { + perror("Binding datagram uc socket error"); + close(fd); + exit(1); + } + printf("Binding datagram uc socket num %d....OK.\n", sock_num); + + return fd; +} + + +void * uc_func(void * num) +{ + struct sockaddr_in servaddr; + char buf[uc_bufflen], ka[] = "KA"; + int ret; + int thread_num = (long int)num; + + uint64_t delta_usec; + + memset(&servaddr, 0, sizeof(servaddr)); + servaddr.sin_family = AF_INET; + servaddr.sin_port = htons(uc_server_port); + if (inet_aton(uc_server_address, &(servaddr.sin_addr)) <= 0) + { + printf("ERROR: Invalid IP address.\n"); + exit(1); + } + int serveraddrlen = sizeof(servaddr); + +/* + printf("Connecting uc socket..\n"); + if(connect(fd_list[UC_SOCKET], (struct sockaddr *) &servaddr, sizeof(struct sockaddr))) + { + printf("error connecting uc socket"); + close(fd_list[UC_SOCKET]); + exit(1); + } +*/ + + while(1) + { + /* Timeout on recvfrom using setsockopt */ + ret = recvfrom(fd_list[thread_num].uc_fd, buf, uc_bufflen, 0, (struct sockaddr *) &servaddr, (socklen_t *)&serveraddrlen); + if (ret < 0) + { + if (errno == EAGAIN){ // meaning that Timeout occured + //printf("Debug: Keep alive timeout occured, sending KA packet\n"); +//DDD pthread_spin_lock(&uc_spinlock_arr[thread_num]); + ret = sendto(fd_list[thread_num].uc_fd, ka, sizeof(ka), 0, (struct sockaddr *) &servaddr, sizeof(struct sockaddr)); +//DDD pthread_spin_unlock(&uc_spinlock_arr[thread_num]); + if (ret < 0) + { + printf("ERROR on SEND errno = %s\n", strerror(errno)); + printf("errno value = %d\n", errno); + } + } else { + printf("ERROR on SEND errno = %s\n", strerror(errno)); + printf("errno value = %d\n", errno); + } + } else { // packet received + if (strcmp(buf, "ORD_ACK") == 0) { + gettimeofday(&tv_order_end, NULL); + delta_usec = ((tv_order_end.tv_sec - tv_order_start.tv_sec) * 1000000) + (tv_order_end.tv_usec - tv_order_start.tv_usec); + printf("#### Thread num = %d - ORDER sent and received ####. RTT time = %llu\n", thread_num+1, (long long unsigned int)delta_usec); + } else if (strcmp(buf, "KA_ACK") == 0) { + //printf("DEBUG: *** Keep Alive sent and received ***\n"); + } else { + printf("Internal error! UC packet received, not ORD_ACK or KA_ACK\n"); + } + } + } + + close(fd_list[thread_num].uc_fd); + printf("closed UC socket\n"); + return 0; +} + + +void * recv_loop(void * num) +{ + int ret; + int thread_num = (long int)num; + char buf[mc_bufflen], order[] = "ORD"; + struct sockaddr_in servaddr; + uint64_t rx_pkt_count, delta_usec; + + printf("MC Thread number %d entered recv_loop\n", thread_num+1); + + memset(&servaddr, 0, sizeof(servaddr)); + servaddr.sin_family = AF_INET; + servaddr.sin_port = htons(uc_server_port); + if (inet_aton(uc_server_address, &(servaddr.sin_addr)) <= 0) + { + printf("ERROR: Invalid IP address.\n"); + exit(1); + } + + rx_pkt_count=0; + struct timeval tv_start, tv_end; + gettimeofday(&tv_start, NULL); + + while (true) + { + ret = recv(fd_list[thread_num].mc_fd, buf, mc_bufflen, 0); + if(ret == -1) + { + printf("ERROR in recv! errno = %s\n", strerror(errno)); + } + rx_pkt_count++; + if (rx_pkt_count > recv_packets_amount) + { + gettimeofday(&tv_end, NULL); + delta_usec = ((tv_end.tv_sec - tv_start.tv_sec) * 1000000) + (tv_end.tv_usec - tv_start.tv_usec); +// printf("MC thread num %d received %llu packets in usec = %llu\n", thread_num+1, (long long unsigned int)recv_packets_amount, (long long unsigned int)delta_usec); + rx_pkt_count=0; + gettimeofday(&tv_start, NULL); + } + if (strcmp(buf, "QUOTE") == 0) { +// printf("MC thread number %d got QUOTE, sending order via UC thread... \n", thread_num+1); + gettimeofday(&tv_order_start, NULL); +//DDD pthread_spin_lock(&uc_spinlock_arr[thread_num]); + ret = sendto(fd_list[thread_num].uc_fd, order, sizeof(order), 0, (struct sockaddr *) &servaddr, sizeof(struct sockaddr)); +//DDD pthread_spin_unlock(&uc_spinlock_arr[thread_num]); + if (ret < 0) + { + printf("ERROR on SEND errno = %s\n", strerror(errno)); + printf("errno value = %d\n", errno); + } + } + } + + return 0; +} + + +int main(int argc, char *argv[]) +{ + int i; + + for (i=1; i +#include +#include +#include +#include +#include +#include +#include +#include + +#define BUFSIZE 258 + + +/** + * This is a simple test, designed to measure UDP multicast send/receive rate. + * Can be used in sender mode or receiver mode. + * + */ +int main(int argc, char** argv) +{ + int sock, status; + socklen_t socklen; + char buffer[BUFSIZE]; + struct sockaddr_in saddr; + int count, realcount, i; + struct timeval tv_before, tv_after; + double sec; + + if (argc < 3) { + fprintf(stderr, "Usage: pps_test [ srv ]\n"); + exit(1); + } + + // set content of struct saddr and imreq to zero + memset(&saddr, 0, sizeof(struct sockaddr_in)); + + // open a UDP socket + sock = socket(PF_INET,SOCK_DGRAM, 0); + if (sock < 0) { + perror("Error creating socket"); + exit(0); + } + + // set destination multicast address + saddr.sin_family = AF_INET; + saddr.sin_port = htons(11111); + inet_pton(AF_INET, argv[1], &saddr.sin_addr); + + status = bind(sock, (struct sockaddr *) &saddr, + sizeof(struct sockaddr_in)); + + count = atoi(argv[2]); + realcount = 0; + + socklen = sizeof(struct sockaddr_in); + + if (status < 0) + perror("Error binding socket to interface"), exit(0); + + if (argc <= 3) { + struct in_addr iaddr; + + memset(&iaddr, 0, sizeof(struct in_addr)); + iaddr.s_addr = INADDR_ANY; // use DEFAULT interface + + // Set the outgoing interface to DEFAULT + setsockopt(sock, IPPROTO_IP, IP_MULTICAST_IF, &iaddr, + sizeof(struct in_addr)); + + + // warm-up + for (i = 0; i < 5; ++i) { + status = sendto(sock, buffer, BUFSIZE, 0, + (struct sockaddr *) &saddr, socklen); + } + + gettimeofday(&tv_before, NULL); + for (i = 0; i < count; ++i) { + status = sendto(sock, buffer, BUFSIZE, 0, + (struct sockaddr *) &saddr, socklen); + if (status > 0) + ++realcount; + } + gettimeofday(&tv_after, NULL); + } + else { + struct ip_mreq imreq; + + imreq.imr_multiaddr.s_addr = inet_addr(argv[1]); + imreq.imr_interface.s_addr = INADDR_ANY; // use DEFAULT interface + + // JOIN multicast group on default interface + status = setsockopt(sock, IPPROTO_IP, IP_ADD_MEMBERSHIP, + (const void *)&imreq, sizeof(struct ip_mreq)); + + // first packet + status = recvfrom(sock, buffer, BUFSIZE, 0, + (struct sockaddr *) &saddr, &socklen); + + // receive packet from socket + gettimeofday(&tv_before, NULL); + for (i = 0; i < count; ++i) { + status = recvfrom(sock, buffer, BUFSIZE, 0, + (struct sockaddr *) &saddr, &socklen); + if (status > 0) + ++realcount; + } + gettimeofday(&tv_after, NULL); + } + + sec = (tv_after.tv_sec - tv_before.tv_sec) + + (tv_after.tv_usec - tv_before.tv_usec) / 1000000.0; + + printf("%d packets in %.3f seconds. PPS=%.2f\n", realcount, sec, realcount / sec); + + // close socket + close(sock); + + return 0; +} diff --git a/tests/resource_release_checker/server_socket_receive_and_recreate_loop.py b/tests/resource_release_checker/server_socket_receive_and_recreate_loop.py new file mode 100755 index 0000000..6a97317 --- /dev/null +++ b/tests/resource_release_checker/server_socket_receive_and_recreate_loop.py @@ -0,0 +1,129 @@ +#!/usr/bin/env python +# +# +#@copyright: +# Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +#@author: Alex Rosenbaum + +#@date: 20160520 +# +# +import socket, select, os, time, sys, fcntl, errno + +EPOLL_TIMEOUT=1 # infinity + +def echo_server(argv): + if (len(argv) <4): + print "Incorrect parameter : " + argv[0] + " server-ip server-port-lower num-socket packet-count-to-restart" + sys.exit(-1) + + # read configuration + IP = argv[1] + PORT = int(argv[2]) + SKT_COUNT=100 + PKT_TO_RESTART_COUNT = 100000 + if (len(argv) > 3): + SKT_COUNT = int(argv[3]) + if (len(argv) > 4): + PKT_TO_RESTART_COUNT = int(argv[4]) + + loops = 10 + while loops > 0: + + # init structures + sock = None + sock_fd = 0 + streams = {} + epoll = select.epoll() + + # create socket and add to epoll() + counter = 0 + while counter < SKT_COUNT: + sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + print IP, int(PORT + counter) + sock.bind((IP, int(PORT + counter))) + fd = sock.fileno() + epoll.register(fd, select.EPOLLIN) + streams[fd] = sock + counter += 1 + + # block on epoll until received expected packets + counter = 0 + print "expected to process ", PKT_TO_RESTART_COUNT, " ingress packet before leaving loop..." + while counter < PKT_TO_RESTART_COUNT: + # print "calling epoll ..." + eevents = epoll.poll(EPOLL_TIMEOUT) + if len(eevents) == 0: + # print "wakeup from epoll (timeout)" + continue # epoll timeout + else: + # check epoll ready events + # print "wakeup from epoll (rc=", eevents, ")" + for fd, evt in eevents: + if evt & select.EPOLLIN: # error on socket close it and restart from begining + sock = streams[fd] + data = sock.recv(1500) + counter += 1 + # print "Rx counter=", counter + continue + print "done epoll Rx of ", counter, " packets" + + print "... 4s sleep before continueing..." + time.sleep (4) + + # close before restart session + print "starting disconnect..." + for fd in streams: + sock = streams[fd] + sock.close() + print "closed sockets .. 4s sleep before continueing..." + time.sleep (4) + epoll.close() + print "closed epoll .. 4s sleep before continueing..." + time.sleep (4) + + print "Done...(loop=", loops, ")" + loops -= 1 + + print "... 1s sleep before continueing..." + time.sleep (1) + + continue + + print "... big sleep before exit..." + time.sleep (100) + +def main(): + echo_server(sys.argv) + +if __name__ == "__main__": + main() diff --git a/tests/resource_release_checker/server_socket_recreate_loop.py b/tests/resource_release_checker/server_socket_recreate_loop.py new file mode 100755 index 0000000..7a382fe --- /dev/null +++ b/tests/resource_release_checker/server_socket_recreate_loop.py @@ -0,0 +1,67 @@ +#!/usr/bin/env python +# +# +#@copyright: +# Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. +# +#@author: Alex Rosenbaum + +#@date: 20160520 +# +# +import socket, select, os, time, sys, fcntl, errno + +def main(): + + argv = sys.argv + if (len(argv) < 2): + print "Incorrect parameter : " + argv[0] + " server-ip server-port-lower" + sys.exit(-1) + + # read configuration + IP = argv[1] + PORT = int(argv[2]) + + loops = 4 + while loops > 0: + print "socket create..." + sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + sock.bind((IP, int(PORT))) + print ".. created ... sleep before continueing..." + time.sleep (4) + print "socket closing ..." + sock.close() + print ".. closed ... sleep before continueing..." + time.sleep (4) + loops -= 1 + +if __name__ == "__main__": + main() diff --git a/tests/reuse_ud_test.c b/tests/reuse_ud_test.c new file mode 100644 index 0000000..85ee0fd --- /dev/null +++ b/tests/reuse_ud_test.c @@ -0,0 +1,221 @@ +#include +#include +#include +#include +#include +#include +#include +#include /* superset of previous */ +#include +#include +#include +#include +#include +#include + +#define NOT_IN_USE(a) { if (a) {}; } + +void usage() +{ + printf( +"Usage: reuse_ud_test [option]
\n" +"\t-v\t\tShow test desription\n" +"\t-f\t\tUse fork() instead of threads\n" +"\t-h\t\tThis message\n" +); + exit(1); +} + +void describe() +{ + printf( +"Socket reuse test:\n" +" - create datagram socket\n" +" - receive msg on it\n" +" - close socket\n" +" - repeat\n" +); + exit(1); +} + +#define BIND_PORT 4242 + +#define MSG_HELLO 0xcafebabe +#define READ_TIMEOUT 30 + +struct tmsg { + int m; +} __attribute__ ((packed)); + +struct sockaddr_in rem_addr; +int status = 0; +int use_fork = 0; + +void *client_main(void *arg) +{ + int s; + struct tmsg msg; + //struct sockaddr_in addr; + int ret; + + NOT_IN_USE(arg); + + s = socket(PF_INET, SOCK_DGRAM, 0); + assert(s >= 0); + + if (connect(s, (struct sockaddr *)&rem_addr, sizeof(rem_addr))) { + printf("connect failed: %m\n"); + goto out; + } + + msg.m = MSG_HELLO; + + ret = write(s, &msg, sizeof(msg)); + if (ret != sizeof(msg)) { + printf("write failed: %m, len=%d\n", ret); + goto out; + } + + close(s); + return 0; +out: + close(s); + status++; + return 0; +} + + +void *srv_main(void *arg) +{ + int s, ret; + struct sockaddr_in addr; + struct tmsg msg; + int val = 1; + int n; + fd_set readfds; + struct timeval to; + + NOT_IN_USE(arg); + + s = socket(PF_INET, SOCK_DGRAM, 0); + assert(s >= 0); + + + addr.sin_family = AF_INET; + addr.sin_port = htons(BIND_PORT); + addr.sin_addr.s_addr = INADDR_ANY; + + if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val))) { + printf("setsockopt failed: %m\n"); + goto out; + } + + if (bind(s, (struct sockaddr *)&addr, sizeof(addr))) { + printf("bind failed: %m\n"); + goto out; + } + + FD_ZERO(&readfds); + to.tv_sec = READ_TIMEOUT; + to.tv_usec = 0; + FD_SET(s, &readfds); + + n = select(s+1, &readfds, 0, 0, &to); + if (n <= 0) { + printf("select errno or timeout\n"); + goto out; + } + + if (!FD_ISSET(s, &readfds)) { + printf("FD is not ready\n"); + goto out; + } + + ret = read(s, &msg, sizeof(msg)); + if (ret < 0) { + printf("read failed: %m\n"); + goto out; + } + + if (ret != sizeof(msg)) { + printf("read %d, expected %d\n", ret, (int)sizeof(msg)); + goto out; + } + + if (msg.m != (int)MSG_HELLO) { + printf("Bad message 0x%x\n", msg.m); + goto out; + } + + close(s); + return 0; +out: + status++; + close(s); + return 0; +} + + +int main(int argc, char **argv) +{ + int op; + pthread_t cl_th, srv_th; + + while ((op = getopt(argc, argv, "hvf")) != -1) { + switch (op) { + case 'f': + use_fork = 1; + break; + case 'v': + describe(); + break; + case 'h': + default: + usage(); + } + + } + + if (optind >= argc) + usage(); + + printf("will connect to address: %s\n", argv[optind]); + rem_addr.sin_family = AF_INET; + rem_addr.sin_port = htons(BIND_PORT); + if (!inet_aton(argv[optind], &rem_addr.sin_addr)) { + printf("address is invalid!!!\n"); + return 1; + } + + if (!use_fork) { + pthread_create(&srv_th, 0, srv_main, 0); + sleep(1); + pthread_create(&cl_th, 0, client_main, 0); + + pthread_join(cl_th, 0); + pthread_join(srv_th, 0); + } + else { + pid_t cl_pid, srv_pid; + int stat; + + srv_pid = fork(); + if(srv_pid == 0) { + srv_main(0); + exit(status); + } + sleep(1); + cl_pid = fork(); + if(cl_pid == 0) { + client_main(0); + exit(status); + } + waitpid(cl_pid, &stat, 0); + status += WEXITSTATUS(stat); + waitpid(srv_pid, &stat, 0); + status += WEXITSTATUS(stat); + } + + printf("exit status: %d\n", status); + return status; +} diff --git a/tests/select_t1.c b/tests/select_t1.c new file mode 100644 index 0000000..a63f3e0 --- /dev/null +++ b/tests/select_t1.c @@ -0,0 +1,125 @@ +#include +#include +#include +#include +#include +#include +#include +#include /* superset of previous */ +#include +#include +#include +#include +#include +#include +#include + +void usage() +{ + printf( +"Usage: select_t1 [option]
\n" +"\t-v\t\tShow test desription\n" +"\t-h\t\tThis message\n" +); + exit(1); +} + +void describe() +{ + printf( +"Select timeout regression:\n" +" - read select on the socket and wait 10seconds. Verify that it is indeed 10seconds\n" +); + exit(1); +} + +#define BIND_PORT 4242 + +#define READ_TIMEOUT 10 + + +int t1() +{ + int s; + struct sockaddr_in addr; + int val = 1; + int n; + fd_set readfds; + struct timeval to, st,dt,et; + + s = socket(PF_INET, SOCK_DGRAM, 0); + assert(s >= 0); + + + addr.sin_family = AF_INET; + addr.sin_port = htons(BIND_PORT); + addr.sin_addr.s_addr = INADDR_ANY; + + if (setsockopt(s, SOL_SOCKET, SO_REUSEADDR, &val, sizeof(val))) { + printf("setsockopt failed: %m\n"); + goto out; + } + + if (bind(s, (struct sockaddr *)&addr, sizeof(addr))) { + printf("bind failed: %m\n"); + goto out; + } + + FD_ZERO(&readfds); + to.tv_sec = READ_TIMEOUT; + to.tv_usec = 0; + FD_SET(s, &readfds); + + gettimeofday(&st, 0); + n = select(s+1, &readfds, 0, 0, &to); + if (n < 0) { + printf("select errno: %m\n"); + goto out; + } + gettimeofday(&et, 0); + timersub(&et, &st, &dt); + if (abs(dt.tv_sec - READ_TIMEOUT) > 1) { + printf("select does not honor timeout: delta: %d\n", + abs(dt.tv_sec - READ_TIMEOUT)); + goto out; + } + else { + printf("select timeout OK\n"); + } + + close(s); + return 0; +out: + close(s); + return 1; +} + +void oops() +{ + printf("Test did not complete in expected time\n"); + exit(1); +} + +int main(int argc, char **argv) +{ + int op; + int status; + + while ((op = getopt(argc, argv, "hv")) != -1) { + switch (op) { + case 'v': + describe(); + break; + case 'h': + default: + usage(); + } + + } + signal(SIGALRM, oops); + alarm(2*READ_TIMEOUT); + status = t1(); + + printf("exit status: %d\n", status); + return status; +} diff --git a/tests/server_test/Makefile.am b/tests/server_test/Makefile.am new file mode 100644 index 0000000..b026e0b --- /dev/null +++ b/tests/server_test/Makefile.am @@ -0,0 +1,11 @@ +noinst_PROGRAMS = server_perf + +AM_CPPFLAGS := \ + -I$(top_builddir)/. \ + -I$(top_builddir)/include + +server_perf_LDADD = -lboost_thread +server_perf_SOURCES = server.cc client.cc main.cc options.cc vtime.cc +server_perf_CXXFLAGS = -g + + diff --git a/tests/server_test/client.cc b/tests/server_test/client.cc new file mode 100644 index 0000000..8353c39 --- /dev/null +++ b/tests/server_test/client.cc @@ -0,0 +1,206 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "client.h" +#include "options.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + + +client::client(const options& opts) { + for (unsigned id = 0; id < opts.num_threads(); ++id) { + m_connections.push_back(boost::make_shared(id, + opts.server(), + opts.port(), + opts.packet_rate())); + } +} + +void client::run() { + boost::thread_group tg; + + BOOST_FOREACH(const connection_ptr& conn, m_connections) { + tg.create_thread(boost::ref(*conn.get())); + } + + tg.join_all(); +} + +client::connection::connection(unsigned id, const std::string& server, + unsigned port, size_t packet_rate) : + m_id(id), + m_packet_rate(packet_rate), + m_psn(0) +{ + struct hostent *he = gethostbyname(server.c_str()); + if (!he) { + throw std::runtime_error(std::string("failed to resolve ") + server); + } + + m_dest_addr.sin_family = he->h_addrtype; + m_dest_addr.sin_port = htons(port); + if (he->h_length != sizeof(m_dest_addr.sin_addr)) { + throw std::runtime_error("invalid address length"); + } + + memcpy(&m_dest_addr.sin_addr, he->h_addr_list[0], he->h_length); + memset(m_dest_addr.sin_zero, 0, sizeof(m_dest_addr.sin_zero)); + m_sockfd = socket(AF_INET, SOCK_DGRAM, 0); + if (m_sockfd < 0) { + throw std::runtime_error("failed to create socket"); + } + + m_epfd = epoll_create(1); + if (m_epfd < 0) { + throw std::runtime_error("failed to create epfd"); + } + + // Add the socket to the main epoll set + struct epoll_event evt; + evt.events = EPOLLIN|EPOLLOUT; + evt.data.fd = m_sockfd; + int ret = epoll_ctl(m_epfd, EPOLL_CTL_ADD, m_sockfd, &evt); + if (ret < 0) { + throw std::runtime_error("failed to add socket fd to epoll set"); + } +} + +client::connection::~connection() { + close(m_epfd); + close(m_sockfd); +} + +std::string client::connection::destination() const { + char buf[256] = {0}; + inet_ntop(m_dest_addr.sin_family, &m_dest_addr.sin_addr, buf, sizeof(buf) - 1); + return (boost::format("%s:%d") % buf % ntohs(m_dest_addr.sin_port)).str(); +} + +void client::connection::operator()() { + const size_t maxevents = 2; + struct epoll_event events[maxevents]; + unsigned next_worker = 0; + + m_start_time = vtime::current(); + m_recv_count = 0; + m_send_count = 0; + m_rtt_sum = 0; + + size_t sent_prev = 0; + size_t recv_prev = 0; + vtime::time_t time_prev = m_start_time; + vtime::time_t prev_rtt_sum = 0; + vtime::time_t packet_interval = vtime::time_from_sec(1.0 / m_packet_rate); + vtime::time_t last_send_time = m_start_time; + size_t print_rate = std::min(200000ul, m_packet_rate); + + std::cout << "connection " << m_id << ": sending to " << destination() << std::endl; + do { + int nevents = epoll_wait(m_epfd, events, maxevents, -1); + if (nevents < 0) { + throw std::runtime_error("epoll_wait failed"); + } + + vtime:time_t current_time = vtime::current(); + + for (int i = 0; i < nevents; ++i) { + if (events[i].data.fd == m_sockfd) { + if (events[i].events & EPOLLIN) { + int nrecvd = recvfrom(m_sockfd, &m_recvbuf, sizeof(m_recvbuf), + 0, NULL, NULL); + if (nrecvd != sizeof(m_sendbuf)) { + throw std::runtime_error("recvfrom failed"); + } + + m_rtt_sum += (current_time - m_recvbuf.send_time); + ++m_recv_count; + } + + if (events[i].events & EPOLLOUT) { + if (current_time >= last_send_time + packet_interval) { + // TODO maintain packet rate + m_sendbuf.psn = m_psn++; + m_sendbuf.send_time = current_time; + int nsent = sendto(m_sockfd, &m_sendbuf, sizeof(m_sendbuf), 0, + reinterpret_cast(&m_dest_addr), + sizeof(m_dest_addr)); + if (nsent != sizeof(m_sendbuf)) { + throw std::runtime_error("sendto failed"); + } + + ++m_send_count; + /*last_send_time += packet_interval;*/ + last_send_time = + (current_time + last_send_time + packet_interval) / 2; + } + } + } + } + + if (m_send_count - sent_prev >= print_rate) { + double rtt = (m_recv_count) > 0 ? + vtime::time_to_sec( + (m_rtt_sum - prev_rtt_sum) * 1000000.0 / + (m_recv_count - recv_prev)) : + 0; + + double packet_rate = (m_send_count - sent_prev) / + vtime::time_to_sec(current_time - time_prev); + + double recv_ratio = (m_recv_count - recv_prev) / + static_cast(m_send_count - sent_prev); + + printf("sent: %Zu rate: %7.2f recvd: %Zu (%5.2f%%) rtt: %5.2f\n", + m_send_count, packet_rate, m_recv_count, recv_ratio * 100.0, + rtt); + + sent_prev = m_send_count; + recv_prev = m_recv_count; + time_prev = current_time; + prev_rtt_sum = m_rtt_sum; + } + + } while (1); +} + diff --git a/tests/server_test/client.h b/tests/server_test/client.h new file mode 100644 index 0000000..a2c55fe --- /dev/null +++ b/tests/server_test/client.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef _CLIENT_H_ +#define _CLIENT_H_ + +#include "vtime.h" + +#include +#include +#include +#include +#include + +class options; + +class client { +public: + client(const options& opts); + + void run(); +private: + + struct request { + uint32_t id; + uint64_t send_time; + }; + + + class connection : private boost::noncopyable { + public: + connection(unsigned id, const std::string& server, unsigned port, + size_t packet_rate); + ~connection(); + + void operator()(); + + std::string destination() const; + + private: + struct packet { + uint64_t psn; + uint64_t send_time; + }; + + const unsigned m_id; + size_t m_packet_rate; + size_t m_head, m_tail; + struct sockaddr_in m_dest_addr; + int m_sockfd; + int m_epfd; + packet m_sendbuf; + packet m_recvbuf; + uint64_t m_psn; + size_t m_recv_count; + size_t m_send_count; + vtime::time_t m_start_time; + vtime::time_t m_rtt_sum; + }; + + typedef boost::shared_ptr connection_ptr; + + std::vector m_connections; +}; + +#endif diff --git a/tests/server_test/main.cc b/tests/server_test/main.cc new file mode 100644 index 0000000..294f329 --- /dev/null +++ b/tests/server_test/main.cc @@ -0,0 +1,51 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "client.h" +#include "server.h" +#include "options.h" + + +int main(int argc, char **argv) +{ + options opts(argc, argv); + + if (opts.is_server()) { + server s(opts); + s.run(); + } else { + client c(opts); + c.run(); + } + return 0; +} diff --git a/tests/server_test/options.cc b/tests/server_test/options.cc new file mode 100644 index 0000000..75f21cb --- /dev/null +++ b/tests/server_test/options.cc @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "options.h" + +#include +#include + + +options::options (int argc, char **argv) : + m_port(13737), + m_packet_rate(0), + m_num_threads(1), + m_window(65536) +{ + char c; + while ((c = getopt (argc, argv, "t:p:r:w:")) != -1) { + switch (c) { + case 't': + m_num_threads = atoi(optarg); + break; + case 'p': + m_port = atoi(optarg); + break; + case 'r': + m_packet_rate = atol(optarg); + break; + case 'w': + m_window = atol(optarg); + break; + } + } + + if (optind < argc) { + m_server = argv[optind]; + } +} + +const std::string options::server() const { + return m_server; +} + +unsigned options::port() const { + return m_port; +} + +unsigned options::packet_rate() const { + return m_packet_rate; +} + +unsigned options::num_threads() const { + return m_num_threads; +} + +size_t options::window() const { + return m_window; +} + +bool options::is_server() const { + return m_server.empty(); +} diff --git a/tests/server_test/options.h b/tests/server_test/options.h new file mode 100644 index 0000000..1bc3b0f --- /dev/null +++ b/tests/server_test/options.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef _OPTIONS_H_ +#define _OPTIONS_H_ + +#include + +class options { +public: + options (int argc, char **argv); + + const std::string server() const; + + unsigned port() const; + + unsigned packet_rate() const; + + unsigned num_threads() const; + + size_t window() const; + + bool is_server() const; + +private: + std::string m_server; + unsigned m_port; + unsigned m_packet_rate; + unsigned m_num_threads; + size_t m_window; /* NUmber of requests to remember */ +}; + +#endif diff --git a/tests/server_test/server.cc b/tests/server_test/server.cc new file mode 100644 index 0000000..5fc8a01 --- /dev/null +++ b/tests/server_test/server.cc @@ -0,0 +1,233 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "server.h" +#include "options.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +server::server(const options& opts) { + + struct sockaddr_in bind_addr; + int ret; + + // Create the UDP socket + m_udp_sockfd = socket(AF_INET, SOCK_DGRAM, 0); + if (m_udp_sockfd < 0) { + throw std::runtime_error("failed to create socket"); + } + + // Create the UDP socket + m_tcp_sockfd = socket(AF_INET, SOCK_STREAM, 0); + if (m_udp_sockfd < 0) { + throw std::runtime_error("failed to create socket"); + } + + // Bind the socket to the given port + bind_addr.sin_family = AF_INET; + bind_addr.sin_port = htons(opts.port()); + bind_addr.sin_addr.s_addr = INADDR_ANY; + ret = bind(m_udp_sockfd, reinterpret_cast(&bind_addr), sizeof(bind_addr)); + if (ret < 0) { + throw std::runtime_error("failed to bind the UDP socket"); + } + + bind_addr.sin_family = AF_INET; + bind_addr.sin_port = htons(opts.port()); + bind_addr.sin_addr.s_addr = INADDR_ANY; + ret = bind(m_tcp_sockfd, reinterpret_cast(&bind_addr), sizeof(bind_addr)); + if (ret < 0) { + throw std::runtime_error("failed to bind the TCP socket"); + } + + ret = listen(m_tcp_sockfd, 100); + if (ret < 0) { + throw std::runtime_error("failed to listen"); + } + + ret = fcntl(m_udp_sockfd, F_SETFL, fcntl(m_udp_sockfd, F_GETFL) | O_NONBLOCK); + if (ret < 0) { + throw std::runtime_error("failed to make UDP socket nonblocking"); + } + + ret = fcntl(m_tcp_sockfd, F_SETFL, fcntl(m_tcp_sockfd, F_GETFL) | O_NONBLOCK); + if (ret < 0) { + throw std::runtime_error("failed to make TCP socket nonblocking"); + } + + // Create main epoll set + m_epfd = epoll_create(2); + if (m_epfd < 0) { + throw std::runtime_error("failed to create epfd"); + } + + // Add the socket to the main epoll set + struct epoll_event evt; + evt.events = EPOLLIN; + evt.data.fd = m_tcp_sockfd; + ret = epoll_ctl(m_epfd, EPOLL_CTL_ADD, m_tcp_sockfd, &evt); + if (ret < 0) { + throw std::runtime_error("failed to add socket fd to epoll set"); + } + + // Create the workers + for (unsigned i = 0; i < opts.num_threads(); ++i) { + m_workers.push_back(boost::make_shared(i, m_udp_sockfd)); + } +} + +server::~server() { + close(m_epfd); + close(m_udp_sockfd); +} + +void server::run() { + boost::thread_group tg; + + BOOST_FOREACH(const worker_ptr& worker, m_workers) { + tg.create_thread(boost::ref(*worker.get())); + } + + do { + const size_t maxevents = 2; + struct epoll_event events[maxevents]; + + epoll_wait(m_epfd, events, maxevents, 1000); + } while(1); + + tg.join_all(); +} + +server::worker::worker(int id, int sockfd) : + m_id(id), m_recv_sockfd(sockfd), m_buffer(1024) +{ + int ret; + + m_reply_sockfd = socket(AF_INET, SOCK_DGRAM, 0); + if (m_reply_sockfd < 0) { + throw std::runtime_error("failed to create socket"); + } + + // Create worker epoll set + m_epfd = epoll_create(1); + if (m_epfd < 0) { + throw std::runtime_error("failed to create epfd"); + } + + // Add the socket to the main epoll set + struct epoll_event evt; + evt.events = EPOLLIN; + evt.data.fd = m_recv_sockfd; + ret = epoll_ctl(m_epfd, EPOLL_CTL_ADD, m_recv_sockfd, &evt); + if (ret < 0) { + throw std::runtime_error("failed to add socket fd to epoll set"); + } + + // Add a pipe to the main epoll set + int pipefds[2]; + pipe(pipefds); + + evt.events = EPOLLIN; + evt.data.fd = pipefds[0]; + ret = epoll_ctl(m_epfd, EPOLL_CTL_ADD, pipefds[0], &evt); + if (ret < 0) { + throw std::runtime_error("failed to add pipe fd to epoll set"); + } +} + +server::worker::~worker() { + close(m_epfd); + close(m_reply_sockfd); +} + +void server::worker::operator()() { + const size_t maxevents = 2; + struct epoll_event events[maxevents]; + unsigned next_worker = 0; + + do { + int nevents = epoll_wait(m_epfd, events, maxevents, 1000); + if (nevents < 0 && errno == EINTR) { + continue; + } + + if (nevents < 0) { + std::cerr << "errno=" << errno << std::endl; + throw std::runtime_error("epoll_wait failed, errno"); + } + + for (int i = 0; i < nevents; ++i) { + if (events[i].data.fd == m_recv_sockfd) { + process_message(); + } + } + + } while (1); +} + +void server::worker::process_message() { + struct sockaddr_in sender_addr; + socklen_t sender_addrlen = sizeof(sender_addr); + + ssize_t recvd = recvfrom(m_recv_sockfd, &m_buffer[0], m_buffer.size(), 0, + reinterpret_cast(&sender_addr), + &sender_addrlen); + if (recvd == -1 && errno == EAGAIN) { + /* Some other time ... */ + return; + } + + if (recvd < 0) { + std::cerr << "recvfrom returned " << recvd << " errno=" << errno << std::endl; + throw std::runtime_error("recvfrom failed"); + } + + ssize_t sent = sendto(m_reply_sockfd, &m_buffer[0], recvd, 0, + reinterpret_cast(&sender_addr), + sender_addrlen); + if (recvd <= 0) { + throw std::runtime_error("sendto failed"); + } + +} diff --git a/tests/server_test/server.h b/tests/server_test/server.h new file mode 100644 index 0000000..32cc7d3 --- /dev/null +++ b/tests/server_test/server.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef _SERVER_H_ +#define _SERVER_H_ + +#include +#include +#include + +class options; + +class server { +public: + server(const options& opts); + ~server(); + void run(); +private: + + class worker : private boost::noncopyable { + public: + worker(int id, int sockfd); + ~worker(); + + void operator()(); + + void process_message(); + + private: + int m_id; + int m_recv_sockfd; + int m_reply_sockfd; + int m_epfd; + std::vector m_buffer; + }; + + typedef boost::shared_ptr worker_ptr; + + int m_udp_sockfd; + int m_tcp_sockfd; + int m_epfd; + std::vector m_workers; +}; + +#endif diff --git a/tests/server_test/vtime.cc b/tests/server_test/vtime.cc new file mode 100644 index 0000000..5cdd4b8 --- /dev/null +++ b/tests/server_test/vtime.cc @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include "vtime.h" + +#include +#include + + +#define MEASUREMENTS 200 +#define USECSTEP 10 +#define USECSTART 100 + +vtime::vtime() : m_clocks_per_sec(get_cpu_mhz(0) * 1000000.0) { +} + +/* + Use linear regression to calculate cycles per microsecond. + http://en.wikipedia.org/wiki/Linear_regression#Parameter_estimation + */ +double vtime::sample_get_cpu_mhz(void) { + struct timeval tv1, tv2; + time_t start; + double sx = 0, sy = 0, sxx = 0, syy = 0, sxy = 0; + double tx, ty; + int i; + + /* Regression: y = a + b x */ + long x[MEASUREMENTS]; + time_t y[MEASUREMENTS]; + double a; /* system call overhead in cycles */ + double b; /* cycles per microsecond */ + double r_2; + + for (i = 0; i < MEASUREMENTS; ++i) { + start = current(); + + if (gettimeofday(&tv1, NULL)) { + throw std::runtime_error("gettimeofday failed"); + } + + do { + if (gettimeofday(&tv2, NULL)) { + throw std::runtime_error("gettimeofday failed"); + } + } while ((tv2.tv_sec - tv1.tv_sec) * 1000000 + + (tv2.tv_usec - tv1.tv_usec) < USECSTART + i * USECSTEP); + + x[i] = (tv2.tv_sec - tv1.tv_sec) * 1000000 + + tv2.tv_usec - tv1.tv_usec; + y[i] = current() - start; + } + + for (i = 0; i < MEASUREMENTS; ++i) { + tx = x[i]; + ty = y[i]; + sx += tx; + sy += ty; + sxx += tx * tx; + syy += ty * ty; + sxy += tx * ty; + } + + b = (MEASUREMENTS * sxy - sx * sy) / (MEASUREMENTS * sxx - sx * sx); + a = (sy - b * sx) / MEASUREMENTS; + + r_2 = (MEASUREMENTS * sxy - sx * sy) * (MEASUREMENTS * sxy - sx * sy) / + (MEASUREMENTS * sxx - sx * sx) / + (MEASUREMENTS * syy - sy * sy); + + if (r_2 < 0.9) { + fprintf(stderr,"Correlation coefficient r^2: %g < 0.9\n", r_2); + return 0; + } + + return b; +} + +double vtime::proc_get_cpu_mhz(int no_cpu_freq_fail) { + FILE* f; + char buf[256]; + double mhz = 0.0; + + f = fopen("/proc/cpuinfo","r"); + if (!f) { + return 0.0; + } + + while(fgets(buf, sizeof(buf), f)) { + double m; + int rc; + +#if defined (__ia64__) + /* Use the ITC frequency on IA64 */ + rc = sscanf(buf, "itc MHz : %lf", &m); +#elif defined (__PPC__) || defined (__PPC64__) + /* PPC has a different format as well */ + rc = sscanf(buf, "clock : %lf", &m); +#else + rc = sscanf(buf, "cpu MHz : %lf", &m); +#endif + if (rc != 1) { + continue; + } + if (mhz == 0.0) { + mhz = m; + continue; + } + if (mhz != m) { + fprintf(stderr, "Conflicting CPU frequency values" + " detected: %lf != %lf\n", mhz, m); + if (no_cpu_freq_fail) { + fprintf(stderr, "Test integrity may be harmed !\n"); + } else { + return 0.0; + } + continue; + } + } + fclose(f); + return mhz; +} + + +double vtime::get_cpu_mhz(int no_cpu_freq_fail) { + double sample, proc, mhz, delta; + + sample = sample_get_cpu_mhz(); + proc = proc_get_cpu_mhz(no_cpu_freq_fail); + + if (!proc || !sample) + return 0; + + delta = proc > sample ? proc - sample : sample - proc; + if (delta / proc > 0.01) { + fprintf(stderr, "Warning: measured timestamp frequency " + "%g differs from nominal %g MHz\n", + sample, proc); + mhz = sample; + } else { + mhz = proc; + } + + printf("cpu mhz is %.2f\n", mhz); + return mhz; +} + +double vtime::get_cpu_clocks_per_sec() { + static vtime t; + return t.m_clocks_per_sec; +} + + diff --git a/tests/server_test/vtime.h b/tests/server_test/vtime.h new file mode 100644 index 0000000..ccbd3bc --- /dev/null +++ b/tests/server_test/vtime.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef VTIME_H_ +#define VTIME_H_ + +#include + +class vtime { +public: + + typedef long long time_t; + + static const time_t MXM_MSEC_PER_SEC = 1000ull; /* Milli */ + static const time_t MXM_USEC_PER_SEC = 1000000ul; /* Micro */ + static const time_t MXM_NSEC_PER_SEC = 1000000000ul; /* Nano */ + + + static inline time_t current() { + time_t time; +#if defined (__x86_64__) || defined(__i386__) + uint32_t low, high; + asm volatile ("rdtsc" : "=a" (low), "=d" (high)); + time = ((time_t)high << 32) | (time_t)low; +# define MXM_TIME_CPU_CLOCK 1 +#elif defined(__PPC__) || defined(__PPC64__) + asm volatile ("mftb %0" : "=r" (time) : ); +# define MXM_TIME_CPU_CLOCK 1 +#elif defined(__ia64__) + asm volatile ("mov %0=ar.itc" : "=r" (ret)); +# define MXM_TIME_CPU_CLOCK 1 +#else + /* Fallback - use microseconds from gettimeofday() */ + struct timeval tv; + gettimeofday(&tv, NULL); + time = tv.tv_usec + tv.tv_sec * MXM_USEC_PER_SEC; +# define MXM_TIME_CPU_CLOCK 0 +#endif + return time; + } + + + /** + * @return The clock value of a single second. + */ + static inline double time_sec_value() { +#if MXM_TIME_CPU_CLOCK + return get_cpu_clocks_per_sec(); +#else + return MXM_USEC_PER_SEC; +#endif + } + + + /** + * Convert seconds to time units. + */ + static inline time_t time_from_sec(double sec) { + return sec * time_sec_value(); + } + + /** + * Convert MXM time units to seconds. + */ + static inline double time_to_sec(time_t time) { + return time / time_sec_value(); + } + +private: + typedef unsigned long long cycles_t; + + vtime(); + + static double get_cpu_clocks_per_sec(); + static double sample_get_cpu_mhz(void); + static double proc_get_cpu_mhz(int no_cpu_freq_fail); + static double get_cpu_mhz(int no_cpu_freq_fail); + + double m_clocks_per_sec; + +}; + +#endif diff --git a/tests/simple_fork/fork.py b/tests/simple_fork/fork.py new file mode 100755 index 0000000..19a9f3b --- /dev/null +++ b/tests/simple_fork/fork.py @@ -0,0 +1,31 @@ +#!/usr/bin/python + +#LD_PRELOAD=libvma.so ./fork.py +#VMA_MEM_ALLOC_TYPE=2 LD_PRELOAD=libvma.so ./fork.py +#VMA_MEM_ALLOC_TYPE=2 VMA_LOG_FILE="/tmp/libvma.log.%d" VMA_TRACELEVEL=4 LD_PRELOAD=libvma.so ./fork.py + +import os +import socket + +def child(): + print 'A new child ', os.getpid( ) + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + s.close() + os._exit(0) + +def parent(): + i = 0 + s = socket.socket(socket.AF_INET, socket.SOCK_STREAM) + while True: + i = i + 1 + newpid = os.fork() + if newpid == 0: + child() + else: + pids = (os.getpid(), newpid) + print "parent: %d, child: %d" % pids + if i == 5: break + s.close() + +parent() + diff --git a/tests/tcp_window_size_exerciser/README b/tests/tcp_window_size_exerciser/README new file mode 100644 index 0000000..7c1fd01 --- /dev/null +++ b/tests/tcp_window_size_exerciser/README @@ -0,0 +1,47 @@ +Test Name: + TCP window size exerciser + +Author(s): + Daniel Libenson danielli@mellanox.com + +Short description: + This is a client server test that built to exercise the tcp window size. + +Supported OSes: + Linux + +Description: + This is a client server test that built to exercise the TCP window size changing during traffic. + - TCP window size changed due to the use of setsockopt function with SO_RCVBUF parameter. + +How to use: + 1. Compile the code using the Make file or running next commands from Shell command line: + Client -> $ gcc -lrt tcp_hang_test_client.c -o client + Server -> $ gcc -lrt tcp_hang_test_server.c -o server + + 2. Run server side - don't forget to update all relevant parameters like IP and port + $ ./server -i 9.9.9.4 -p 5000 -s 1000000 -t 10 -m 500 -M 30000 -c 122 + + -i: Server IP + -p: Server port + -s: Sleep time interval [msec] + -t: Update receive window size every # seconds + -m: Minimal receive window size [bytes] + -M: Maximum receive window size [bytes] + -c: Client message size + Example: ./server -i 9.9.9.4 -p 5000 -s 1000000 -t 10 -m 500 -M 30000 -c 122 + + 4. Run client side - don't forget to update LD_PRELOAD path and all other parameters like IP and port + $ LD_PRELOAD=libvma.so ./client.o -i 9.9.9.3 -s 9.9.9.4 -p 5000 -m 122 + + -i: Client IP + -s: Server IP + -p: Server port + -m: Client -> Server message size [bytes](1000> X >=4) + Example: ./client -i 9.9.9.3 -s 9.9.9.4 -p 5000 -m 122 + +Known issues: + The hang may appear after 10-15 seconds if VMA version is lower than 8.1.4. + +To do: + None \ No newline at end of file diff --git a/tests/tcp_window_size_exerciser/tcp_wnd_test.h b/tests/tcp_window_size_exerciser/tcp_wnd_test.h new file mode 100644 index 0000000..652ca69 --- /dev/null +++ b/tests/tcp_window_size_exerciser/tcp_wnd_test.h @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifndef _CLIENT_H_ +#define _TCP_WND_TEST_H_ + +#define BUFFER_SIZE (0x400) +#define MAX_MESSAGE_SIZE (255) +#define MIN_MESSAGE_SIZE (4) + +#endif diff --git a/tests/tcp_window_size_exerciser/tcp_wnd_test_client.c b/tests/tcp_window_size_exerciser/tcp_wnd_test_client.c new file mode 100644 index 0000000..d69c17f --- /dev/null +++ b/tests/tcp_window_size_exerciser/tcp_wnd_test_client.c @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "tcp_wnd_test.h" + +/* gcc -lrt tcp_wnd_test_client.c -o client */ +/* LD_PRELOAD=libvma.so ./client -i 9.9.9.3 -s 9.9.9.4 -p 5000 -m 122 */ +/* VMA_TX_BUFS=8192 VMA_RX_BUFS=204800 VMA_TRACELEVEL=3 VMA_STATS_FD_NUM=1024 VMA_RX_POLL=-1 VMA_SELECT_POLL=-1 VMA_SELECT_POLL_OS_RATIO=0 VMA_TCP_3T_RULES=1 VMA_TCP_CTL_THREAD=2 VMA_AVOID_SYS_CALLS_ON_TCP_FD=1 VMA_BUFFER_BATCHING_MODE=0 LD_PRELOAD=libvma.so ./client -i 9.9.9.3 -s 9.9.9.4 -p 5000 -m 122 */ + +int main(int argc, char* argv[]) +{ + int option = 0, msgSize = 4; + int clientIp = 0, serverPort = 0, serverIp = 0; + char *pClientIp = NULL, *pServerIp = NULL; + int clientfd = 0; + char buffer[BUFFER_SIZE] = {0}; + struct sockaddr_in server; + struct sockaddr_in client; + int i = 0; + + if (2 > argc) { + printf("Wrong parameters!!!\n"); + exit(1); + } + + opterr = 0; + while (EOF != (option = getopt(argc, argv, "i:p:s:m:h")) ) { + switch (option) { + case 'i': { + pClientIp = optarg; + clientIp = inet_addr(optarg); + break; + } + case 's': { + pServerIp = optarg; + serverIp = inet_addr(optarg); + break; + } + case 'p': { + serverPort = atoi(optarg); + break; + } + case 'm': { + msgSize = atoi(optarg); + if((MIN_MESSAGE_SIZE > msgSize) || (MAX_MESSAGE_SIZE < msgSize)) { + printf("Message size should be: %d >= message size >= %d\n",MIN_MESSAGE_SIZE, MAX_MESSAGE_SIZE); + exit(1); + } + break; + } + case 'h': { + printf("-i: Client IP\n"); + printf("-s: Server IP\n"); + printf("-p: Server port\n"); + printf("-m: Client -> Server message size(%d>= X >=4)\n", MAX_MESSAGE_SIZE); + printf("\nExample: ./client -i 9.9.9.3 -s 9.9.9.4 -p 5000 -m 122\n"); + exit(0); + break; + } + default : { + printf("Incorrect option!!!\n"); + exit(1); + break; + } + } + } + + printf("Client IP: %s [atoi:%x]\n", pClientIp, clientIp); + printf("Server IP: %s [atoi:%x]\n", pServerIp, serverIp); + printf("Server Port: %d\n", serverPort); + printf("Client -> Server message size: %d\n", msgSize); + + /* Init send uffer */ + for (i=0; i < BUFFER_SIZE;++i) { + buffer[i] = (char)(i+1); + } + + /* Create client socket */ + clientfd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (0 > clientfd) { + printf("ERROR opening socket\n"); + exit(1); + } + printf("Client Socket: OK\n"); + + /* Built client Internet address */ + bzero( &client, sizeof(client)); + client.sin_family = AF_INET; + client.sin_port = htons(INADDR_ANY); + inet_pton( AF_INET, pClientIp, &client.sin_addr); + + if (0 != bind(clientfd, (struct sockaddr*) &client, sizeof(client))) { + printf("ERROR on binding!\n"); + exit(1); + } + printf("Bind: OK\n"); + + /* Set server address */ + bzero( &server, sizeof(server)); + server.sin_family = AF_INET; + server.sin_port = htons(serverPort); + inet_pton( AF_INET, pServerIp, &server.sin_addr); + + /* Connect socket to server */ + if (0 > connect(clientfd, ( struct sockaddr*)&server, sizeof(server))) { + printf("ERROR on connect\n"); + exit(1); + } + printf("Connect: OK\n"); + + /* Setsockopt */ + option = 1; + setsockopt(clientfd, IPPROTO_TCP, TCP_NODELAY, &option, sizeof(option)); + + option = msgSize * 1024; + setsockopt(clientfd, SOL_SOCKET, SO_SNDBUF, &option, sizeof(option));/* Sets the maximum socket send buffer in bytes */ + + fcntl(clientfd, F_SETFL, O_NONBLOCK); + + while (1) { + int sentsize = 0; + int rc = 0; + + do{ + rc = write(clientfd, buffer, msgSize); + if (msgSize != rc) { + sentsize = rc; + while(msgSize > sentsize) { + rc = write(clientfd, buffer + sentsize, msgSize - sentsize); + if(rc > 0) { + sentsize += rc; + } + } + } + } while (0 > rc); + + usleep(1000);/* sleep for 1 msec */ + } + + if (0 != close(clientfd)) { + printf("ERROR - close socket!\n"); + exit(1); + } + + return 0; +} diff --git a/tests/tcp_window_size_exerciser/tcp_wnd_test_server.c b/tests/tcp_window_size_exerciser/tcp_wnd_test_server.c new file mode 100644 index 0000000..1df51e9 --- /dev/null +++ b/tests/tcp_window_size_exerciser/tcp_wnd_test_server.c @@ -0,0 +1,232 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include /* for clock_gettime */ +#include + +#include "tcp_wnd_test.h" + +/* gcc -lrt tcp_wnd_test_server.c -o server */ +/* ./server -i 9.9.9.4 -p 5000 -s 1000000 -t 10 -m 500 -M 30000 -c 122 */ + +int main(int argc, char* argv[]) +{ + int option = 0, sleeptimeusec = 1000000, windowtimesec = 10, minwndsize = 500, maxwndsize = 30000, clientmsgsize = 122, rcvwnd = 1450; + int port = 0, serverIp = 0; + char *pServerIp = NULL; + int socketfd = 0, clientfd = 0, clientsize = 0, readsize = 0; + unsigned char buffer[BUFFER_SIZE] = {0}; + struct sockaddr_in server, client; + int optval; /* flag value for setsockopt */ + struct timespec start, end; + long long diff = 0; + + if (2 > argc) { + printf("Wrong parameters!!!\n"); + exit(1); + } + + opterr = 0; + while (EOF != (option = getopt(argc, argv, "i:p:s:t:m:M:c:h")) ) { + switch(option) { + case 'i': { + pServerIp = optarg; + serverIp = inet_addr(optarg); + break; + } + case 'p': { + port = atoi(optarg); + break; + } + case 's': { + if(atoi(optarg)) { + sleeptimeusec = atoi(optarg); + } + break; + } + case 't': { + if(atoi(optarg)) { + windowtimesec = atoi(optarg); + } + break; + } + case 'm': { + if(atoi(optarg)) { + minwndsize = atoi(optarg); + } + break; + } + case 'M': { + if(atoi(optarg)) { + maxwndsize = atoi(optarg); + } + break; + } + case 'c': { + if(atoi(optarg)) { + clientmsgsize = atoi(optarg); + } + if((MIN_MESSAGE_SIZE > clientmsgsize) || (MAX_MESSAGE_SIZE < clientmsgsize)) { + printf("Message size should be: %d >= message size >= %d\n",MIN_MESSAGE_SIZE, MAX_MESSAGE_SIZE); + exit(1); + } + break; + } + case 'h': { + printf("-i: Server IP\n"); + printf("-p: Server port\n"); + printf("-s: Sleep time interval [usec]\n"); + printf("-t: Update receive window size every # seconds"); + printf("-m: Minimal receive window size [bytes]\n"); + printf("-M: Maximum receive window size [bytes]\n"); + printf("-c: Client message size [message integrity validation]. Should be: %d > message size > %d\n", MIN_MESSAGE_SIZE, MAX_MESSAGE_SIZE); + printf("\nExample: ./server -i 9.9.9.4 -p 5000 -s 1000000 -t 10 -m 500 -M 30000 -c 122\n"); + exit(0); + break; + } + default : { + printf("%c - Incorrect option!!!\n", option); + exit(1); + break; + } + } + } + + printf("Server IP: %s [atoi:%x]\n", pServerIp, serverIp); + printf("Server Port: %d\n", port); + printf("Sleep time interval [usec]: %d\n", sleeptimeusec); + printf("Window update time interval [sec]: %d\n", windowtimesec); + printf("Minimum receive window size [bytes]: %d\n", minwndsize); + printf("Maximum receive window size [bytes]: %d\n", maxwndsize); + printf("Client message size [bytes]: %d\n", clientmsgsize); + + /*Create a socket*/ + socketfd = socket(AF_INET, SOCK_STREAM, IPPROTO_TCP); + if (0 > socketfd) { + printf("ERROR opening socket!\n"); + exit(1); + } + printf("Socket: OK\n"); + + optval = 1; + setsockopt(socketfd, SOL_SOCKET, SO_REUSEADDR,(const void *)&optval , sizeof(int)); + + /* Built the server Internet address */ + bzero(&server, sizeof(server)); + server.sin_family = AF_INET; + server.sin_port = htons(port); + inet_pton(AF_INET, pServerIp, &server.sin_addr); + + if (0 != bind(socketfd, (struct sockaddr*) &server, sizeof(server))) { + printf("ERROR on binding!\n"); + exit(1); + } + printf("Bind: OK\n"); + + if (0 > listen(socketfd, 6)) { + printf("ERROR on listen!\n"); + exit(1); + } + printf("Listen: OK\n"); + + clientsize = sizeof(struct sockaddr_in); + + clientfd = accept(socketfd, ( struct sockaddr*)&client, (socklen_t*)&clientsize); + if (0 > clientfd) { + printf("ERROR on accept!\n"); + exit(1); + } + printf("Connection accepted: OK [clientfd:%d]\n", clientfd); + + /* Set receive window size to 30k */ + setsockopt(socketfd, SOL_SOCKET, SO_RCVBUF,(const void *)&maxwndsize, sizeof(maxwndsize)); + setsockopt(clientfd, SOL_SOCKET, SO_RCVBUF,(const void *)&maxwndsize, sizeof(maxwndsize)); + rcvwnd = maxwndsize; + clock_gettime(CLOCK_MONOTONIC, &start); /* mark start time */ + + while (1) { + readsize = recvfrom(clientfd, buffer, (rand() % sizeof(buffer)), 0, (struct sockaddr*)&client, (socklen_t*)&client); + if (0 < readsize) { + printf("readsize: %d\n", readsize); + } + else { + printf("Something went wrong with recvfrom()! %s\n", strerror(errno)); + } + + clock_gettime(CLOCK_MONOTONIC, &end); /* mark the end time */ + diff = (end.tv_sec - start.tv_sec) + (end.tv_nsec - start.tv_nsec)/1000000000; + if (diff > windowtimesec) { + rcvwnd = rcvwnd == minwndsize ? maxwndsize : minwndsize; + printf("Setsockopt: [SO_RCVBUF] window size:%d\n", rcvwnd); + setsockopt(socketfd, SOL_SOCKET, SO_RCVBUF,(const void *)&rcvwnd , sizeof(rcvwnd)); + setsockopt(clientfd, SOL_SOCKET, SO_RCVBUF,(const void *)&rcvwnd , sizeof(rcvwnd)); + clock_gettime(CLOCK_MONOTONIC, &start); /* mark start time */ + } + + if (0 < readsize) { + /* Buffer validation */ + static unsigned int counter = 0; + int i; + /* message integrity validation */ + for(i=0; iengine->sender: + For example: sender and receiver are launched on 10.0.0.9 engine is launched on 10.0.0.10 + + ./testbed.out --receiver=:10.0.0.9 --scount=20 --rcount=10 --msg-size=500 --msg-rate=2000 -d4 -n8000 + ./testbed.out --engine=10.0.0.9:10.0.0.10 --scount=20 --rcount=10 --msg-size=500 --msg-rate=2000 -d4 -n8000 + ./testbed.out --sender=10.0.0.10:10.0.0.9 --scount=20 --rcount=10 --msg-size=500 --msg-rate=2000 -d4 -n8000 diff --git a/tests/testbed/testbed.c b/tests/testbed/testbed.c new file mode 100644 index 0000000..ed0c446 --- /dev/null +++ b/tests/testbed/testbed.c @@ -0,0 +1,1822 @@ +/* + * testbed.c + * + * This application includes sender, engine, receiver units. + * Where sender and receiver are located on the same node. + * + * gcc testbed.c -o testbed.out -g -Wall -Werror -DTIMESTAMP_ENABLED=1 -DTIMESTAMP_RDTSC=1 -DNDEBUG -lrt + * + * Additional compilation options: + * + * -DTIMESTAMP_ENABLED=1 + * -DTIMESTAMP_ENABLED=0 (default) + * + * -DTIMESTAMP_RDTSC=1 - rdtsc based time (default) + * -DTIMESTAMP_RDTSC=0 - clock_gettime() + * + * -DVMA_ZCOPY_ENABLED=1 + * -DVMA_ZCOPY_ENABLED=0 (default) + * + * -DNDEBUG � ON/OFF assert and log_trace() + * + * How to use (launch using this order): + * sender and receiver are launched on 10.0.0.9 + * engine is launched on 10.0.0.10 + * + * ./testbed.out --receiver=:10.0.0.9 --scount=20 --rcount=10 --msg-size=500 --msg-rate=2000 -d4 -n8000 + * ./testbed.out --engine=10.0.0.9:10.0.0.10 --scount=20 --rcount=10 --msg-size=500 --msg-rate=2000 -d4 -n8000 + * ./testbed.out --sender=10.0.0.10:10.0.0.9 --scount=20 --rcount=10 --msg-size=500 --msg-rate=2000 -d4 -n8000 + */ + + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include /* mlock */ +#include +#include +#include +#include +#if defined(VMA_ZCOPY_ENABLED) && (VMA_ZCOPY_ENABLED == 1) +#include +#endif /* VMA_ZCOPY_ENABLED */ + +#ifndef __LINUX__ +#define __LINUX__ +#endif +#ifndef TIMESTAMP_RDTSC +#define TIMESTAMP_RDTSC 1 +#endif +#ifndef TIMESTAMP_ENABLED +#define TIMESTAMP_ENABLED 0 +#endif +#ifndef BLOCKING_READ_ENABLED +#define BLOCKING_READ_ENABLED 0 +#endif +#ifndef BLOCKING_WRITE_ENABLED +#define BLOCKING_WRITE_ENABLED 0 +#endif +#ifndef VMA_ZCOPY_ENABLED +#define VMA_ZCOPY_ENABLED 0 +#endif +#ifndef PONG_ENABLED +#define PONG_ENABLED 0 +#endif +#ifndef UDP_ENABLED +#define UDP_ENABLED 0 +#endif + + +struct testbed_config { + enum { + MODE_ENGINE = 0, + MODE_SENDER, + MODE_RECEIVER + } mode; + struct sockaddr_in addr; + struct sockaddr_in bind_addr; + uint16_t port; + int scount; + int rcount; + uint32_t msg_size; + int msg_count; + int msg_rate; + int msg_skip; + int log_level; +}; + +#define MSG_MAGIC 0xAA +#define MSG_BAD 0xFF +#define MSG_IN 1 +#define MSG_OUT 2 +#define NANOS_IN_SEC 1000000000L +#define NANOS_IN_MSEC 1000000L +#define NANOS_IN_USEC 1000L + +#define MAX_FD 1024 + +#pragma pack(push, 1) +struct msg_header { + uint8_t magic_num; + uint8_t msg_type; + uint16_t len; + int32_t seq_num; + int16_t client_id; + int16_t receiver; +#if defined(TIMESTAMP_ENABLED) && (TIMESTAMP_ENABLED == 1) + int64_t time_start; + int64_t time_end; +#endif /* TIMESTAMP_ENABLED */ +}; +#pragma pack( pop ) + +struct testbed_stat { +#if defined(TIMESTAMP_ENABLED) && (TIMESTAMP_ENABLED == 1) + struct msg_header *data; + int size; + int count; +#endif /* TIMESTAMP_ENABLED */ + int tx_count; + int rx_count; +}; + +#define log_fatal(fmt, ...) \ + do { \ + if (_config.log_level > 0) \ + fprintf(stderr, "[FATAL ] " fmt, ##__VA_ARGS__); \ + exit(1); \ + } while (0) + +#define log_error(fmt, ...) \ + do { \ + if (_config.log_level > 1) \ + fprintf(stderr, "[ERROR ] " fmt, ##__VA_ARGS__); \ + } while (0) + +#define log_warn(fmt, ...) \ + do { \ + if (_config.log_level > 2) \ + fprintf(stderr, "[WARN ] " fmt, ##__VA_ARGS__); \ + } while (0) + +#define log_info(fmt, ...) \ + do { \ + if (_config.log_level > 3) \ + fprintf(stderr, "[INFO ] " fmt, ##__VA_ARGS__); \ + } while (0) + +#if defined(NDEBUG) +#define log_trace(fmt, ...) ((void)0) +#else +#define log_trace(fmt, ...) \ + do { \ + if (_config.log_level > 4) \ + fprintf(stderr, "[TRACE ] " fmt, ##__VA_ARGS__); \ + } while (0) +#endif /* NDEBUG */ + +#define _min(a, b) ((a) > (b) ? (b) : (a)) +#define _max(a, b) ((a) < (b) ? (b) : (a)) + +static int _set_config(int argc, char **argv); +static int _def_config(void); +static void _usage(void); +static int _proc_sender(void); +static int _proc_engine(void); +static int _proc_receiver(void); +static void _proc_signal(int signal_id); + +static inline int64_t _get_time_ns(void); +static inline char *_addr2str(struct sockaddr_in *addr); +static int _get_addr(char *dst, struct sockaddr_in *addr); +static int _set_noblock(int fd); + +static int _write(int fd, uint8_t *buf, int count, int block); +#if defined(VMA_ZCOPY_ENABLED) && (VMA_ZCOPY_ENABLED == 1) +#else +static int _read(int fd, uint8_t *buf, int count, int block); +#endif /* VMA_ZCOPY_ENABLED */ + +#if defined(UDP_ENABLED) && (UDP_ENABLED == 1) +static int _udp_client_init(struct sockaddr_in *addr); +static int _udp_server_init(int fd); +static int _udp_create_and_bind(struct sockaddr_in *addr); +#else +static int _tcp_client_init(struct sockaddr_in *addr); +static int _tcp_server_init(int fd); +static int _tcp_create_and_bind(struct sockaddr_in *addr); +#endif /* UDP_ENABLED */ + +static void _ini_stat(void); +static void _fin_stat(void); + +static struct testbed_config _config; +static struct testbed_stat _stat; +static volatile int _done; +#if defined(BLOCKING_READ_ENABLED) && (BLOCKING_READ_ENABLED == 1) +static int _rb = 1; +#else +static int _rb = 0; +#endif /* BLOCKING_READ_ENABLED */ +#if defined(BLOCKING_WRITE_ENABLED) && (BLOCKING_WRITE_ENABLED == 1) +static int _wb = 1; +#else +static int _wb = 0; +#endif /* BLOCKING_WRITE_ENABLED */ + +#if defined(VMA_ZCOPY_ENABLED) && (VMA_ZCOPY_ENABLED == 1) +static struct vma_api_t *_vma_api = NULL; +static int _vma_ring_fd = -1; +#endif /* VMA_ZCOPY_ENABLED */ + +#if defined(PONG_ENABLED) && (PONG_ENABLED == 1) +#if defined(UDP_ENABLED) && (UDP_ENABLED == 0) +static int _udp_create_and_bind(struct sockaddr_in *addr); +#endif +#define PONG_PORT 41794 +#endif /* PONG_ENABLED */ + +int main(int argc, char **argv) +{ + int rc = 0; + struct sigaction sa; + + if (argc < 2) { + rc = -EINVAL; + _usage(); + goto err; + } + + srand(time(0)); + + _rb = _rb; + _wb = _wb; + rc = _def_config(); + if (0 != rc) { + goto err; + } + + rc = _set_config(argc, argv); + if (0 != rc) { + goto err; + } + + /* catch SIGINT to exit */ + memset(&sa, 0, sizeof(sa)); + sa.sa_handler = _proc_signal; + sa.sa_flags = 0; + sigemptyset(&(sa.sa_mask)); + if (sigaction(SIGINT, &sa, NULL) != 0) { + goto err; + } + +#if defined(VMA_ZCOPY_ENABLED) && (VMA_ZCOPY_ENABLED == 1) + _vma_api = vma_get_api(); + if (_vma_api == NULL) { + log_fatal("VMA Extra API not found\n"); + } +#endif /* VMA_ZCOPY_ENABLED */ + + _done = 0; + _ini_stat(); + switch ((int)_config.mode) { + case MODE_ENGINE: + rc = _proc_engine(); + break; + case MODE_SENDER: + rc = _proc_sender(); + break; + case MODE_RECEIVER: + rc = _proc_receiver(); + break; + default: + break; + } + _fin_stat(); +err: + return rc; +} + +static void _usage(void) +{ + fprintf(stderr, "Usage: testbed [options]\n" + "\t--engine Engine mode (receiver ip)\n" + "\t--sender Sender mode (engine ip)\n" + "\t--receiver Receiver mode\n" + "\t--port Listen/connect to port (default %d).\n" + "\t--scount,-s Total number of senders (default %d).\n" + "\t--rcount,-r Total number of receivers (default %d).\n" + "\t--msg-size,-l Message size in bytes (default %d).\n" + "\t--msg-count,-n Total number of messages to send (default %d).\n" + "\t--msg-rate,-f Number of messages per second (default %d).\n" + "\t--msg-skip,-i Skip number of messages in statistic (default %d).\n" + "\t--debug,-d Output verbose level (default: %d).\n" + "\t--help,-h Print help and exit\n", + + _config.port, + _config.scount, + _config.rcount, + _config.msg_size, + _config.msg_count, + _config.msg_rate, + _config.msg_skip, + _config.log_level); +} + +static void _proc_signal(int signal_id) +{ + _done = signal_id; +} + +static int _def_config(void) +{ + int rc = 0; + + memset(&_config, 0, sizeof(_config)); + _config.mode = -1; + _config.port = 12345; + _config.bind_addr.sin_family = PF_INET; + _config.bind_addr.sin_addr.s_addr = INADDR_ANY; + _config.msg_size = 500; + _config.scount = 20; + _config.rcount = 10; + _config.msg_count = 8000; + _config.msg_rate = 2000; + _config.msg_skip = 500; + _config.log_level = 4; + + return rc; +} + +static int _set_config(int argc, char **argv) +{ + int rc = 0; + static struct option long_options[] = { + {"engine", required_argument, 0, MODE_ENGINE}, + {"sender", required_argument, 0, MODE_SENDER}, + {"receiver", optional_argument, 0, MODE_RECEIVER}, + {"port", required_argument, 0, 'p'}, + {"scount", required_argument, 0, 's'}, + {"rcount", required_argument, 0, 'r'}, + {"msg-size", required_argument, 0, 'l'}, + {"msg-count", required_argument, 0, 'n'}, + {"msg-rate", required_argument, 0, 'f'}, + {"msg-skip", required_argument, 0, 'i'}, + {"debug", required_argument, 0, 'd'}, + {"help", no_argument, 0, 'h'}, + }; + int op; + int option_index; + + while ((op = getopt_long(argc, argv, "p:s:r:l:n:f:i:d:h", long_options, &option_index)) != -1) { + switch (op) { + case MODE_ENGINE: + case MODE_SENDER: + case MODE_RECEIVER: + if ((int)_config.mode < 0) { + char *token1 = NULL; + char *token2 = NULL; + const char s[2] = ":"; + if (optarg) { + if (optarg[0] != ':') { + token1 = strtok(optarg, s); + token2 = strtok(NULL, s); + } else { + token1 = NULL; + token2 = strtok(optarg, s); + } + } + + if (token1) { + rc = _get_addr(token1, &_config.addr); + if (rc < 0) { + rc = -EINVAL; + log_fatal("Failed to resolve ip address %s\n", token1); + } + } + if (token2) { + if (0 == inet_aton(token2, &_config.bind_addr.sin_addr)) { + log_fatal("Failed to resolve ip address %s\n", token2); + } + } + _config.mode = op; + } else { + rc = -EINVAL; + log_error("Wrong option usage \'%c\'\n", op); + } + break; + case 'p': + errno = 0; + _config.port = strtol(optarg, NULL, 0); + assert(errno == 0); + if (0 != errno) { + rc = -EINVAL; + log_error("Invalid option value <%s>\n", optarg); + } + break; + case 's': + errno = 0; + _config.scount = strtol(optarg, NULL, 0); + assert(errno == 0); + if (0 != errno) { + rc = -EINVAL; + log_error("Invalid option value <%s>\n", optarg); + } + break; + case 'r': + errno = 0; + _config.rcount = strtol(optarg, NULL, 0); + assert(errno == 0); + if (0 != errno) { + rc = -EINVAL; + log_error("Invalid option value <%s>\n", optarg); + } + break; + case 'l': + errno = 0; + _config.msg_size = strtol(optarg, NULL, 0); + assert(errno == 0); + if (0 != errno) { + rc = -EINVAL; + log_error("Invalid option value <%s>\n", optarg); + } + if (_config.msg_size < sizeof(struct msg_header)) { + rc = -EINVAL; + log_error("Message size can not be less than <%d>\n", (int)sizeof(struct msg_header)); + } + break; + case 'n': + errno = 0; + _config.msg_count = strtol(optarg, NULL, 0); + assert(errno == 0); + if (0 != errno) { + rc = -EINVAL; + log_error("Invalid option value <%s>\n", optarg); + } + break; + case 'f': + errno = 0; + _config.msg_rate = strtol(optarg, NULL, 0); + assert(errno == 0); + if (0 != errno) { + rc = -EINVAL; + log_error("Invalid option value <%s>\n", optarg); + } + break; + case 'i': + errno = 0; + _config.msg_skip = strtol(optarg, NULL, 0); + assert(errno == 0); + if (0 != errno) { + rc = -EINVAL; + log_error("Invalid option value <%s>\n", optarg); + } + break; + case 'd': + errno = 0; + _config.log_level = strtol(optarg, NULL, 0); + assert(errno == 0); + if (0 != errno) { + rc = -EINVAL; + log_error("Invalid option value <%s>\n", optarg); + } + break; + case 'h': + default: + log_error("Unknown option <%c>\n", op); + _usage(); + break; + } + } + + if (0 != rc) { + _usage(); + } else { + _config.addr.sin_port = htons(_config.port); + _config.bind_addr.sin_port = htons(_config.port); + log_info("CONFIGURATION:\n"); + log_info("mode: %d\n", _config.mode); + log_info("senders: %d\n", _config.scount); + log_info("receivers: %d\n", _config.rcount); + log_info("log level: %d\n", _config.log_level); + log_info("msg size: %d\n", _config.msg_size); + log_info("msg count: %d\n", _config.msg_count); + log_info("msg rate: %d\n", _config.msg_rate); + log_info("msg skip: %d\n", _config.msg_skip); + log_info("connect to ip: %s\n", _addr2str(&_config.addr)); + log_info("listen on ip: %s\n", _addr2str(&_config.bind_addr)); + } + + return rc; +} + +static int _proc_sender(void) +{ + int rc = 0; + int efd; + struct epoll_event *events = NULL; + int max_events; + struct conn_info { + int id; + int fd; + int msg_len; + struct per_sender_connection { + int msgs_sent; + int64_t begin_send_time; + } stat; + uint8_t msg[1]; + } *conns_out = NULL; + struct per_sender_connection *stat; + struct msg_header *msg_hdr; + int i; + int total_msg_count; + int conns_size = sizeof(struct conn_info) + _config.msg_size + 1; +#if defined(PONG_ENABLED) && (PONG_ENABLED == 1) + struct sockaddr_in addr; + memcpy(&addr, &_config.bind_addr, sizeof(addr)); + addr.sin_port = htons(PONG_PORT); + int fd_pong = _udp_create_and_bind(&addr); + assert(fd_pong >= 0); +#endif /* PONG_ENABLED */ + + log_trace("Launching mode...\n"); + + efd = epoll_create1(0); + assert(efd >= 0); + + conns_out = calloc(_config.scount, conns_size); + assert(conns_out); + for (i = 0; i < _config.scount; i++) { + struct epoll_event event; + struct conn_info *conn; + + conn = (struct conn_info *)((uint8_t *)conns_out + i * conns_size); + conn->stat.msgs_sent = 0; + conn->stat.begin_send_time = 0; + + msg_hdr = (struct msg_header *)conn->msg; + msg_hdr->magic_num = MSG_MAGIC; + msg_hdr->msg_type = MSG_IN; + msg_hdr->len = _config.msg_size; + msg_hdr->seq_num = 0; + msg_hdr->client_id = i; + msg_hdr->receiver = 0; + + conn->id = i; +#if defined(UDP_ENABLED) && (UDP_ENABLED == 1) + conn->fd = _udp_client_init(&_config.addr); +#else + conn->fd = _tcp_client_init(&_config.addr); +#endif /* UDP_ENABLED */ + conn->msg_len = 0; + if (_done) { + goto err; + } + assert(conn->fd >= 0); + + event.data.ptr = conn; + event.events = EPOLLOUT; + rc = epoll_ctl(efd, EPOLL_CTL_ADD, conn->fd, &event); + assert(rc == 0); + } + + log_trace(" established %d connections with \n", _config.scount); + + max_events = _config.scount * 10; + events = calloc(max_events, sizeof(*events)); + assert(events); + + total_msg_count = _config.scount * _config.msg_count; + while (!_done) { + int n; + int j; + + n = epoll_wait(efd, events, max_events, 0); + for (j = 0; j < n; j++) { + struct conn_info *conn = NULL; + uint32_t event; + + event = events[j].events; + conn = (struct conn_info *)events[j].data.ptr; + assert(conn); + + if ((event & EPOLLERR) || + (event & EPOLLHUP) || + (!(event & EPOLLOUT))) { + log_error("epoll error\n"); + goto err; + } + + /* Check message count threshold */ + if (_config.msg_count > 0 && + conn->stat.msgs_sent >= _config.msg_count) { + continue; + } + +#if defined(PONG_ENABLED) && (PONG_ENABLED == 1) + recv(fd_pong, "pong", sizeof("pong"), 0); +#else + usleep(0); +#endif /* PONG_ENABLED */ + + if (event & EPOLLOUT) { + int fd; + int64_t time_now = _get_time_ns(); + + fd = conn->fd; + stat = &conn->stat; + msg_hdr = (struct msg_header *)conn->msg; + + if (stat->begin_send_time > 0) { + int expected_msg_count = 0; + int ret; + + /* check if this connection hasn�t reached begin time yet */ + if (stat->begin_send_time > time_now) { + continue; + } + + /* check if it is new message */ + if (0 == conn->msg_len) { + /* calculate the expected number of sent message */ + expected_msg_count = _config.msg_rate * (time_now - stat->begin_send_time) / NANOS_IN_SEC; + if (stat->msgs_sent >= expected_msg_count) { + continue; + } + + /* Each time while sending messages to engine, sender connection + * randomly picks up a integer X in range of 0 to N-1 (inclusive), + * and put into receiver filed so that engine program will forward + * the message to its Xth connection with receiver. + */ + msg_hdr->receiver = rand() % _config.rcount; +#if defined(TIMESTAMP_ENABLED) && (TIMESTAMP_ENABLED == 1) + msg_hdr->time_start = time_now; +#endif /* TIMESTAMP_ENABLED */ + } + + ret = _write(fd, + ((uint8_t *)msg_hdr) + conn->msg_len, + _config.msg_size - conn->msg_len, _wb); + if (ret < 0) { + goto err; + } + conn->msg_len += ret; + if (conn->msg_len != _config.msg_size) { + continue; + } else { + conn->msg_len = 0; + } + log_trace(" [%d]->[%d] Send %d bytes fd=%d ret=%d\n", + msg_hdr->client_id, msg_hdr->receiver, msg_hdr->len, fd, ret); + + /* send message */ + msg_hdr->seq_num++; + stat->msgs_sent++; + _stat.tx_count++; + + /* check exit condition */ + if (total_msg_count > 0) { + total_msg_count--; + if (total_msg_count == 0) { + _done++; + sleep(3); + } + }; + } else { + int64_t interval = NANOS_IN_SEC / _config.msg_rate; + + /* pick a random time for each connection so they don�t + * start at the same time + */ + stat->begin_send_time = time_now + rand() % interval; + } + } + } + } + +err: + + if (conns_out) { + for (i = 0; i < _config.scount; i++) { + struct conn_info *conn; + + conn = (struct conn_info *)((uint8_t *)conns_out + i * conns_size); +#if defined(UDP_ENABLED) && (UDP_ENABLED == 1) + _write(conn->fd, (uint8_t *)"?", sizeof("?"), 0); +#endif /* UDP_ENABLED */ + epoll_ctl(efd, EPOLL_CTL_DEL, conn->fd, NULL); + close(conn->fd); + conn->fd = -1; + } + free(conns_out); + } + + if (events) { + free(events); + } + + close(efd); + + return rc; +} + +static int _proc_engine(void) +{ + int rc = 0; + int efd; + struct epoll_event *events = NULL; + int max_events; + struct conn_info { + int id; + int fd; +#if defined(VMA_ZCOPY_ENABLED) && (VMA_ZCOPY_ENABLED == 1) + struct vma_packet_desc_t vma_packet; + struct vma_buff_t *vma_buf; + int vma_buf_offset; +#endif /* VMA_ZCOPY_ENABLED */ + int msg_len; + uint8_t msg[1]; + } *conns_out, **conns_in; + struct conn_info *conn = NULL; + struct msg_header *msg_hdr; + int i; + int sfd = -1; + int conns_size = sizeof(struct conn_info) + _config.msg_size + 1; + + log_trace("Launching mode...\n"); + + conns_out = NULL; + conns_in = NULL; + efd = epoll_create1(0); + assert(efd >= 0); + +#if defined(UDP_ENABLED) && (UDP_ENABLED == 1) + sfd = _udp_create_and_bind(&_config.bind_addr); +#else + sfd = _tcp_create_and_bind(&_config.bind_addr); +#endif /* UDP_ENABLED */ + if (sfd < 0) { + rc = -EBUSY; + log_fatal("Failed to create socket\n"); + goto err; + } + + conns_in = calloc(MAX_FD, sizeof(*conns_in)); + assert(conns_in); + for (i = 0; i < _config.scount; i++) { + struct epoll_event event; + int fd; + +#if defined(UDP_ENABLED) && (UDP_ENABLED == 1) + fd = _udp_server_init(sfd); +#else + fd = _tcp_server_init(sfd); +#endif /* UDP_ENABLED */ + if (fd >= MAX_FD) { + log_error("fd(%d) >= MAX_FD(%d)\n", fd, MAX_FD); + goto err; + } + conn = (struct conn_info *)calloc(1, conns_size); + assert(conn); + + msg_hdr = (struct msg_header *)conn->msg; + msg_hdr->msg_type = MSG_BAD; + + conn->id = i; + conn->fd = fd; + conn->msg_len = 0; +#if defined(UDP_ENABLED) && (UDP_ENABLED == 1) + conns_in[i] = conn; +#else + conns_in[fd] = conn; +#endif /* UDP_ENABLED */ + + if (_done) { + goto err; + } + assert(conn->fd >= 0); + + event.data.ptr = conn; + event.events = EPOLLIN; +#if defined(VMA_ZCOPY_ENABLED) && (VMA_ZCOPY_ENABLED == 1) + event = event; +#else + rc = epoll_ctl(efd, EPOLL_CTL_ADD, conn->fd, &event); + assert(rc == 0); +#endif /* VMA_ZCOPY_ENABLED */ + } + + log_trace(" established %d connections with \n", _config.scount); + + conns_out = calloc(_config.rcount, conns_size); + assert(conns_out); + for (i = 0; i < _config.rcount; i++) { + conn = (struct conn_info *)((uint8_t *)conns_out + i * conns_size); + + msg_hdr = (struct msg_header *)conn->msg; + msg_hdr->msg_type = MSG_BAD; + + conn->id = i; +#if defined(UDP_ENABLED) && (UDP_ENABLED == 1) + conn->fd = _udp_client_init(&_config.addr); +#else + conn->fd = _tcp_client_init(&_config.addr); +#endif /* UDP_ENABLED */ + conn->msg_len = 0; + if (_done) { + goto err; + } + assert(conn->fd >= 0); + } + + log_trace(" established %d connections with \n", _config.rcount); + + max_events = (_config.scount + _config.rcount) * 10; + events = calloc(max_events, sizeof(*events)); + assert(events); + + conn = NULL; + while (!_done) { + uint32_t event = 0; + int n = 0; + int j = 0; + +#if defined(VMA_ZCOPY_ENABLED) && (VMA_ZCOPY_ENABLED == 1) + if (conn) { + if (conn->vma_buf && (conn->vma_buf_offset < conn->vma_buf->len)) { + n = 1; + } else if (conn->vma_buf && conn->vma_buf->next) { + conn->vma_buf = conn->vma_buf->next; + conn->vma_buf_offset = 0; + n = 1; + } else if (conn->vma_buf && !conn->vma_buf->next) { + _vma_api->socketxtreme_free_vma_packets(&conn->vma_packet, 1); + conn->vma_buf = NULL; + conn->vma_buf_offset = 0; + conn = NULL; + n = 0; + } + } + while (0 == n) { + struct vma_completion_t vma_comps; + n = _vma_api->socketxtreme_poll(_vma_ring_fd, &vma_comps, 1, 0); + if (n > 0) { + event = (uint32_t)vma_comps.events; + if (vma_comps.events & VMA_SOCKETXTREME_PACKET) { + event |= EPOLLIN; +#if defined(UDP_ENABLED) && (UDP_ENABLED == 1) + if (vma_comps.packet.buff_lst->len >= sizeof(struct msg_header)) { + msg_hdr = (struct msg_header *)vma_comps.packet.buff_lst->payload; + vma_comps.user_data = msg_hdr->client_id; + } else { + event |= EPOLLERR; + log_error("event=0x%x user_data size=%d\n", event, vma_comps.packet.buff_lst->len); + log_error("EOF?\n"); + goto err; + } +#endif /* UDP_ENABLED */ + conn = conns_in[vma_comps.user_data]; + conn->vma_packet.num_bufs = vma_comps.packet.num_bufs; + conn->vma_packet.total_len = vma_comps.packet.total_len; + conn->vma_packet.buff_lst = vma_comps.packet.buff_lst; + conn->vma_buf = conn->vma_packet.buff_lst; + conn->vma_buf_offset = 0; + } else if ((event & EPOLLERR) || (event & EPOLLRDHUP) || + (event & EPOLLHUP)) { + event |= EPOLLERR; + log_error("event=0x%x user_data=%ld\n", event, vma_comps.user_data); + log_error("EOF?\n"); + goto err; + } else { + log_warn("event=0x%x user_data=%ld\n", event, vma_comps.user_data); + n = 0; + } + } + } +#else + n = epoll_wait(efd, events, max_events, 0); +#endif /* VMA_ZCOPY_ENABLED */ + + for (j = 0; j < n; j++) { + int fd = 0; + int ret = 0; + +#if defined(VMA_ZCOPY_ENABLED) && (VMA_ZCOPY_ENABLED == 1) +#else + event = events[j].events; + conn = (struct conn_info *)events[j].data.ptr; + assert(conn); +#endif /* VMA_ZCOPY_ENABLED */ + + fd = conn->fd; + fd = fd; + msg_hdr = (struct msg_header *)conn->msg; + + if ((event & EPOLLERR) || + (event & EPOLLHUP)) { + log_error("epoll error\n"); + goto err; + } + + if (event & EPOLLIN) { + struct conn_info *conn_peer = NULL; + +#if defined(VMA_ZCOPY_ENABLED) && (VMA_ZCOPY_ENABLED == 1) + ret = _min((_config.msg_size - conn->msg_len), (conn->vma_buf->len - conn->vma_buf_offset)); + memcpy(((uint8_t *)msg_hdr) + conn->msg_len, + ((uint8_t *)conn->vma_buf->payload) + conn->vma_buf_offset, + ret); + conn->vma_buf_offset += ret; +#else + ret = _read(fd, + ((uint8_t *)msg_hdr) + conn->msg_len, + _config.msg_size - conn->msg_len, _wb); +#endif /* VMA_ZCOPY_ENABLED */ + if (ret < 0) { + goto err; + } + conn->msg_len += ret; + if (conn->msg_len != _config.msg_size) { + continue; + } else { + conn->msg_len = 0; + } + log_trace(" [%d]<- Read %d bytes fd=%d ret=%d\n", + msg_hdr->client_id, msg_hdr->len, fd, ret); + assert(msg_hdr->msg_type == MSG_IN); + _stat.rx_count++; + + msg_hdr->msg_type = MSG_OUT; + conn_peer = (struct conn_info *)((uint8_t *)conns_out + msg_hdr->receiver * conns_size); + /* use blocking operation */ + ret = _write(conn_peer->fd, (uint8_t *)msg_hdr, msg_hdr->len, 1); + log_trace(" [%d]-> Send %d bytes fd=%d ret=%d\n", + msg_hdr->receiver, msg_hdr->len, conn_peer->fd, ret); + if (ret != msg_hdr->len) { + goto err; + } + _stat.tx_count++; + } + } + } + +err: + + close(sfd); + + if (conns_in) { + for (i = 0; i < MAX_FD; i++) { + struct conn_info *conn; + + conn = conns_in[i]; + if (conn) { +#if defined(VMA_ZCOPY_ENABLED) && (VMA_ZCOPY_ENABLED == 1) +#else + epoll_ctl(efd, EPOLL_CTL_DEL, conn->fd, NULL); +#endif /* VMA_ZCOPY_ENABLED */ + close(conn->fd); + conn->fd = -1; + free(conn); + } + } + free(conns_in); + } + + if (conns_out) { + for (i = 0; i < _config.rcount; i++) { + struct conn_info *conn; + + conn = (struct conn_info *)((uint8_t *)conns_out + i * conns_size); +#if defined(UDP_ENABLED) && (UDP_ENABLED == 1) + _write(conn->fd, (uint8_t *)"?", sizeof("?"), 0); +#endif /* UDP_ENABLED */ + epoll_ctl(efd, EPOLL_CTL_DEL, conn->fd, NULL); + close(conn->fd); + conn->fd = -1; + } + free(conns_out); + } + + if (events) { + free(events); + } + + close(efd); + + return rc; +} + +static int _proc_receiver(void) +{ + int rc = 0; + int efd; + struct epoll_event *events = NULL; + int max_events; + struct conn_info { + int id; + int fd; +#if defined(VMA_ZCOPY_ENABLED) && (VMA_ZCOPY_ENABLED == 1) + struct vma_packet_desc_t vma_packet; + struct vma_buff_t *vma_buf; + int vma_buf_offset; +#endif /* VMA_ZCOPY_ENABLED */ + int msg_len; + uint8_t msg[1]; + } **conns_in = NULL; + struct conn_info *conn; + struct msg_header *msg_hdr; + int i; + int sfd = -1; + int conns_size = sizeof(struct conn_info) + _config.msg_size + 1; + + log_trace("Launching mode...\n"); + + efd = epoll_create1(0); + assert(efd >= 0); + +#if defined(UDP_ENABLED) && (UDP_ENABLED == 1) + sfd = _udp_create_and_bind(&_config.bind_addr); +#else + sfd = _tcp_create_and_bind(&_config.bind_addr); +#endif /* UDP_ENABLED */ + if (sfd < 0) { + rc = -EBUSY; + log_fatal("Failed to create socket\n"); + goto err; + } + + conns_in = calloc(MAX_FD, sizeof(*conns_in)); + assert(conns_in); + for (i = 0; i < _config.rcount; i++) { + struct epoll_event event; + int fd; + +#if defined(UDP_ENABLED) && (UDP_ENABLED == 1) + fd = _udp_server_init(sfd); +#else + fd = _tcp_server_init(sfd); +#endif /* UDP_ENABLED */ + if (fd >= MAX_FD) { + log_error("fd(%d) >= MAX_FD(%d)\n", fd, MAX_FD); + goto err; + } + conn = (struct conn_info *)calloc(1, conns_size); + assert(conn); + + conn->id = i; + conn->fd = fd; + conn->msg_len = 0; +#if defined(UDP_ENABLED) && (UDP_ENABLED == 1) + conns_in[i] = conn; +#else + conns_in[fd] = conn; +#endif /* UDP_ENABLED */ + + if (_done) { + goto err; + } + assert(conn->fd >= 0); + + event.data.ptr = conn; + event.events = EPOLLIN; +#if defined(VMA_ZCOPY_ENABLED) && (VMA_ZCOPY_ENABLED == 1) + event = event; +#else + rc = epoll_ctl(efd, EPOLL_CTL_ADD, conn->fd, &event); + assert(rc == 0); +#endif /* VMA_ZCOPY_ENABLED */ + } + + log_trace(" established %d connections with \n", _config.rcount); + + max_events = _config.rcount * 10; + events = calloc(max_events, sizeof(*events)); + assert(events); + + conn = NULL; + while (!_done) { + uint32_t event = 0; + int n = 0; + int j = 0; + +#if defined(VMA_ZCOPY_ENABLED) && (VMA_ZCOPY_ENABLED == 1) + if (conn) { + if (conn->vma_buf && (conn->vma_buf_offset < conn->vma_buf->len)) { + n = 1; + } else if (conn->vma_buf && conn->vma_buf->next) { + conn->vma_buf = conn->vma_buf->next; + conn->vma_buf_offset = 0; + n = 1; + } else if (conn->vma_buf && !conn->vma_buf->next) { + _vma_api->socketxtreme_free_vma_packets(&conn->vma_packet, 1); + conn->vma_buf = NULL; + conn->vma_buf_offset = 0; + conn = NULL; + n = 0; + } + } + while (0 == n) { + struct vma_completion_t vma_comps; + n = _vma_api->socketxtreme_poll(_vma_ring_fd, &vma_comps, 1, 0); + if (n > 0) { + event = (uint32_t)vma_comps.events; + if (vma_comps.events & VMA_SOCKETXTREME_PACKET) { + event |= EPOLLIN; +#if defined(UDP_ENABLED) && (UDP_ENABLED == 1) + if (vma_comps.packet.buff_lst->len >= sizeof(struct msg_header)) { + msg_hdr = (struct msg_header *)vma_comps.packet.buff_lst->payload; + vma_comps.user_data = msg_hdr->receiver; + } else { + event |= EPOLLERR; + log_error("event=0x%x user_data size=%d\n", event, vma_comps.packet.buff_lst->len); + log_error("EOF?\n"); + goto err; + } +#endif /* UDP_ENABLED */ + conn = conns_in[vma_comps.user_data]; + conn->vma_packet.num_bufs = vma_comps.packet.num_bufs; + conn->vma_packet.total_len = vma_comps.packet.total_len; + conn->vma_packet.buff_lst = vma_comps.packet.buff_lst; + conn->vma_buf = conn->vma_packet.buff_lst; + conn->vma_buf_offset = 0; + } else if ((event & EPOLLERR) || (event & EPOLLRDHUP) || + (event & EPOLLHUP)) { + event |= EPOLLERR; + log_error("event=0x%x user_data=%ld\n", event, vma_comps.user_data); + log_error("EOF?\n"); + goto err; + } else { + log_warn("event=0x%x user_data=%ld\n", event, vma_comps.user_data); + n = 0; + } + } + } +#else + n = epoll_wait(efd, events, max_events, 0); +#endif /* VMA_ZCOPY_ENABLED */ + for (j = 0; j < n; j++) { + int fd = 0; + int ret = 0; + +#if defined(VMA_ZCOPY_ENABLED) && (VMA_ZCOPY_ENABLED == 1) +#else + event = events[j].events; + conn = (struct conn_info *)events[j].data.ptr; + assert(conn); +#endif /* VMA_ZCOPY_ENABLED */ + + fd = conn->fd; + fd = fd; + msg_hdr = (struct msg_header *)conn->msg; + + if ((event & EPOLLERR) || + (event & EPOLLHUP)) { + log_error("epoll error\n"); + goto err; + } + + if (event & EPOLLIN) { +#if defined(VMA_ZCOPY_ENABLED) && (VMA_ZCOPY_ENABLED == 1) + ret = _min((_config.msg_size - conn->msg_len), (conn->vma_buf->len - conn->vma_buf_offset)); + memcpy(((uint8_t *)msg_hdr) + conn->msg_len, + ((uint8_t *)conn->vma_buf->payload) + conn->vma_buf_offset, + ret); + conn->vma_buf_offset += ret; +#else + ret = _read(fd, + ((uint8_t *)msg_hdr) + conn->msg_len, + _config.msg_size - conn->msg_len, _rb); +#endif /* VMA_ZCOPY_ENABLED */ + if (ret < 0) { + goto err; + } + conn->msg_len += ret; + if (conn->msg_len != _config.msg_size) { + continue; + } else { + conn->msg_len = 0; + } + log_trace(" [%d]<-[%d] Read %d bytes fd=%d ret=%d\n", + msg_hdr->receiver, msg_hdr->client_id, msg_hdr->len, fd, ret); + assert(msg_hdr->msg_type == MSG_OUT); + _stat.rx_count++; +#if defined(TIMESTAMP_ENABLED) && (TIMESTAMP_ENABLED == 1) + msg_hdr->time_end = _get_time_ns(); + memcpy(_stat.data + _stat.count, msg_hdr, sizeof(*msg_hdr)); + _stat.count++; +#endif /* TIMESTAMP_ENABLED */ + } + } + } + +err: + + close(sfd); + + if (conns_in) { + for (i = 0; i < MAX_FD; i++) { + struct conn_info *conn; + + conn = conns_in[i]; + if (conn) { +#if defined(VMA_ZCOPY_ENABLED) && (VMA_ZCOPY_ENABLED == 1) +#else + epoll_ctl(efd, EPOLL_CTL_DEL, conn->fd, NULL); +#endif /* VMA_ZCOPY_ENABLED */ + close(conn->fd); + conn->fd = -1; + free(conn); + } + } + free(conns_in); + } + + if (events) { + free(events); + } + + close(efd); + + return rc; +} + +#if defined(TIMESTAMP_RDTSC) && (TIMESTAMP_RDTSC == 1) +static inline double __get_cpu_clocks_per_sec(void) +{ + static double clocks_per_sec = 0.0; + static int initialized = 0; + + if (!initialized) { + double mhz = 0.0; +#if defined(__LINUX__) + FILE* f; + char buf[256]; + + f = fopen("/proc/cpuinfo", "r"); + if (!f) { + return 0.0; + } + + while (fgets(buf, sizeof(buf), f)) { + double m; + int rc; + +#if defined(__ia64__) + rc = sscanf(buf, "itc MHz : %lf", &m); +#elif defined(__powerpc__) + rc = sscanf(buf, "clock : %lf", &m); +#else + rc = sscanf(buf, "cpu MHz : %lf", &m); +#endif + if (rc != 1) { + continue; + } + if (mhz == 0.0) { + mhz = m; + continue; + } + if (mhz != m) { + double mm = (mhz < m ? m : mhz); + mhz = mm; + } + } + fclose(f); +#endif + clocks_per_sec = mhz * 1.0e6; + initialized = 1; + } + + return clocks_per_sec; +} +#endif /* TIMESTAMP_RDTSC */ + +static inline int64_t _get_time_ns(void) +{ +#if defined(TIMESTAMP_RDTSC) && (TIMESTAMP_RDTSC == 1) + unsigned long long int result=0; + +#if defined(__LINUX__) +#if defined(__i386__) + __asm volatile(".byte 0x0f, 0x31" : "=A" (result) : ); + +#elif defined(__x86_64__) + unsigned hi, lo; + __asm volatile("rdtsc" : "=a"(lo), "=d"(hi)); + result = hi; + result = result<<32; + result = result|lo; + +#elif defined(__powerpc__) + unsigned long int hi, lo, tmp; + __asm volatile( + "0: \n\t" + "mftbu %0 \n\t" + "mftb %1 \n\t" + "mftbu %2 \n\t" + "cmpw %2,%0 \n\t" + "bne 0b \n" + : "=r"(hi),"=r"(lo),"=r"(tmp) + ); + result = hi; + result = result<<32; + result = result|lo; + +#endif +#endif /* __LINUX__ */ + + return ((int64_t)((double)result * NANOS_IN_SEC / __get_cpu_clocks_per_sec())); +#else + static struct timespec time; + clock_gettime(CLOCK_REALTIME, &time); + return (NANOS_IN_SEC * time.tv_sec + time.tv_nsec); +#endif /* TIMESTAMP_RDTSC */ +} + +static inline char *_addr2str(struct sockaddr_in *addr) +{ + static __thread char addrbuf[100]; + inet_ntop(AF_INET, &addr->sin_addr, addrbuf, sizeof(addrbuf)); + sprintf(addrbuf,"%s:%d", addrbuf, ntohs(addr->sin_port)); + + return addrbuf; +} + +static int _get_addr(char *dst, struct sockaddr_in *addr) +{ + int rc = 0; + struct addrinfo *res; + + rc = getaddrinfo(dst, NULL, NULL, &res); + if (rc) { + log_error("getaddrinfo failed - invalid hostname or IP address\n"); + return rc; + } + + if (res->ai_family != PF_INET) { + rc = -1; + goto out; + } + + *addr = *(struct sockaddr_in *)res->ai_addr; +out: + freeaddrinfo(res); + return rc; +} + +static int _set_noblock(int fd) +{ + int rc = 0; + int flag; + + flag = fcntl(fd, F_GETFL); + if (flag < 0) { + rc = -errno; + log_error("failed to get socket flags %s\n", strerror(errno)); + } + flag |= O_NONBLOCK; + rc = fcntl(fd, F_SETFL, flag); + if (rc < 0) { + rc = -errno; + log_error("failed to set socket flags %s\n", strerror(errno)); + } + + return rc; +} + + +#if defined(UDP_ENABLED) && (UDP_ENABLED == 1) +static int _udp_client_init(struct sockaddr_in *addr) +{ + int rc = 0; + int fd = -1; + struct sockaddr_in bind_addr; + + memcpy(&bind_addr, &_config.bind_addr, sizeof(bind_addr)); + bind_addr.sin_port = 0; + + fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_IP); + if (!fd) { + rc = -EBUSY; + log_fatal("Failed to create socket\n"); + goto err; + } + + rc = _set_noblock(fd); + if (rc < 0) { + log_error("Configure failed: %s\n", strerror(errno)); + goto err; + } + + rc = bind(fd, (struct sockaddr *) &bind_addr, sizeof(bind_addr)); + if (rc < 0) { + rc = -EBUSY; + log_fatal("Failed to bind socket\n"); + goto err; + } + + rc = connect(fd, (struct sockaddr *)addr, sizeof(*addr)); + if (rc < 0 && errno != EINPROGRESS) { + log_error("Connect failed: %s\n", strerror(errno)); + goto err; + } + +#if defined(VMA_ZCOPY_ENABLED) && (VMA_ZCOPY_ENABLED == 1) + /* Need to get ring after listen() or nonblocking connect() */ + if (_vma_ring_fd < 0) { + _vma_api->get_socket_rings_fds(fd, &_vma_ring_fd, 1); + assert((-1) != _vma_ring_fd); + } +#endif /* VMA_ZCOPY_ENABLED */ + + log_trace("Established connection: fd=%d to %s\n", fd, _addr2str(addr)); + +err: + return (rc == 0 ? fd : (-1)); +} + +static int _udp_server_init(int fd) +{ + +#if defined(VMA_ZCOPY_ENABLED) && (VMA_ZCOPY_ENABLED == 1) + /* Need to get ring after listen() or nonblocking connect() */ + if (_vma_ring_fd < 0) { + _vma_api->get_socket_rings_fds(fd, &_vma_ring_fd, 1); + assert((-1) != _vma_ring_fd); + } +#endif /* VMA_ZCOPY_ENABLED */ + + return fd; +} + +static int _udp_create_and_bind(struct sockaddr_in *addr) +{ + int rc = 0; + int fd; + int flag; + + fd = socket(PF_INET, SOCK_DGRAM, IPPROTO_UDP); + if (!fd) { + rc = -EBUSY; + log_fatal("Failed to create socket\n"); + goto err; + } + + flag = 1; + rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char *) &flag, sizeof(int)); + if (rc < 0) { + log_error("Failed to setsockopt: %s\n", strerror(errno)); + goto err; + } + + rc = _set_noblock(fd); + if (rc < 0) { + log_error("Failed to nonblocking: %s\n", strerror(errno)); + goto err; + } + + rc = bind(fd, (struct sockaddr *) addr, sizeof(*addr)); + if (rc < 0) { + rc = -EBUSY; + log_fatal("Failed to bind socket\n"); + goto err; + } + +err: + return (rc == 0 ? fd : (-1)); +} +#else +static int _tcp_client_init(struct sockaddr_in *addr) +{ + int rc = 0; + int fd = -1; + int flag; + + fd = socket(PF_INET, SOCK_STREAM, IPPROTO_IP); + if (!fd) { + rc = -EBUSY; + log_fatal("Failed to create socket\n"); + goto err; + } + + flag = 1; + rc = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char *) &flag, sizeof(int)); + if (rc < 0) { + log_error("Failed to disable NAGLE: %s\n", strerror(errno)); + goto err; + } + + rc = _set_noblock(fd); + if (rc < 0) { + log_error("Configure failed: %s\n", strerror(errno)); + goto err; + } + + rc = connect(fd, (struct sockaddr *)addr, sizeof(*addr)); + if (rc < 0 && errno != EINPROGRESS) { + log_error("Connect failed: %s\n", strerror(errno)); + goto err; + } + +#if defined(VMA_ZCOPY_ENABLED) && (VMA_ZCOPY_ENABLED == 1) + /* Need to get ring after listen() or nonblocking connect() */ + if (_vma_ring_fd < 0) { + _vma_api->get_socket_rings_fds(fd, &_vma_ring_fd, 1); + assert((-1) != _vma_ring_fd); + } +#endif /* VMA_ZCOPY_ENABLED */ + + /* do this for non-blocking socket */ + rc = 0; +#if defined(VMA_ZCOPY_ENABLED) && (VMA_ZCOPY_ENABLED == 1) + while (0 == rc) { + uint32_t event; + struct vma_completion_t vma_comps; + rc = _vma_api->socketxtreme_poll(_vma_ring_fd, &vma_comps, 1, 0); + if (rc > 0) { + event = (uint32_t)vma_comps.events; + if (vma_comps.events & EPOLLOUT) { + fd = vma_comps.user_data; + rc = 0; + break; + } else { + log_warn("event=0x%x user_data=%ld\n", event, vma_comps.user_data); + rc = 0; + } + } + } +#else + /* wait for setting connection */ + if (0) { + fd_set rset, wset; + FD_ZERO(&rset); + FD_SET(fd, &rset); + wset = rset; + + if (select(fd + 1, &rset, &wset, NULL, NULL) == 0) { + close(fd); + errno = ETIMEDOUT; + rc = -ETIMEDOUT; + log_error("select failed: %s\n", strerror(errno)); + goto err; + } + } else { + int efd; + struct epoll_event event; + int n; + struct epoll_event events[10]; + + efd = epoll_create1(0); + event.events = EPOLLOUT | EPOLLIN; + event.data.fd = fd; + rc = epoll_ctl(efd, EPOLL_CTL_ADD, fd, &event); + n = epoll_wait(efd, events, 10, -1); + epoll_ctl(efd, EPOLL_CTL_DEL, fd, NULL); + close(efd); + if (n <= 0 || events[0].events != EPOLLOUT || events[0].data.fd != fd) { + log_error("epoll_wait event=0x%x fd=%d\n", events[0].events, events[0].data.fd); + goto err; + } + } +#endif /* VMA_ZCOPY_ENABLED */ + + log_trace("Established connection: fd=%d to %s\n", fd, _addr2str(addr)); + +err: + return (rc == 0 ? fd : (-1)); +} + +static int _tcp_server_init(int fd) +{ + int rc = 0; + struct sockaddr in_addr; + socklen_t in_len; + int flag; + + /* Need to get ring after listen() or nonblocking connect() */ +#if defined(VMA_ZCOPY_ENABLED) && (VMA_ZCOPY_ENABLED == 1) + if (_vma_ring_fd < 0) { + _vma_api->get_socket_rings_fds(fd, &_vma_ring_fd, 1); + assert((-1) != _vma_ring_fd); + } +#endif /* VMA_ZCOPY_ENABLED */ + +#if defined(VMA_ZCOPY_ENABLED) && (VMA_ZCOPY_ENABLED == 1) + while (0 == rc) { + uint32_t event; + struct vma_completion_t vma_comps; + rc = _vma_api->socketxtreme_poll(_vma_ring_fd, &vma_comps, 1, 0); + if (rc > 0) { + event = (uint32_t)vma_comps.events; + if (vma_comps.events & VMA_SOCKETXTREME_NEW_CONNECTION_ACCEPTED) { + fd = vma_comps.user_data; + in_len = sizeof(in_addr); + memcpy(&in_addr, &vma_comps.src, in_len); + } else { + log_warn("event=0x%x user_data=%ld\n", event, vma_comps.user_data); + rc = 0; + } + } + } +#else + in_len = sizeof(in_addr); + fd = accept(fd, &in_addr, &in_len); + if (fd < 0) { + log_error("Accept failed: %s\n", strerror(errno)); + goto err; + } +#endif /* VMA_ZCOPY_ENABLED */ + + log_trace("Accepted connection: fd=%d from %s\n", fd, _addr2str((struct sockaddr_in *)&in_addr)); + + flag = 1; + rc = setsockopt(fd, IPPROTO_TCP, TCP_NODELAY, (char *) &flag, sizeof(int)); + if (rc < 0) { + log_error("Failed to disable NAGLE: %s\n", strerror(errno)); + goto err; + } + + rc = _set_noblock(fd); + +err: + return fd; +} + +static int _tcp_create_and_bind(struct sockaddr_in *addr) +{ + int rc = 0; + int fd; + int flag; + + fd = socket(PF_INET, SOCK_STREAM, IPPROTO_IP); + if (!fd) { + rc = -EBUSY; + log_fatal("Failed to create socket\n"); + goto err; + } + + flag = 1; + rc = setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, (char *) &flag, sizeof(int)); + if (rc < 0) { + log_error("Failed to setsockopt: %s\n", strerror(errno)); + goto err; + } + + rc = bind(fd, (struct sockaddr *) addr, sizeof(*addr)); + if (rc < 0) { + rc = -EBUSY; + log_fatal("Failed to bind socket\n"); + goto err; + } + + listen(fd, SOMAXCONN); + +err: + return (rc == 0 ? fd : (-1)); +} +#endif /* UDP_ENABLED */ + +static int _write(int fd, uint8_t *buf, int count, int block) +{ + int n, nb; + + nb = 0; + do { + n = write(fd, buf, count); + if (n <= 0) { + if (errno == EAGAIN) { + log_trace("blocking write fd=%d ret=%d written %d of %d %s\n", + fd, n, nb, count, strerror(errno)); + if (block) { + continue; + } + return nb; + } + log_error("bad write fd=%d ret=%d written %d of %d %s\n", + fd, n, nb, count, strerror(errno)); + return nb; + } + count -= n; + buf += n; + nb += n; + } while (block && (count > 0)); + + return nb; +} + +#if defined(VMA_ZCOPY_ENABLED) && (VMA_ZCOPY_ENABLED == 1) +#else +static int _read(int fd, uint8_t *buf, int count, int block) +{ + int n; + int nb; + + nb = 0; + do { + n = read(fd, buf, count); + if (n == 0) { + log_error("EOF?\n"); + return -1; + } + if (n < 0) { + if (errno == EAGAIN) { + log_trace("blocking read fd=%d ret=%d read %d of %d %s\n", + fd, n, nb, count, strerror(errno)); + if (block) { + continue; + } + return nb; + } + log_error("bad read fd=%d ret=%d read %d of %d %s\n", + fd, n, nb, count, strerror(errno)); + return nb; + } + count -= n; + buf += n; + nb += n; + } while (block && (count > 0)); + + return nb; +} +#endif /* VMA_ZCOPY_ENABLED */ + +static void _ini_stat(void) +{ + memset(&_stat, 0, sizeof(_stat)); + +#if defined(TIMESTAMP_ENABLED) && (TIMESTAMP_ENABLED == 1) + if (_config.mode == MODE_RECEIVER) { + _stat.count = 0; + _stat.size = _config.scount * (_config.msg_count < 0 ? 10000 : _config.msg_count); + _stat.data = malloc(_stat.size * sizeof(*_stat.data) + _config.msg_size); + if (!_stat.data) { + log_fatal("Can not allocate memory for statistic\n"); + exit(1); + } + memset(_stat.data, 0, _stat.size * sizeof(*_stat.data) + _config.msg_size); + mlock(_stat.data, _stat.size * sizeof(*_stat.data) + _config.msg_size); + } +#endif /* TIMESTAMP_ENABLED */ +} + +#if defined(TIMESTAMP_ENABLED) && (TIMESTAMP_ENABLED == 1) +static int _cmpfunc (const void *a, const void *b) +{ + return ( *(int64_t*)a - *(int64_t*)b ); +} +#endif /* TIMESTAMP_ENABLED */ + +static void _fin_stat(void) +{ + log_info("STATISTIC:\n"); + log_info("mode: %d\n", _config.mode); + log_info("tx: %d\n", _stat.tx_count); + log_info("rx: %d\n", _stat.rx_count); +#if defined(TIMESTAMP_ENABLED) && (TIMESTAMP_ENABLED == 1) + if (_config.mode == MODE_RECEIVER) { + int64_t *values; + int64_t values_count = 0; + int64_t values_sum = 0; + int i, j, k; + + values = calloc(_stat.count, sizeof(*values)); + values_count = 0; + for (i = 0; i < _config.scount; i++) { + k = 0; + for (j = 0; j < _stat.count; j++) { + if (i == _stat.data[j].client_id) { + if (k < _config.msg_skip) { + k++; + continue; + } + /* calculate RTD/2 */ + values[values_count] = (_stat.data[j].time_end - _stat.data[j].time_start) / 2; + values_sum += values[values_count]; + values_count++; + } + } + } + assert(values_count <= _stat.count); + + if (values_count > 0) { + double percentile[] = {0.9999, 0.999, 0.995, 0.99, 0.95, 0.90, 0.75, 0.50, 0.25}; + int num = sizeof(percentile) / sizeof(percentile[0]); + double observationsInPercentile = (double)values_count / 100; + + qsort(values, values_count, sizeof(*values), _cmpfunc); + + log_info("====> avg-lat=%7.3lf\n", (double)values_sum / (values_count * (double)NANOS_IN_USEC)); + log_info("Total %lu observations; each percentile contains %.2lf observations\n", (long unsigned)values_count, observationsInPercentile); + + log_info("---> observation = %8.3lf\n", (double)values[values_count - 1] / (double)NANOS_IN_USEC); + for (j = 0; j < num; j++) { + int index = (int)( 0.5 + percentile[j] * values_count ) - 1; + if (index >= 0) { + log_info("---> percentile %6.2lf = %8.3lf\n", 100 * percentile[j], (double)values[index] / (double)NANOS_IN_USEC); + } + } + log_info("---> observation = %8.3lf\n", (double)values[0] / (double)NANOS_IN_USEC); + } else { + log_info("Total %lu observations\n", (long unsigned)values_count); + } + free(values); + if (_stat.data) { + munlock(_stat.data, _stat.size * sizeof(*_stat.data) + _config.msg_size); + free(_stat.data); + } + } +#endif /* TIMESTAMP_ENABLED */ +} diff --git a/tests/throughput_test/Makefile.am b/tests/throughput_test/Makefile.am new file mode 100644 index 0000000..29b7ad8 --- /dev/null +++ b/tests/throughput_test/Makefile.am @@ -0,0 +1,10 @@ +noinst_PROGRAMS = udp_perf + +AM_CPPFLAGS := \ + -I$(top_builddir)/. -I$(top_srcdir)/. + +udp_perf_LDADD = -lrt +udp_perf_SOURCES = bandwidth_test.c +udp_perf_DEPENDENCIES = Makefile.am Makefile.in Makefile + + diff --git a/tests/throughput_test/bandwidth_test.c b/tests/throughput_test/bandwidth_test.c new file mode 100644 index 0000000..781d90c --- /dev/null +++ b/tests/throughput_test/bandwidth_test.c @@ -0,0 +1,726 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + + +#include // sockets +#include // sockets +#include +#include +#include +#include // internet address manipulation +#include +#include +#include +#include +#include // random() +#include // timers +#include // clock_gettime() +#include // getopt() and sleep() +#include // getopt() +#include // isprint() +#include // select() According to POSIX 1003.1-2001 #include +#include +#include //Linux RTC +#include +#include +#include + + + + + +#define UDP_PERF_VERSION "1.2" +#define UDP_PERF_VERSION_DATE "21 November 2007" +#define HEADER (sizeof(struct iphdr)+sizeof(struct udphdr)) +#define MSG_RATE "5MB" +#define RTC_HZ 1024 +#define MIN_PAYLOAD_SIZE 17 +#define DEFAULT_PAYLOAD_SIZE 1470 +#define MAX_STREAM_SIZE (50*1024*1024) +#define DEFAULT_MC_ADDR "0.0.0.0" +#define DEFAULT_PORT 11111 +#define DEFAULT_TEST_DURATION 10 /* [sec] */ +#define MAX_TEST_DURATION_ON_i386 4 /* [sec] */ +#define MS 1000000 +#define KB 1024 +#define MB (KB*1024) +#define GB (MB*1024) +#define END_OF_PACKETS 9 +#define BYTE 1 +#define KBYTE 2 +#define MBYTE 3 +#define GBYTE 4 +#ifndef MAX_PATH_LENGTH + #define MAX_PATH_LENGTH 1024 +#endif +#define IP_PORT_FORMAT_REG_EXP "^((25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}"\ + "(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?):"\ + "(6553[0-5]|655[0-2][0-9]|65[0-4][0-9]{2}|6[0-4][0-9]{3}|[0-5]?[0-9]{1,4})\n" + +//long msg_rate=0; +bool b_exit = false; +struct sigaction sigact; +unsigned int client_counter = 0; +struct timespec ts; +struct timespec start_time, end_time; +int max_fd = 0; +int fd; /* used when single mc group is given */ +fd_set readfds; +double totaltime=0; + + +regex_t regexpr; + +typedef struct tagPKT { + long seqnum; + char buf; + int size; +} PKT; + +PKT *pkt=NULL; + +struct user_params_t { + struct sockaddr_in addr; + uint16_t mc_dest_port; + int sec; /* test duration */ + int msg_size; + int server; + double sendRate; + char sendRateDetails; + +} user_params; + +static void usage(const char *argv0) +{ + printf("\nUdp Bandwidth Test\n"); + printf("Usage:\n"); + printf("\t%s [OPTIONS]\n", argv0); + printf("\t%s -s [-i ip [-p port] [-m message_size]\n", argv0); + printf("\t%s -c -i ip [-p port] [-m message_size] [-t time] \n", argv0); + printf("\t%s -c -r message_rate [-m message_size] [-t time] \n", argv0); + printf("\n"); + printf("Options:\n"); + printf(" -s, --server\t\t\trun server (default - unicast)\n"); + printf(" -c, --client\t\t\trun client\n"); + printf(" -i, --ip=\t\t\tlisten on/send to ip \n"); + printf(" -p, --port=\t\tlisten on/connect to port (default %d)\n", DEFAULT_PORT); + printf(" -t, --time=\t\trun for seconds (default %d, max = 3600)\n", DEFAULT_TEST_DURATION); + printf(" -m, --msg_size=\t\tuse messages of size bytes\n"); + printf(" -r, --msg rate expected\n"); + printf(" -v, --version\t\t\tprint the version\n"); + printf(" -h, --help\t\t\tprint this help message\n"); +} + + +void cleanup() +{ + if (pkt) { + free(pkt); + } + close(fd); +} + +void server_sig_handler(int signum) +{ + printf("Got signal %d - exiting.\n", signum); + b_exit = true; + //exit(0); +} + +void client_sig_handler(int signum) +{ + if (signum) {}; + + if (clock_gettime(CLOCK_REALTIME, &end_time)) { + perror("udp_perf: clock_gettime()"); + exit(1); + } + + if (!pkt) { + printf("packet not allocated\n"); + } + else if (pkt->seqnum) { + printf("udp_perf: Total time taken:%.3lf sec, total packet sent %ld, avg msg rate %.0lf pps,\n",totaltime/1000 ,pkt->seqnum, (pkt->seqnum*1000/totaltime)); + } + + b_exit = true; + //exit(0); +} + +/* set the timer on client to the [-t sec] parameter given by user */ + +void set_client_timer(struct itimerval *timer) +{ + + timer->it_value.tv_sec = user_params.sec; + timer->it_value.tv_usec = 0; + timer->it_interval.tv_sec = 0; + timer->it_interval.tv_usec = 0; + +} + +/* set the action taken when signal received */ + +void set_signal_action() +{ + sigact.sa_handler = + user_params.server ? server_sig_handler : client_sig_handler; + sigemptyset(&sigact.sa_mask); + sigact.sa_flags = 0; + sigaction(user_params.server ? SIGINT : SIGALRM, &sigact, NULL); +} + + +int get_send_rate(char *tmp) +{ + int i; + char ez[MAX_PATH_LENGTH]; + for (i=0;tmp[i];i++) { + if (tmp[i] < '0' || tmp[i] > '9' ) + break; + else + ez[i]=tmp[i]; + } + ez[i]='\0'; + if (strstr(tmp,"M") || strstr(tmp,"m")) + user_params .sendRateDetails = MBYTE; + else if (strstr(tmp,"K") || strstr(tmp,"k")) + user_params .sendRateDetails = KBYTE; + else if (strstr(tmp,"G") || strstr(tmp,"g")) + user_params .sendRateDetails = GBYTE; + else if (strstr(tmp,"B") || strstr(tmp,"b")) + user_params .sendRateDetails = BYTE; + else + user_params .sendRateDetails = BYTE; + + // printf("user_params.sendRateDetails=%d\n",user_params.sendRateDetails); + return atoi(ez); +} + + +void set_defaults() +{ + memset(&user_params, 0, sizeof(struct user_params_t)); + user_params.addr.sin_family = AF_INET; + inet_aton(DEFAULT_MC_ADDR, &user_params.addr.sin_addr); + user_params.mc_dest_port = DEFAULT_PORT; + user_params.addr.sin_port = htons(user_params.mc_dest_port); + user_params.sec = DEFAULT_TEST_DURATION; + user_params.msg_size = DEFAULT_PAYLOAD_SIZE; + user_params.server = 1; + //user_params.use_select = 0; + user_params.sendRate = get_send_rate(MSG_RATE); + + if (user_params.sendRateDetails == KBYTE) + user_params.sendRate *= KB; + else if (user_params.sendRateDetails == MBYTE) + user_params.sendRate *= MB; + else if (user_params.sendRateDetails == GBYTE) + user_params.sendRate *= GB; + +} + +void print_version() +{ + printf("udp_perf version %s (%s)\n", UDP_PERF_VERSION, UDP_PERF_VERSION_DATE); +} + + +int check_empty_addr(struct in_addr in){ + return !(strcmp(DEFAULT_MC_ADDR, inet_ntoa(in))); + +} + +void prepare_network(int is_server) +{ + u_int yes = 1; + u_char i_loop = 0; + struct ip_mreq mreq; + uint32_t in_addr; + struct sockaddr_in client_addr; + //printf("udp_lat: %s: entry\n", __func__); + + memset(&mreq,0,sizeof(struct ip_mreq)); + printf(" %s port %d\n", inet_ntoa(user_params.addr.sin_addr), user_params.mc_dest_port); + /* create a UDP socket */ + if ((fd = socket(AF_INET, SOCK_DGRAM, 0)) < 0) { + perror("udp_lat: socket()"); + exit(1); + } + + /* allow multiple sockets to use the same PORT number */ + if (setsockopt(fd, SOL_SOCKET, SO_REUSEADDR, &yes, sizeof(yes)) < 0) { + perror("udp_lat: Reusing ADDR failed"); + exit(1); + } + + in_addr = ntohl(((struct sockaddr_in *)&user_params.addr)->sin_addr.s_addr); + + /* bind to receive address */ + if (is_server) { + /* check if the ip is 0.0.0.0 and if so insert INADDR_ANY to user_params.addr */ + if (check_empty_addr(user_params.addr.sin_addr)) { + user_params.addr.sin_addr.s_addr = htonl(INADDR_ANY); + } + } + + if (IN_MULTICAST(in_addr)) { + if (bind(fd, (struct sockaddr *)&user_params.addr, sizeof(user_params.addr)) < 0) { + perror("udp_lat: bind()"); + exit(1); + } + + /* use setsockopt() to request that the kernel join a multicast group */ + mreq.imr_multiaddr.s_addr = user_params.addr.sin_addr.s_addr; + mreq.imr_interface.s_addr = htonl(INADDR_ANY); + if (setsockopt(fd, IPPROTO_IP, IP_ADD_MEMBERSHIP, &mreq, sizeof(mreq)) < 0) { + perror("udp_lat: setsockopt()"); + exit(1); + } + + if (setsockopt(fd, IPPROTO_IP, IP_MULTICAST_LOOP, &i_loop, sizeof(i_loop)) == (-1)) { + perror("udp_lat: setsockopt()"); + exit(1); + } + } else { + if (!is_server) { + client_addr.sin_family = AF_INET; + client_addr.sin_port = user_params.addr.sin_port; + client_addr.sin_addr.s_addr = htonl( INADDR_ANY ); + memset(&(client_addr.sin_zero), '\0', 8); // zero the rest of the struct + + //printf ("IP to bind: %s\n",inet_ntoa(client_addr.sin_addr)); + if (bind(fd, (struct sockaddr *)&client_addr, sizeof(client_addr)) < 0) { + perror("udp_lat: bind()"); + exit(1); + } + } else { //server - unicast + if (bind(fd, (struct sockaddr *)&user_params.addr, sizeof(user_params.addr)) < 0) { + perror("udp_lat: bind()"); + exit(1); + } + } + } + +} + +long get_current_time_us() +{ + struct timeval tv; + long usec; + /* Obtain the time of day, and convert it to a tm struct. */ + gettimeofday (&tv, NULL); + usec = tv.tv_sec * MS; + usec += tv.tv_usec; + return usec; +} + +float get_recieve(PKT *pkt,float recv) +{ + if (pkt ->buf == MBYTE) + recv/=MB; + else if (pkt ->buf == KBYTE) + recv/=KB; + else if (pkt ->buf == GBYTE) + recv/=GB; + + return recv; + +} + +void set_msg_size() +{ + + pkt=(PKT *)malloc(user_params.msg_size); + if (!pkt) { + printf("Error due memmory allocation\n"); + exit(1); + } + + + +} + + + +void server_handler() +{ + + struct timeval; + int nbytes; + socklen_t size = sizeof(struct sockaddr); + struct sockaddr_in client_addr; + printf("udp_perf: [SERVER] listen on: "); + prepare_network(1); + long now = 0; + int missed=0,totalMissed=0,lastMissed=0; + float timeTaken=0,actualRecieve=0; + long totalPkt = 0,lastTotalPkt = 0; + char detail[3]; + while (!b_exit) { + nbytes = recvfrom(fd, pkt, user_params.msg_size, 0, (struct sockaddr *)&client_addr, &size); + if (b_exit) + goto get_out_s; + if (nbytes < 0) { + perror("udp_perf: recvfrom()"); + cleanup(); + exit(1); + } + //printf("nbytes= %d\n",nbytes); + //exit(1); + + if (nbytes < (pkt ->size)) { + printf("Error:Expected %d,recieved=%d\n",pkt->size ,nbytes); + cleanup(); + exit(1); + } + + if (pkt->buf == END_OF_PACKETS) { + timeTaken = (get_current_time_us()-now)/1000; + actualRecieve = get_recieve(pkt ,actualRecieve); + totalMissed=pkt->seqnum - totalPkt; + missed=totalMissed-lastMissed; + printf("Missed %d pkt, Actual rate = %.2lf%s, Actual packets recived %.0lf pps, Time %.2lf ms\n", + missed,actualRecieve,detail ,(totalPkt-lastTotalPkt)*1000/timeTaken,timeTaken); + printf("Expected packets:%ld ,Total recieved :%ld ,Total missed %d \n",pkt->seqnum,totalPkt,totalMissed); + continue; + } + + if (0 == pkt->seqnum) { + printf("New instance of server started. Resetting counters\n"); + now = get_current_time_us(); + totalPkt = 0; + lastTotalPkt=0; + lastMissed=0; + if (pkt->buf == 1) + strcpy(detail,"B"); + else if (pkt->buf == 2) + strcpy(detail,"KB"); + else if (pkt->buf == 3) + strcpy(detail,"MB"); + else if (pkt->buf == 4) + strcpy(detail,"GB"); + + totalMissed=0; + } + + if ((get_current_time_us()-now) / MS >= 1) { + + timeTaken = (get_current_time_us()-now)/1000; + actualRecieve = (totalPkt-lastTotalPkt)*((nbytes)/(timeTaken/1000)); + totalMissed=pkt->seqnum - totalPkt; //(sizeof(struct iphdr)+sizeof(struct udphdr)) + actualRecieve = get_recieve(pkt ,actualRecieve); + missed=totalMissed-lastMissed; + printf("Missed %d pkt, Actual rate = %.2lf%s, Actual packets recived %.0lf pps, Time %.2lf ms\n", + missed,actualRecieve,detail,(totalPkt-lastTotalPkt)*1000/timeTaken,timeTaken); + lastMissed = totalMissed; + lastTotalPkt = totalPkt; + now = get_current_time_us(); + } + + totalPkt++; + } + +get_out_s: + return; +} + +void client_handler() +{ + int retval,fd_delay; + unsigned long data; + struct itimerval timer; + int ret; + float timeTaken,actualSend,total_pkt; + long sent_pkt; + int i,j; + long now; + char detail[3]; + if (!pkt) { + printf("pkt not allocated"); + exit(1); + } + pkt->size = user_params.msg_size; + prepare_network(0); + sleep(2); + + if (user_params.sendRateDetails == 1) + strcpy(detail,"B"); + else if (user_params.sendRateDetails == 2) + strcpy(detail,"KB"); + else if (user_params.sendRateDetails == 3) + strcpy(detail,"MB"); + else if (user_params.sendRateDetails == 4) + strcpy(detail,"GB"); + + printf("udp_perf: Client Start sending ...\n"); + fd_delay = open("/dev/rtc", O_RDONLY); + if (fd_delay == -1) { + perror("/dev/rtc"); + exit(1); + } + //printf("Turning RTC interrupts (%d HZ)\n",RTC_HZ); + /* Turn on update interrupts (RTC_HZ per second) */ + retval = ioctl(fd_delay, RTC_IRQP_SET,RTC_HZ); + if (retval == -1) { + perror("ioctl"); + exit(1); + } + + /* Enable periodic interrupts */ + retval = ioctl(fd_delay,RTC_PIE_ON, 0); + if (retval == -1) { + perror("ioctl"); + exit(1); + } + total_pkt = (user_params.sendRate / (user_params.msg_size)); + if(total_pktbuf = user_params.sendRateDetails; + pkt->seqnum=0; + set_client_timer(&timer); + if (clock_gettime(CLOCK_REALTIME, &start_time)) { + perror("udp_perf: clock_gettime()"); + exit(1); + } + ret = setitimer(ITIMER_REAL, &timer, NULL); + if (ret) { + perror("udp_perf: setitimer()"); + exit(1); + } + while (!b_exit) { + sent_pkt = 0; + now = get_current_time_us(); + for (j=0; sent_pkt < total_pkt;j++) { + /* + if (total_pkt < RTC_HZ) { + //now = get_current_time_us(); + if (sendto(fd, pkt, user_params.msg_size , 0, + (struct sockaddr *)&(user_params.addr), sizeof(user_params.addr)) seqnum++; + for (i=0; iseqnum++; + + } + } + retval = read(fd_delay, &data, sizeof(unsigned long)); + if (retval == -1) { + perror("read"); + exit(1); + } + } + + + + timeTaken = (get_current_time_us() - now) / 1000; + totaltime+=timeTaken; + if (total_pkt==RTC_HZ) + actualSend=(RTC_HZ * user_params .msg_size) / (timeTaken / 1000 ); + else + actualSend = (user_params.sendRate / (timeTaken / 1000)); + + if (user_params.sendRateDetails == KBYTE) + actualSend /= KB; + else if (user_params.sendRateDetails == MBYTE) + actualSend /= MB; + else if (user_params.sendRateDetails == GBYTE) + actualSend /= GB; + printf("Time taken = %.0lf ms, Actual sent Rate = %.2lf%s, Actual packets sent %ld pps \n", timeTaken,actualSend,detail ,sent_pkt*1000/(long)timeTaken); + } + +get_out_c: + pkt->buf = (char)END_OF_PACKETS; + if (sendto(fd, pkt, user_params.msg_size, 0, (struct sockaddr *)&(user_params.addr), sizeof(user_params.addr)) < user_params .msg_size ) { + perror("udp_perf: sendto()"); + exit(1); + } + return; +} + + + +int main(int argc, char *argv[]) { + char send_rate[1024]; + if (argc == 1){ + usage(argv[0]); + return 1; + } + set_defaults(); + + /* Parse the parameters */ + while (1) { + int c = 0; + + static struct option long_options[] = { + {.name = "port", .has_arg = 1,.val = 'p'}, + {.name = "time", .has_arg = 1,.val = 't'}, + {.name = "rate", .has_arg = 1,.val = 'r'}, + {.name = "msg_size", .has_arg = 1,.val = 'm'}, + {.name = "ip", .has_arg = 1,.val = 'i'}, + {.name = "client", .has_arg = 0,.val = 'c'}, + {.name = "server", .has_arg = 0,.val = 's'}, + {.name = "help", .has_arg = 0,.val = 'h'}, + {.name = "version", .has_arg = 0,.val = 'v'}, + {0,0,0,0} + }; + + if ((c = getopt_long(argc, argv, "p:t:r:m:i:schv", + long_options, NULL)) == -1) + break; + + switch (c) { + case 'p': + user_params.mc_dest_port = strtol(optarg, NULL, 0); + /* strtol() returns 0 if there were no digits at all */ + if (user_params.mc_dest_port <= 0) { + printf("udp_perf: Invalid port: %d \n", + user_params.mc_dest_port); + usage(argv[0]); + return 1; + } + user_params.addr.sin_port = + htons(user_params.mc_dest_port); + break; + case 't': + user_params.sec = strtol(optarg, NULL, 0); + if (user_params.sec <= 0 || user_params.sec > 3600) { + printf("udp_perf: Invalid duration: %d \n", + user_params.sec); + usage(argv[0]); + return 1; + } + break; + case 'r': + strncpy(send_rate, optarg, MAX_PATH_LENGTH); + user_params.sendRate=get_send_rate(send_rate); + + if (user_params.sendRateDetails == (char)KBYTE) + user_params.sendRate *= KB; + else if (user_params.sendRateDetails == (char)MBYTE) + user_params.sendRate *= MB; + else if (user_params.sendRateDetails == (char)GBYTE) + user_params.sendRate *= GB; + if (user_params.sendRate <= 0) { + printf("udp_perf: Invalid message rate %fd\n",user_params.sendRate); + usage(argv[0]); + return 1; + } + + break; + + case 'm': + user_params.msg_size = strtol(optarg, NULL, 0); + if (user_params.msg_size < MIN_PAYLOAD_SIZE) { + printf("udp_perf: Invalid message size: %d (min: %d)\n", + user_params.msg_size, MIN_PAYLOAD_SIZE); + usage(argv[0]); + return 1; + } + break; + case 'i': + if (!inet_aton(optarg, &user_params.addr.sin_addr)) { // already in network byte order + printf("udp_perf: Invalid address: %s\n", + optarg); + usage(argv[0]); + return 1; + } + break; + case 's': + user_params.server = 1; + break; + case 'c': + user_params.server = 0; + break; + case 'h': + usage(argv[0]); + return 1; + break; + case 'v': + print_version(); + return 0; + default: + usage(argv[0]); + return 1; + } + } + if (optind < argc) { + printf("udp_perf: non-option ARGV-elements: "); + while (optind < argc) + printf("%s\n", argv[optind++]); + printf("\n"); + usage(argv[0]); + return 1; + } + set_msg_size(); + if (user_params.sendRate <= user_params.msg_size ) { + printf("udp_perf: Invalid message rate, should be bigger than msg size\n"); + return 1; + } + + set_signal_action(); + + + + + if (user_params.server) + server_handler(); + else + client_handler(); + + return 0; +} + diff --git a/tests/timetest/Makefile.am b/tests/timetest/Makefile.am new file mode 100644 index 0000000..b850b32 --- /dev/null +++ b/tests/timetest/Makefile.am @@ -0,0 +1,6 @@ +noinst_PROGRAMS = timetest + +AM_CPPFLAGS := -I$(top_builddir)/src + +timetest_LDADD = -lrt $(top_builddir)/src/utils/libutils.la +timetest_SOURCES = timetest.cpp diff --git a/tests/timetest/timetest.cpp b/tests/timetest/timetest.cpp new file mode 100644 index 0000000..b94fbeb --- /dev/null +++ b/tests/timetest/timetest.cpp @@ -0,0 +1,221 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include "utils/clock.h" +#include "utils/rdtsc.h" + +#define ITERATION_NUM 10000000 +#define ITERATION_NUM_LOW_PPS 100 +#define LOW_PPS_SLEEP_USEC 10000 + +int main(int argc, char* argv[]) +{ + if (argc) {}; + if (argv) {}; + + struct sched_param sp; + sp.sched_priority = 30; + sched_setscheduler(0, SCHED_FIFO, &sp); + + std::cout << "--------------------------------------------------------------------------------" << std::endl; + std::cout << "" << std::endl; + std::cout << "" << std::endl; + + std::cout << "Get time using clock_gettime(CLOCK_MONOTONIC):" << std::endl; + + + int64_t timeall = 0; + + timespec* times1 = new timespec[ITERATION_NUM]; + + for(int i=0;i 0) timeall += ts_to_nsec(&m_elapsed); + //std::cout << i << ": " << ts_to_nsec(&m_elapsed) << std::endl; + } + double rdtsc_avg = ((double)timeall)/(ITERATION_NUM-2); + std::cout << "RDTSC AVG: " << rdtsc_avg << " nsec" << std::endl; + + std::cout << "--------------------------------------------------------------------------------" << std::endl; + std::cout << "" << std::endl; + std::cout << "" << std::endl; + + std::cout << "Get time using gettimeofday:" << std::endl; + + timeall = 0; + + timeval* times = new timeval[ITERATION_NUM]; + for(int i=0;i 0) timeall += ts_to_nsec(&m_elapsed); + //std::cout << i << ": " << ts_to_nsec(&m_elapsed) << std::endl; + } + + double rdtsc_avg_lowpps = ((double)timeall)/(ITERATION_NUM_LOW_PPS-2); + std::cout << "RDTSC - low pps AVG: " << rdtsc_avg_lowpps << " nsec" << std::endl; + + std::cout << "--------------------------------------------------------------------------------" << std::endl; + std::cout << "" << std::endl; + std::cout << "" << std::endl; + + std::cout << "Get time using gettimeofday - low pps:" << std::endl; + + timeall = 0; + + for(int i=0;iUDP_LAT Using Select/Poll/Epoll<==============" "$log_file" + if [[ "$OVER_VMA" = yes ]]; then + vma_select_poll_info="With VMA_SELECT_POLL=0" + print_message "|----------------------------------|" "$log_file" + print_message "|VMA_SELECT_POLL=0" "$log_file" + print_message "|----------------------------------|" "$log_file" + fi + run_udp_lat_using_select_helper "$vma_select_poll_info" + run_udp_lat_using_poll_helper "$vma_select_poll_info" + run_udp_lat_using_epoll_helper "$vma_select_poll_info" + recreate_coalesce_params + tests_finish +} + +function run_udp_lat_using_select_epoll_poll_with_full_polling_vma_only +{ + if [[ "$OVER_VMA" = yes ]]; then + local vma_select_poll_old=$VMA_SELECT_POLL + vma_select_poll_info="" + save_coalesce_params + update_coalesce_4_udp_lat + append_tmp_file_and_delete "$TMP_DIR/$log_file.prep" "$log_file" + change_command_prefix VMA_SELECT_POLL "$VMA_SELECT_POLL_MAX_VAL" + vma_select_poll_info="With VMA_SELECT_POLL=$VMA_SELECT_POLL_MAX_VAL" + print_message "===============>UDP_LAT Using Select/Poll/Epoll<==============" "$log_file" + print_message "|----------------------------------|" "$log_file" + print_message "|VMA_SELECT_POLL=$VMA_SELECT_POLL_MAX_VAL" "$log_file" + print_message "|----------------------------------|" "$log_file" + run_udp_lat_using_select_helper "$vma_select_poll_info" + run_udp_lat_using_poll_helper "$vma_select_poll_info" + run_udp_lat_using_epoll_helper "$vma_select_poll_info" + change_command_prefix VMA_SELECT_POLL vma_select_poll_old + recreate_coalesce_params + tests_finish + fi +} + +function run_udp_lat_using_select_helper +{ + command_str="s" + log_str="Select (default timeout 1msec)" + prepare_udp_lat_using_feed_file_headlines "$log_str" "$1" + run_udp_lat_with_diff_mc_feed_files "$command_str" "$log_str" + + command_str="s --timeout 0" + log_str="Select (timeout zero)" + prepare_udp_lat_using_feed_file_headlines "$log_str" "$1" + run_udp_lat_with_diff_mc_feed_files "$command_str" "$log_str" + + command_str="s --timeout 1000" + log_str="Select (timeout 1 sec)" + prepare_udp_lat_using_feed_file_headlines "$log_str" "$1" + run_udp_lat_with_diff_mc_feed_files "$command_str" "$log_str" + + command_str="s --timeout -1" + log_str="Select (timeout infinite)" + prepare_udp_lat_using_feed_file_headlines "$log_str" "$1" + run_udp_lat_with_diff_mc_feed_files "$command_str" "$log_str" +} + +function run_udp_lat_using_poll_helper +{ + command_str="p" + log_str="Poll (default timeout 1msec)" + prepare_udp_lat_using_feed_file_headlines "$log_str" "$1" + run_udp_lat_with_diff_mc_feed_files "$command_str" "$log_str" + + command_str="p --timeout 0" + log_str="Poll (timeout zero)" + prepare_udp_lat_using_feed_file_headlines "$log_str" "$1" + run_udp_lat_with_diff_mc_feed_files "$command_str" "$log_str" + + command_str="p --timeout 1000" + log_str="Poll (timeout 1 sec)" + prepare_udp_lat_using_feed_file_headlines "$log_str" "$1" + run_udp_lat_with_diff_mc_feed_files "$command_str" "$log_str" + + command_str="p --timeout -1" + log_str="Poll (timeout infinite)" + prepare_udp_lat_using_feed_file_headlines "$log_str" "$1" + run_udp_lat_with_diff_mc_feed_files "$command_str" "$log_str" +} + +function run_udp_lat_using_epoll_helper +{ + command_str="e" + log_str="Epoll (default timeout 1msec)" + prepare_udp_lat_using_feed_file_headlines "$log_str" "$1" + run_udp_lat_with_diff_mc_feed_files "$command_str" "$log_str" + + command_str="e --timeout 0" + log_str="Epoll (timeout zero)" + prepare_udp_lat_using_feed_file_headlines "$log_str" "$1" + run_udp_lat_with_diff_mc_feed_files "$command_str" "$log_str" + + command_str="e --timeout 1000" + log_str="Epoll (timeout 1 sec)" + prepare_udp_lat_using_feed_file_headlines "$log_str" "$1" + run_udp_lat_with_diff_mc_feed_files "$command_str" "$log_str" + + command_str="e --timeout -1" + log_str="Epoll (timeout infinite)" + prepare_udp_lat_using_feed_file_headlines "$log_str" "$1" + run_udp_lat_with_diff_mc_feed_files "$command_str" "$log_str" +} + +function run_udp_lat_with_diff_mc_feed_files +{ + local size_arr_len=${#MC_GROUP_SIZES[*]} + print_udp_lat_with_feed_files_header "$2" + for((i=0; $i < $size_arr_len; i=$((i=$i+1)))) + do + curr_feed_file_size=${MC_GROUP_SIZES[$i]} + feed_file_name="$TMP_DIR/feed_file_$curr_feed_file_size" + print_cycle_info $curr_feed_file_size "mc group" + create_mc_feed_files "$curr_feed_file_size" "$feed_file_name" + run_udp_lat_with_feed_file "$feed_file_name" "$1" + parse_udp_lat_test_results "${MC_GROUP_SIZES[$i]}" + remove_mc_feed_files "$feed_file_name" + done + clean_after_udp_lat +} + + +function run_udp_lat_with_feed_file +{ + upd_lat_command_line_srv=${PREFIX}"${UDP_LAT_APP} -s -m $UDP_LAT_MSG_SIZE -f $1 -F $2" + upd_lat_command_line_clt=${PREFIX}"${UDP_LAT_APP} -c -m $UDP_LAT_MSG_SIZE -f $1 -F $2 -t $DURATION" + + (echo ${SRV_CMMND_LINE_PREF}$upd_lat_command_line_srv | tee -a $log_file) >& /dev/null + (eval "$upd_lat_command_line_srv 2>&1 | tee >> $log_file &") + sleep 15 + (ssh $REM_HOST_IP "killall udp_lat") >& /dev/null + (echo "${CLT_CMMND_LINE_PREF} $upd_lat_command_line_clt" | tee -a "$TMP_DIR/$log_file.tmp") >& /dev/null + (ssh $REM_HOST_IP "sleep 10;$upd_lat_command_line_clt 2>&1 | tee >> $TMP_FILE") + pkill -2 -f udp_lat >& /dev/null + sleep 5 +} + +function print_udp_lat_with_feed_files_header +{ + print_message "=====================>UDP_LAT With $1<====================" "$log_file" +} + +function tests_finish +{ + eval "cat $TMP_DIR/$log_file.post" | tee -a $log_file >& /dev/null + echo "---------------------------------------------------------------" |tee -a $log_file + clean +} + +function print_cycle_info +{ + let "cycle_num=$i+1" + echo "##################### cycle $cycle_num of $size_arr_len #####################" + echo "#$2 size is $1" +} + +function parse_udp_lat_test_results +{ + check_udp_lat_succss + if [[ $success -eq $TRUE ]]; then + + latency=`ssh $REM_HOST_IP cat $TMP_FILE |tr A-Z a-z|grep latency|tr [="="=] " " |tr -s " "|tr " " "\n"|tail -2|head -1` + echo "#average latency is $latency usec" + echo $1,$latency >> $res_file + else + echo "#$ERROR_MESSAGE" + echo "$1,${ERROR_RESULT}" >> $res_file + + fi + + ssh $REM_HOST_IP "cat $TMP_FILE" | tee -a "$TMP_DIR/$log_file.tmp" >& /dev/null + ssh $REM_HOST_IP "rm -rf $TMP_FILE" >& /dev/null +} + +function check_udp_lat_succss +{ + local res=0 + + res=`ssh $REM_HOST_IP "cat $TMP_FILE |tr A-Z a-z |grep latency | wc -l"` + + if [[ $res -gt 0 ]]; then + success=$TRUE + else + success=$FALSE + fi + +} + +function prepare_output_files +{ + date=`date +%Y_%m_%d_%H_%M_%S` + log_file="${OUTPUT_FILES_PATH}vma_perf_${date}_logs.txt" + res_file="${OUTPUT_FILES_PATH}vma_perf_${date}_results.csv" + + touch $log_file + touch $res_file +} + +function prepare_udp_lat_using_feed_file_headlines +{ + echo "" >> $res_file + echo Udp_lat Using $1 $2 Test Results >> $res_file + echo MC Group Num,Latency >> $res_file +} + +function update_command_line_pref_in_log_file +{ + SRV_CMMND_LINE_PREF="[$srv_hostname "`pwd`"]" + CLT_CMMND_LINE_PREF="[$clt_hostname "`ssh $REM_HOST_IP pwd`"]" +} + +function get_hostnames +{ + clt_hostname=`ssh $REM_HOST_IP hostname` + srv_hostname=`hostname` + + update_command_line_pref_in_log_file + +} + +function update_command_prefix +{ + PREFIX="" + + if [[ "$OVER_VMA" = yes ]] ; then + + if [[ $VMA_IGMP_ENABLE -eq 0 ]] ; then + PREFIX="$PREFIX VMA_IGMP=0 " + fi + + if [[ $VMA_SELECT_POLL -ne 0 ]] ; then + PREFIX="$PREFIX VMA_SELECT_POLL=$VMA_SELECT_POLL " + fi + + if [[ $VMA_RX_BUFS -ne 0 ]] ; then + PREFIX="$PREFIX VMA_RX_BUFS=$VMA_RX_BUFS " + fi + + PREFIX=${PREFIX}"LD_PRELOAD=$VMA_LIB " + fi +} + +function change_command_prefix +{ + eval "$1=$2" + update_command_prefix +} + +function remove_ifaces +{ + add_curr_route_table_2_log + iface_arr_local=`route | grep 2[24][40].0.0.0 | tr -s ' ' | cut -d ' ' -f 8` + iface_arr_remote=`ssh $REM_HOST_IP "route | grep 2[24][40].0.0.0 | tr -s ' ' | cut -d ' ' -f 8"` + + echo "" >> "$TMP_DIR/$log_file.prep" + echo "============>Remove interfaces from route table <=============" >> "$TMP_DIR/$log_file.prep" + + for iface in $iface_arr_local + do + command="route del -net $DST_NET netmask $DST_MASK dev $iface" + (echo "${SRV_CMMND_LINE_PREF} $command" | tee -a $TMP_DIR/$log_file.prep) >& /dev/null + eval "$command 2>&1 | tee >> $TMP_DIR/$log_file.prep" + done + + for iface in $iface_arr_remote + do + command="route del -net $DST_NET netmask $DST_MASK dev $iface" + (echo "${CLT_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.prep") >& /dev/null + eval "ssh $REM_HOST_IP "$command" 2>&1 | tee >> $TMP_DIR/$log_file.prep" + done +} + +function add_curr_route_table_2_log +{ + (echo "${SRV_CMMND_LINE_PREF} route" | tee -a "$TMP_DIR/$log_file.prep") >& /dev/null + eval "route 2>&1 | tee >> $TMP_DIR/$log_file.prep" + + (echo "${CLT_CMMND_LINE_PREF} route" | tee -a "$TMP_DIR/$log_file.prep") >& /dev/null + eval "ssh $REM_HOST_IP "route" 2>&1 | tee >> $TMP_DIR/$log_file.prep" +} + +function recreate_route_table +{ + echo "" >> "$TMP_DIR/$log_file.post" + echo "===================>Recreate route table <====================" >> "$TMP_DIR/$log_file.post" + command="route del -net $DST_NET netmask $DST_MASK dev $INTERFACE" + (echo "${SRV_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.post") >& /dev/null + eval "$command 2>&1 | tee >> $TMP_DIR/$log_file.post" + (echo "${CLT_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.post") >& /dev/null + ssh $REM_HOST_IP "$command" 2>&1 | tee >> "$TMP_DIR/$log_file.post" + + for iface in $iface_arr_local + do + command="route add -net $DST_NET netmask $DST_MASK dev $iface" + (echo "${SRV_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.post") >& /dev/null + eval "$command 2>&1 | tee >> $TMP_DIR/$log_file.post" + done + + for iface in $iface_arr_remote + do + command="route add -net $DST_NET netmask $DST_MASK dev $iface" + (echo "${CLT_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.post") >& /dev/null + (eval "ssh $REM_HOST_IP "$command" 2>&1 | tee >> $TMP_DIR/$log_file.post") >& /dev/null + done + eval "cat $TMP_DIR/$log_file.post" | tee -a $log_file >& /dev/null + clean +} + +function prepare_route_table +{ + echo "" >> "$TMP_DIR/$log_file.prep" + echo "=====================>Route table info <======================" >> "$TMP_DIR/$log_file.prep" + remove_ifaces + echo "" >> "$TMP_DIR/$log_file.prep" + echo "============>Add work interface to route table <==============" >> "$TMP_DIR/$log_file.prep" + command="route add -net $DST_NET netmask $DST_MASK dev $INTERFACE" + (echo "${SRV_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.prep") >& /dev/null + eval "$command 2>&1 | tee >> $TMP_DIR/$log_file.prep" + (echo "${CLT_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.prep") >& /dev/null + eval "ssh $REM_HOST_IP "$command" 2>&1 | tee >> $TMP_DIR/$log_file.prep" + eval "cat $TMP_DIR/$log_file.prep" | tee -a $log_file >& /dev/null + clean +} + +function save_coalesce_params +{ + local_coalesce_params_saved=$TRUE + remote_coalesce_params_saved=$TRUE + command1="ethtool -c $INTERFACE" + + echo "" >> "$TMP_DIR/$log_file.prep" + echo "===================>Coalesce params info<=====================" >> "$TMP_DIR/$log_file.prep" + + save_local_coalesce_params "$command1" rx-frames: initial_rx_frames_local + save_remote_coalesce_params "$command1" rx-frames: initial_rx_frames_remote + + rm -f $TMP_FILE >& /dev/null + rm -f $TMP_FILE.err >& /dev/null + +} + +function save_local_coalesce_params +{ + (echo "${SRV_CMMND_LINE_PREF} $1" | tee -a "$TMP_DIR/$log_file.prep") >& /dev/null + eval "$1 $COMMAND_REDIRECT" + check_succsess_and_save_param $2 get_coalesce_param + eval "$3=$?" + if [[ $SUCCSESS -eq $FALSE ]] ;then + local_coalesce_params_saved=$FALSE + fi + +} + +function save_remote_coalesce_params +{ + (echo "${CLT_CMMND_LINE_PREF} $1" | tee -a "$TMP_DIR/$log_file.prep") >& /dev/null + eval "(ssh $REM_HOST_IP "$1") $COMMAND_REDIRECT" + check_succsess_and_save_param $2 get_coalesce_param + eval "$3=$?" + if [[ $SUCCSESS -eq $FALSE ]]; then + remote_coalesce_params_saved=$FALSE + fi +} + +function get_coalesce_param +{ + ret_val=`cat $TMP_FILE | grep $1 | cut -d " " -f 2 2>/dev/null` +} + +function get_umcast_val +{ + umcast_val=`cat $TMP_FILE | tr -d "\n" 2>/dev/null` +} + +function update_coalesce_4_udp_lat +{ + local_coalesce_params_changed=$FALSE + remote_coalesce_params_changed=$FALSE + echo "" >> "$TMP_DIR/$log_file.prep" + echo "============>Prepare coalesce params for udp_lat<=============" >> "$TMP_DIR/$log_file.prep" + update_coalesce_params $RX_FRAMES_4_UDP_LAT +} + +function update_coalesce_params +{ + command="ethtool -C $INTERFACE rx-frames $1" + + if [[ $local_coalesce_params_saved -eq $TRUE ]]; then + if [[ $initial_rx_frames_local -ne $1 ]]; then + update_local_coalesce_params + fi + fi + + if [[ $remote_coalesce_params_saved -eq $TRUE ]]; then + if [[ $initial_rx_frames_remote -ne $1 ]]; then + update_remote_coalesce_params + fi + fi +} + +function update_local_coalesce_params +{ + (echo "${SRV_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.prep") >& /dev/null + eval "$command $COMMAND_REDIRECT" + check_command_succss + + if [[ $SUCCSESS -eq $TRUE ]]; then + local_coalesce_params_changed=$TRUE + else + local_coalesce_params_changed=$FALSE + fi + +} + +function update_remote_coalesce_params +{ + (echo "${CLT_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.prep") >& /dev/null + eval "(ssh $REM_HOST_IP "$command") $COMMAND_REDIRECT" + check_command_succss + if [[ $SUCCSESS -eq $TRUE ]]; then + remote_coalesce_params_changed=$TRUE + else + remote_coalesce_params_changed=$FALSE + fi +} + +function recreate_coalesce_params +{ + echo "" >> "$TMP_DIR/$log_file.post" + echo "==================>Recreate coalesce params<==================" >> "$TMP_DIR/$log_file.post" + + if [[ $local_coalesce_params_changed -eq $TRUE ]]; then + recreate_local_coalesce_params + fi + + if [[ $remote_coalesce_params_changed -eq $TRUE ]]; then + recreate_remote_coalesce_params + fi +} + +function recreate_local_coalesce_params +{ + local command="ethtool -C $INTERFACE rx-frames $initial_rx_frames_local" + + (echo "${SRV_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.post") >& /dev/null + eval $command >& /dev/null +} + +function recreate_remote_coalesce_params +{ + local command="ethtool -C $INTERFACE rx-frames $initial_rx_frames_remote" + + (echo "${CLT_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.post") >& /dev/null + ssh $REM_HOST_IP "$command" >& /dev/null +} + +function save_umcast +{ + local_umcast_saved=$FALSE + remote_umcast_saved=$FALSE + echo "" >> "$TMP_DIR/$log_file.prep" + echo "========================>Umcast info<=========================" >> "$TMP_DIR/$log_file.prep" + check_if_infbnd_iface + if [[ $is_infiniband -eq $TRUE ]]; then + save_local_umcast_val + save_remote_umcast_val + fi + eval "cat $TMP_DIR/$log_file.prep" | tee -a $log_file >& /dev/null + clean +} + +function save_local_umcast_val +{ + local command="cat /sys/class/net/$INTERFACE/umcast" + (echo "${SRV_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.prep") >& /dev/null + eval "cat /sys/class/net/$INTERFACE/umcast 1>$TMP_FILE 2>$TMP_FILE.err " + check_succsess_and_save_param initial_local_umcast_val get_umcast_val + if [[ $SUCCSESS -eq $FALSE ]]; then + local_umcast_saved=$FALSE + else + initial_local_umcast_val=$umcast_val + local_umcast_saved=$TRUE + fi +} + +function save_remote_umcast_val +{ + local command="cat /sys/class/net/$INTERFACE/umcast" + (echo "${CLT_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.prep") >& /dev/null + (eval "ssh $REM_HOST_IP cat /sys/class/net/$INTERFACE/umcast") 1>$TMP_FILE 2>$TMP_FILE.err + check_succsess_and_save_param initial_remote_umcast_val get_umcast_val + if [[ $SUCCSESS -eq $FALSE ]]; then + remote_umcast_saved=$FALSE + else + initial_remote_umcast_val=$umcast_val + remote_umcast_saved=$TRUE + fi +} + + +function update_umcast +{ + echo "" >> "$TMP_DIR/$log_file.prep" + echo "===================>Prepare umcast param<====================" >> "$TMP_DIR/$log_file.prep" + local_umcast_changed=$FALSE + remote_umcast_changed=$FALSE + + if [[ $initial_local_umcast_val -ne $UMCAST_VAL ]]; then + update_local_umcast + fi + + if [[ $initial_remote_umcast_val -ne $UMCAST_VAL ]]; then + update_remote_umcast + fi + + eval "cat $TMP_DIR/$log_file.prep" | tee -a $log_file >& /dev/null + clean +} + +function update_local_umcast +{ + local command="echo $UMCAST_VAL 1>&/sys/class/net/$INTERFACE/umcast" + if [[ $local_umcast_saved -eq $TRUE ]]; then + (echo "${SRV_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.prep") >& /dev/null + eval "$command 1>$TMP_FILE 2>$TMP_FILE.err" + check_command_succss + if [[ $SUCCSESS -eq $TRUE ]]; then + local_umcast_changed=$TRUE + else + local_umcast_changed=$FALSE + fi + fi +} + +function update_remote_umcast +{ + local command="echo $UMCAST_VAL 1>&/sys/class/net/$INTERFACE/umcast" + if [[ $local_umcast_saved -eq $TRUE ]]; then + (echo "${SRV_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.prep") >& /dev/null + ssh $REM_HOST_IP "$command" | eval "1>$TMP_FILE 2>$TMP_FILE.err " + check_command_succss + if [[ $SUCCSESS -eq $TRUE ]]; then + remote_umcast_changed=$TRUE + else + remote_umcast_changed=$FALSE + fi + fi +} + +function recreate_umcast +{ + echo "" >> "$TMP_DIR/$log_file.post" + echo "====================>Recreate umcast value<===================" >> "$TMP_DIR/$log_file.post" + + if [[ $local_umcast_changed -eq $TRUE ]]; then + recreate_local_umcast_val + fi + + if [[ $remote_umcast_changed -eq $TRUE ]]; then + recreate_remote_umcast_val + fi + + eval "cat $TMP_DIR/$log_file.post" | tee -a $log_file >& /dev/null +} + +function recreate_local_umcast_val +{ + local command="echo $initial_local_umcast_val 1>&/sys/class/net/$INTERFACE/umcast" + (echo "${SRV_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.post") >& /dev/null + eval "$command" >& /dev/null +} + +function recreate_remote_umcast_val +{ + local command="echo $initial_remote_umcast_val 1>&/sys/class/net/$INTERFACE/umcast" + (echo "${CLT_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.post") >& /dev/null + ssh $REM_HOST_IP $command >& /dev/null +} + +function clean_after_udp_lat +{ + ssh $REM_HOST_IP killall udp_lat >& /dev/null + pkill -2 -f udp_lat >& /dev/null + sleep 10 + cat "$TMP_DIR/$log_file.tmp" | tee -a $log_file >& /dev/null + rm -f "$TMP_DIR/$log_file.tmp" >& /dev/null + +} + +function collect_nodes_info_to_file +{ + collect_local_node_info_to_file "$1" + collect_remote_node_info_to_file "$1" +} + +function collect_local_node_info_to_file +{ + (echo "======================>Local node info<======================" | tee -a $1) >& /dev/null + (echo "--------------------" | tee -a $1) >& /dev/null + (hostname | tee -a $1) >& /dev/null + (echo -n "OS: " >> $1;cat /etc/issue | grep We | tee -a $1) >& /dev/null + (echo -n "CPU: " >> $1;cat /proc/cpuinfo | grep 'model name' | sort -u | awk '{print $4, $5, $6, $7, $9}' | tee -a $1) >& /dev/null + (echo -n "Number of CPUs: " >> $1;cat /proc/cpuinfo |grep proce |wc |awk '{print $1}' | tee -a $1) >& /dev/null + (echo -n "CPU Type: " >> $1;uname -a | awk '{print $12}' | tee -a $1) >& /dev/null + (cat /proc/meminfo |grep [M,m]em | tee -a $1) >& /dev/null + (echo -n "Kernel: " >> $1;uname -a | awk '{print $3}' | tee -a $1) >& /dev/null + (cat /usr/voltaire/version | tee -a $1) >& /dev/null + (ibstat | grep -e "CA type" -e "Firmware version" | tee -a $1) >& /dev/null + (ibstatus | grep -e rate -e state | grep -v 'phys state' | tee -a $1) >& /dev/null + check_if_infbnd_iface + if [[ $is_infiniband -eq $TRUE ]]; then + (echo -n "IPoIB mode: " >> $1 ; cat "/sys/class/net/$INTERFACE/mode" | tee -a $1) >& /dev/null + fi + (ifconfig $INTERFACE | grep MTU | awk '{print $5}' | tee -a $1) >& /dev/null + (echo -n "OFED:" >> $1;ofed_info | head -6 | grep OFED | tee -a $1) >& /dev/null + (echo -n "VMA:" >> $1;rpm -qa | grep $VMA | tee -a $1) >& /dev/null + +} + +function collect_remote_node_info_to_file +{ + (echo "=====================>Remote node info<======================" | tee -a $1) >& /dev/null + (echo "--------------------" | tee -a $1) >& /dev/null + (ssh $REM_HOST_IP "hostname" | tee -a $1) >& /dev/null + (echo -n "OS: " >> $1;ssh $REM_HOST_IP cat /etc/issue | grep We | tee -a $1) >& /dev/null + (echo -n "CPU: " >> $1;ssh $REM_HOST_IP cat /proc/cpuinfo | grep 'model name' | sort -u | awk '{print $4, $5, $6, $7, $9}' | tee -a $1) >& /dev/null + (echo -n "Number of CPUs: " >> $1;ssh $REM_HOST_IP cat /proc/cpuinfo |grep proce |wc |awk '{print $1}' | tee -a $1) >& /dev/null + (echo -n "CPU Type: " >> $1;ssh $REM_HOST_IP uname -a | awk '{print $12}' | tee -a $1) >& /dev/null + (ssh $REM_HOST_IP cat /proc/meminfo |grep [M,m]em | tee -a $1) >& /dev/null + (echo -n "Kernel: " >> $1;ssh $REM_HOST_IP uname -a | awk '{print $3}' | tee -a $1) >& /dev/null + (ssh $REM_HOST_IP "cat /usr/voltaire/version" | tee -a $1) >& /dev/null + (ssh $REM_HOST_IP "ibstat | grep -e "CA type" -e 'Firmware version'" | tee -a $1) >& /dev/null + (ssh $REM_HOST_IP "ibstatus | grep -e rate -e state | grep -v 'phys state'" | tee -a $1) >& /dev/null + to_print="/sys/class/net/$INTERFACE/mode" + check_if_infbnd_iface + if [[ $is_infiniband -eq $TRUE ]]; then + (echo -n "IPoIB mode: " >> $1 ;ssh $REM_HOST_IP "cat $to_print" | tee -a $1) >& /dev/null + fi + (ssh $REM_HOST_IP ifconfig $INTERFACE | grep MTU | awk '{print $5}' | tee -a $1) >& /dev/null + (echo -n "OFED:" >> $1;ssh $REM_HOST_IP ofed_info | head -6 | grep OFED | tee -a $1) >& /dev/null + (echo -n "VMA:" >> $1;ssh $REM_HOST_IP rpm -qa | grep $VMA | tee -a $1) >& /dev/null +} + +function print_message +{ + echo $1 | tee -a $2 + echo ""| tee -a $2 +} + +function append_tmp_file_and_delete +{ + cat $1 | tee -a $2 >& /dev/null + rm -f $1 +} + +function check_command_succss +{ + if [ -s $TMP_FILE.err ]; then + eval "cat $TMP_FILE.err 2>&1 | tee >> $TMP_DIR/$log_file.prep" + LAST_ERROR=`cat $TMP_FILE.err` + SUCCSESS=$FALSE + else + if [ -s $TMP_FILE ]; then + eval "cat $TMP_FILE 2>&1 | tee >> $TMP_DIR/$log_file.prep" + fi + SUCCSESS=$TRUE + fi + + rm -f $TMP_FILE.err >& /dev/null +} + +function check_succsess_and_save_param +{ + local ret_val=0 + + check_command_succss + + if [[ $SUCCSESS -eq $TRUE ]]; then + $2 $1 + fi + + return $ret_val +} + +function check_if_infbnd_iface +{ + is_infiniband=$FALSE + + if [[ $INTERFACE =~ "ib*" ]]; then + is_infiniband=$TRUE + fi +} + +function discover_local_work_if_ip +{ + LOCAL_IP=`ifconfig $INTERFACE | grep inet |grep -v inet6 | cut -d ':' -f 2| cut -d " " -f 1` >& /dev/null +} + +function calc_file_age +{ + creation_time=`stat -c %Z /tmp/vma_utils_block_file` + now=`date +%s` + block_file_age=$(($now-$creation_time)) +} + +function get_operator_pid +{ + pid=$$ + #pid=`ps -eF |grep $script_name| tr -s ' ' | cut -d ' ' -f 2|head -1` +} + +function write_to_file_operator_details +{ + operating_machine_hostname=`hostname` + get_operator_pid + rm -f "$2" >& /dev/null + touch "$2" >& /dev/null + echo -n "$1 " >> "$2" 2>/dev/null + echo -n "$pid " >> "$2" 2>/dev/null + echo -n "$script_name " >> "$2" 2>/dev/null + echo -n "$user_name " >> "$2" 2>/dev/null + echo -n "$user_id " >> "$2" 2>/dev/null + echo -n "$operating_machine_hostname " >> "$2" 2>/dev/null + + if [[ $1 == "local" ]]; then + st_timestamp=`date` + else + st_timestamp=`ssh $REM_HOST_IP "date" ` + fi + + echo "$st_timestamp" | tr " " "_" >> "$2" 2>/dev/null +} + +function read_block_file +{ + blocking_pid=`awk -F " " '{print $2}' "$1"` + blocking_app=`awk -F " " '{print $3}' "$1"` + blocking_username=`awk -F " " '{print $4}' "$1"` + blocking_id=`awk -F " " '{print $5}' "$1"` + blocking_hostname=`awk -F " " '{print $6}' "$1"` + blocking_st_time=`awk -F " " '{print $7}' "$1"` +} + +function print_block_files_details +{ + echo "Blocked Host:$blocked_host" + echo "You blocked by:" + echo "- application: ${blocking_app} " + echo "- user: ${blocking_username} " + echo "- users local host ip:${blocking_id} " + echo "- blocking proccess with pid: ${blocking_pid} running on host ${blocking_hostname} " + echo "- starting time:${blocking_st_time} " + echo "- blocking file:${BLOCK_FILE}" +} + +function update_remote_block_file +{ + write_to_file_operator_details "$LOCAL_IP" "${BLOCK_FILE}.rem" + scp "${BLOCK_FILE}.rem" "${REM_HOST_IP}:${BLOCK_FILE}" >& /dev/null + rm -f "${BLOCK_FILE}.rem" >& /dev/null +} + +function update_local_block_file +{ + write_to_file_operator_details "local" "$BLOCK_FILE" +} + +function update_block_files +{ + discover_local_work_if_ip + if [ "$LOCAL_IP" == "" ]; then + echo "WARNING: Will be executed without blocking..." + else + update_local_block_file + update_remote_block_file + fi +} + +function unblock_local_host +{ + rm -f $BLOCK_FILE >& /dev/null + rm -f "${BLOCK_FILE}.rem" >& /dev/null +} + +function unblock_remote_host +{ + ssh $REM_HOST_IP "rm -f $BLOCK_FILE >& /dev/null" >& /dev/null + rm -f "${BLOCK_FILE}.rem" >& /dev/null +} + +function unblock +{ + unblock_local_host + unblock_remote_host +} + +function check_connection_to_remote_host +{ + ping -w 3 "$1" >& /dev/null + test $? -eq 0 && RES=OK || RES=NOT +} + +function check_if_another_proccess_running_on_local_host +{ + eval "ps -eF| grep '$blocking_pid'|grep -v grep|wc -l" > $TMP_FILE + RES=`cat $TMP_FILE` +} + +function check_if_another_proccess_running_on_remote_host +{ + RES=0 + check_connection_to_remote_host "$1" + if [ $RES == "OK" ]; then + RES=`sudo ssh ${SUPER_USR}@${1} "ps -eF| grep ${blocking_pid}|grep -v grep|wc -l"` + else + RES=1 + fi +} + +function get_operating_host_ip_or_hostname +{ + RES=0 + operating_host_ip=`awk -F " " '{print $1}' "$1"` + if [[ $operating_host_ip != "local" ]]; then + check_connection_to_remote_host "$operating_host_ip" + if [ $RES != "OK" ]; then + operating_host_ip=`awk -F " " '{print $6}' "$1"` + fi + fi +} + +function check_if_operating_host_running_another_proccess +{ + if [ $1 == "local" ]; then + check_if_another_proccess_running_on_local_host + else + check_if_another_proccess_running_on_remote_host "$1" + fi +} + +function adjust_operating_host_ip_of_remote_machine +{ + local tmp="" + + if [ "$1" == "local" ]; then + operating_host_ip=$REM_HOST_IP + else + tmp=`ifconfig|grep $1` + if [ "$tmp" != "" ]; then + operating_host_ip="local" + fi + fi +} + +function check_if_remote_host_is_blocked +{ + RES=0 + ssh $REM_HOST_IP "cat ${BLOCK_FILE} 2>/dev/null" > "${BLOCK_FILE}.rem" + + if [ -s "${BLOCK_FILE}.rem" ]; then + read_block_file "${BLOCK_FILE}.rem" + get_operating_host_ip_or_hostname "${BLOCK_FILE}.rem" + adjust_operating_host_ip_of_remote_machine "$operating_host_ip" + check_if_operating_host_running_another_proccess "$operating_host_ip" + + if [[ $RES -le 0 ]]; then + unblock_remote_host + else + read_block_file "$BLOCK_FILE.rem" + blocked_host=$REM_HOST_IP + print_block_files_details + rm -f "${BLOCK_FILE}.rem" >& /dev/null + clean + exit 1 + fi + fi +} + +function check_if_local_host_is_blocked +{ + RES=0 + if [[ -e $BLOCK_FILE ]]; then + read_block_file "$BLOCK_FILE" + get_operating_host_ip_or_hostname "${BLOCK_FILE}" + check_if_operating_host_running_another_proccess "$operating_host_ip" + if [[ $RES -le 0 ]]; then + unblock_local_host + else + blocked_host=`hostname` + print_block_files_details + clean + exit 1 + fi + fi +} + +function block +{ + check_if_local_host_is_blocked + check_if_remote_host_is_blocked + update_block_files +} + + +function pre_test_checks +{ + check_connection_2_remote_ip + clean +} + +function check_connection_2_remote_ip +{ + ssh -o "BatchMode yes" $REM_HOST_IP exit 2>$TMP_FILE.err + check_command_succss + if [[ $SUCCSESS -ne $TRUE ]]; then + echo "vma_perf_envelope error:$LAST_ERROR" + clean + unblock + exit 1 + fi +} + +function write_mc_feed_file +{ + x=0 # num of mc groups + y=1 # the third number of the mc group + port=10005 + num=3 # the last number of the mc group + + + while [ $x -lt $1 ] + do + if [ $num -ge 254 ]; then + y=$(($y+1)) + num=3 + fi + + echo 224.4.$y.$num:$port >> $2 + x=$(($x+1)) + port=$(($port+1)) + num=$(($num+1)) + done +} + +function create_mc_feed_files +{ + write_mc_feed_file $1 $2 + copy_feed_file_2_remote_machine $2 >& /dev/null +} + +function copy_feed_file_2_remote_machine +{ + scp $1 "$REM_HOST_IP:/$TMP_DIR" +} + +function remove_mc_feed_files +{ + rm -f $1 >& /dev/null + ssh $REM_HOST_IP "rm -f $1" >& /dev/null +} + +function clean +{ + rm -f $TMP_FILE.err >& /dev/null + rm -f "$TMP_DIR/$log_file.prep" >& /dev/null + rm -f "$TMP_DIR/$log_file.post" >& /dev/null +} + +function write_date_2_log_file +{ + echo "=============================>Date<===========================" >> "$log_file" + (echo "${SRV_CMMND_LINE_PREF} date" | tee -a "$log_file") >& /dev/null + (date | tee -a $log_file) >& /dev/null +} + +function pre_vma_perf +{ + pre_test_checks + get_hostnames + prepare_output_files + write_date_2_log_file + collect_nodes_info_to_file "$log_file" + prepare_route_table + save_umcast + update_umcast + clean + update_command_prefix +} + +function final_test_message +{ + echo "####################### Generated files #######################" + echo "#Test results : $res_file" + echo "#Test logs : $log_file" + echo "---------------------------------------------------------------" +} + +function post_vma_perf +{ + collect_nodes_info_to_file "$res_file" + recreate_route_table + recreate_umcast + final_test_message + write_date_2_log_file + clean +} + +function vma_perf +{ + run_udp_lat_using_select_epoll_poll_with_zero_polling + run_udp_lat_using_select_epoll_poll_with_full_polling_vma_only +} + +#main + +if [ $# = 1 ]; then + block + pre_vma_perf + vma_perf + post_vma_perf + unblock +else + echo "Usage: perf " + exit +fi diff --git a/tests/vma_perf_envelope/vma_perf_envelope.sh b/tests/vma_perf_envelope/vma_perf_envelope.sh new file mode 100755 index 0000000..fc516fa --- /dev/null +++ b/tests/vma_perf_envelope/vma_perf_envelope.sh @@ -0,0 +1,1590 @@ +#!/bin/bash + +# Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. +# +# This software is available to you under a choice of one of two +# licenses. You may choose to be licensed under the terms of the GNU +# General Public License (GPL) Version 2, available from the file +# COPYING in the main directory of this source tree, or the +# BSD license below: +# +# Redistribution and use in source and binary forms, with or +# without modification, are permitted provided that the following +# conditions are met: +# +# - Redistributions of source code must retain the above +# copyright notice, this list of conditions and the following +# disclaimer. +# +# - Redistributions in binary form must reproduce the above +# copyright notice, this list of conditions and the following +# disclaimer in the documentation and/or other materials +# provided with the distribution. +# +# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, +# EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF +# MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND +# NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS +# BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN +# ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN +# CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +# SOFTWARE. + +#configurable parameters +#--------------------------------------------------- +MC_GROUP=224.18.7.81 +PORT=5001 +DURATION=10 #in seconds +BW=10G +OUTPUT_FILES_PATH="./" +INTERFACE="ib0" +OVER_VMA="yes" #[yes | not] +UDP_LAT_MSG_SIZE=(2 8 20 60 100 200 400 800 1470 2000 3000 4000 8000 16000 32000 65000) #Bytes +UDP_LAT_BURST_SIZE=(2 5 10 25 50 100 250 500 1000 2500 5000 10000 16000 25000 50000 100000 250000 500000) #Bytes +IPERF_MSG_SIZE=(12 20 60 100 200 400 800 1470 2000 3000 4000 8000 16000 32000 65000) #Bytes +MC_GROUP_SIZES=(1 10 20 30 50 60 64 65 70 80 90 100 150 200) + +#path +#---------------------------------------------------- +UDP_LAT_APP=${UDP_LAT_PATH:-udp_lat} +VMA_LIB=${VMA_PATH:-libvma.so} + +##################################################### +#vma default values +#--------------------------------------------------- +DEFAULT_VMA_IGMP_ENABLE=1 +DEFAULT_VMA_RX_POLL_OS_RATIO=10 +DEFAULT_VMA_RX_SKIP_OS=100 +DEFAULT_VMA_SELECT_POLL=0 +DEFAULT_VMA_RX_BUFS=200000 +DEFAULT_VMA_THREAD_MODE=1 +DEFAULT_VMA_RX_WRE=16000 +DEFAULT_VMA_SELECT_POLL=0 +DEFAULT_VMA_SELECT_SKIP_OS=4 +DEFAULT_VMA_HUGETLB=1 +##################################################### +#initial vma values in test +#--------------------------------------------------- +VMA_IGMP_ENABLE=0 +VMA_RX_POLL_OS_RATIO=$DEFAULT_VMA_RX_POLL_OS_RATIO +VMA_RX_SKIP_OS=$DEFAULT_VMA_RX_SKIP_OS +VMA_RX_BUFS=$DEFAULT_VMA_RX_BUFS +VMA_RX_WRE=$DEFAULT_VMA_RX_WRE +VMA_SELECT_POLL=$DEFAULT_VMA_SELECT_POLL +VMA_SELECT_SKIP_OS=$DEFAULT_VMA_SELECT_SKIP_OS +VMA_THREAD_MODE=$DEFAULT_VMA_THREAD_MODE +VMA_HUGETLB=$DEFAULT_VMA_HUGETLB +########################################################################### +#other Optimal Val +#-------------------------------------------------------------------------- +VMA_SELECT_POLL_MAX_VAL=1000000 +VMA_RX_BUFS_MAX_VAL=200000 +VMA_IOMUX_RX_WRE=$DEFAULT_VMA_RX_WRE #3200 +VMA_IOMUX_RX_SKIP_OS=$DEFAULT_VMA_RX_SKIP_OS #1000 +VMA_IOMUX_SELECT_SKIP_OS=$DEFAULT_VMA_SELECT_SKIP_OS #500 +VMA_IOMUX_HUGETLB=$DEFAULT_VMA_HUGETLB #1 +MAX_UDP_LAT_MSG_SIZE=65000 +ACTIVITY=100000 +RX_FRAMES_4_UDP_LAT=1 +RX_FRAMES_4_IPERF=16 +RX_USEC_4_LAT_TEST=0 +RX_USEC_4_BW_TEST=10 +UMCAST_VAL=1 +DST_NET="224.0.0.0" +DST_MASK="240.0.0.0" +VMA="vma" +TMP_DIR=/tmp +TMP_FILE="$TMP_DIR/perf_tmp" +ERROR_MESSAGE="!!! Test Failed !!!" +ERORR_PROMT="vma_perf_envelope:" +ERROR_RESULT="null" +PREFIX="" +REM_HOST_IP=$1 +COMMAND_REDIRECT="1>$TMP_FILE 2>$TMP_FILE.err" +TRUE=1 +FALSE=0 +SUCCSESS=1 +BLOCK_FILE="$TMP_DIR/vma_tests_block_file" +script_name=$(basename $0) +user_name=`whoami` +user_id=`who -am | tr " " "\n" | tail -1 | tr -d "(|)"` +SUPER_USR=root +##################################################### + +function run_iperf_with_diff_msg_len +{ + wait_time=0 + local size_arr_len=${#IPERF_MSG_SIZE[*]} + + prepare_iperf_headlines + save_coalesce_params + update_coalesce_4_tr_test + append_tmp_file_and_delete "$TMP_DIR/$log_file.prep" "$log_file" + + print_message "============================>IPERF<============================" "$log_file" + + for((i=0; $i < $size_arr_len; i=$((i=$i+1)))) + do + curr_msg_size=${IPERF_MSG_SIZE[$i]} + iperf_cycle + parse_iperf_test_results + done + recreate_coalesce_params + clean_after_iperf + tests_finish +} + +function iperf_cycle +{ + print_cycle_info $curr_msg_size message + killall iperf >& /dev/null + ssh $REM_HOST_IP killall iperf >& /dev/null + iperf_command_line_srv=${PREFIX}"iperf -usB $MC_GROUP -p $PORT -l $curr_msg_size -i 1 -f M" + iperf_command_line_clt=${PREFIX}"iperf -uc $MC_GROUP -p $PORT -l $curr_msg_size -t $DURATION -b $BW -i 1 -f M" + (echo "${SRV_CMMND_LINE_PREF}$iperf_command_line_srv" | tee -a $log_file) >& /dev/null + (ssh $REM_HOST_IP "echo ${CLT_CMMND_LINE_PREF}$iperf_command_line_clt|tee -a $TMP_DIR/$log_file.tmp")>& /dev/null + (ssh $REM_HOST_IP "sleep 10;$iperf_command_line_clt 2>&1 | tee >> $TMP_DIR/$log_file.tmp " &) + wait_time=$DURATION + let "wait_time += 20" + (sleep $wait_time;killall -9 iperf >& /dev/null)| (eval "$iperf_command_line_srv 2>&1 | tee >> $TMP_FILE") +} + +function run_udp_lat_with_diff_msg_len +{ + prepare_udp_lat_headlines + save_coalesce_params + update_coalesce_4_udp_lat + append_tmp_file_and_delete "$TMP_DIR/$log_file.prep" "$log_file" + print_message "===========================>UDP_LAT<==========================" "$log_file" + + local size_arr_len=${#UDP_LAT_MSG_SIZE[*]} + upd_lat_command_line_srv=${PREFIX}"${UDP_LAT_APP} -s -i $MC_GROUP -p $PORT -m $MAX_UDP_LAT_MSG_SIZE" + (echo ${SRV_CMMND_LINE_PREF}$upd_lat_command_line_srv | tee -a $log_file) >& /dev/null + (eval "$upd_lat_command_line_srv 2>&1 | tee >> $log_file &") + + for((i=0; $i < $size_arr_len; i=$((i=$i+1)))) + do + curr_msg_size=${UDP_LAT_MSG_SIZE[$i]} + udp_lat_cycle + sleep 5 + parse_udp_lat_test_results "${UDP_LAT_MSG_SIZE[$i]}" + done + clean_after_udp_lat + recreate_coalesce_params + tests_finish +} + +function run_udp_lat_tx_bw_with_diff_msg_len +{ + prepare_udp_lat_tx_bw_headlines + save_coalesce_params + update_coalesce_4_tr_test + append_tmp_file_and_delete "$TMP_DIR/$log_file.prep" "$log_file" + print_message "========================>UDP_LAT TX BW<=======================" "$log_file" + + local size_arr_len=${#UDP_LAT_MSG_SIZE[*]} + + for((i=0; $i < $size_arr_len; i=$((i=$i+1)))) + do + curr_msg_size=${UDP_LAT_MSG_SIZE[$i]} + udp_lat_tx_bw_cycle + sleep 5 + parse_udp_lat_tx_bw_test_results "${UDP_LAT_MSG_SIZE[$i]}" + done + clean_after_udp_lat + recreate_coalesce_params + tests_finish +} + +function run_udp_lat_bw_with_diff_msg_len +{ + prepare_udp_lat_bw_headlines + save_coalesce_params + update_coalesce_4_tr_test + append_tmp_file_and_delete "$TMP_DIR/$log_file.prep" "$log_file" + print_message "========================>UDP_LAT BW<=========================" "$log_file" + + local size_arr_len=${#UDP_LAT_MSG_SIZE[*]} + + for((i=0; $i < $size_arr_len; i=$((i=$i+1)))) + do + curr_msg_size=${UDP_LAT_MSG_SIZE[$i]} + udp_lat_bw_cycle + sleep 5 + parse_udp_lat_bw_test_results "${UDP_LAT_MSG_SIZE[$i]}" + done + clean_after_udp_lat + recreate_coalesce_params + tests_finish +} + +function run_udp_lat_with_diff_burst_size +{ + prepare_udp_lat_sending_bursts_headlines + save_coalesce_params + update_coalesce_4_udp_lat + append_tmp_file_and_delete "$TMP_DIR/$log_file.prep" "$log_file" + print_message "===================>UDP_LAT SENDING BURSTS<===================" "$log_file" + local size_arr_len=${#UDP_LAT_BURST_SIZE[*]} + local initial_rx_buffs_val=$VMA_RX_BUFS + VMA_RX_BUFS=$VMA_RX_BUFS_MAX_VAL + update_command_prefix + upd_lat_command_line_srv=${PREFIX}"${UDP_LAT_APP} -s -i $MC_GROUP -p $PORT -m $MAX_UDP_LAT_MSG_SIZE" + (echo ${SRV_CMMND_LINE_PREF}$upd_lat_command_line_srv | tee -a $log_file) >& /dev/null + (eval "$upd_lat_command_line_srv 2>&1 | tee >> $log_file &") + + for((i=0; $i < $size_arr_len; i=$((i=$i+1)))) + do + curr_burst_size=${UDP_LAT_BURST_SIZE[$i]} + udp_lat_sending_bursts_cycle + sleep 5 + parse_udp_lat_test_results "${UDP_LAT_BURST_SIZE[$i]}" + done + VMA_RX_BUFS=$initial_rx_buffs_val + update_command_prefix + clean_after_udp_lat + recreate_coalesce_params + tests_finish +} + +function run_udp_lat_using_select_epoll_poll_with_zero_polling +{ + local vma_select_poll_old=$VMA_SELECT_POLL + vma_select_poll_info="" + save_coalesce_params + update_coalesce_4_udp_lat + append_tmp_file_and_delete "$TMP_DIR/$log_file.prep" "$log_file" + print_message "===============>UDP_LAT Using Select/Poll/Epoll<==============" "$log_file" + if [[ "$OVER_VMA" = yes ]]; then + vma_select_poll_info="With VMA_SELECT_POLL=0" + print_message "|----------------------------------|" "$log_file" + print_message "|VMA_SELECT_POLL=0" "$log_file" + print_message "|----------------------------------|" "$log_file" + fi + run_udp_lat_using_select_epoll_poll_helper "$vma_select_poll_info" + recreate_coalesce_params + tests_finish +} + +function save_shmem_prop +{ + eval "save_local_hugetlb=`cat /proc/sys/vm/nr_hugepages 2>/dev/null`;save_local_shmax=`cat /proc/sys/kernel/shmmax 2>/dev/null`" + eval "save_remote_hugetlb=`ssh $REM_HOST_IP 'cat /proc/sys/vm/nr_hugepages 2>/dev/null'`;save_remote_shmax=`ssh $REM_HOST_IP 'cat /proc/sys/kernel/shmmax 2>/dev/null'`" +} + +function recreate_mem_prop +{ + echo "" >> "$TMP_DIR/$log_file.post" + echo "================>Recreate number of huge pages <==============" >> "$TMP_DIR/$log_file.post" + command="echo $save_local_hugetlb > /proc/sys/kernel/shmmax;echo $save_local_shmax > /proc/sys/vm/nr_hugepages" + (echo "${SRV_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.post") >& /dev/null + eval "$command 2>&1 | tee >> $TMP_DIR/$log_file.post" + command="echo $save_remote_hugetlb > /proc/sys/kernel/shmmax;echo $save_remote_shmax > /proc/sys/vm/nr_hugepages" + (echo "${CLT_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.post") >& /dev/null + eval "ssh $REM_HOST_IP "$command" 2>&1 | tee >> $TMP_DIR/$log_file.post" + print_huge_tlb_info "${TMP_DIR}/${log_file}.post" + eval "cat $TMP_DIR/$log_file.post" | tee -a $log_file >& /dev/null + clean +} + +function print_huge_tlb_info +{ + local file=$1 + echo "" >> "$file" + echo "=======================>Huge pages info<======================" >> "$file" + command="cat /proc/meminfo | grep -i HugePage" + (echo "${SRV_CMMND_LINE_PREF} $command" | tee -a "$file") >& /dev/null + eval "$command 2>&1 | tee >> $file" + (echo "${CLT_CMMND_LINE_PREF} $command" | tee -a "$file") >& /dev/null + eval "ssh $REM_HOST_IP "$command" 2>&1 | tee >> $file" +} + +function increase_number_of_hugetlb +{ + print_huge_tlb_info "${TMP_DIR}/${log_file}.prep" + echo "" >> "$TMP_DIR/$log_file.prep" + echo "================>Update number of huge pages <================" >> "$TMP_DIR/$log_file.prep" + command="echo 1000000000 > /proc/sys/kernel/shmmax;echo 400 > /proc/sys/vm/nr_hugepages;cat /proc/meminfo | grep -i HugePage" + (echo "${SRV_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.prep") >& /dev/null + eval "$command 2>&1 | tee >> $TMP_DIR/$log_file.prep" + (echo "${CLT_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.prep") >& /dev/null + eval "ssh $REM_HOST_IP "$command" 2>&1 | tee >> $TMP_DIR/$log_file.prep" + eval "cat $TMP_DIR/$log_file.prep" | tee -a $log_file >& /dev/null + clean +} + +function run_udp_lat_using_select_epoll_poll_with_full_polling_vma_only +{ + if [[ "$OVER_VMA" = yes ]]; then + local vma_select_poll_old=$VMA_SELECT_POLL + local vma_select_skip_os_old=$VMA_SELECT_SKIP_OS + local vma_rx_skip_os_old=$VMA_RX_SKIP_OS + local vma_rx_wre_old=$VMA_RX_WRE + local vma_hugetlb_old=$VMA_HUGETLB + + vma_select_poll_info="" + save_coalesce_params + update_coalesce_4_udp_lat + append_tmp_file_and_delete "$TMP_DIR/$log_file.prep" "$log_file" + change_command_prefix VMA_SELECT_POLL=$VMA_SELECT_POLL_MAX_VAL VMA_SELECT_SKIP_OS=$VMA_IOMUX_SELECT_SKIP_OS VMA_RX_WRE=$VMA_IOMUX_RX_WRE VMA_HUGETLB=$VMA_IOMUX_HUGETLB VMA_RX_SKIP_OS=$VMA_IOMUX_RX_SKIP_OS + vma_select_poll_info="With VMA_SELECT_POLL=$VMA_SELECT_POLL_MAX_VAL" + print_message "===============>UDP_LAT Using Select/Poll/Epoll<==============" "$log_file" + print_message "|----------------------------------|" "$log_file" + print_message "|VMA_SELECT_POLL=$VMA_SELECT_POLL_MAX_VAL" "$log_file" + print_message "|----------------------------------|" "$log_file" + run_udp_lat_using_select_epoll_poll_helper "$vma_select_poll_info" + change_command_prefix VMA_SELECT_POLL=$vma_select_poll_old VMA_SELECT_SKIP_OS=$vma_select_skip_os_old VMA_RX_WRE=$vma_rx_wre_old VMA_HUGETLB=$vma_hugetlb_old VMA_RX_SKIP_OS=$vma_rx_skip_os_old + recreate_coalesce_params + tests_finish + fi +} + +function run_udp_lat_using_select_epoll_poll_helper +{ + prepare_udp_lat_using_feed_file_headlines "Select" "$1" + run_udp_lat_with_diff_mc_feed_files "s" "Select" + prepare_udp_lat_using_feed_file_headlines "Epoll" "$1" + run_udp_lat_with_diff_mc_feed_files "e" "Epoll " + prepare_udp_lat_using_feed_file_headlines "Poll" "$1" + run_udp_lat_with_diff_mc_feed_files "p" "Poll " +} + +function run_udp_lat_with_diff_mc_feed_files +{ + local size_arr_len=${#MC_GROUP_SIZES[*]} + print_udp_lat_with_feed_files_header "$2" + for((i=0; $i < $size_arr_len; i=$((i=$i+1)))) + do + curr_feed_file_size=${MC_GROUP_SIZES[$i]} + feed_file_name="$TMP_DIR/feed_file_$curr_feed_file_size" + print_cycle_info "$curr_feed_file_size" "mc group" + create_mc_feed_files "$curr_feed_file_size" "$feed_file_name" + run_udp_lat_with_feed_file "$feed_file_name" "$1" + parse_udp_lat_test_results "${MC_GROUP_SIZES[$i]}" + remove_mc_feed_files "$feed_file_name" + done + clean_after_udp_lat +} + + +function run_udp_lat_with_feed_file +{ + upd_lat_command_line_srv=${PREFIX}"${UDP_LAT_APP} -s -f $1 -F $2" + upd_lat_command_line_clt=${PREFIX}"${UDP_LAT_APP} -c -f $1 -F $2 -t $DURATION" + + (echo ${SRV_CMMND_LINE_PREF}$upd_lat_command_line_srv | tee -a $log_file) >& /dev/null + (eval "$upd_lat_command_line_srv 2>&1 | tee >> $log_file &") + sleep 15 + (ssh $REM_HOST_IP "killall udp_lat") >& /dev/null + (echo "${CLT_CMMND_LINE_PREF} $upd_lat_command_line_clt" | tee -a "$TMP_DIR/$log_file.tmp") >& /dev/null + (ssh $REM_HOST_IP "sleep 10;$upd_lat_command_line_clt 2>&1 | tee >> $TMP_FILE") + pkill -2 -f udp_lat >& /dev/null + sleep 5 +} + +function print_udp_lat_with_feed_files_header +{ + print_message "=====================>UDP_LAT With $1<====================" "$log_file" +} + +function tests_finish +{ + eval "cat $TMP_DIR/$log_file.post" | tee -a $log_file >& /dev/null + echo "---------------------------------------------------------------" |tee -a $log_file + clean +} + +function udp_lat_cycle +{ + print_cycle_info $curr_msg_size message + (ssh $REM_HOST_IP "killall udp_lat") >& /dev/null + upd_lat_command_line_clt=${PREFIX}"${UDP_LAT_APP} -c -i $MC_GROUP -p $PORT -m $curr_msg_size -t $DURATION" + (echo "${CLT_CMMND_LINE_PREF} $upd_lat_command_line_clt" | tee -a "$TMP_DIR/$log_file.tmp") >& /dev/null + (ssh $REM_HOST_IP "sleep 10;$upd_lat_command_line_clt 2>&1 | tee >> $TMP_FILE") + +} + +function udp_lat_tx_bw_cycle +{ + print_cycle_info $curr_msg_size message + (ssh $REM_HOST_IP "killall udp_lat") >& /dev/null + upd_lat_command_line_clt=${PREFIX}"${UDP_LAT_APP} -c -i $MC_GROUP -p $PORT -m $curr_msg_size -t $DURATION -k -A $ACTIVITY" + (echo "${CLT_CMMND_LINE_PREF} $upd_lat_command_line_clt" | tee -a "$TMP_DIR/$log_file.tmp") >& /dev/null + (ssh $REM_HOST_IP "sleep 10;$upd_lat_command_line_clt 2>&1 | tee >> $TMP_FILE") +} + +function udp_lat_bw_cycle +{ + print_cycle_info $curr_msg_size message + killall -9 udp_lat >& /dev/null + ssh $REM_HOST_IP killall -9 udp_lat >& /dev/null + + upd_lat_command_line_srv=${PREFIX}"${UDP_LAT_APP} -s -i $MC_GROUP -p $PORT -m $curr_msg_size -k -A $ACTIVITY" + upd_lat_command_line_clt=${PREFIX}"${UDP_LAT_APP} -c -i $MC_GROUP -p $PORT -m $curr_msg_size -t $DURATION -k -A $ACTIVITY" + + (echo ${SRV_CMMND_LINE_PREF}$upd_lat_command_line_srv | tee -a $log_file) >& /dev/null + (ssh $REM_HOST_IP "echo ${CLT_CMMND_LINE_PREF} $upd_lat_command_line_clt | tee -a $TMP_DIR/$log_file.tmp") >& /dev/null + (eval "$upd_lat_command_line_srv 2>&1 | tee >> $log_file &") + (ssh $REM_HOST_IP "killall udp_lat") >& /dev/null + (echo "${CLT_CMMND_LINE_PREF} $upd_lat_command_line_clt" | tee -a "$TMP_DIR/$log_file.tmp") >& /dev/null + (ssh $REM_HOST_IP "sleep 5;$upd_lat_command_line_clt 2>&1 | tee >> $TMP_FILE") + local wait_time=$DURATION + let "wait_time += 20" + sleep $wait_time + killall -2 udp_lat +} + +function udp_lat_sending_bursts_cycle +{ + print_cycle_info $curr_burst_size burst + (ssh $REM_HOST_IP "killall udp_lat") >& /dev/null + upd_lat_command_line_clt=${PREFIX}"${UDP_LAT_APP} -c -i $MC_GROUP -p $PORT -t $DURATION -b $curr_burst_size" + (echo "${CLT_CMMND_LINE_PREF} $upd_lat_command_line_clt" | tee -a "$TMP_DIR/$log_file.tmp") >& /dev/null + (ssh $REM_HOST_IP "sleep 10;$upd_lat_command_line_clt 2>&1 | tee >> $TMP_FILE") + +} + +function print_cycle_info +{ + let "cycle_num=$i+1" + echo "##################### cycle $cycle_num of $size_arr_len #####################" + echo "#$2 size is $1" +} + +function parse_iperf_test_results +{ + local start_time=0 + local end_time=0 + local warning_msg="" + check_iperf_succss + if [[ $success -eq $TRUE ]]; then + + loss=`cat $TMP_FILE | grep % | tail -1 | tr " " "\n" | tail -1 | tr -d '(-)'` + avg_pps=`cat $TMP_FILE | grep % | tail -1 | tr "-" " " | tr -s " " | tr '/' '\n' | tail -1 | tr " " '\n' | tail -2 | head -1` + #avg_pps=`cat $TMP_FILE | grep % | tail -1 | tr "-" " " | tr -s " " | cut --d=" " -f 12 | cut -d "/" -f 2` + start_time=`cat $TMP_FILE|grep %|tail -1|tr "-" " "|tr -s " " | cut --d=" " -f 3 | cut --d="." -f 1` + end_time=`cat $TMP_FILE|grep %|tail -1|tr "-" " "|tr -s " " | cut --d=" " -f 4 | cut --d="." -f 1` + let "actual_duration=$end_time-$start_time" + let "avg_pps=avg_pps/$actual_duration" + avg_bw=`cat $TMP_FILE | grep % | tail -1 | tr "-" " " | tr -s " " | cut --d=" " -f 8` + echo "#average loss is $loss" + echo "#average BW is $avg_bw MBps" + echo "#average packet rate is $avg_pps pps" + if [[ "$actual_duration" -ne $DURATION ]]; then + warning_msg="#vma_perf_envelope:WARNING:missing summarize in iperf" + echo "$warning_msg" + warning_msg=",$warning_msg" + fi + + echo "${IPERF_MSG_SIZE[$i]},$avg_bw,$loss,${avg_pps}${warning_msg}" >> $res_file + else + echo "#$ERROR_MESSAGE" + echo "${IPERF_MSG_SIZE[$i]},${ERROR_RESULT},${ERROR_RESULT},${ERROR_RESULT}" >> $res_file + fi + + cat $TMP_FILE | tee -a $log_file >& /dev/null + rm -rf $TMP_FILE >& /dev/null +} + +function parse_udp_lat_test_results +{ + check_udp_lat_succss latency $TMP_FILE + if [[ $success -eq $TRUE ]]; then + + latency=`ssh $REM_HOST_IP cat $TMP_FILE |tr A-Z a-z|grep latency|tr [="="=] " " |tr -s " "|tr " " "\n"|tail -2|head -1` + echo "#average latency is $latency usec" + echo $1,$latency >> $res_file + else + echo "#$ERROR_MESSAGE" + echo "$1,${ERROR_RESULT}" >> $res_file + + fi + + ssh $REM_HOST_IP "cat $TMP_FILE" | tee -a "$TMP_DIR/$log_file.tmp" >& /dev/null + ssh $REM_HOST_IP "rm -rf $TMP_FILE" >& /dev/null +} + +function parse_udp_lat_tx_bw_test_results +{ + check_udp_lat_succss rate $TMP_FILE + if [[ $success -eq $TRUE ]]; then + + local pps=`ssh $REM_HOST_IP cat $TMP_FILE |tr A-Z a-z|tail -2|grep "rate"| tr -s " " |cut -d " " -f 6` + local bw=`ssh $REM_HOST_IP cat $TMP_FILE |tr A-Z a-z|tail -2|grep "bandwidth"| tr -s " " |cut -d " " -f 5` + echo "#average message rate is $pps [msg/sec]" + echo "#average bw is $bw MBps" + echo $1,$pps,$bw >> $res_file + else + echo "#$ERROR_MESSAGE" + echo "$1,${ERROR_RESULT}" >> $res_file + + fi + + ssh $REM_HOST_IP "cat $TMP_FILE" | tee -a "$TMP_DIR/$log_file.tmp" >& /dev/null + ssh $REM_HOST_IP "rm -rf $TMP_FILE" >& /dev/null +} + +function parse_udp_lat_bw_test_results +{ + check_udp_lat_succss total $log_file + if [[ $success -eq $TRUE ]]; then + + local pps=`ssh $REM_HOST_IP cat $TMP_FILE |tr A-Z a-z|tail -2|grep "rate"| tr -s " " |cut -d " " -f 6` + local bw=`ssh $REM_HOST_IP cat $TMP_FILE |tr A-Z a-z|tail -2|grep "bandwidth"| tr -s " " |cut -d " " -f 5` + local rx_recived=`cat $log_file| grep received |tail -1|tr -s " "|cut -d " " -f 4` + local tx_send=`ssh $REM_HOST_IP cat $TMP_FILE| grep sent| tail -1 |tr -s " "| cut -d " " -f 4` + local diff=$(($tx_send-$rx_recived)) + + if [ $diff -lt 0 ]; then + diff=0 + fi + + local loss=$(echo "scale=5;($diff/$tx_send)*100"| bc) + + if [[ $loss =~ "^\." ]]; + then loss="0${loss}" + fi + + local d_point=`expr index $loss "\."` + d_point=$(($d_point+3)) + loss=`expr substr $loss 1 $d_point` + + echo "#message rate is $pps [msg/sec]" + echo "#bw is $bw MBps" + echo "#packet loss is ${loss}%" + echo $1,$pps,$bw,$loss >> $res_file + else + echo "#$ERROR_MESSAGE" + echo "$1,${ERROR_RESULT}" >> $res_file + + fi + + ssh $REM_HOST_IP "cat $TMP_FILE" | tee -a "$TMP_DIR/$log_file.tmp" >& /dev/null + ssh $REM_HOST_IP "rm -rf $TMP_FILE" >& /dev/null +} + +function check_udp_lat_succss +{ + local res=0 + local look_for=$1 + local res_file=$2 + + if [ $res_file = $log_file ]; then + res=`cat $res_file |tr A-Z a-z |grep $look_for | wc -l` + else + res=`ssh $REM_HOST_IP "cat $res_file |tr A-Z a-z |grep $look_for | wc -l"` + fi + + if [[ $res -gt 0 ]]; then + success=$TRUE + else + success=$FALSE + fi +} + +function check_iperf_succss +{ + local res=0 + + res=`cat $TMP_FILE | grep % | wc -l` + + if [ $res -gt 0 ]; then + success=$TRUE + else + success=$FALSE + fi +} + +function prepare_output_files +{ + date=`date +%Y_%m_%d_%H_%M_%S` + log_file="${OUTPUT_FILES_PATH}vma_perf_${date}_logs.txt" + res_file="${OUTPUT_FILES_PATH}vma_perf_${date}_results.csv" + + touch $log_file + touch $res_file +} + +function prepare_iperf_headlines +{ + echo "" >> $res_file + echo Iperf Test Results >> $res_file + echo Msg size,Averag RX BW,Average Loss,Packet Rate >> $res_file +} + +function prepare_udp_lat_headlines +{ + echo "" >> $res_file + echo Udp_lat Test Results >> $res_file + echo Message size,Latency >> $res_file +} + +function prepare_udp_lat_tx_bw_headlines +{ + echo "" >> $res_file + echo Udp_lat TX BW Test Results >> $res_file + echo Msg size,TX Packet Rate,TX BW >> $res_file +} + +function prepare_udp_lat_bw_headlines +{ + echo "" >> $res_file + echo Udp_lat BW Test Results >> $res_file + echo Msg size,TX Packet Rate,TX BW,RX Average Loss %>> $res_file +} + +function prepare_udp_lat_sending_bursts_headlines +{ + echo "" >> $res_file + echo Udp_lat Sending Bursts Test Results >> $res_file + echo Burst size,Latency >> $res_file +} + +function prepare_udp_lat_using_feed_file_headlines +{ + echo "" >> $res_file + echo Udp_lat Using $1 $2 Test Results >> $res_file + echo MC Group Num,Latency >> $res_file +} + +function update_command_line_pref_in_log_file +{ + SRV_CMMND_LINE_PREF="[$srv_hostname "`pwd`"]" + CLT_CMMND_LINE_PREF="[$clt_hostname "`ssh $REM_HOST_IP pwd`"]" +} + +function get_hostnames +{ + clt_hostname=`ssh $REM_HOST_IP hostname` + srv_hostname=`hostname` + + update_command_line_pref_in_log_file + +} + +function update_vma_igmp_flag +{ + check_if_infbnd_iface + if [[ $is_infiniband -eq $FALSE ]]; then + VMA_IGMP_ENABLE=1 + fi +} + +function update_command_prefix +{ + PREFIX="" + + update_vma_igmp_flag + + if [[ "$OVER_VMA" = yes ]] ; then + + if [[ $VMA_IGMP_ENABLE -ne $DEFAULT_VMA_IGMP_ENABLE ]] ; then + PREFIX="$PREFIX VMA_IGMP=$VMA_IGMP_ENABLE " + fi + + if [[ $VMA_SELECT_POLL -ne $DEFAULT_VMA_SELECT_POLL ]] ; then + PREFIX="$PREFIX VMA_SELECT_POLL=$VMA_SELECT_POLL " + fi + + if [[ $VMA_RX_SKIP_OS -ne $DEFAULT_VMA_RX_SKIP_OS ]] ; then + PREFIX="$PREFIX VMA_RX_SKIP_OS=$VMA_RX_SKIP_OS " + fi + + if [[ $VMA_RX_BUFS -ne $DEFAULT_VMA_RX_BUFS ]] ; then + PREFIX="$PREFIX VMA_RX_BUFS=$VMA_RX_BUFS " + fi + + if [[ $VMA_THREAD_MODE -ne $DEFAULT_VMA_THREAD_MODE ]] ; then + PREFIX="$PREFIX VMA_THREAD_MODE=$VMA_THREAD_MODE " + fi + + if [[ $VMA_RX_WRE -ne $DEFAULT_VMA_RX_WRE ]] ; then + PREFIX="$PREFIX VMA_RX_WRE=$VMA_RX_WRE " + fi + + if [[ $VMA_SELECT_SKIP_OS -ne $DEFAULT_VMA_SELECT_SKIP_OS ]] ; then + PREFIX="$PREFIX VMA_SELECT_SKIP_OS=$VMA_SELECT_SKIP_OS " + fi + + if [[ $VMA_HUGETLB -ne $DEFAULT_VMA_HUGETLB ]] ; then + PREFIX="$PREFIX VMA_HUGETLB=$VMA_HUGETLB " + fi + + PREFIX=${PREFIX}"LD_PRELOAD=$VMA_LIB " + fi +} + +function change_command_prefix +{ + for curr in $*; + do + eval "$curr" + done; + update_command_prefix +} + +function remove_ifaces +{ + add_curr_route_table_2_log + iface_arr_local=`route | grep 2[24][40].0.0.0 | tr -s ' ' | cut -d ' ' -f 8` + iface_arr_remote=`ssh $REM_HOST_IP "route | grep 2[24][40].0.0.0 | tr -s ' ' | cut -d ' ' -f 8"` + + echo "" >> "$TMP_DIR/$log_file.prep" + echo "============>Remove interfaces from route table <=============" >> "$TMP_DIR/$log_file.prep" + + for iface in $iface_arr_local + do + command="route del -net $DST_NET netmask $DST_MASK dev $iface" + (echo "${SRV_CMMND_LINE_PREF} $command" | tee -a $TMP_DIR/$log_file.prep) >& /dev/null + eval "$command 2>&1 | tee >> $TMP_DIR/$log_file.prep" + done + + for iface in $iface_arr_remote + do + command="route del -net $DST_NET netmask $DST_MASK dev $iface" + (echo "${CLT_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.prep") >& /dev/null + eval "ssh $REM_HOST_IP "$command" 2>&1 | tee >> $TMP_DIR/$log_file.prep" + done +} + +function add_curr_route_table_2_log +{ + (echo "${SRV_CMMND_LINE_PREF} route" | tee -a "$TMP_DIR/$log_file.prep") >& /dev/null + eval "route 2>&1 | tee >> $TMP_DIR/$log_file.prep" + + (echo "${CLT_CMMND_LINE_PREF} route" | tee -a "$TMP_DIR/$log_file.prep") >& /dev/null + eval "ssh $REM_HOST_IP "route" 2>&1 | tee >> $TMP_DIR/$log_file.prep" +} + +function recreate_route_table +{ + echo "" >> "$TMP_DIR/$log_file.post" + echo "===================>Recreate route table <====================" >> "$TMP_DIR/$log_file.post" + command="route del -net $DST_NET netmask $DST_MASK dev $INTERFACE" + (echo "${SRV_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.post") >& /dev/null + eval "$command 2>&1 | tee >> $TMP_DIR/$log_file.post" + (echo "${CLT_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.post") >& /dev/null + ssh $REM_HOST_IP "$command" 2>&1 | tee >> "$TMP_DIR/$log_file.post" + + for iface in $iface_arr_local + do + command="route add -net $DST_NET netmask $DST_MASK dev $iface" + (echo "${SRV_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.post") >& /dev/null + eval "$command 2>&1 | tee >> $TMP_DIR/$log_file.post" + done + + for iface in $iface_arr_remote + do + command="route add -net $DST_NET netmask $DST_MASK dev $iface" + (echo "${CLT_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.post") >& /dev/null + (eval "ssh $REM_HOST_IP "$command" 2>&1 | tee >> $TMP_DIR/$log_file.post") >& /dev/null + done + eval "cat $TMP_DIR/$log_file.post" | tee -a $log_file >& /dev/null + clean +} + +function prepare_route_table +{ + echo "" >> "$TMP_DIR/$log_file.prep" + echo "=====================>Route table info <======================" >> "$TMP_DIR/$log_file.prep" + remove_ifaces + echo "" >> "$TMP_DIR/$log_file.prep" + echo "============>Add work interface to route table <==============" >> "$TMP_DIR/$log_file.prep" + command="route add -net $DST_NET netmask $DST_MASK dev $INTERFACE" + (echo "${SRV_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.prep") >& /dev/null + eval "$command 2>&1 | tee >> $TMP_DIR/$log_file.prep" + (echo "${CLT_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.prep") >& /dev/null + eval "ssh $REM_HOST_IP "$command" 2>&1 | tee >> $TMP_DIR/$log_file.prep" + eval "cat $TMP_DIR/$log_file.prep" | tee -a $log_file >& /dev/null + clean +} + +function save_coalesce_params +{ + local_coalesce_params_saved=$TRUE + remote_coalesce_params_saved=$TRUE + command1="ethtool -c $INTERFACE" + + echo "" >> "$TMP_DIR/$log_file.prep" + echo "===================>Coalesce params info<=====================" >> "$TMP_DIR/$log_file.prep" + + save_local_coalesce_params "$command1" rx-frames: initial_rx_frames_local + save_remote_coalesce_params "$command1" rx-frames: initial_rx_frames_remote + save_local_coalesce_params "$command1" rx-usecs: initial_rx_usecs_local + save_remote_coalesce_params "$command1" rx-usecs: initial_rx_usecs_remote + rm -f $TMP_FILE >& /dev/null + rm -f $TMP_FILE.err >& /dev/null + +} + +function save_local_coalesce_params +{ + (echo "${SRV_CMMND_LINE_PREF} $1" | tee -a "$TMP_DIR/$log_file.prep") >& /dev/null + eval "$1 $COMMAND_REDIRECT" + check_succsess_and_save_param $2 get_coalesce_param + eval "$3=$?" + if [[ $SUCCSESS -eq $FALSE ]] ;then + local_coalesce_params_saved=$FALSE + fi + +} + +function save_remote_coalesce_params +{ + (echo "${CLT_CMMND_LINE_PREF} $1" | tee -a "$TMP_DIR/$log_file.prep") >& /dev/null + eval "(ssh $REM_HOST_IP "$1") $COMMAND_REDIRECT" + check_succsess_and_save_param $2 get_coalesce_param + eval "$3=$?" + if [[ $SUCCSESS -eq $FALSE ]]; then + remote_coalesce_params_saved=$FALSE + fi +} + +function get_coalesce_param +{ + ret_val=`cat $TMP_FILE | grep $1 | cut -d " " -f 2 2>/dev/null` +} + +function get_umcast_val +{ + umcast_val=`cat $TMP_FILE | tr -d "\n" 2>/dev/null` +} + +function update_coalesce_4_udp_lat +{ + local_coalesce_params_changed=$FALSE + remote_coalesce_params_changed=$FALSE + echo "" >> "$TMP_DIR/$log_file.prep" + echo "============>Prepare coalesce params for udp_lat<=============" >> "$TMP_DIR/$log_file.prep" + update_coalesce_params $RX_FRAMES_4_UDP_LAT $RX_USEC_4_LAT_TEST +} + +function update_coalesce_4_tr_test +{ + local_coalesce_params_changed=$FALSE + remote_coalesce_params_changed=$FALSE + echo "" >> "$TMP_DIR/$log_file.prep" + echo "========>Prepare coalesce params for throughput test<=========" >> "$TMP_DIR/$log_file.prep" + update_coalesce_params $RX_FRAMES_4_IPERF $RX_USEC_4_BW_TEST +} + +function update_coalesce_params +{ + local rx_frames_val=$1 + local rx_usecs_val=$2 + + update_coalesce_param rx-frames $rx_frames_val + update_coalesce_param rx-usecs $rx_usecs_val +} + +function update_coalesce_param +{ + local param_name=$1 + local param_val=$2 + command="ethtool -C $INTERFACE $param_name $param_val" + + if [[ $local_coalesce_params_saved -eq $TRUE ]]; then + if [[ $initial_rx_frames_local -ne $1 ]]; then + update_local_coalesce_params + fi + fi + + if [[ $remote_coalesce_params_saved -eq $TRUE ]]; then + if [[ $initial_rx_frames_remote -ne $1 ]]; then + update_remote_coalesce_params + fi + fi +} + +function update_local_coalesce_params +{ + (echo "${SRV_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.prep") >& /dev/null + eval "$command $COMMAND_REDIRECT" + check_command_succss + + if [[ $SUCCSESS -eq $TRUE ]]; then + local_coalesce_params_changed=$TRUE + else + local_coalesce_params_changed=$FALSE + fi + +} + +function update_remote_coalesce_params +{ + (echo "${CLT_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.prep") >& /dev/null + eval "(ssh $REM_HOST_IP "$command") $COMMAND_REDIRECT" + check_command_succss + if [[ $SUCCSESS -eq $TRUE ]]; then + remote_coalesce_params_changed=$TRUE + else + remote_coalesce_params_changed=$FALSE + fi +} + +function recreate_coalesce_params +{ + echo "" >> "$TMP_DIR/$log_file.post" + echo "==================>Recreate coalesce params<==================" >> "$TMP_DIR/$log_file.post" + + if [[ $local_coalesce_params_changed -eq $TRUE ]]; then + recreate_local_coalesce_params + fi + + if [[ $remote_coalesce_params_changed -eq $TRUE ]]; then + recreate_remote_coalesce_params + fi +} + +function recreate_local_coalesce_params +{ + local command="ethtool -C $INTERFACE rx-frames $initial_rx_frames_local rx-usecs $initial_rx_usecs_local" + + (echo "${SRV_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.post") >& /dev/null + eval $command >& /dev/null +} + +function recreate_remote_coalesce_params +{ + local command="ethtool -C $INTERFACE rx-frames $initial_rx_frames_remote rx-usecs $initial_rx_usecs_remote" + + (echo "${CLT_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.post") >& /dev/null + ssh $REM_HOST_IP "$command" >& /dev/null +} + +function save_umcast +{ + local_umcast_saved=$FALSE + remote_umcast_saved=$FALSE + echo "" >> "$TMP_DIR/$log_file.prep" + echo "========================>Umcast info<=========================" >> "$TMP_DIR/$log_file.prep" + check_if_infbnd_iface + + if [[ "$OVER_VMA" = not ]]; then + UMCAST_VAL=0 + fi + + if [[ $is_infiniband -eq $TRUE ]]; then + save_local_umcast_val + save_remote_umcast_val + fi + eval "cat $TMP_DIR/$log_file.prep" | tee -a $log_file >& /dev/null + clean +} + +function save_local_umcast_val +{ + local command="cat /sys/class/net/$INTERFACE/umcast" + (echo "${SRV_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.prep") >& /dev/null + eval "cat /sys/class/net/$INTERFACE/umcast 1>$TMP_FILE 2>$TMP_FILE.err " + check_succsess_and_save_param initial_local_umcast_val get_umcast_val + if [[ $SUCCSESS -eq $FALSE ]]; then + local_umcast_saved=$FALSE + else + initial_local_umcast_val=$umcast_val + local_umcast_saved=$TRUE + fi +} + +function save_remote_umcast_val +{ + local command="cat /sys/class/net/$INTERFACE/umcast" + (echo "${CLT_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.prep") >& /dev/null + (eval "ssh $REM_HOST_IP cat /sys/class/net/$INTERFACE/umcast") 1>$TMP_FILE 2>$TMP_FILE.err + check_succsess_and_save_param initial_remote_umcast_val get_umcast_val + if [[ $SUCCSESS -eq $FALSE ]]; then + remote_umcast_saved=$FALSE + else + initial_remote_umcast_val=$umcast_val + remote_umcast_saved=$TRUE + fi +} + + +function update_umcast +{ + echo "" >> "$TMP_DIR/$log_file.prep" + echo "===================>Prepare umcast param<====================" >> "$TMP_DIR/$log_file.prep" + local_umcast_changed=$FALSE + remote_umcast_changed=$FALSE + + if [[ $initial_local_umcast_val -ne $UMCAST_VAL ]]; then + update_local_umcast + fi + + if [[ $initial_remote_umcast_val -ne $UMCAST_VAL ]]; then + update_remote_umcast + fi + + eval "cat $TMP_DIR/$log_file.prep" | tee -a $log_file >& /dev/null + clean +} + +function update_local_umcast +{ + local command="echo $UMCAST_VAL 1>&/sys/class/net/$INTERFACE/umcast" + if [[ $local_umcast_saved -eq $TRUE ]]; then + (echo "${SRV_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.prep") >& /dev/null + eval "$command 1>$TMP_FILE 2>$TMP_FILE.err" + check_command_succss + if [[ $SUCCSESS -eq $TRUE ]]; then + local_umcast_changed=$TRUE + else + local_umcast_changed=$FALSE + fi + fi +} + +function update_remote_umcast +{ + local command="echo $UMCAST_VAL 1>&/sys/class/net/$INTERFACE/umcast" + if [[ $local_umcast_saved -eq $TRUE ]]; then + (echo "${CLT_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.prep") >& /dev/null + ssh $REM_HOST_IP "$command" | eval "1>$TMP_FILE 2>$TMP_FILE.err " + check_command_succss + if [[ $SUCCSESS -eq $TRUE ]]; then + remote_umcast_changed=$TRUE + else + remote_umcast_changed=$FALSE + fi + fi +} + +function recreate_umcast +{ + echo "" >> "$TMP_DIR/$log_file.post" + echo "====================>Recreate umcast value<===================" >> "$TMP_DIR/$log_file.post" + + if [[ $local_umcast_changed -eq $TRUE ]]; then + recreate_local_umcast_val + fi + + if [[ $remote_umcast_changed -eq $TRUE ]]; then + recreate_remote_umcast_val + fi + + eval "cat $TMP_DIR/$log_file.post" | tee -a $log_file >& /dev/null +} + +function recreate_local_umcast_val +{ + local command="echo $initial_local_umcast_val 1>&/sys/class/net/$INTERFACE/umcast" + (echo "${SRV_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.post") >& /dev/null + eval "$command" >& /dev/null +} + +function recreate_remote_umcast_val +{ + local command="echo $initial_remote_umcast_val 1>&/sys/class/net/$INTERFACE/umcast" + (echo "${CLT_CMMND_LINE_PREF} $command" | tee -a "$TMP_DIR/$log_file.post") >& /dev/null + ssh $REM_HOST_IP $command >& /dev/null +} + +function clean_after_iperf +{ + killall iperf >& /dev/null + ssh $REM_HOST_IP killall iperf >& /dev/null + ssh $REM_HOST_IP "cat $TMP_DIR/$log_file.tmp" | tee -a $log_file >& /dev/null + ssh $REM_HOST_IP rm -f "$TMP_DIR/$log_file.tmp" >& /dev/null + rm -f $TMP_FILE.err >& /dev/null + rm -f "$TMP_DIR/$log_file.prep" >& /dev/null +} + +function clean_after_udp_lat +{ + ssh $REM_HOST_IP killall udp_lat >& /dev/null + pkill -2 -f udp_lat >& /dev/null + sleep 10 + cat "$TMP_DIR/$log_file.tmp" | tee -a $log_file >& /dev/null + rm -f "$TMP_DIR/$log_file.tmp" >& /dev/null +} + +function collect_nodes_info_to_file +{ + collect_local_node_info_to_file "$1" + collect_remote_node_info_to_file "$1" +} + +function collect_local_node_info_to_file +{ + (echo "======================>Local node info<======================" | tee -a $1) >& /dev/null + (echo "--------------------" | tee -a $1) >& /dev/null + (hostname | tee -a $1) >& /dev/null + (echo -n "OS: " >> $1;cat /etc/issue | grep We | tee -a $1) >& /dev/null + (echo -n "CPU: " >> $1;cat /proc/cpuinfo | grep 'model name' | sort -u | awk '{print $4, $5, $6, $7, $9}' | tee -a $1) >& /dev/null + (echo -n "Number of CPUs: " >> $1;cat /proc/cpuinfo |grep proce |wc |awk '{print $1}' | tee -a $1) >& /dev/null + (echo -n "CPU Type: " >> $1;uname -a | awk '{print $12}' | tee -a $1) >& /dev/null + (cat /proc/meminfo |grep [M,m]em | tee -a $1) >& /dev/null + (echo -n "Kernel: " >> $1;uname -a | awk '{print $3}' | tee -a $1) >& /dev/null + (cat /usr/voltaire/version | tee -a $1) >& /dev/null + (ibstat | grep -e "CA type" -e "Firmware version" | tee -a $1) >& /dev/null + (ibstatus | grep -e rate -e state | grep -v 'phys state' | tee -a $1) >& /dev/null + check_if_infbnd_iface + if [[ $is_infiniband -eq $TRUE ]]; then + (echo -n "IPoIB mode: " >> $1 ; cat "/sys/class/net/$INTERFACE/mode" | tee -a $1) >& /dev/null + fi + (ifconfig $INTERFACE | grep MTU | awk '{print $5}' | tee -a $1) >& /dev/null + (echo -n "OFED:" >> $1;ofed_info | head -6 | grep OFED | tee -a $1) >& /dev/null + (echo -n "VMA:" >> $1;rpm -qa | grep $VMA | tee -a $1) >& /dev/null + +} + +function collect_remote_node_info_to_file +{ + (echo "=====================>Remote node info<======================" | tee -a $1) >& /dev/null + (echo "--------------------" | tee -a $1) >& /dev/null + (ssh $REM_HOST_IP "hostname" | tee -a $1) >& /dev/null + (echo -n "OS: " >> $1;ssh $REM_HOST_IP cat /etc/issue | grep We | tee -a $1) >& /dev/null + (echo -n "CPU: " >> $1;ssh $REM_HOST_IP cat /proc/cpuinfo | grep 'model name' | sort -u | awk '{print $4, $5, $6, $7, $9}' | tee -a $1) >& /dev/null + (echo -n "Number of CPUs: " >> $1;ssh $REM_HOST_IP cat /proc/cpuinfo |grep proce |wc |awk '{print $1}' | tee -a $1) >& /dev/null + (echo -n "CPU Type: " >> $1;ssh $REM_HOST_IP uname -a | awk '{print $12}' | tee -a $1) >& /dev/null + (ssh $REM_HOST_IP cat /proc/meminfo |grep [M,m]em | tee -a $1) >& /dev/null + (echo -n "Kernel: " >> $1;ssh $REM_HOST_IP uname -a | awk '{print $3}' | tee -a $1) >& /dev/null + (ssh $REM_HOST_IP "cat /usr/voltaire/version" | tee -a $1) >& /dev/null + (ssh $REM_HOST_IP "ibstat | grep -e "CA type" -e 'Firmware version'" | tee -a $1) >& /dev/null + (ssh $REM_HOST_IP "ibstatus | grep -e rate -e state | grep -v 'phys state'" | tee -a $1) >& /dev/null + to_print="/sys/class/net/$INTERFACE/mode" + check_if_infbnd_iface + if [[ $is_infiniband -eq $TRUE ]]; then + (echo -n "IPoIB mode: " >> $1 ;ssh $REM_HOST_IP "cat $to_print" | tee -a $1) >& /dev/null + fi + (ssh $REM_HOST_IP ifconfig $INTERFACE | grep MTU | awk '{print $5}' | tee -a $1) >& /dev/null + (echo -n "OFED:" >> $1;ssh $REM_HOST_IP ofed_info | head -6 | grep OFED | tee -a $1) >& /dev/null + (echo -n "VMA:" >> $1;ssh $REM_HOST_IP rpm -qa | grep $VMA | tee -a $1) >& /dev/null +} + +function print_message +{ + echo $1 | tee -a $2 + echo ""| tee -a $2 +} + +function append_tmp_file_and_delete +{ + cat $1 | tee -a $2 >& /dev/null + rm -f $1 +} + +function check_command_succss +{ + if [ -s $TMP_FILE.err ]; then + eval "cat $TMP_FILE.err 2>&1 | tee >> $TMP_DIR/$log_file.prep" + LAST_ERROR=`cat $TMP_FILE.err` + SUCCSESS=$FALSE + else + if [ -s $TMP_FILE ]; then + eval "cat $TMP_FILE 2>&1 | tee >> $TMP_DIR/$log_file.prep" + fi + SUCCSESS=$TRUE + fi + + rm -f $TMP_FILE.err >& /dev/null +} + +function check_succsess_and_save_param +{ + local ret_val=0 + + check_command_succss + + if [[ $SUCCSESS -eq $TRUE ]]; then + $2 $1 + fi + + return $ret_val +} + +function check_if_infbnd_iface +{ + is_infiniband=$FALSE + + if [[ $INTERFACE =~ "ib*" ]]; then + is_infiniband=$TRUE + fi +} + +function discover_local_work_if_ip +{ + LOCAL_IP=`ifconfig $INTERFACE | grep inet |grep -v inet6 | cut -d ':' -f 2| cut -d " " -f 1` >& /dev/null +} + +function calc_file_age +{ + creation_time=`stat -c %Z /tmp/vma_utils_block_file` + now=`date +%s` + block_file_age=$(($now-$creation_time)) +} + +function get_operator_pid +{ + pid=$$ +} + +function write_to_file_operator_details +{ + operating_machine_hostname=`hostname` + get_operator_pid + rm -f "$2" >& /dev/null + touch "$2" >& /dev/null + echo -n "$1 " >> "$2" 2>/dev/null + echo -n "$pid " >> "$2" 2>/dev/null + echo -n "$script_name " >> "$2" 2>/dev/null + echo -n "$user_name " >> "$2" 2>/dev/null + echo -n "$user_id " >> "$2" 2>/dev/null + echo -n "$operating_machine_hostname " >> "$2" 2>/dev/null + + if [[ $1 == "local" ]]; then + st_timestamp=`date` + else + st_timestamp=`ssh $REM_HOST_IP "date" ` + fi + + echo "$st_timestamp" | tr " " "_" >> "$2" 2>/dev/null +} + +function read_block_file +{ + blocking_pid=`awk -F " " '{print $2}' "$1"` + blocking_app=`awk -F " " '{print $3}' "$1"` + blocking_username=`awk -F " " '{print $4}' "$1"` + blocking_id=`awk -F " " '{print $5}' "$1"` + blocking_hostname=`awk -F " " '{print $6}' "$1"` + blocking_st_time=`awk -F " " '{print $7}' "$1"` +} + +function print_block_files_details +{ + echo "Blocked Host:$blocked_host" + echo "You blocked by:" + echo "- application: ${blocking_app} " + echo "- user: ${blocking_username} " + echo "- users local host ip:${blocking_id} " + echo "- blocking proccess with pid: ${blocking_pid} running on host ${blocking_hostname} " + echo "- starting time:${blocking_st_time} " + echo "- blocking file:${BLOCK_FILE}" +} + +function update_remote_block_file +{ + write_to_file_operator_details "$LOCAL_IP" "${BLOCK_FILE}.rem" + scp "${BLOCK_FILE}.rem" "${REM_HOST_IP}:${BLOCK_FILE}" >& /dev/null + rm -f "${BLOCK_FILE}.rem" >& /dev/null +} + +function update_local_block_file +{ + write_to_file_operator_details "local" "$BLOCK_FILE" +} + +function update_block_files +{ + discover_local_work_if_ip + if [ "$LOCAL_IP" == "" ]; then + echo "WARNING: Will be executed without blocking..." + else + update_local_block_file + update_remote_block_file + fi +} + +function unblock_local_host +{ + rm -f $BLOCK_FILE >& /dev/null + rm -f "${BLOCK_FILE}.rem" >& /dev/null +} + +function unblock_remote_host +{ + ssh $REM_HOST_IP "rm -f $BLOCK_FILE >& /dev/null" >& /dev/null + rm -f "${BLOCK_FILE}.rem" >& /dev/null +} + +function unblock +{ + unblock_local_host + unblock_remote_host +} + +function check_connection_to_remote_host +{ + ping -w 3 "$1" >& /dev/null + test $? -eq 0 && RES=OK || RES=NOT +} + +function check_if_another_proccess_running_on_local_host +{ + eval "ps -eF| grep '$blocking_pid'|grep -v grep|wc -l" > $TMP_FILE + RES=`cat $TMP_FILE` +} + +function check_if_another_proccess_running_on_remote_host +{ + RES=0 + check_connection_to_remote_host "$1" + if [ $RES == "OK" ]; then + RES=`sudo ssh ${SUPER_USR}@${1} "ps -eF| grep ${blocking_pid}|grep -v grep|wc -l"` + else + RES=1 + fi +} + +function get_operating_host_ip_or_hostname +{ + RES=0 + operating_host_ip=`awk -F " " '{print $1}' "$1"` + if [[ $operating_host_ip != "local" ]]; then + check_connection_to_remote_host "$operating_host_ip" + if [ $RES != "OK" ]; then + operating_host_ip=`awk -F " " '{print $6}' "$1"` + fi + fi +} + +function check_if_operating_host_running_another_proccess +{ + if [ $1 == "local" ]; then + check_if_another_proccess_running_on_local_host + else + check_if_another_proccess_running_on_remote_host "$1" + fi +} + +function adjust_operating_host_ip_of_remote_machine +{ + local tmp="" + + if [ "$1" == "local" ]; then + operating_host_ip=$REM_HOST_IP + else + tmp=`ifconfig|grep $1` + if [ "$tmp" != "" ]; then + operating_host_ip="local" + fi + fi +} + +function check_if_remote_host_is_blocked +{ + RES=0 + ssh $REM_HOST_IP "cat ${BLOCK_FILE} 2>/dev/null" > "${BLOCK_FILE}.rem" + + if [ -s "${BLOCK_FILE}.rem" ]; then + read_block_file "${BLOCK_FILE}.rem" + get_operating_host_ip_or_hostname "${BLOCK_FILE}.rem" + adjust_operating_host_ip_of_remote_machine "$operating_host_ip" + check_if_operating_host_running_another_proccess "$operating_host_ip" + + if [[ $RES -le 0 ]]; then + unblock_remote_host + else + read_block_file "$BLOCK_FILE.rem" + blocked_host=$REM_HOST_IP + print_block_files_details + rm -f "${BLOCK_FILE}.rem" >& /dev/null + clean + exit 1 + fi + fi +} + +function check_if_local_host_is_blocked +{ + RES=0 + if [[ -e $BLOCK_FILE ]]; then + read_block_file "$BLOCK_FILE" + get_operating_host_ip_or_hostname "${BLOCK_FILE}" + check_if_operating_host_running_another_proccess "$operating_host_ip" + if [[ $RES -le 0 ]]; then + unblock_local_host + else + blocked_host=`hostname` + print_block_files_details + clean + exit 1 + fi + fi +} + +function block +{ + check_if_local_host_is_blocked + check_if_remote_host_is_blocked + update_block_files +} + + +function pre_test_checks +{ + check_connection_2_remote_ip + check_if_iperf_avaibale + clean +} + +function check_connection_2_remote_ip +{ + ssh -o "BatchMode yes" $REM_HOST_IP exit 2>$TMP_FILE.err + check_command_succss + if [[ $SUCCSESS -ne $TRUE ]]; then + echo "vma_perf_envelope error:$LAST_ERROR" + clean + unblock + exit 1 + fi +} + +function write_mc_feed_file +{ + local mc_grp_ctr=0 # num of mc groups + local mc_addr_part_3=1 # the third number of the mc group + local port=10005 + local mc_addr_part_4=3 # the last number of the mc group + local mc_grp_num=$1 + local file_name=$2 + + if [ -e $file_name ]; then + rm -f $file_name >& /dev/null + fi + + while [ $mc_grp_ctr -lt $mc_grp_num ] + do + if [ $mc_addr_part_4 -ge 254 ]; then + mc_addr_part_3=$(($mc_addr_part_3+1)) + mc_addr_part_4=3 + fi + + echo 224.4.$mc_addr_part_3.$mc_addr_part_4:$port >> $file_name + mc_grp_ctr=$(($mc_grp_ctr+1)) + port=$(($port+1)) + mc_addr_part_4=$(($mc_addr_part_4+1)) + done +} + +function create_mc_feed_files +{ + write_mc_feed_file $1 $2 + copy_feed_file_2_remote_machine $2 >& /dev/null +} + +function copy_feed_file_2_remote_machine +{ + scp $1 "$REM_HOST_IP:/$TMP_DIR" +} + +function remove_mc_feed_files +{ + rm -f $1 >& /dev/null + ssh $REM_HOST_IP "rm -f $1" >& /dev/null +} + +function run_iperf +{ + if [[ $iperf_is_installed -ne $TRUE ]]; then + echo "$ERORR_PROMT iperf tool not found on one of the machines, skipping iperf test" + else + run_iperf_with_diff_msg_len + fi +} + + +function check_if_iperf_avaibale +{ + local_iperf_is_installed=$FALSE + remote_iperf_is_installed=$FALSE + iperf_is_installed=$FALSE + which iperf 2>$TMP_FILE.err 1>/dev/null + check_command_succss + if [[ $SUCCSESS -ne $FALSE ]]; then + local_iperf_is_installed=$TRUE + else + echo "$ERORR_PROMT iperf not found on local machine " + fi + + ssh $REM_HOST_IP "which iperf" 2>$TMP_FILE.err 1>/dev/null + check_command_succss + if [[ $SUCCSESS -ne $FALSE ]]; then + remote_iperf_is_installed=$TRUE + else + echo "$ERORR_PROMT iperf not found on remote machine " + fi + + if [[ $local_iperf_is_installed -eq $TRUE ]]; then + if [[ $remote_iperf_is_installed -eq $TRUE ]]; then + iperf_is_installed=$TRUE + fi + fi +} + +function clean +{ + rm -f $TMP_FILE.err >& /dev/null + rm -f "$TMP_DIR/$log_file.prep" >& /dev/null + rm -f "$TMP_DIR/$log_file.post" >& /dev/null + ssh $REM_HOST_IP rm -f "$TMP_DIR/$log_file.tmp" >& /dev/null + rm -rf $TMP_FILE >& /dev/null +} + +function write_date_2_log_file +{ + echo "=============================>Date<===========================" >> "$log_file" + (echo "${SRV_CMMND_LINE_PREF} date" | tee -a "$log_file") >& /dev/null + (date | tee -a $log_file) >& /dev/null +} + +function pre_vma_perf +{ + pre_test_checks + get_hostnames + prepare_output_files + write_date_2_log_file + collect_nodes_info_to_file "$log_file" + prepare_route_table + save_shmem_prop + increase_number_of_hugetlb + save_umcast + update_umcast + clean + update_command_prefix +} + +function final_test_message +{ + echo "####################### Generated files #######################" + echo "#Test results : $res_file" + echo "#Test logs : $log_file" + echo "---------------------------------------------------------------" +} + +function post_vma_perf +{ + collect_nodes_info_to_file "$res_file" + recreate_route_table + recreate_mem_prop + recreate_umcast + final_test_message + write_date_2_log_file + clean +} + +function vma_perf +{ + run_udp_lat_with_diff_msg_len + run_udp_lat_tx_bw_with_diff_msg_len + run_udp_lat_bw_with_diff_msg_len + run_udp_lat_with_diff_burst_size + run_iperf + run_udp_lat_using_select_epoll_poll_with_zero_polling + run_udp_lat_using_select_epoll_poll_with_full_polling_vma_only +} + +#main + +if [ $# = 1 ]; then + block + pre_vma_perf + vma_perf + post_vma_perf + unblock +else + echo "Usage: perf " + exit +fi diff --git a/tools/Makefile.am b/tools/Makefile.am new file mode 100644 index 0000000..e65d4f9 --- /dev/null +++ b/tools/Makefile.am @@ -0,0 +1,5 @@ +SUBDIRS := \ + daemon + +EXTRA_DIST = \ + daemon diff --git a/tools/daemon/Makefile.am b/tools/daemon/Makefile.am new file mode 100644 index 0000000..1239cea --- /dev/null +++ b/tools/daemon/Makefile.am @@ -0,0 +1,28 @@ +sbin_PROGRAMS = vmad + +vmad_LDADD = + +vmad_CPPFLAGS = \ + -I$(top_srcdir)/src \ + -I$(top_srcdir)/src/vma + +vmad_LDFLAGS = +vmad_CFLAGS = + +vmad_SOURCES = \ + daemon.c \ + loop.c \ + hash.c \ + store.c \ + flow.c \ + message.c \ + notify.c \ + nl.c \ + tc.c + +noinst_HEADERS = \ + daemon.h \ + hash.h \ + bitmap.h \ + nl.h \ + tc.h diff --git a/tools/daemon/bitmap.h b/tools/daemon/bitmap.h new file mode 100644 index 0000000..ecfa281 --- /dev/null +++ b/tools/daemon/bitmap.h @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +/** + * @file ibwu_bitmap.h + * + * @brief Bitmap operations. + * + **/ +#ifndef TOOLS_DAEMON_BITMAP_H_ +#define TOOLS_DAEMON_BITMAP_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +typedef uint32_t bitmap_item_t; + +typedef struct bitmap bitmap_t; +struct bitmap { + bitmap_item_t *bitmap; /**< The actual bitmap array of characters */ + size_t size; /**< Bitmap size */ +}; + +/* Number of bits in single bitmap item */ +#define BITMAP_ITEM_SIZE (8 * sizeof(bitmap_item_t)) + +/* Number of items needed to store n bits */ +#define BITMAP_ARRAY_SIZE(n) (((n) + BITMAP_ITEM_SIZE - 1) / BITMAP_ITEM_SIZE) + +/** + * Initialize a bitmap. + * + * @param bm Bitmap to initialize. + * @param size Bit count in bitmap. + * + * @retval @a none + ***************************************************************************/ +static inline void bitmap_create(bitmap_t **bm, size_t size) +{ + *bm = (bitmap_t *)malloc(sizeof(**bm)); + if (*bm) { + (*bm)->size = size; + (*bm)->bitmap = (bitmap_item_t *)calloc(BITMAP_ARRAY_SIZE(size), sizeof(bitmap_item_t)); + if (NULL == (*bm)->bitmap) { + free(*bm); + *bm = NULL; + } + } +} + +/** + * Destroy a bitmap. + * + * @param bm Bitmap to destroy. + * + * @retval @a none + ***************************************************************************/ +static inline void bitmap_destroy(bitmap_t *bm) +{ + free(bm->bitmap); + bm->size = 0; +} + +/** + * Returns the index of the element in internal array that contains the bit. + * + * @param bit Bit index. + * + * @retval Array index + ***************************************************************************/ +static inline size_t elem_idx(size_t bit) +{ + return (bit / BITMAP_ITEM_SIZE); +} + +/** + * Returns the value with one bit is on. + * + * @param bit Bit index. + * + * @retval Element value + ***************************************************************************/ +static inline bitmap_item_t bit_mask(size_t bit_idx) +{ + return (bitmap_item_t)(1 << (bit_idx % BITMAP_ITEM_SIZE)); +} + +/** + * Returns the size of the bitmap in bits. + * + * @param bm Bitmap handle. + * + * @retval Bitmap size + ***************************************************************************/ +static inline size_t bitmap_size(bitmap_t *bm) +{ + return (bm->size); +} + +/** + * Atomically sets the bit (set 1). + * + * @param bm Bitmap handle. + * @param bit Bit index. + * + * @retval @a none + ***************************************************************************/ +static inline void bitmap_set(bitmap_t *bm, size_t bit) +{ + size_t idx = elem_idx(bit); + bitmap_item_t mask = bit_mask(bit); + bm->bitmap[idx] |= mask; +} + +/** + * Atomically clears the bit (set 0). + * + * @param bm Bitmap handle. + * @param bit Bit index. + * + * @retval @a none + ***************************************************************************/ +static inline void bitmap_clear(bitmap_t *bm, size_t bit) +{ + size_t idx = elem_idx(bit); + bitmap_item_t mask = bit_mask(bit); + bm->bitmap[idx] &= ~mask; +} + +/** + * Atomically inverse the bit. + * + * @param bm Bitmap handle. + * @param bit Bit index. + * + * @retval @a none + ***************************************************************************/ +static inline void bitmap_flip(bitmap_t *bm, size_t bit) +{ + size_t idx = elem_idx(bit); + bitmap_item_t mask = bit_mask(bit); + bm->bitmap[idx] ^= mask; +} + +/** + * Tests the bit. + * + * @param bm Bitmap handle. + * @param bit Bit index. + * + * @retval bit value + ***************************************************************************/ +static inline int bitmap_test(bitmap_t *bm, size_t bit) +{ + size_t idx = elem_idx(bit); + bitmap_item_t mask = bit_mask(bit); + return (0 != (bm->bitmap[idx] & mask)); +} + +/** + * Tests if defined interval is a group of bits with identical values. + * + * @param bm Bitmap handle. + * @param start Start bit index. + * @param count Number of bits in the group. + * + * @retval 0|1 - on success + * @retval -1 - on failure + ***************************************************************************/ +static inline int bitmap_test_group(bitmap_t *bm, size_t start, size_t count) +{ + size_t i; + int value = -1; + + if ((start + count) <= bm->size) { + value = bitmap_test(bm, start); + for (i = 1; i < count; i++) { + if (bitmap_test(bm, start + i) != value) { + return -1; + } + } + } + return value; +} + +/** + * Find a group of bits with identical values. + * + * @param bm Bitmap handle. + * @param start Start bit index. + * @param count Number of bits in the group. + * @param value Value of the group. + * + * @retval index - on success + * @retval -1 - on failure + ***************************************************************************/ +static inline int bitmap_find_group(bitmap_t *bm, size_t start, size_t count, + int value) +{ + size_t i; + size_t last; + + if ((start + count) <= bm->size) { + last = bm->size - count; + for (i = start; i <= last; i++) { + if (value == bitmap_test_group(bm, i, count)) { + return i; + } + } + } + return -1; +} + +/** + * Find first unset. + * + * @param bm Bitmap handle. + * + * @retval index - on success + * @retval -1 - on failure + ***************************************************************************/ +static inline int bitmap_find_first_zero(bitmap_t *bm) +{ + size_t i; + + for (i = 0; i < BITMAP_ARRAY_SIZE(bm->size); i++) { + if (((bitmap_item_t)(-1)) != bm->bitmap[i]) { + return (i * BITMAP_ITEM_SIZE + ffs(~bm->bitmap[i]) - 1); + } + } + return -1; +} + +#ifdef __cplusplus +} +#endif + +#endif /* TOOLS_DAEMON_BITMAP_H_ */ + diff --git a/tools/daemon/daemon.c b/tools/daemon/daemon.c new file mode 100644 index 0000000..4444258 --- /dev/null +++ b/tools/daemon/daemon.c @@ -0,0 +1,494 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include + +#if HAVE_SYS_PRCTL_H +# include +#endif + +#include "hash.h" +#include "tc.h" +#include "daemon.h" + + +extern int proc_loop(void); + +static void handle_signal(int signo); +static void daemonize(void); +static int config_def(void); +static int config_set(int argc, char **argv); +static void usage(void); + +struct module_cfg daemon_cfg; + +int main(int argc, char *argv[]) +{ + int rc = 0; + + /* Setup syslog logging */ + openlog(MODULE_NAME, LOG_PID, LOG_LOCAL5); + + /* already a daemon */ + if (getppid() == 1) { + return 0; + } + + /* command line parsing... */ + config_def(); + log_info("Starting\n"); + + config_set(argc, argv); + + /* Daemonize */ + if (0 == daemon_cfg.opt.mode) { + daemonize(); + } + + /* Change the file mode mask */ + umask(0); + + /* Set name of the process */ +#if HAVE_SYS_PRCTL_H + if (prctl(PR_SET_NAME, MODULE_NAME, NULL, NULL, NULL) < 0) { + log_error("cannot set process name to %s, errno=%d (%s)\n", + MODULE_NAME, errno, strerror(errno)); + goto err; + } +#endif + + /* Ensure only one copy */ + if (daemon_cfg.lock_file[0]) { + char str[10]; + + daemon_cfg.lock_fd = open(daemon_cfg.lock_file, O_RDWR | O_CREAT, S_IRUSR | S_IWUSR | S_IRGRP); + if (daemon_cfg.lock_fd < 0) { + log_error("could not open PID lock file %s, errno=%d (%s)\n", + daemon_cfg.lock_file, errno, strerror(errno)); + goto err; + } + + if (lockf(daemon_cfg.lock_fd, F_TLOCK, 0) < 0) { + log_error("could not lock PID lock file %s, errno=%d (%s)\n", + daemon_cfg.lock_file, errno, strerror(errno)); + goto err; + } + + /* Write pid to lockfile */ + sprintf(str,"%d\n", getpid()); + if (write(daemon_cfg.lock_fd, str, strlen(str)) < 0) { + log_error("could not write to PID lock file %s, errno=%d (%s)\n", + daemon_cfg.lock_file, errno, strerror(errno)); + goto err; + } + } + + /* Main loop */ + rc = proc_loop(); + + /* Finish up */ + close(daemon_cfg.lock_fd); + unlink(daemon_cfg.lock_file); + + log_info("Terminated with code %d\n", rc); + closelog(); + + return (rc < 0 ? EXIT_FAILURE : EXIT_SUCCESS); +err: + return EXIT_FAILURE; +} + +static void handle_signal(int signo) +{ + log_debug("Getting signal (%d)\n", signo); + + switch (signo) { + case SIGALRM: + case SIGCHLD: + case SIGUSR1: + daemon_cfg.sig = SIGUSR1; + _exit(EXIT_SUCCESS); + break; + default: + daemon_cfg.sig = signo; + return; + } +} + +static void daemonize(void) +{ + struct sigaction sa; + pid_t pid, sid, parent; + + /* Fork off the parent process */ + pid = fork(); + if (pid < 0) { + log_error("unable to fork daemon, code=%d (%s)\n", errno, + strerror(errno)); + exit(EXIT_FAILURE); + } + /* If we got a good PID, then we can exit the parent process. */ + if (pid > 0) { + + /* Setup signal handling before we start */ + sa.sa_handler = &handle_signal; + sigemptyset(&sa.sa_mask); + sa.sa_flags = 0; + if (sigaction(SIGUSR1, &sa, NULL) < 0) { + log_error("cannot register SIGUSR1 signal handler, errno=%d (%s)\n", + errno, strerror(errno)); + exit(EXIT_FAILURE); + } + if (sigaction(SIGCHLD, &sa, NULL) < 0) { + log_error("cannot register SIGCHLD signal handler, errno=%d (%s)\n", + errno, strerror(errno)); + exit(EXIT_FAILURE); + } + if (sigaction(SIGALRM, &sa, NULL) < 0) { + log_error("cannot register SIGALRM signal handler, errno=%d (%s)\n", + errno, strerror(errno)); + exit(EXIT_FAILURE); + } + + /* Wait for confirmation from the child via SIGTERM or SIGCHLD, or + * for two seconds to elapse (SIGALRM). + * pause() should not return. + */ + alarm(2); + pause(); + exit(EXIT_FAILURE); + } + + /* At this point we are executing as the child process */ + parent = getppid(); + + /* Cancel certain signals */ + signal(SIGTSTP, SIG_IGN); /* Various TTY signals */ + signal(SIGTTOU, SIG_IGN); + signal(SIGTTIN, SIG_IGN); + signal(SIGALRM, SIG_IGN); + signal(SIGUSR1, SIG_IGN); + signal(SIGHUP, SIG_IGN); + signal(SIGCHLD, SIG_DFL); /* A child process dies */ + signal(SIGTERM, SIG_DFL); /* Die on SIGTERM */ + + /* Setup signal handling before we start */ + sa.sa_handler = &handle_signal; + sigemptyset(&sa.sa_mask); + sa.sa_flags = 0; + if (sigaction(SIGINT, &sa, NULL) < 0) { + log_error("cannot register SIGINT signal handler, errno=%d (%s)\n", + errno, strerror(errno)); + exit(EXIT_FAILURE); + } + + /* Create a new SID for the child process */ + sid = setsid(); + if (sid < 0) { + log_error("unable to create a new session, errno %d (%s)\n", errno, + strerror(errno)); + exit(EXIT_FAILURE); + } + + /* Change the current working directory */ + if ((chdir("/")) < 0) { + log_error("unable to change directory to %s, errno %d (%s)\n", "/", + errno, strerror(errno)); + exit(EXIT_FAILURE); + } + + /* Redirect standard files to /dev/null */ + if (NULL == freopen("/dev/null", "r", stdin)) { + log_error("unable redirect stdin, errno %d (%s)\n", + errno, strerror(errno)); + exit(EXIT_FAILURE); + } + + if (NULL == freopen("/dev/null", "w", stdout)) { + log_error("unable redirect stdout, errno %d (%s)\n", + errno, strerror(errno)); + exit(EXIT_FAILURE); + } + + if (NULL == freopen("/dev/null", "w", stderr)) { + log_error("unable redirect stderr, errno %d (%s)\n", + errno, strerror(errno)); + exit(EXIT_FAILURE); + } + + /* Send a signal to the parent to it can terminate. */ + kill(parent, SIGUSR1); +} + +static int config_def(void) +{ + int rc = 0; + + memset(&daemon_cfg, 0, sizeof(daemon_cfg)); + + daemon_cfg.opt.mode = 0; + daemon_cfg.opt.log_level = 4; + daemon_cfg.opt.max_pid_num = PID_MAX; + daemon_cfg.opt.max_fid_num = FID_MAX; + daemon_cfg.opt.force_rst = 0; + daemon_cfg.opt.retry_interval = 1000; + + + daemon_cfg.lock_file = "/var/lock/" MODULE_NAME ".lock"; + daemon_cfg.lock_fd = -1; + daemon_cfg.sock_file = VMA_AGENT_ADDR; + daemon_cfg.sock_fd = -1; + daemon_cfg.sig = 0; + daemon_cfg.raw_fd = -1; + daemon_cfg.notify_fd = -1; + daemon_cfg.notify_dir = VMA_AGENT_PATH; + daemon_cfg.ht = NULL; + daemon_cfg.tc = NULL; + + return rc; +} + +static int config_set(int argc, char **argv) +{ + int rc = 0; + static struct option long_options[] = { + {"console", no_argument, &daemon_cfg.opt.mode, 1}, + {"notify-dir", required_argument, 0, 'n'}, + {"verbose", required_argument, 0, 'v'}, + {"pid", required_argument, 0, 'p'}, + {"fid", required_argument, 0, 'f'}, + {"force-rst", no_argument, &daemon_cfg.opt.force_rst, 1}, + {"retry-interval", required_argument, &daemon_cfg.opt.retry_interval, 'r'}, + {"help", no_argument, 0, 'h'}, + { 0, 0, 0, 0 } + }; + int op; + int option_index; + + while ((op = getopt_long(argc, argv, "v:n:p:f:r:h", long_options, &option_index)) != -1) { + switch (op) { + case 'v': + errno = 0; + daemon_cfg.opt.log_level = strtol(optarg, NULL, 0); + if (0 != errno) { + rc = -EINVAL; + } + break; + case 'n': + errno = 0; + daemon_cfg.notify_dir = optarg; + if (0 != errno) { + rc = -EINVAL; + } + break; + case 'p': + errno = 0; + daemon_cfg.opt.max_pid_num = strtol(optarg, NULL, 0); + if (0 != errno) { + rc = -EINVAL; + } + break; + case 'f': + errno = 0; + daemon_cfg.opt.max_fid_num = strtol(optarg, NULL, 0); + if (0 != errno) { + rc = -EINVAL; + } + break; + case 'r': + errno = 0; + daemon_cfg.opt.retry_interval = strtol(optarg, NULL, 0); + if (0 != errno) { + rc = -EINVAL; + } + break; + case 'h': + usage(); + break; + case 0: + /* getopt_long() set a variable, just keep going */ + break; + case ':': + case '?': + default: + rc = -EINVAL; + break; + } + } + + log_debug("CONFIGURATION:\n"); + log_debug("package version: %s\n", PACKAGE_VERSION); + log_debug("mode: %d\n", daemon_cfg.opt.mode); + log_debug("log level: %d\n", daemon_cfg.opt.log_level); + log_debug("max pid: %d\n", daemon_cfg.opt.max_pid_num); + log_debug("max fid: %d\n", daemon_cfg.opt.max_fid_num); + log_debug("force rst: %d\n", daemon_cfg.opt.force_rst); + log_debug("retry interval: %d ms \n", daemon_cfg.opt.retry_interval); + log_debug("lock file: %s\n", daemon_cfg.lock_file); + log_debug("sock file: %s\n", daemon_cfg.sock_file); + log_debug("notify dir: %s\n", daemon_cfg.notify_dir); + log_debug("format version: 0x%X\n", VMA_AGENT_VER); + + if (0 != rc) { + usage(); + } + + return rc; +} + +static void usage(void) +{ + printf(MODULE_NAME " is a part of Mellanox's Messaging Accelerator (VMA) product\n" + "that boosts performance for message-based and streaming applications.\n"); + printf("version: %s (0x%X)\n\n", PACKAGE_VERSION, VMA_AGENT_VER); + + printf("Usage: " MODULE_NAME " [options]\n" + "\t--console Enable foreground mode (default: %s)\n" + "\t--notify-dir Sets the outout dir used by vmad (default: %s)\n" + "\t--pid,-p Set prime number as maximum of processes per node. (default: %d).\n" + "\t--fid,-f Set prime number as maximum of sockets per process. (default: %d).\n" + "\t--force-rst Force internal RST. (default: %s).\n" + "\t--verbose,-v Output verbose level (default: %d).\n" + "\t--retry-interval,-r Set SYN pkt retry interval in [ms] (default: %d).\n" + "\t--help,-h Print help and exit\n", + (daemon_cfg.opt.mode ? "on" : "off"), + VMA_AGENT_PATH, + daemon_cfg.opt.max_pid_num, + daemon_cfg.opt.max_fid_num, + (daemon_cfg.opt.force_rst ? "on" : "off"), + daemon_cfg.opt.log_level, + daemon_cfg.opt.retry_interval); + + exit(EXIT_SUCCESS); +} + + +void sys_log(int level, const char *format, ...) +{ + va_list args; + va_start(args, format); + + if (0 == daemon_cfg.opt.mode) { + vsyslog(level, format, args); + } else { + vprintf(format, args); + } + va_end(args); +} + +ssize_t sys_sendto(int sockfd, + const void *buf, size_t len, int flags, + const struct sockaddr *dest_addr, socklen_t addrlen) +{ + char *data = (char *)buf; + int n, nb; + + nb = 0; + do { + n = sendto(sockfd, data, len, flags, dest_addr, addrlen); + if (n <= 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) { + if (flags & MSG_DONTWAIT) { + break; + } + continue; + } + return -errno; + } + len -= n; + data += n; + nb += n; + } while (!(flags & MSG_DONTWAIT) && (len > 0)); + + return nb; +} + +char *sys_exec(const char * format, ...) +{ + static __thread char outbuf[256]; + FILE *file = NULL; + va_list va; + char *cmd; + int ret; + + /* calculate needed size for command buffer */ + va_start(va, format); + ret = vsnprintf(NULL, 0, format, va); + va_end(va); + if (ret <= 0) { + goto err; + } + + /* allocate command buffer */ + ret += 1; + cmd = malloc(ret); + if (NULL == cmd) { + goto err; + } + + /* fill command buffer */ + va_start(va, format); + ret = vsnprintf(cmd, ret, format, va); + va_end(va); + if (ret <= 0) { + free(cmd); + goto err; + } + + /* execute command */ + file = popen(cmd, "r"); + log_trace("Run command: %s\n", cmd); + free(cmd); + if (NULL == file) { + goto err; + } + + /* save output */ + memset(outbuf, 0, sizeof(outbuf)); + if ((NULL == fgets(outbuf, sizeof(outbuf) - 1, file)) && (ferror(file))) { + pclose(file); + goto err; + } + pclose(file); + + return outbuf; +err: + return NULL; +} diff --git a/tools/daemon/daemon.h b/tools/daemon/daemon.h new file mode 100644 index 0000000..bb38829 --- /dev/null +++ b/tools/daemon/daemon.h @@ -0,0 +1,349 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef TOOLS_DAEMON_DAEMON_H_ +#define TOOLS_DAEMON_DAEMON_H_ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_LINUX_LIMITS_H +#include +#endif + +#include "vma/util/agent_def.h" +#include "vma/util/list.h" +#include "utils/clock.h" + + +#define MODULE_NAME "vmad" + +#define EXIT_SUCCESS 0 +#define EXIT_FAILURE 1 + +#ifndef NOT_IN_USE +#define NOT_IN_USE(P) ((void)(P)) +#endif + +#define INVALID_VALUE (-1) +#define STATE_ESTABLISHED 4 + +#define PID_MAX 499 /**< Default maximum number of processes + per node (should be prime number) */ +#define FID_MAX 65599 /**< Default maximum number of sockets + per process (should be prime number) */ + +#ifndef HAVE_LINUX_LIMITS_H +#define NAME_MAX 255 /**< chars in a file name */ +#define PATH_MAX 4096 /**< chars in a path name including null */ +#endif + +#define log_fatal(fmt, ...) \ + do { \ + if (daemon_cfg.opt.log_level > 0) \ + sys_log(LOG_ALERT, "[FATAL ] " fmt, ##__VA_ARGS__); \ + } while (0) + +#define log_error(fmt, ...) \ + do { \ + if (daemon_cfg.opt.log_level > 1) \ + sys_log(LOG_ERR, "[ERROR ] " fmt, ##__VA_ARGS__); \ + } while (0) + +#define log_warn(fmt, ...) \ + do { \ + if (daemon_cfg.opt.log_level > 2) \ + sys_log(LOG_WARNING, "[WARN ] " fmt, ##__VA_ARGS__); \ + } while (0) + +#define log_info(fmt, ...) \ + do { \ + if (daemon_cfg.opt.log_level > 3) \ + sys_log(LOG_NOTICE, "[INFO ] " fmt, ##__VA_ARGS__); \ + } while (0) + +#define log_debug(fmt, ...) \ + do { \ + if (daemon_cfg.opt.log_level > 4) \ + sys_log(LOG_INFO, "[DEBUG ] " fmt, ##__VA_ARGS__); \ + } while (0) + +#define log_trace(fmt, ...) \ + do { \ + if (daemon_cfg.opt.log_level > 5) \ + sys_log(LOG_INFO, "[TRACE ] " fmt, ##__VA_ARGS__); \ + } while (0) + +#define log_hexdump(_ptr, _size) \ + do { \ + if (daemon_cfg.opt.log_level > 5) \ + sys_hexdump((_ptr), (_size)); \ + } while (0) + + +/** + * @struct module_cfg + * @brief Configuration parameters in global values + */ +struct module_cfg { + struct { + int mode; /**< 0 - daemon, 1 - console */ + int log_level; /**< 0..5 verbose level */ + int max_pid_num; /**< maximum number of processes per node */ + int max_fid_num; /**< maximum number of sockets per process */ + int force_rst; /**< RST method + * 0 - only system RST is sent as + * reaction on spoofed SYN + * 1 - form and send internal RST + * based on SeqNo */ + int retry_interval; /**< daemon time interval between spoofed SYN packets */ + } opt; + volatile sig_atomic_t sig; + const char *lock_file; + int lock_fd; + const char *sock_file; + int sock_fd; + int raw_fd; + int notify_fd; + const char *notify_dir; + hash_t ht; + tc_t tc; + struct list_head if_list; +}; + +extern struct module_cfg daemon_cfg; + +/** + * @struct store_pid + * @brief Describe process using pid as unique key + */ +struct store_pid { + pid_t pid; /**< Process id */ + hash_t ht; /**< Handle to socket store */ + struct list_head flow_list; /**< List of flows */ + uint32_t lib_ver; /**< Library version that the process uses */ + struct timeval t_start; /**< Start time of the process */ +}; + +/** + * @struct store_fid + * @brief Describe socket using fid as unique key + */ +struct store_fid { + int fid; /**< Socket id */ + uint32_t src_ip; /**< Source IP address */ + uint32_t dst_ip; /**< Destination IP address */ + uint16_t src_port; /**< Source port number */ + uint16_t dst_port; /**< Destination port number */ + uint8_t type; /**< Connection type */ + uint8_t state; /**< Current TCP state of the connection */ +}; + +/** + * @struct store_flow + * @brief Describe flow + */ +struct store_flow { + struct list_head item; /**< Link to use in queue */ + uint32_t handle; /**< Handle value in term of tc */ + int type; /**< Flow type */ + uint32_t if_id; /**< Interface index */ + uint32_t tap_id; /**< Tap device index */ + struct { + uint32_t dst_ip; + uint16_t dst_port; + struct { + uint32_t src_ip; + uint16_t src_port; + } t5; + } flow; +}; + + +void sys_log(int level, const char *format, ...); + +ssize_t sys_sendto(int sockfd, + const void *buf, size_t len, int flags, + const struct sockaddr *dest_addr, socklen_t addrlen); + +char *sys_exec(const char * format, ...); + +static inline char *sys_addr2str(struct sockaddr_in *addr) +{ + static char buf[100]; + static __thread char addrbuf[sizeof(buf) + sizeof(addr->sin_port) + 5]; + inet_ntop(AF_INET, &addr->sin_addr, buf, sizeof(buf) - 1); + sprintf(addrbuf, "%s:%d", buf, ntohs(addr->sin_port)); + + return addrbuf; +} + +static inline char *sys_ip2str(uint32_t ip) +{ + static __thread char ipbuf[100]; + struct in_addr value = {0}; + value.s_addr = ip; + inet_ntop(AF_INET, &value, ipbuf, sizeof(ipbuf) - 1); + + return ipbuf; +} + +static inline uint32_t sys_lo_ifindex(void) +{ + static __thread uint32_t lo_ifindex = 0; + struct ifaddrs *ifaddr, *ifa; + + if (lo_ifindex > 0) { + return lo_ifindex; + } + + if (!getifaddrs(&ifaddr)) { + for (ifa = ifaddr; NULL != ifa; ifa = ifa->ifa_next) { + if (ifa->ifa_addr->sa_family == AF_INET && + (ifa->ifa_flags & IFF_LOOPBACK)) { + lo_ifindex = if_nametoindex(ifa->ifa_name); + break; + } + } + freeifaddrs(ifaddr); + } + + return lo_ifindex; +} + +static inline char *sys_lo_ifname(void) +{ + static __thread char lo_ifname[IF_NAMESIZE] = {0}; + + if (lo_ifname[0] > 0) { + return lo_ifname; + } + + if (NULL == if_indextoname(sys_lo_ifindex(), lo_ifname)) { + lo_ifname[0] = 0; + } + + return lo_ifname; +} + +static inline int sys_iplocal(uint32_t addr) +{ + int rc = 0; + struct ifaddrs *ifaddr, *ifa; + struct sockaddr_in *sa; + + if (!getifaddrs(&ifaddr)) { + for (ifa = ifaddr; NULL != ifa; ifa = ifa->ifa_next) { + if (ifa->ifa_addr->sa_family == AF_INET) { + sa = (struct sockaddr_in *) ifa->ifa_addr; + if (addr == sa->sin_addr.s_addr) { + rc = 1; + break; + } + } + } + freeifaddrs(ifaddr); + } + + return rc; +} + +static inline void sys_hexdump(void *ptr, int buflen) +{ + unsigned char *buf = (unsigned char *)ptr; + char out_buf[256]; + int ret = 0; + int out_pos = 0; + int i, j; + + log_trace("dump data at %p\n", ptr); + for (i = 0; i < buflen; i += 16) { + out_pos = 0; + ret = sprintf(out_buf + out_pos, "%06x: ", i); + if (ret < 0) { + return; + } + out_pos += ret; + for (j = 0; j < 16; j++) { + if (i + j < buflen) { + ret = sprintf(out_buf + out_pos, "%02x ", buf[i + j]); + } else { + ret = sprintf(out_buf + out_pos, " "); + } + if (ret < 0) { + return; + } + out_pos += ret; + } + ret = sprintf(out_buf + out_pos, " "); + if (ret < 0) { + return; + } + out_pos += ret; + for (j = 0; j < 16; j++) + if (i + j < buflen) { + ret = sprintf(out_buf + out_pos, "%c", + isprint(buf[i+j]) ? + buf[i + j] : + '.'); + if (ret < 0) { + return; + } + out_pos += ret; + } + ret = sprintf(out_buf + out_pos, "\n"); + if (ret < 0) { + return; + } + log_trace("%s", out_buf); + } +} + +#endif /* TOOLS_DAEMON_DAEMON_H_ */ diff --git a/tools/daemon/flow.c b/tools/daemon/flow.c new file mode 100644 index 0000000..aba8ad9 --- /dev/null +++ b/tools/daemon/flow.c @@ -0,0 +1,739 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include +#include +#include + + +#include "hash.h" +#include "bitmap.h" +#include "tc.h" +#include "daemon.h" + + +/** + * @struct htid_node_t + * @brief It is an object to be used for removal workaround. + */ +struct htid_node_t { + struct list_head node; + int htid; + int prio; +}; + +/** + * @struct flow_ctx + * @brief It is an object described extra details for flow element + */ +struct flow_ctx { + bitmap_t *ht; /**< bitmap of used hash tables */ + struct list_head pending_list; + struct { + int prio; + int id; + } ht_prio[4]; /**< internal hash tables related priority (size should be set as number of possible priorities) */ +}; + +/** + * @struct flow_element + * @brief It is an object described tc element + */ +struct flow_element { + struct list_head item; /**< link sequence of elements in list */ + struct list_head list; /**< head of children list */ + int ref; /**< reference counter */ + uint32_t value[2]; /**< data */ + union { + struct flow_ctx *ctx; /**< data related if */ + uint32_t ht_id; /**< data related ip (16 bytes for internal ht id 16 bytes ht id) */ + }; +}; + +int open_flow(void); +void close_flow(void); +int add_flow(struct store_pid *pid_value, struct store_flow *value); +int del_flow(struct store_pid *pid_value, struct store_flow *value); + +static int add_flow_egress(struct store_pid *pid_value, struct store_flow *value); +static inline void get_htid(struct flow_ctx *ctx, int prio, int *ht_krn, int *ht_id); +static inline void free_htid(struct flow_ctx *ctx, int ht_id); +static inline void add_pending_list(pid_t pid, struct flow_ctx *ctx, int if_index, int ht_id, int prio, int *rc); +static inline void free_pending_list(pid_t pid, struct flow_ctx *ctx, int if_index); +static inline int get_prio(struct store_flow *value); +static inline int get_bkt(struct store_flow *value); +static inline int get_protocol(struct store_flow *value); +static inline int get_node(struct list_head **list); + + +int open_flow(void) +{ + int rc = 0; + + INIT_LIST_HEAD(&daemon_cfg.if_list); + daemon_cfg.tc = tc_create(); + if (NULL == daemon_cfg.tc) { + rc = -EFAULT; + log_error("Failed to create TC object %d (%s)\n", errno, + strerror(errno)); + goto err; + } + +err: + return rc; +} + +void close_flow(void) +{ + tc_destroy(daemon_cfg.tc); + daemon_cfg.tc = NULL; +} + +int add_flow(struct store_pid *pid_value, struct store_flow *value) +{ + int rc = 0; + pid_t pid = pid_value->pid; + struct list_head *cur_head = NULL; + struct flow_element *cur_element = NULL; + struct list_head *cur_entry = NULL; + uint32_t ip = value->flow.dst_ip; + int ht = HANDLE_HT(value->handle); + int bkt = HANDLE_BKT(value->handle); + int id = HANDLE_ID(value->handle); + int ht_internal = KERNEL_HT; + struct flow_ctx *ctx = NULL; + + /* Egress rules should be created for new tap device + */ + if (VMA_MSG_FLOW_EGRESS == value->type) { + return add_flow_egress(pid_value, value); + } + + errno = 0; + + /* interface list processing + * use interface index as unique identifier + * every network interface has qdisc + * so as first step let find if interface referenced in this flow exists + * in the if_list or allocate new element related one + */ + cur_head = &daemon_cfg.if_list; + list_for_each(cur_entry, cur_head) { + cur_element = list_entry(cur_entry, struct flow_element, item); + if (cur_element->value[0] == value->if_id) { + break; + } + } + if (cur_entry == cur_head) { + cur_element = (void *)calloc(1, sizeof(*cur_element)); + if (NULL == cur_element) { + rc = -ENOMEM; + goto err; + } + + /* Cleanup from possible failure during last daemon session */ + tc_del_qdisc(daemon_cfg.tc, value->if_id); + + /* Create filter to redirect traffic from netvsc device to tap device */ + if (tc_add_qdisc(daemon_cfg.tc, value->if_id) < 0) { + log_error("[%d] failed tc_add_qdisc() errno = %d\n", pid, errno); + free(cur_element); + rc = -EFAULT; + goto err; + } + + INIT_LIST_HEAD(&cur_element->list); + cur_element->ref = 0; + cur_element->value[0] = value->if_id; + cur_element->ctx = (void *)calloc(1, sizeof(*cur_element->ctx)); + if (NULL == cur_element->ctx) { + free(cur_element); + rc = -ENOMEM; + goto err; + } + /* tables from 0x800 are reserved by kernel */ + bitmap_create(&cur_element->ctx->ht, (KERNEL_HT - 1)); + if (NULL == cur_element->ctx->ht) { + free(cur_element->ctx); + free(cur_element); + rc = -ENOMEM; + goto err; + } + + /* table id = 0 is not used */ + bitmap_set(cur_element->ctx->ht, 0); + INIT_LIST_HEAD(&(cur_element->ctx->pending_list)); + list_add_tail(&cur_element->item, cur_head); + } + assert(cur_element); + cur_element->ref++; + ctx = cur_element->ctx; + + log_debug("[%d] add flow (if): 0x%p value: %d ref: %d\n", + pid, cur_element, cur_element->value[0], cur_element->ref); + + /* table list processing + * table id calculation is based on ip and type + * so as first step let find if hash table referenced in this flow exists + * in the list of tables related specific interface or allocate new element related one + */ + cur_head = &cur_element->list; + list_for_each(cur_entry, cur_head) { + cur_element = list_entry(cur_entry, struct flow_element, item); + if (cur_element->value[0] == (uint32_t)value->type && + cur_element->value[1] == ip) { + ht = cur_element->ht_id & 0x0000FFFF; + ht_internal = (cur_element->ht_id >> 16) & 0x0000FFFF; + break; + } + } + if (cur_entry == cur_head) { + cur_element = (void *)calloc(1, sizeof(*cur_element)); + if (NULL == cur_element) { + rc = -ENOMEM; + goto err; + } + + get_htid(ctx, get_prio(value), &ht_internal, &ht); + + if (tc_add_filter_divisor(daemon_cfg.tc, value->if_id, get_prio(value), ht) < 0) { + log_error("[%d] failed tc_add_filter_divisor() errno = %d\n", pid, errno); + free(cur_element); + rc = -EFAULT; + goto err; + } + if (tc_add_filter_link(daemon_cfg.tc, value->if_id, get_prio(value), ht_internal, ht, ip) < 0) { + log_error("[%d] failed tc_add_filter_link() errno = %d\n", pid, errno); + free(cur_element); + rc = -EFAULT; + goto err; + } + + INIT_LIST_HEAD(&cur_element->list); + cur_element->ref = 0; + cur_element->value[0] = value->type; + cur_element->value[1] = ip; + cur_element->ht_id = ((ht_internal << 16) & 0xFFFF0000) | (ht & 0x0000FFFF); + list_add_tail(&cur_element->item, cur_head); + } + assert(cur_element); + cur_element->ref++; + + log_debug("[%d] add flow (ht): 0x%p value: %d:%d ref: %d\n", + pid, cur_element, cur_element->value[0], cur_element->value[1], cur_element->ref); + + /* bucket list processing + * bucket number calculation can be different for flow types + * so as first step let find if bucket referenced in this flow exists + * in the list of buckets related specific hash table or allocate new element related one + */ + cur_head = &cur_element->list; + bkt = get_bkt(value); + if (bkt < 0) { + log_warn("[%d] invalid flow bkt: %d\n", + pid, bkt); + goto err; + } + list_for_each(cur_entry, cur_head) { + cur_element = list_entry(cur_entry, struct flow_element, item); + if ((int)cur_element->value[0] == bkt) { + break; + } + } + if (cur_entry == cur_head) { + cur_element = (void *)calloc(1, sizeof(*cur_element)); + if (NULL == cur_element) { + rc = -ENOMEM; + goto err; + } + + INIT_LIST_HEAD(&cur_element->list); + cur_element->ref = 0; + cur_element->value[0] = bkt; + list_add_tail(&cur_element->item, cur_head); + } + assert(cur_element); + cur_element->ref++; + + log_debug("[%d] add flow (bkt): 0x%p value: %d ref: %d\n", + pid, cur_element, cur_element->value[0], cur_element->ref); + + /* node list processing + * node number calculation can be different for flow types + * allocate new element related one + * cur_entry pointed by cur_head can depends on internal logic and + * direct a place in the list where new entry should be inserted + */ + cur_head = &cur_element->list; + id = get_node(&cur_head); + if (id <= 0) { + log_warn("[%d] invalid flow id: %d\n", + pid, id); + goto err; + } else { + cur_element = (void *)calloc(1, sizeof(*cur_element)); + if (NULL == cur_element) { + rc = -ENOMEM; + goto err; + } + + switch (value->type) { + case VMA_MSG_FLOW_TCP_3T: + case VMA_MSG_FLOW_UDP_3T: + rc = tc_add_filter_dev2tap(daemon_cfg.tc, value->if_id, + get_prio(value), ht, bkt, id, + get_protocol(value), value->flow.dst_ip, value->flow.dst_port, + 0, 0, value->tap_id); + break; + case VMA_MSG_FLOW_TCP_5T: + case VMA_MSG_FLOW_UDP_5T: + rc = tc_add_filter_dev2tap(daemon_cfg.tc, value->if_id, + get_prio(value), ht, bkt, id, + get_protocol(value), value->flow.dst_ip, value->flow.dst_port, + value->flow.t5.src_ip, value->flow.t5.src_port, value->tap_id); + break; + default: + break; + } + if (rc < 0) { + log_error("[%d] failed tc_add_filter_dev2tap() errno = %d\n", pid, errno); + free(cur_element); + rc = -EFAULT; + goto err; + } + + INIT_LIST_HEAD(&cur_element->list); + cur_element->ref = 0; + cur_element->value[0] = id; + list_add_tail(&cur_element->item, cur_head); + } + assert(cur_element); + cur_element->ref++; + + log_debug("[%d] add flow (node): 0x%p value: %d ref: %d\n", + pid, cur_element, cur_element->value[0], cur_element->ref); + + free_pending_list(pid, ctx, value->if_id); + +err: + + value->handle = HANDLE_SET(ht, bkt, id); + log_debug("[%d] add flow filter: %x:%x:%x rc=%d\n", + pid, ht, bkt, id, rc); + + return rc; +} + +int del_flow(struct store_pid *pid_value, struct store_flow *value) +{ + int rc = 0; + pid_t pid = pid_value->pid; + struct list_head *cur_head = NULL; + struct flow_element *cur_element = NULL; + struct list_head *cur_entry = NULL; + struct flow_element *save_element[3]; + struct list_head *save_entry[3]; + uint32_t ip = value->flow.dst_ip; + int ht = HANDLE_HT(value->handle); + int bkt = HANDLE_BKT(value->handle); + int id = HANDLE_ID(value->handle); + int ht_internal = KERNEL_HT; + struct flow_ctx *ctx = NULL; + int found = 0; + + errno = 0; + + /* interface list processing */ + found = 0; + cur_head = &daemon_cfg.if_list; + list_for_each(cur_entry, cur_head) { + cur_element = list_entry(cur_entry, struct flow_element, item); + if (cur_element->value[0] == value->if_id) { + found = 1; + break; + } + } + if (found) { + assert(cur_entry != cur_head); + assert(cur_element); + ctx = cur_element->ctx; + save_element[0] = cur_element; + save_entry[0] = cur_entry; + + /* table list processing */ + found = 0; + cur_head = &cur_element->list; + list_for_each(cur_entry, cur_head) { + cur_element = list_entry(cur_entry, struct flow_element, item); + if (cur_element->value[0] == (uint32_t)value->type && + cur_element->value[1] == ip) { + ht = cur_element->ht_id & 0x0000FFFF; + ht_internal = (cur_element->ht_id >> 16) & 0x0000FFFF; + found = 1; + break; + } + } + if (found) { + assert(cur_entry != cur_head); + assert(cur_element); + save_element[1] = cur_element; + save_entry[1] = cur_entry; + + /* bucket list processing */ + found = 0; + cur_head = &cur_element->list; + list_for_each(cur_entry, cur_head) { + cur_element = list_entry(cur_entry, struct flow_element, item); + if ((int)cur_element->value[0] == bkt) { + found = 1; + break; + } + } + if (found) { + assert(cur_entry != cur_head); + assert(cur_element); + save_element[2] = cur_element; + save_entry[2] = cur_entry; + + /* node list processing */ + found = 0; + cur_head = &cur_element->list; + list_for_each(cur_entry, cur_head) { + cur_element = list_entry(cur_entry, struct flow_element, item); + if ((int)cur_element->value[0] == id) { + found = 1; + break; + } + } + if (found) { + assert(cur_entry != cur_head); + assert(cur_element); + + cur_element->ref--; + + log_debug("[%d] del flow (node): 0x%p value: %d ref: %d\n", + pid, cur_element, cur_element->value[0], cur_element->value[1], cur_element->ref); + if (list_empty(&cur_element->list) && (cur_element->ref <=0 )) { + + if (tc_del_filter(daemon_cfg.tc, value->if_id, get_prio(value), ht, bkt, id) < 0) { + log_warn("[%d] failed tc_del_filter() errno = %d\n", pid, errno); + rc = -EFAULT; + } + + list_del_init(cur_entry); + free(cur_element); + } + } + + cur_element = save_element[2]; + cur_entry = save_entry[2]; + cur_element->ref--; + + log_debug("[%d] del flow (bkt): 0x%p value: %d ref: %d\n", + pid, cur_element, cur_element->value[0], cur_element->ref); + if (list_empty(&cur_element->list) && (cur_element->ref <=0 )) { + list_del_init(cur_entry); + free(cur_element); + } + } + + cur_element = save_element[1]; + cur_entry = save_entry[1]; + cur_element->ref--; + + log_debug("[%d] del flow (ht): 0x%p value: %d:%d ref: %d\n", + pid, cur_element, cur_element->value[0], cur_element->value[1], cur_element->ref); + if (list_empty(&cur_element->list) && (cur_element->ref <=0 )) { + + if (tc_del_filter(daemon_cfg.tc, value->if_id, get_prio(value), ht_internal, 0, ht) < 0) { + log_warn("[%d] failed tc_del_filter() errno = %d\n", pid, errno); + rc = -EFAULT; + } + + /* Device busy error is returned while trying to remove table in this location */ + add_pending_list(pid, ctx, value->if_id, ht, get_prio(value), &rc); + + list_del_init(cur_entry); + free(cur_element); + } + } + + cur_element = save_element[0]; + cur_entry = save_entry[0]; + cur_element->ref--; + + log_debug("[%d] del flow (if): 0x%p value: %d ref: %d\n", + pid, cur_element, cur_element->value[0], cur_element->ref); + if (list_empty(&cur_element->list) && (cur_element->ref <=0 )) { + + if (tc_del_qdisc(daemon_cfg.tc, value->if_id) < 0) { + log_warn("[%d] failed tc_del_qdisc() errno = %d\n", pid, errno); + rc = -EFAULT; + } + + bitmap_destroy(cur_element->ctx->ht); + free(cur_element->ctx); + list_del_init(cur_entry); + free(cur_element); + } + } + + free_pending_list(pid, ctx, value->if_id); + + log_debug("[%d] del flow filter: %x:%x:%x rc=%d\n", + pid, ht, bkt, id, rc); + + return rc; +} + +static int add_flow_egress(struct store_pid *pid_value, struct store_flow *value) +{ + int rc = 0; + pid_t pid = pid_value->pid; + struct list_head *cur_entry = NULL; + struct store_flow *cur_flow = NULL; + + errno = 0; + + /* Egress rules should be created for new tap device + */ + list_for_each(cur_entry, &pid_value->flow_list) { + cur_flow = list_entry(cur_entry, struct store_flow, item); + if (value->tap_id == cur_flow->tap_id) { + break; + } + } + if (cur_entry == &pid_value->flow_list) { + struct ifaddrs *ifaddr, *ifa; + int handle = 1; + + /* This cleanup is done just to support verification */ + tc_del_qdisc(daemon_cfg.tc, value->tap_id); + + /* Create rules to process ingress trafic on tap device */ + if (tc_add_qdisc(daemon_cfg.tc, value->tap_id) < 0) { + log_error("[%d] failed tc_add_qdisc() errno = %d\n", pid, errno); + rc = -EFAULT; + goto err; + } + + if (!getifaddrs(&ifaddr)) { + for (ifa = ifaddr; NULL != ifa; ifa = ifa->ifa_next) { + if (ifa->ifa_addr->sa_family == AF_INET && + !(ifa->ifa_flags & IFF_LOOPBACK) && + value->if_id == if_nametoindex(ifa->ifa_name)) { + + /* Create filter to redirect traffic from tap device to lo device + * in case destination address relates netvsc + */ + if (tc_add_filter_tap2dev(daemon_cfg.tc, value->tap_id, 1, handle, + ((struct sockaddr_in *)ifa->ifa_addr)->sin_addr.s_addr, sys_lo_ifindex()) < 0) { + log_error("[%d] failed tc_add_filter_tap2dev() errno = %d\n", pid, errno); + rc = -EFAULT; + goto err; + } + handle++; + } + } + freeifaddrs(ifaddr); + } + + /* Create filter to redirect traffic from tap device to netvsc device + * Use another prio value for common filter just to separate one + * actually the same value should work too + */ + if (tc_add_filter_tap2dev(daemon_cfg.tc, value->tap_id, 2, handle, + 0, value->if_id) < 0) { + log_error("[%d] failed tc_add_filter_tap2dev() errno = %d\n", pid, errno); + rc = -EFAULT; + goto err; + } + } + +err: + + return rc; +} + +static inline void get_htid(struct flow_ctx *ctx, int prio, int *ht_krn, int *ht_id) +{ + if (ht_krn) { + int i; + int free_index = -1; + int free_id = -1; + + *ht_krn = 0; + for (i = 0; i < (int)(sizeof(ctx->ht_prio) / sizeof(ctx->ht_prio[0])); i++) { + if (ctx->ht_prio[i].prio == prio) { + *ht_krn = (KERNEL_HT + ctx->ht_prio[i].id); + break; + } + if (ctx->ht_prio[i].prio == 0) { + free_index = i; + } else { + free_id = (free_id < ctx->ht_prio[i].id ? ctx->ht_prio[i].id : free_id); + } + } + + if ((0 == *ht_krn) && (0 <= free_index)) { + ctx->ht_prio[free_index].prio = prio; + ctx->ht_prio[free_index].id = free_id + 1; + + *ht_krn = (KERNEL_HT + ctx->ht_prio[free_index].id); + } + } + + if (ht_id) { + *ht_id = bitmap_find_first_zero(ctx->ht); + if (*ht_id >= 0) { + bitmap_set(ctx->ht, *ht_id); + } + } +} + +static inline void free_pending_list(pid_t pid, struct flow_ctx *ctx, int if_index) +{ + struct htid_node_t *cur_element = NULL; + struct list_head *cur_entry = NULL, *tmp_entry = NULL; + + if (ctx) { + list_for_each_safe(cur_entry, tmp_entry, &ctx->pending_list) { + cur_element = list_entry(cur_entry, struct htid_node_t, node); + + if (tc_del_filter(daemon_cfg.tc, if_index, cur_element->prio, cur_element->htid, 0, 0) < 0) { + continue; + } + + log_debug("[%d] del flow request was removed successfully: if %d htid %d prio %d\n", + pid, if_index, cur_element->htid, cur_element->prio); + + list_del_init(&cur_element->node); + free_htid(ctx, cur_element->htid); + free(cur_element); + } + } +} + +static inline void add_pending_list(pid_t pid, struct flow_ctx *ctx, int if_index, int ht_id, int prio, int *rc) +{ + struct htid_node_t *htid_node = (void *)calloc(1, sizeof(struct htid_node_t)); + if (NULL == htid_node) { + *rc = -ENOMEM; + return; + } + + INIT_LIST_HEAD(&htid_node->node); + htid_node->htid = ht_id; + htid_node->prio = prio; + + list_add(&htid_node->node, &ctx->pending_list); + + log_debug("[%d] del flow request was added to the pending list: if %d htid %d prio %d\n", + pid, if_index, ht_id, prio); +} + +static inline void free_htid(struct flow_ctx *ctx, int ht_id) +{ + bitmap_clear(ctx->ht, ht_id); +} + +static inline int get_prio(struct store_flow *value) +{ + return value->type; +} + +static inline int get_bkt(struct store_flow *value) +{ + return ntohs(value->flow.dst_port) & 0xFF; +} + +static inline int get_protocol(struct store_flow *value) +{ + switch (value->type) { + case VMA_MSG_FLOW_UDP_3T: + case VMA_MSG_FLOW_UDP_5T: + return IPPROTO_UDP; + + case VMA_MSG_FLOW_TCP_3T: + case VMA_MSG_FLOW_TCP_5T: + return IPPROTO_TCP; + + default: + return -EINVAL; + } +} + +static inline int get_node(struct list_head **cur_head) +{ + int id = 1; + struct flow_element *cur_element = NULL; + struct list_head *cur_entry = NULL; + + /* node id logic is smart (keep list entry in ascending order) + * there are two ways as + * 1 - simply take last entry in the list and increment id value until + * maximum value is not achieved + * 2 - if last entry has maximum possible value try look for first free + * entry from start in the list + */ + if (!list_empty((*cur_head))) { + cur_entry = (*cur_head)->prev; + cur_element = list_entry(cur_entry, struct flow_element, item); + if (cur_element->value[0] < MAX_ID) { + id = cur_element->value[0] + 1; + } else { + id = 1; + list_for_each(cur_entry, (*cur_head)) { + cur_element = list_entry(cur_entry, struct flow_element, item); + if ((int)cur_element->value[0] > id) { + *cur_head = cur_entry; + break; + } + id++; + } + } + } + + if ((0 >= id) || (id > MAX_ID)) { + return -EINVAL; + } + + return id; +} diff --git a/tools/daemon/hash.c b/tools/daemon/hash.c new file mode 100644 index 0000000..f11b8eb --- /dev/null +++ b/tools/daemon/hash.c @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include +#include +#include + +#include "hash.h" + + +#define HASH_KEY_INVALID (hash_key_t)(-1) + +/** + * @struct hash_element + * @brief It is an object to be stored in a hash container + */ +struct hash_element { + hash_key_t key; /**< key */ + void *value; /**< value */ +}; + +/** + * @struct hash_object + * @brief hash container + */ +struct hash_object { + struct hash_element *hash_table; /**< hash table */ + struct hash_element *last; /**< last accessed */ + int size; /**< maximum number of elements */ + int count; /**< current count of elements */ + hash_freefunc_t free; /**< free function */ +}; + +static struct hash_element* hash_find(hash_t ht, hash_key_t key, int flag); +static int check_prime(int value); + + +hash_t hash_create(hash_freefunc_t free_func, size_t size) +{ + hash_t ht = NULL; + + /* Check passed size */ + if (!check_prime(size)) { + return NULL; + } + + ht = (struct hash_object *)malloc(sizeof(*ht)); + if (ht) { + ht->size = size; + ht->hash_table = (struct hash_element *)calloc(ht->size, + sizeof(*ht->hash_table)); + if (ht->hash_table) { + int i = 0; + + ht->last = NULL; + ht->count = 0; + ht->free = free_func; + for (i = 0; i < ht->size; i++) { + ht->hash_table[i].key = HASH_KEY_INVALID; + } + } else { + free(ht); + ht = NULL; + } + } + + return ht; +} + +void hash_destroy(hash_t ht) +{ + if (ht) { + if (ht->hash_table) { + int i = 0; + + for (i = 0; i < ht->size; i++) { + if (ht->hash_table[i].key != HASH_KEY_INVALID) { + if (ht->free && ht->hash_table[i].value) { + ht->free(ht->hash_table[i].value); + } + ht->hash_table[i].key = HASH_KEY_INVALID; + ht->hash_table[i].value = NULL; + } + } + free(ht->hash_table); + ht->hash_table = NULL; + } + free(ht); + ht = NULL; + } +} + +int hash_count(hash_t ht) +{ + return ht->count; +} + +int hash_size(hash_t ht) +{ + return ht->size; +} + +void *hash_get(hash_t ht, hash_key_t key) +{ + if (ht) { + struct hash_element *entry = NULL; + + entry = hash_find(ht, key, 0); + if (entry) { + return entry->value; + } + } + + return NULL; +} + +void *hash_enum(hash_t ht, size_t index) +{ + if (ht) { + struct hash_element *entry = NULL; + + entry = &(ht->hash_table[index]); + if (entry) { + return entry->value; + } + } + + return NULL; +} + +void *hash_put(hash_t ht, hash_key_t key, void *value) +{ + if (ht && (ht->count < ht->size)) { + struct hash_element *entry = NULL; + + entry = hash_find(ht, key, 0); + if (NULL == entry) { + entry = hash_find(ht, key, 1); + } + if (entry) { + if (ht->free && entry->value) { + ht->free(entry->value); + } + if (entry->key == HASH_KEY_INVALID) { + ht->count++; + } + entry->key = key; + entry->value = value; + return value; + } + } + + return NULL; +} + +void hash_del(hash_t ht, hash_key_t key) +{ + if (ht) { + struct hash_element *entry = NULL; + + entry = hash_find(ht, key, 0); + if (entry) { + if (ht->free && entry->value) { + ht->free(entry->value); + } + if (entry->key != HASH_KEY_INVALID) { + ht->count--; + } + entry->key = HASH_KEY_INVALID; + entry->value = NULL; + } + } +} + +/* hash_find(): + * + * Find a place (hash element) in the hash related key or + * new element. + * @param ht - point to hash object + * @param key - key identified data + * @param flag - 1 - add new, 0 - find existing + * @return hash element or NULL in case there is no place. + */ +static struct hash_element* hash_find(hash_t ht, hash_key_t key, int flag) +{ + struct hash_element *entry = NULL; + int attempts = 0; + int idx = 0; + hash_key_t expect_key; + + if (ht->last && ht->last->key == key) + return ht->last; + + expect_key = (flag ? HASH_KEY_INVALID : key); + + idx = key % ht->size; + + do { + entry = &(ht->hash_table[idx]); + + if (entry->key == expect_key) { + break; + } else { + if (attempts >= (ht->size - 1)) { + entry = NULL; + break; + } + attempts++; + idx = (idx + 1) % ht->size; + } + } while (1); + + ht->last = (entry ? entry : ht->last); + + return entry; +} + +static int check_prime(int value) +{ + int i = 0; + + for (i = 2; i <= value / 2; i++) { + if ((value % i) == 0) { + return 0; + } + } + + return 1; +} diff --git a/tools/daemon/hash.h b/tools/daemon/hash.h new file mode 100644 index 0000000..ab4eaa3 --- /dev/null +++ b/tools/daemon/hash.h @@ -0,0 +1,130 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef TOOLS_DAEMON_HASH_H_ +#define TOOLS_DAEMON_HASH_H_ + +#ifdef __cplusplus +extern "C" { +#endif + +/* The hash_t opaque data type + */ +typedef struct hash_object* hash_t; + +/* The hash key data type + */ +typedef uint32_t hash_key_t; + + +/* This type of function is used to free data inserted into hash table + */ +typedef void (*hash_freefunc_t)(void *); + + +/* hash_create(): + * + * Create a hash object. + * @param free_func - user defined function for destroy data + * inserted into hash + * @param size - size of hash that should be Prime number + * @return the newly allocated hash table. Must be freed with hash_destory. + */ +hash_t hash_create(hash_freefunc_t free_func, size_t size); + +/* hash_destroy(): + * + * Destroy a hash object. + * @param ht - hash to be freed + * @return @a none + */ +void hash_destroy(hash_t ht); + +/* hash_count(): + * + * Return number of valid elements in the hash object. + * @param ht - point to hash object + * @return number of elements + */ +int hash_count(hash_t ht); + +/* hash_size(): + * + * Return maximum number of elements in the hash object. + * @param ht - point to hash object + * @return maximum number of elements + */ +int hash_size(hash_t ht); + +/* hash_get(): + * + * Return value stored in hash object by found by key. + * @param ht - point to hash object + * @param key - key identified data + * @return value + */ +void *hash_get(hash_t ht, hash_key_t key); + +/* hash_enum(): + * + * Return value stored in hash object by index. + * @param ht - point to hash object + * @param index - index in hash object + * @return value + */ +void *hash_enum(hash_t ht, size_t index); + +/* hash_put(): + * + * Store data in hash object. + * @param ht - point to hash object + * @param key - key identified data + * @param value - stored data + * @return value or NULL in case of error. + */ +void *hash_put(hash_t ht, hash_key_t key, void *value); + +/* hash_del(): + * + * Remove value stored in hash object and free memory + * if freefunc() is passed during hash object creation. + * @param ht - point to hash object + * @param key - key identified data + * @return @a none + */ +void hash_del(hash_t ht, hash_key_t key); + +#ifdef __cplusplus +} +#endif + +#endif /* TOOLS_DAEMON_HASH_H_ */ diff --git a/tools/daemon/loop.c b/tools/daemon/loop.c new file mode 100644 index 0000000..509dbf2 --- /dev/null +++ b/tools/daemon/loop.c @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include +#include + +#include "hash.h" +#include "tc.h" +#include "daemon.h" + +extern int open_store(void); +extern void close_store(void); +extern int open_flow(void); +extern void close_flow(void); +extern int open_message(void); +extern void close_message(void); +extern int proc_message(void); +extern int open_notify(void); +extern void close_notify(void); +extern int proc_notify(void); + +int proc_loop(void) +{ + int rc = 0; + + log_debug("setting working directory ...\n"); + if ((mkdir(daemon_cfg.notify_dir, 0777) != 0) && (errno != EEXIST)) { + rc = -errno; + log_error("failed create folder %s (errno = %d)\n", + daemon_cfg.notify_dir, errno); + goto err; + } + + log_debug("setting store ...\n"); + rc = open_store(); + if (rc < 0) { + goto err; + } + + log_debug("setting flow ...\n"); + rc = open_flow(); + if (rc < 0) { + goto err; + } + + log_debug("setting notification ...\n"); + rc = open_notify(); + if (rc < 0) { + goto err; + } + + log_debug("setting message processing ...\n"); + rc = open_message(); + if (rc < 0) { + goto err; + } + + log_debug("starting loop ...\n"); + while ((0 == daemon_cfg.sig) && (errno != EINTR)) { + fd_set readfds; + struct timeval tv; + int max_fd = -1; + + FD_ZERO(&readfds); + FD_SET(daemon_cfg.sock_fd, &readfds); + max_fd = daemon_cfg.sock_fd; + FD_SET(daemon_cfg.notify_fd, &readfds); + max_fd = (max_fd < daemon_cfg.notify_fd ? daemon_cfg.notify_fd : max_fd); + + /* Use timeout for select() call */ + tv.tv_sec = 60; + tv.tv_usec = 0; + + rc = select(max_fd + 1, &readfds, NULL, NULL, &tv); + if (rc < 0) { + rc = 0; + if (errno != EINTR) { + rc = -errno; + log_error("Failed select() errno %d (%s)\n", errno, + strerror(errno)); + } + goto err; + } else if (rc == 0) { + continue; + } + + /* Check messages from processes */ + if (FD_ISSET(daemon_cfg.sock_fd, &readfds)) { + log_debug("message processing ...\n"); + rc = proc_message(); + } + + /* Check any events from file system monitor */ + if (FD_ISSET(daemon_cfg.notify_fd, &readfds)) { + log_debug("notification processing ...\n"); + rc = proc_notify(); + } + } + +err: + log_debug("finishing loop ...\n"); + + close_message(); + close_notify(); + close_flow(); + close_store(); + + return rc; +} diff --git a/tools/daemon/message.c b/tools/daemon/message.c new file mode 100644 index 0000000..476f79d --- /dev/null +++ b/tools/daemon/message.c @@ -0,0 +1,522 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "vma/lwip/tcp.h" /* display TCP states */ +#include "hash.h" +#include "tc.h" +#include "daemon.h" + + +int open_message(void); +void close_message(void); +int proc_message(void); + +extern int add_flow(struct store_pid *pid_value, struct store_flow *value); +extern int del_flow(struct store_pid *pid_value, struct store_flow *value); + +static int proc_msg_init(struct vma_hdr *msg_hdr, size_t size, struct sockaddr_un *peeraddr); +static int proc_msg_exit(struct vma_hdr *msg_hdr, size_t size); +static int proc_msg_state(struct vma_hdr *msg_hdr, size_t size); +static int proc_msg_flow(struct vma_hdr *msg_hdr, size_t size, struct sockaddr_un *peeraddr); + + +int open_message(void) +{ + int rc = 0; + int optval = 1; + struct sockaddr_un server_addr; + + /* Create UNIX UDP socket to receive data from VMA processes */ + memset(&server_addr, 0, sizeof(server_addr)); + server_addr.sun_family = AF_UNIX; + strncpy(server_addr.sun_path, daemon_cfg.sock_file, sizeof(server_addr.sun_path) - 1); + /* remove possible old socket */ + unlink(daemon_cfg.sock_file); + + if ((daemon_cfg.sock_fd = socket(AF_UNIX, SOCK_DGRAM, 0)) < 0) { + log_error("Failed to call socket() errno %d (%s)\n", errno, + strerror(errno)); + rc = -errno; + goto err; + } + + optval = 1; + rc = setsockopt(daemon_cfg.sock_fd, SOL_SOCKET, SO_REUSEADDR, &optval, sizeof(optval)); + if (rc < 0) { + log_error("Failed to call setsockopt() errno %d (%s)\n", errno, + strerror(errno)); + rc = -errno; + goto err; + } + + /* bind created socket */ + if (bind(daemon_cfg.sock_fd, (struct sockaddr *)&server_addr, sizeof(server_addr)) < 0) { + log_error("Failed to call bind() errno %d (%s)\n", errno, + strerror(errno)); + rc = -errno; + goto err; + } + + /* Make the socket non-blocking */ + optval = fcntl(daemon_cfg.sock_fd, F_GETFL); + if (optval < 0) { + rc = -errno; + log_error("Failed to get socket flags errno %d (%s)\n", errno, + strerror(errno)); + goto err; + } + optval |= O_NONBLOCK; + rc = fcntl(daemon_cfg.sock_fd, F_SETFL, optval); + if (rc < 0) { + rc = -errno; + log_error("Failed to set socket flags errno %d (%s)\n", errno, + strerror(errno)); + goto err; + } + +err: + return rc; +} + +void close_message(void) +{ + if (daemon_cfg.sock_fd > 0) { + close(daemon_cfg.sock_fd); + } + unlink(daemon_cfg.sock_file); +} + +int proc_message(void) +{ + int rc = 0; + struct sockaddr_un peeraddr; + socklen_t addrlen = sizeof(peeraddr); + char msg_recv[4096]; + int len = 0; + struct vma_hdr *msg_hdr = NULL; + +again: + len = recvfrom(daemon_cfg.sock_fd, &msg_recv, sizeof(msg_recv), 0, + (struct sockaddr *) &peeraddr, &addrlen); + if (len < 0) { + if (errno == EINTR) { + goto again; + } + rc = -errno; + log_error("Failed recvfrom() errno %d (%s)\n", errno, + strerror(errno)); + goto err; + } + + /* Parse and process messages */ + while (len > 0) { + if (len < (int)sizeof(struct vma_hdr)) { + rc = -EBADMSG; + log_error("Invalid message lenght from %s as %d errno %d (%s)\n", + (addrlen > 0 ? peeraddr.sun_path: "n/a"), len, errno, strerror(errno)); + goto err; + } + msg_hdr = (struct vma_hdr *)&msg_recv; + log_debug("getting message ([%d] ver: %d pid: %d)\n", + msg_hdr->code, msg_hdr->ver, msg_hdr->pid); + + switch (msg_hdr->code) { + case VMA_MSG_INIT: + rc = proc_msg_init(msg_hdr, len, &peeraddr); + break; + case VMA_MSG_STATE: + rc = proc_msg_state(msg_hdr, len); + break; + case VMA_MSG_EXIT: + rc = proc_msg_exit(msg_hdr, len); + break; + case VMA_MSG_FLOW: + /* Note: special loopback logic, it + * should be added first as far as observed issue with delay + * in activation loopback filters in case two processes + * communicate locally w/o SRIOV + */ + proc_msg_flow(msg_hdr, len, NULL); + rc = proc_msg_flow(msg_hdr, len, &peeraddr); + break; + default: + rc = -EPROTO; + log_error("Received unknown message errno %d (%s)\n", errno, + strerror(errno)); + goto err; + } + if (0 < rc) { + len -= rc; + rc = 0; + } else { + goto err; + } + } + +err: + return rc; +} + +static int proc_msg_init(struct vma_hdr *msg_hdr, size_t size, struct sockaddr_un *peeraddr) +{ + struct vma_msg_init *data; + struct store_pid *value; + size_t err = 0; + + assert(msg_hdr); + assert(msg_hdr->code == VMA_MSG_INIT); + assert(size); + + data = (struct vma_msg_init *)msg_hdr; + if (size < sizeof(*data)) { + return -EBADMSG; + } + + /* Message protocol version check */ + if (data->hdr.ver > VMA_AGENT_VER) { + log_error("Protocol message mismatch (VMA_AGENT_VER = %d) errno %d (%s)\n", + VMA_AGENT_VER, errno, strerror(errno)); + err = -EBADMSG; + goto send_response; + } + + /* Allocate memory for this value in this place + * Free this memory during hash_del() call or hash_destroy() + */ + value = (void *)calloc(1, sizeof(*value)); + if (NULL == value) { + return -ENOMEM; + } + + value->pid = data->hdr.pid; + value->lib_ver = data->ver; + gettimeofday(&value->t_start, NULL); + INIT_LIST_HEAD(&value->flow_list); + + value->ht = hash_create(&free, daemon_cfg.opt.max_fid_num); + if (NULL == value->ht) { + log_error("Failed hash_create() for %d entries errno %d (%s)\n", + daemon_cfg.opt.max_fid_num, errno, strerror(errno)); + free(value); + return -EFAULT; + } + + if (hash_put(daemon_cfg.ht, value->pid, value) != value) { + log_error("Failed hash_put() count: %d size: %d errno %d (%s)\n", + hash_count(daemon_cfg.ht), hash_size(daemon_cfg.ht), + errno, strerror(errno)); + hash_destroy(value->ht); + free(value); + return -EFAULT; + } + + log_debug("[%d] put into the storage\n", data->hdr.pid); + +send_response: + data->hdr.code |= VMA_MSG_ACK; + data->hdr.ver = VMA_AGENT_VER; + if (0 > sys_sendto(daemon_cfg.sock_fd, data, sizeof(*data), 0, + (struct sockaddr *)peeraddr, sizeof(*peeraddr))) { + log_warn("Failed sendto() message errno %d (%s)\n", errno, + strerror(errno)); + } + + return err ? err : (sizeof(*data)); +} + +static int proc_msg_exit(struct vma_hdr *msg_hdr, size_t size) +{ + struct vma_msg_exit *data; + struct store_pid *pid_value = NULL; + + assert(msg_hdr); + assert(msg_hdr->code == VMA_MSG_EXIT); + assert(size); + + data = (struct vma_msg_exit *)msg_hdr; + if (size < sizeof(*data)) { + return -EBADMSG; + } + + pid_value = hash_get(daemon_cfg.ht, data->hdr.pid); + if (pid_value) { + struct store_flow *flow_value = NULL; + struct list_head *cur_entry = NULL; + struct list_head *tmp_entry = NULL; + list_for_each_safe(cur_entry, tmp_entry, &pid_value->flow_list) { + flow_value = list_entry(cur_entry, struct store_flow, item); + list_del_init(&flow_value->item); + del_flow(pid_value, flow_value); + free(flow_value); + } + + hash_del(daemon_cfg.ht, pid_value->pid); + } + + log_debug("[%d] remove from the storage\n", data->hdr.pid); + + return (sizeof(*data)); +} + +static int proc_msg_state(struct vma_hdr *msg_hdr, size_t size) +{ + struct vma_msg_state *data; + struct store_pid *pid_value; + struct store_fid *value; + + assert(msg_hdr); + assert(msg_hdr->code == VMA_MSG_STATE); + assert(size); + + data = (struct vma_msg_state *)msg_hdr; + if (size < sizeof(*data)) { + return -EBADMSG; + } + + pid_value = hash_get(daemon_cfg.ht, data->hdr.pid); + if (NULL == pid_value) { + /* Return success because this case can be valid + * if the process is terminated using abnormal way + * So no needs in acknowledgement. + */ + log_debug("Failed hash_get() for pid %d errno %d (%s). The process should be abnormal terminated\n", + data->hdr.pid, errno, strerror(errno)); + return ((int)sizeof(*data)); + } + + /* Do not store information about closed socket + * It is a protection for hypothetical scenario when number for new + * sockets are incremented instead of using number + * of closed sockets + */ + if ((CLOSED == data->state) && (SOCK_STREAM == data->type)) { + hash_del(pid_value->ht, data->fid); + + log_debug("[%d] remove fid: %d type: %d state: %s\n", + data->hdr.pid, data->fid, data->type, + (data->state < (sizeof(tcp_state_str)/sizeof(tcp_state_str[0])) ? + tcp_state_str[data->state] : "n/a")); + return (sizeof(*data)); + } + + /* Allocate memory for this value in this place + * Free this memory during hash_del() call or hash_destroy() + */ + value = (void *)calloc(1, sizeof(*value)); + if (NULL == value) { + return -ENOMEM; + } + + value->fid = data->fid; + value->type = data->type; + value->state = data->state; + value->src_ip = data->src_ip; + value->dst_ip = data->dst_ip; + value->src_port = data->src_port; + value->dst_port = data->dst_port; + + if (hash_put(pid_value->ht, value->fid, value) != value) { + log_error("Failed hash_put() count: %d size: %d errno %d (%s)\n", + hash_count(pid_value->ht), hash_size(pid_value->ht), + errno, strerror(errno)); + free(value); + return -EFAULT; + } + + log_debug("[%d] update fid: %d type: %d state: %s\n", + pid_value->pid, value->fid, value->type, + (value->state < (sizeof(tcp_state_str)/sizeof(tcp_state_str[0])) ? + tcp_state_str[value->state] : "n/a")); + + return (sizeof(*data)); +} + +static int proc_msg_flow(struct vma_hdr *msg_hdr, size_t size, struct sockaddr_un *peeraddr) +{ + int rc = 0; + struct vma_msg_flow *data; + struct store_pid *pid_value; + struct store_flow *value = NULL; + struct store_flow *cur_flow = NULL; + struct list_head *cur_entry = NULL; + int value_new = 0; + int ack = 0; + + assert(msg_hdr); + assert((msg_hdr->code & ~VMA_MSG_ACK) == VMA_MSG_FLOW); + assert(size); + + data = (struct vma_msg_flow *)msg_hdr; + if (size < sizeof(*data)) { + rc = -EBADMSG; + goto err; + } + + /* Note: special loopback logic */ + if (NULL == peeraddr && + data->type == VMA_MSG_FLOW_EGRESS) { + return 0; + } + + ack = (1 == data->hdr.status); + + pid_value = hash_get(daemon_cfg.ht, data->hdr.pid); + if (NULL == pid_value) { + /* Return success because this case can be valid + * if the process is terminated using abnormal way + * So no needs in acknowledgement. + */ + log_debug("Failed hash_get() for pid %d errno %d (%s). The process should be abnormal terminated\n", + data->hdr.pid, errno, strerror(errno)); + return ((int)sizeof(*data)); + } + + /* Allocate memory for this value in this place + */ + value = (void *)calloc(1, sizeof(*value)); + if (NULL == value) { + rc = -ENOMEM; + goto err; + } + + value->type = data->type; + value->if_id = data->if_id; + value->tap_id = data->tap_id; + value->flow.dst_ip = data->flow.dst_ip; + value->flow.dst_port = data->flow.dst_port; + + switch (data->type) { + case VMA_MSG_FLOW_EGRESS: + case VMA_MSG_FLOW_TCP_3T: + case VMA_MSG_FLOW_UDP_3T: + break; + case VMA_MSG_FLOW_TCP_5T: + case VMA_MSG_FLOW_UDP_5T: + value->flow.t5.src_ip = data->flow.t5.src_ip; + value->flow.t5.src_port = data->flow.t5.src_port; + break; + default: + log_error("Received unknown message errno %d (%s)\n", errno, + strerror(errno)); + rc = -EPROTO; + goto err; + } + + /* Note: + * - special loopback logic when peeraddr is null + * - avoid useless rules creation in case expected 5t traffic is local + */ + if (NULL == peeraddr) { + value->if_id = sys_lo_ifindex(); + ack = 0; + if (value->if_id <= 0) { + rc = -EFAULT; + goto err; + } + } else if ((VMA_MSG_FLOW_TCP_5T == data->type || + VMA_MSG_FLOW_UDP_5T == data->type) && + sys_iplocal(value->flow.t5.src_ip)) { + rc = 0; + goto err; + } + + if (VMA_MSG_FLOW_ADD == data->action) { + list_for_each(cur_entry, &pid_value->flow_list) { + cur_flow = list_entry(cur_entry, struct store_flow, item); + if (value->type == cur_flow->type && + value->if_id == cur_flow->if_id && + value->tap_id == cur_flow->tap_id && + !memcmp(&value->flow, &cur_flow->flow, sizeof(cur_flow->flow))) { + break; + } + } + if (cur_entry == &pid_value->flow_list) { + rc = add_flow(pid_value, value); + if (rc < 0) { + goto err; + } + value_new = 1; /* mark value as new to avoid releasing */ + list_add_tail(&value->item, &pid_value->flow_list); + + log_debug("[%d] add flow handle: 0x%08X type: %d if_id: %d tap_id: %d\n", + pid_value->pid, value->handle, value->type, value->if_id, value->tap_id); + } + } + + if (VMA_MSG_FLOW_DEL == data->action) { + list_for_each(cur_entry, &pid_value->flow_list) { + cur_flow = list_entry(cur_entry, struct store_flow, item); + if (value->type == cur_flow->type && + value->if_id == cur_flow->if_id && + value->tap_id == cur_flow->tap_id && + !memcmp(&value->flow, &cur_flow->flow, sizeof(cur_flow->flow))) { + log_debug("[%d] del flow handle: 0x%08X type: %d if_id: %d tap_id: %d\n", + pid_value->pid, cur_flow->handle, cur_flow->type, cur_flow->if_id, cur_flow->tap_id); + list_del_init(&cur_flow->item); + rc = del_flow(pid_value, cur_flow); + free(cur_flow); + break; + } + } + } + +err: + if (ack) { + data->hdr.code |= VMA_MSG_ACK; + data->hdr.status = (rc ? 1 : 0); + if (0 > sys_sendto(daemon_cfg.sock_fd, &data->hdr, sizeof(data->hdr), 0, + (struct sockaddr *)peeraddr, sizeof(*peeraddr))) { + log_warn("Failed sendto() message errno %d (%s)\n", errno, + strerror(errno)); + } + } + + if (value && !value_new) { + free(value); + } + + return (rc ? rc : (int)sizeof(*data)); +} diff --git a/tools/daemon/nl.c b/tools/daemon/nl.c new file mode 100644 index 0000000..c9761b3 --- /dev/null +++ b/tools/daemon/nl.c @@ -0,0 +1,242 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include +#include +#include +#include +#include + +#include "hash.h" +#include "tc.h" +#include "daemon.h" +#include "nl.h" + + +/** + * @struct nl_object + * @brief netlink container + */ +struct nl_object { + int fd; /**< the netlink socket file descriptor used for communication */ + int seq; /**< sequence number of send operation */ + char buf[81920]; /**< buffer for receive data */ +}; + +nl_t nl_create(void) +{ + nl_t nt = NULL; + int fd = -1; + + nt = (struct nl_object *)malloc(sizeof(*nt)); + if (nt) { + int sndbuf_size = 32768; + int rcvbuf_size = 32768; + struct sockaddr_nl local; + + fd = socket(AF_NETLINK, SOCK_RAW | SOCK_CLOEXEC, NETLINK_ROUTE); + if (fd < 0) { + log_error("Unable to create a netlink socket\n"); + goto err; + } + if (setsockopt(fd, SOL_SOCKET, SO_SNDBUF, &sndbuf_size, sizeof(int))) { + log_error("Unable to set SO_SNDBUF\n"); + goto err; + } + if (setsockopt(fd, SOL_SOCKET, SO_RCVBUF, &rcvbuf_size, sizeof(int))) { + log_error("Unable to set SO_RCVBUF\n"); + goto err; + } + memset(&local, 0, sizeof(local)); + local.nl_family = AF_NETLINK; + local.nl_groups = 0; + if (bind(fd, (struct sockaddr *)&local, sizeof(local)) < 0) { + log_error("Unable to bind to the netlink socket\n"); + goto err; + } + + memset(nt, 0, sizeof(*nt)); + nt->fd = fd; + nt->seq = 0; + } + + return nt; +err: + if (fd >= 0) { + close(fd); + } + if (nt) { + free(nt); + } + nt = NULL; + + return NULL; +} + +void nl_destroy(nl_t nt) +{ + if (nt) { + close(nt->fd); + free(nt); + nt = NULL; + } +} + +int nl_send(nl_t nt, struct nlmsghdr *nl_msg) +{ + struct sockaddr_nl nladdr; + struct iovec iov; + struct msghdr msg; + int ret = -1; + + nl_msg->nlmsg_seq = nt->seq++; + + memset(&nladdr, 0, sizeof(nladdr)); + nladdr.nl_family = AF_NETLINK; + nladdr.nl_pid = 0; + nladdr.nl_groups = 0; + + iov.iov_base = nl_msg; + iov.iov_len = nl_msg->nlmsg_len; + + memset(&msg, 0, sizeof(msg)); + msg.msg_name = &nladdr; + msg.msg_namelen = sizeof(nladdr); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + log_hexdump((void *)nl_msg, nl_msg->nlmsg_len); + ret = sendmsg(nt->fd, &msg, 0); + if (ret < 0) { + log_error("Failed to send netlink message: %s (%d)\n", + strerror(errno), errno); + return ret; + } + + return ret; +} + +int nl_recv(nl_t nt, int (*cb)(struct nlmsghdr *, void *arg), void *arg) +{ + struct sockaddr_nl nladdr; + struct iovec iov; + struct msghdr msg; + int ret = 0; + int multipart = 0; + + memset(&nladdr, 0, sizeof(nladdr)); + + iov.iov_base = nt->buf; + iov.iov_len = sizeof(nt->buf); + + memset(&msg, 0, sizeof(msg)); + msg.msg_name = &nladdr; + msg.msg_namelen = sizeof(nladdr); + msg.msg_iov = &iov; + msg.msg_iovlen = 1; + + do { + struct nlmsghdr *nl_msg; + int recv_bytes = 0; + + recv_bytes = recvmsg(nt->fd, &msg, 0); + if (recv_bytes <= 0) { + if (errno == EAGAIN || errno == EWOULDBLOCK || errno == EINTR) { + continue; + } + return -1; + } + + for (nl_msg = (struct nlmsghdr *)nt->buf; + NLMSG_OK(nl_msg, (unsigned int)recv_bytes); + nl_msg = NLMSG_NEXT(nl_msg, recv_bytes)) { + if (nl_msg->nlmsg_type == NLMSG_ERROR) { + struct nlmsgerr *err_data = NLMSG_DATA(nl_msg); + + if (err_data->error < 0) { + errno = -err_data->error; + return -1; + } + /* Ack message. */ + return 0; + } + /* Multi-part msgs and their trailing DONE message. */ + if (nl_msg->nlmsg_flags & NLM_F_MULTI) { + if (nl_msg->nlmsg_type == NLMSG_DONE) { + return 0; + } + multipart = 1; + } + if (cb) { + ret = cb(nl_msg, arg); + } + } + } while (multipart || (msg.msg_flags & MSG_TRUNC)); + + return ret; +} + +void nl_attr_add(struct nlmsghdr *nl_msg, unsigned short type, + const void *data, unsigned int data_len) +{ + struct rtattr *rta; + + if ((NLMSG_ALIGN(nl_msg->nlmsg_len) + RTA_ALIGN(RTA_LENGTH(data_len))) > NLMSG_BUF) { + log_error("Message size is: %d that exceeds limit: %d\n", + (NLMSG_ALIGN(nl_msg->nlmsg_len) + RTA_ALIGN(RTA_LENGTH(data_len))), NLMSG_BUF); + return ; + } + rta = (struct rtattr *)NLMSG_TAIL(nl_msg); + rta->rta_len = RTA_LENGTH(data_len); + rta->rta_type = type; + if (data && data_len) { + memcpy(RTA_DATA(rta), data, data_len); + } + nl_msg->nlmsg_len = NLMSG_ALIGN(nl_msg->nlmsg_len) + RTA_ALIGN(rta->rta_len); +} + +struct rtattr *nl_attr_nest_start(struct nlmsghdr *nl_msg, int type) +{ + struct rtattr *nest = NLMSG_TAIL(nl_msg); + + nl_attr_add(nl_msg, type, NULL, 0); + + return nest; +} + +int nl_attr_nest_end(struct nlmsghdr *nl_msg, struct rtattr *nest) +{ + nest->rta_len = (uintptr_t)NLMSG_TAIL(nl_msg) - (uintptr_t)nest; + + return nest->rta_len; +} diff --git a/tools/daemon/nl.h b/tools/daemon/nl.h new file mode 100644 index 0000000..701255a --- /dev/null +++ b/tools/daemon/nl.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef TOOLS_DAEMON_NL_H_ +#define TOOLS_DAEMON_NL_H_ + +#include +#include + + +/* The nl_t opaque data type + */ +typedef struct nl_object* nl_t; + +#define NLMSG_BUF (16384) +#define NLMSG_TAIL(nl_msg) \ + ((struct rtattr *) (((char *) (nl_msg)) + NLMSG_ALIGN((nl_msg)->nlmsg_len))) + +struct nl_req { + struct nlmsghdr hdr; + struct tcmsg msg; + char buf[NLMSG_BUF]; +}; + + +/** + * Initialize a netlink object for communicating with the kernel. + * + * @return + * the newly allocated netlink object. Must be freed with nl_destory. + */ +nl_t nl_create(void); + +/** + * Destroy up a netlink socket. + * + * @param nt + * The netlink object. + * + * @return + * @a none + */ +void nl_destroy(nl_t nt); + +/** + * Send a message to the kernel on the netlink socket. + * + * @param nl_t nt + * The netlink object used for communication. + * @param nl_msg + * The netlink message send to the kernel. + * + * @return + * the number of sent bytes on success, -1 otherwise. + */ +int nl_send(nl_t nt, struct nlmsghdr *nl_msg); + +/** + * Receive a message from the kernel on the netlink socket. + * + * @param nl_t nt + * The netlink object used for communication. + * @param cb + * The callback function to call for each netlink message received. + * @param arg + * Custom arguments for the callback. + * + * @return + * 0 on success, -1 otherwise with errno set. + */ +int nl_recv(nl_t nt, int (*cb)(struct nlmsghdr *, void *arg), void *arg); + +/** + * Append a netlink attribute to a message. + * + * @param nl_msg + * The netlink message to parse, received from the kernel. + * @param type + * The type of attribute to append. + * @param data + * The data to append. + * @param data_len + * The length of the data to append. + * + * @return + * @a none + */ +void nl_attr_add(struct nlmsghdr *nl_msg, unsigned short type, + const void *data, unsigned int data_len); + +struct rtattr *nl_attr_nest_start(struct nlmsghdr *nl_msg, int type); + +int nl_attr_nest_end(struct nlmsghdr *nl_msg, struct rtattr *nest); + +#endif /* TOOLS_DAEMON_NL_H_ */ diff --git a/tools/daemon/notify.c b/tools/daemon/notify.c new file mode 100644 index 0000000..4f44281 --- /dev/null +++ b/tools/daemon/notify.c @@ -0,0 +1,820 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include +#include +#include + +#ifdef HAVE_SYS_INOTIFY_H +#include +#endif +#ifdef HAVE_SYS_FANOTIFY_H +#include +#endif + + +#include "hash.h" +#include "tc.h" +#include "daemon.h" + +#ifndef KERNEL_O_LARGEFILE +#if defined(__aarch64__) || defined(__powerpc__) +/* Check architecture: if we are running on ARM, + * omit KERNEL_O_LARGEFILE from fanotify_init invocation because + * KERNEL_O_LARGEFILE breaks program on armv running at least kernel 4.4+ + */ +#define KERNEL_O_LARGEFILE O_LARGEFILE +#else +/* work around kernels which do not have this fix yet: + * http://git.kernel.org/cgit/linux/kernel/git/torvalds/linux.git/commit/?id=1e2ee49f7 + * O_LARGEFILE is usually 0, so hardcode it here + */ +#define KERNEL_O_LARGEFILE 00100000 +#endif +#endif + + +struct rst_info { + struct sockaddr_in local_addr; + struct sockaddr_in remote_addr; + uint32_t seqno; +}; + +#pragma pack(push, 1) +struct tcp_msg { + struct iphdr ip; + struct tcphdr tcp; + uint8_t data[8192]; +}; +#pragma pack( pop ) + +#pragma pack(push, 1) +struct pseudo_header { + uint32_t source_address; + uint32_t dest_address; + uint8_t placeholder; + uint8_t protocol; + uint16_t tcp_length; + struct tcphdr tcp; +} pseudo_header; +#pragma pack( pop ) + + +int open_notify(void); +void close_notify(void); +int proc_notify(void); + +extern int add_flow(struct store_pid *pid_value, struct store_flow *value); +extern int del_flow(struct store_pid *pid_value, struct store_flow *value); + +static int setup_notify(void); +static int create_raw_socket(void); +static int clean_process(pid_t pid); +static int check_process(pid_t pid); +static unsigned short calc_csum(unsigned short *ptr, int nbytes); +static int get_seqno(struct rst_info *rst); +static int send_rst(struct rst_info *rst); + +#ifdef HAVE_SYS_INOTIFY_H +static int open_inotify(void); +static int proc_inotify(void *buf, int size); +#endif + +#ifdef HAVE_SYS_FANOTIFY_H +static int open_fanotify(void); +static int proc_fanotify(void *buf, int size); +#endif + +#if !defined(HAVE_SYS_FANOTIFY_H) && !defined(HAVE_SYS_INOTIFY_H) +#error neither inotify nor fanotify is supported +#endif + +static int (*do_open_notify)(void) = NULL; +static int (*do_proc_notify)(void*, int) = NULL; + + +int open_notify(void) +{ + int rc = 0; + + rc = setup_notify(); + if (rc < 0) { + goto err; + } + + rc = create_raw_socket(); + if (rc < 0) { + goto err; + } + log_debug("setting raw socket ...\n"); + + rc = do_open_notify(); + if (rc < 0) { + goto err; + } + +err: + return rc; +} + +void close_notify(void) +{ + log_debug("closing raw socket ...\n"); + + if (daemon_cfg.notify_fd > 0) { + close(daemon_cfg.notify_fd); + } + + if (daemon_cfg.raw_fd > 0) { + close(daemon_cfg.raw_fd); + } +} + +int proc_notify(void) +{ + int rc = 0; + int len = 0; + char msg_recv[4096]; + + memset((void *)&msg_recv, 0, sizeof(msg_recv)); +again: + /* coverity[tainted_string_argument] */ + len = read(daemon_cfg.notify_fd, msg_recv, sizeof(msg_recv)); + if (len <= 0) { + if (errno == EINTR) { + goto again; + } + rc = -errno; + log_error("Failed read events() errno %d (%s)\n", errno, + strerror(errno)); + goto err; + } + + rc = do_proc_notify((void *)msg_recv, len); + if (rc < 0) { + goto err; + } + +err: + return rc; +} + +static int setup_notify(void) +{ + int fd = -1; + + /* Set method for processing + * fanotify has the highest priority because it has better + * performance + */ + errno = 0; +#if defined(HAVE_SYS_FANOTIFY_H) + fd = fanotify_init(0, KERNEL_O_LARGEFILE); + if (fd >= 0) { + do_open_notify = open_fanotify; + do_proc_notify = proc_fanotify; + close(fd); + return 0; + } else { + log_debug("fanotify_init() errno %d (%s)\n", errno, + (ENOSYS == errno ? "missing support for fanotify (check CONFIG_FANOTIFY=y)\n" : strerror(errno))); + } +#endif + +#if defined(HAVE_SYS_INOTIFY_H) + fd = inotify_init(); + if (fd >= 0) { + do_open_notify = open_inotify; + do_proc_notify = proc_inotify; + close(fd); + return 0; + } else { + log_debug("inotify_init() errno %d (%s)\n", errno, + (ENOSYS == errno ? "missing support for inotify (check CONFIG_INOTIFY_USER=y)\n" : strerror(errno))); + } +#endif + + log_error("Failed notify way selection, check kernel configuration errno %d (%s)\n", errno, + strerror(errno)); + return -ENOSYS; +} + +static int create_raw_socket(void) +{ + int rc = 0; + int optval = 1; + + /* Create RAW socket to use for sending RST to peers */ + daemon_cfg.raw_fd = socket(PF_INET, SOCK_RAW, IPPROTO_TCP); + if (daemon_cfg.raw_fd < 0) { + /* socket creation failed, may be because of non-root privileges */ + log_error("Failed to call socket() errno %d (%s)\n", errno, + strerror(errno)); + rc = -errno; + goto err; + } + + optval = 1; + rc = setsockopt(daemon_cfg.raw_fd, IPPROTO_IP, IP_HDRINCL, &optval, sizeof(optval)); + if (rc < 0) { + log_error("Failed to call setsockopt() errno %d (%s)\n", errno, + strerror(errno)); + rc = -errno; + goto err; + } + +err: + return rc; +} + +static int clean_process(pid_t pid) +{ + int rc = -ESRCH; + int wait = 0; + + wait = 100; + do { + /* Wait for parent process completion */ + if (!check_process(pid)) { + struct store_pid *pid_value = NULL; + + log_debug("[%d] detect abnormal termination\n", pid); + pid_value = hash_get(daemon_cfg.ht, pid); + if (pid_value) { + struct rst_info rst; + struct store_fid *fid_value = NULL; + struct store_flow *flow_value = NULL; + struct list_head *cur_entry = NULL; + struct list_head *tmp_entry = NULL; + int i, j; + + /* Cleanup flow store */ + j = 0; + list_for_each_safe(cur_entry, tmp_entry, &pid_value->flow_list) { + flow_value = list_entry(cur_entry, struct store_flow, item); + j++; + log_debug("[%d] #%d found handle: 0x%08X type: %d if_id: %d tap_id: %d\n", + pid_value->pid, j, + flow_value->handle, flow_value->type, flow_value->if_id, flow_value->tap_id); + list_del_init(&flow_value->item); + del_flow(pid_value, flow_value); + free(flow_value); + } + + /* Cleanup fid store */ + j = 0; + for (i = 0; (i < hash_size(pid_value->ht)) && + (j < hash_count(pid_value->ht)); i++) { + fid_value = hash_enum(pid_value->ht, i); + if (NULL == fid_value) { + continue; + } + + j++; + log_debug("[%d] #%d found fid: %d type: %d state: %d\n", + pid_value->pid, j, + fid_value->fid, fid_value->type, fid_value->state); + + if (STATE_ESTABLISHED != fid_value->state) { + log_debug("[%d] #%d skip fid: %d\n", + pid_value->pid, j, fid_value->fid); + continue; + } + + log_debug("[%d] #%d process fid: %d\n", + pid_value->pid, j, fid_value->fid); + + /* Notification is based on sending RST packet to all peers + * and looks as spoofing attacks that uses a technique + * called Sequence Number Prediction. + * This actively sends spoofed SYN packets and learns the + * SeqNo number from the answer. It then sends RST packets. + * P1 - terminated process + * P2 - peer of terminated process + * H - host (kernel) of terminated process + * 1. [P1] sends SYN to [P2] with source IP of [H]. + * 2. [P2] replies to [H] by SYN/ACK. + * 3. There is a possibility of: + * 3.1 [H] should reply to an unknown SYN/ACK by RST. + * 3.2 [P1] sends RST using SeqNo from SYN/ACK. + */ + rst.local_addr.sin_family = AF_INET; + rst.local_addr.sin_port = fid_value->src_port; + rst.local_addr.sin_addr.s_addr = fid_value->src_ip; + rst.remote_addr.sin_family = AF_INET; + rst.remote_addr.sin_port = fid_value->dst_port; + rst.remote_addr.sin_addr.s_addr = fid_value->dst_ip; + rst.seqno = 1; + + if (0 == get_seqno(&rst) && daemon_cfg.opt.force_rst) { + send_rst(&rst); + } + } + + hash_del(daemon_cfg.ht, pid); + log_debug("[%d] remove from the storage\n", pid); + + /* Set OK */ + rc = 0; + } + break; + } + usleep(10000); + } while (wait--); + + return rc; +} + +static int check_process(pid_t pid) +{ + char process_file[PATH_MAX]; + int rc = 0; + + rc = snprintf(process_file, sizeof(process_file), "/proc/%d/stat", pid); + if ((0 < rc) && (rc < (int)sizeof(process_file))) { + FILE* fd = fopen(process_file, "r"); + if (fd) { + int pid_v = 0; + char name_v[32]; + char stat_v = 0; + + rc = fscanf(fd, "%d %30s %c", &pid_v, name_v, &stat_v); + fclose(fd); + if (rc == 3 && stat_v != 'Z') { + return 1; + } + } + } + + return 0; +} + +static unsigned short calc_csum(unsigned short *ptr, int nbytes) +{ + register long sum; + unsigned short oddbyte; + register short answer; + + sum = 0; + while (nbytes > 1) { + sum += *ptr++; + nbytes -= 2; + } + if (nbytes == 1) { + oddbyte = 0; + *((u_char*) &oddbyte) = *(u_char*) ptr; + sum += oddbyte; + } + + sum = (sum >> 16) + (sum & 0xffff); + sum = sum + (sum >> 16); + answer = (short) ~sum; + + return (answer); +} + +static int get_seqno(struct rst_info *rst) +{ + int rc = 0; + struct tcp_msg msg; + struct pseudo_header pheader; + int attempt = 3; /* Do maximum number of attempts */ + struct timeval t_end = TIMEVAL_INITIALIZER; + struct timeval t_now = TIMEVAL_INITIALIZER; + struct timeval t_wait = TIMEVAL_INITIALIZER; /* Defines wait interval, holds difference between t_now and t_end */ + + /* zero out the packet */ + memset(&msg, 0, sizeof(msg)); + + /* IP Header */ + msg.ip.version = 4; + msg.ip.ihl = 5; + msg.ip.tos = 0; + msg.ip.tot_len = sizeof(struct iphdr) + sizeof(struct tcphdr); + msg.ip.id = 0; + msg.ip.frag_off = htons(0x4000); /* Flag: "Don't Fragment" */ + msg.ip.ttl = 0x40; + msg.ip.protocol = IPPROTO_TCP; + msg.ip.check = 0; + msg.ip.saddr = rst->local_addr.sin_addr.s_addr; + msg.ip.daddr = rst->remote_addr.sin_addr.s_addr; + + /* Calculate IP header checksum */ + msg.ip.check = calc_csum((unsigned short *)&msg.ip, sizeof(msg.ip)); + + /* TCP Header */ + msg.tcp.source = rst->local_addr.sin_port; + msg.tcp.dest = rst->remote_addr.sin_port; + msg.tcp.seq = rst->seqno; + msg.tcp.ack_seq = 0; + msg.tcp.doff = 5; + msg.tcp.fin = 0; + msg.tcp.syn = 1; + msg.tcp.rst = 0; + msg.tcp.psh = 0; + msg.tcp.ack = 0; + msg.tcp.urg = 0; + msg.tcp.window = 0; + msg.tcp.check = 0; + msg.tcp.urg_ptr = 0; + + /* Calculate TCP header checksum */ + pheader.source_address = msg.ip.saddr; + pheader.dest_address = msg.ip.daddr; + pheader.placeholder = 0; + pheader.protocol = IPPROTO_TCP; + pheader.tcp_length = htons(sizeof(struct tcphdr)); + bcopy((const void *)&msg.tcp, (void *)&pheader.tcp, sizeof(struct tcphdr)); + msg.tcp.check = calc_csum((unsigned short *)&pheader, sizeof(pheader)); + + do { + /* Send invalid SYN packet */ + rc = sys_sendto(daemon_cfg.raw_fd, &msg, sizeof(msg) - sizeof(msg.data), 0, + (struct sockaddr *) &rst->remote_addr, sizeof(rst->remote_addr)); + if (rc < 0) { + goto out; + } + log_debug("send SYN to: %s\n", sys_addr2str(&rst->remote_addr)); + t_wait.tv_sec = daemon_cfg.opt.retry_interval / 1000; + t_wait.tv_usec = (daemon_cfg.opt.retry_interval % 1000) * 1000; + gettimeofday(&t_end, NULL); + + /* Account for wrapping of tv_usec, use libvma utils macro for timeradd() */ + tv_add(&t_end, &t_wait, &t_end); + + do { + struct tcp_msg msg_recv; + struct sockaddr_in gotaddr; + socklen_t addrlen = sizeof(gotaddr); + fd_set readfds; + + FD_ZERO(&readfds); + FD_SET(daemon_cfg.raw_fd, &readfds); + + /* Use t_difference to determine timeout for select so we don't wait longer than t_wait */ + rc = select(daemon_cfg.raw_fd + 1, &readfds, NULL, NULL, &t_wait); + gettimeofday(&t_now, NULL); + + /** + * Determine and save difference between t_now and t_end for select on next iteration. + */ + tv_sub(&t_end, &t_now, &t_wait); + + if (rc == 0) { + continue; + } + + memcpy(&gotaddr, &rst->remote_addr, addrlen); + memset(&msg_recv, 0, sizeof(msg_recv)); + rc = recvfrom(daemon_cfg.raw_fd, &msg_recv, sizeof(msg_recv), 0, (struct sockaddr *)&gotaddr, &addrlen); + if (rc < 0) { + goto out; + } + if (msg_recv.ip.version == 4 && + msg_recv.ip.ihl == 5 && + msg_recv.ip.protocol == IPPROTO_TCP && + msg_recv.ip.saddr == msg.ip.daddr && + msg_recv.ip.daddr == msg.ip.saddr && + msg_recv.tcp.source == msg.tcp.dest && + msg_recv.tcp.dest == msg.tcp.source && + msg_recv.tcp.ack == 1) { + rst->seqno = msg_recv.tcp.ack_seq; + log_debug("recv SYN|ACK from: %s with SegNo: %d\n", + sys_addr2str(&gotaddr), ntohl(rst->seqno)); + return 0; + } + } while (tv_cmp(&t_now, &t_end, <)); + } while (--attempt); + +out: + return -EAGAIN; +} + +static int send_rst(struct rst_info *rst) { + int rc = 0; + struct tcp_msg msg; + struct pseudo_header pheader; + + /* zero out the packet */ + memset(&msg, 0, sizeof(msg)); + + /* IP Header */ + msg.ip.version = 4; + msg.ip.ihl = 5; + msg.ip.tos = 0; + msg.ip.tot_len = sizeof(struct iphdr) + sizeof(struct tcphdr); + msg.ip.id = 0; + msg.ip.frag_off = htons(0x4000); /* Flag: "Don't Fragment" */ + msg.ip.ttl = 0x40; + msg.ip.protocol = IPPROTO_TCP; + msg.ip.check = 0; + msg.ip.saddr = rst->local_addr.sin_addr.s_addr; + msg.ip.daddr = rst->remote_addr.sin_addr.s_addr; + + /* Calculate IP header checksum */ + msg.ip.check = calc_csum((unsigned short *)&msg.ip, sizeof(msg.ip)); + + /* TCP Header */ + msg.tcp.source = rst->local_addr.sin_port; + msg.tcp.dest = rst->remote_addr.sin_port; + msg.tcp.seq = rst->seqno; + msg.tcp.ack_seq = 0; + msg.tcp.doff = 5; + msg.tcp.fin = 0; + msg.tcp.syn = 0; + msg.tcp.rst = 1; + msg.tcp.psh = 0; + msg.tcp.ack = 0; + msg.tcp.urg = 0; + msg.tcp.window = 0; + msg.tcp.check = 0; + msg.tcp.urg_ptr = 0; + + /* Calculate TCP header checksum */ + pheader.source_address = msg.ip.saddr; + pheader.dest_address = msg.ip.daddr; + pheader.placeholder = 0; + pheader.protocol = IPPROTO_TCP; + pheader.tcp_length = htons(sizeof(struct tcphdr)); + bcopy((const void *)&msg.tcp, (void *)&pheader.tcp, sizeof(struct tcphdr)); + msg.tcp.check = calc_csum((unsigned short *)&pheader, sizeof(pheader)); + + rc = sys_sendto(daemon_cfg.raw_fd, &msg, sizeof(msg) - sizeof(msg.data), 0, + (struct sockaddr *) &rst->remote_addr, sizeof(rst->remote_addr)); + if (rc < 0) { + goto out; + } + log_debug("send RST to: %s\n", sys_addr2str(&rst->remote_addr)); + + rc = 0; + +out: + return rc; +} + +#ifdef HAVE_SYS_FANOTIFY_H +static int open_fanotify(void) +{ + int rc = 0; + + log_debug("selected fanotify ...\n"); + + if ((daemon_cfg.notify_fd = fanotify_init(0, KERNEL_O_LARGEFILE)) < 0) { + log_error("Cannot initialize fanotify_init() errno %d (%s)\n", errno, + strerror(errno)); + rc = -errno; + goto err; + } + + rc = fanotify_mark(daemon_cfg.notify_fd, FAN_MARK_ADD, + FAN_CLOSE | FAN_EVENT_ON_CHILD, AT_FDCWD, + daemon_cfg.notify_dir); + if (rc < 0) { + rc = -errno; + log_error("Failed to add watch for directory %s errno %d (%s)\n", + daemon_cfg.notify_dir, errno, strerror(errno)); + goto err; + } + +err: + return rc; +} + +static int proc_fanotify(void *buffer, int nbyte) +{ + int rc = 0; + int len = nbyte; + struct fanotify_event_metadata *data = (struct fanotify_event_metadata *)buffer; + + while (FAN_EVENT_OK(data, len)) { + + /* Check that run-time and compile-time structures match */ + if (data->vers != FANOTIFY_METADATA_VERSION) { + rc = -EPROTO; + log_error("Mismatch of fanotify metadata version\n"); + goto err; + } + /* Current check is based on monitoring special pid file events + * This file is created on library startup + * Event about closing this file should come if process exits + * either after work completion or as result of unexpected termination + */ + if ((data->mask & FAN_CLOSE_WRITE || data->mask & FAN_CLOSE_NOWRITE) && + hash_get(daemon_cfg.ht, data->pid)) { + char buf[PATH_MAX]; + char pathname[PATH_MAX]; + + memset(buf, 0, sizeof(buf)); + memset(pathname, 0, sizeof(pathname)); + + rc = snprintf(buf, sizeof(buf) - 1, "/proc/self/fd/%d", data->fd); + if ((rc < 0 ) || (rc == (sizeof(buf) - 1) )) { + rc = -ENOMEM; + log_error("Cannot read process name errno %d (%s)\n", errno, + strerror(errno)); + goto err; + } + rc = readlink(buf, pathname, sizeof(pathname) - 1); + if (rc < 0) { + rc = -ENOMEM; + log_error("Cannot read process name errno %d (%s)\n", errno, + strerror(errno)); + goto err; + } + + log_debug("getting event ([0x%x] pid: %d fd: %d name: %s)\n", + data->mask, data->pid, data->fd, pathname); + + rc = snprintf(buf, sizeof(buf) - 1, "%s/%s.%d.pid", + daemon_cfg.notify_dir, VMA_AGENT_BASE_NAME, data->pid); + if ((rc < 0 ) || (rc == (sizeof(buf) - 1) )) { + rc = -ENOMEM; + log_error("failed allocate pid file errno %d (%s)\n", errno, + strerror(errno)); + goto err; + } + + /* Process event related pid file only */ + rc = 0; + if (!strncmp(buf, pathname, strlen(buf))) { + log_debug("[%d] check the event\n", data->pid); + + /* Check if termination is unexpected and send RST to peers + * Return status should be 0 in case we send RST and + * nonzero in case we decide that processes exited accurately + * or some internal error happens during RST send + */ + rc = clean_process(data->pid); + if (0 == rc) { + /* Cleanup unexpected termination */ + log_debug("[%d] cleanup after unexpected termination\n", data->pid); + /* To suppress TOCTOU (time-of-check, time-of-use race condition) */ + strcpy(pathname, buf); + unlink(pathname); + if (snprintf(pathname, sizeof(pathname) - 1, "%s/%s.%d.sock", + daemon_cfg.notify_dir, VMA_AGENT_BASE_NAME, data->pid) > 0) { + unlink(pathname); + } + } else if (-ESRCH == rc) { + /* No need in peer notification */ + log_debug("[%d] no need in peer notification\n", data->pid); + rc = 0; + } + } else { + log_debug("[%d] skip the event\n", data->pid); + } + } + + /* Close the file descriptor of the event */ + close(data->fd); + data = FAN_EVENT_NEXT(data, len); + } + +err: + return rc; +} +#endif + +#ifdef HAVE_SYS_INOTIFY_H +static int open_inotify(void) +{ + int rc = 0; + + log_debug("selected inotify ...\n"); + + if ((daemon_cfg.notify_fd = inotify_init()) < 0) { + log_error("Cannot initialize inotify_init() errno %d (%s)\n", errno, + strerror(errno)); + rc = -errno; + goto err; + } + + rc = inotify_add_watch(daemon_cfg.notify_fd, + daemon_cfg.notify_dir, + IN_CLOSE_WRITE | IN_CLOSE_NOWRITE | IN_DELETE); + if (rc < 0) { + rc = -errno; + log_error("Failed to add watch for directory %s errno %d (%s)\n", + daemon_cfg.notify_dir, errno, strerror(errno)); + goto err; + } + +err: + return rc; +} + +static int proc_inotify(void *buffer, int nbyte) +{ + int rc = 0; + struct inotify_event *data = (struct inotify_event *)buffer; + + while ((uintptr_t)data < ((uintptr_t)buffer + nbyte)) { + pid_t pid; + + /* Monitor only events from files */ + if ((data->len > 0) && + !(data->mask & IN_ISDIR ) && + (1 == sscanf(data->name, VMA_AGENT_BASE_NAME ".%d.pid", &pid)) && + hash_get(daemon_cfg.ht, pid)) { + + char buf[PATH_MAX]; + char pathname[PATH_MAX]; + + memset(buf, 0, sizeof(buf)); + memset(pathname, 0, sizeof(pathname)); + + rc = snprintf(pathname, sizeof(pathname) - 1, "%s/%s", + daemon_cfg.notify_dir, data->name); + if ((rc < 0 ) || (rc == (sizeof(pathname) - 1) )) { + rc = -ENOMEM; + log_error("failed allocate pid file errno %d (%s)\n", errno, + strerror(errno)); + goto err; + } + + log_debug("getting event ([0x%x] pid: %d name: %s)\n", + data->mask, pid, pathname); + + rc = snprintf(buf, sizeof(buf) - 1, "%s/%s.%d.pid", + daemon_cfg.notify_dir, VMA_AGENT_BASE_NAME, pid); + if ((rc < 0 ) || (rc == (sizeof(buf) - 1) )) { + rc = -ENOMEM; + log_error("failed allocate pid file errno %d (%s)\n", errno, + strerror(errno)); + goto err; + } + + /* Process event related pid file only */ + rc = 0; + if (!strncmp(buf, pathname, strlen(buf))) { + log_debug("[%d] check the event\n", pid); + + /* Check if termination is unexpected and send RST to peers + * Return status should be 0 in case we send RST and + * nonzero in case we decide that processes exited accurately + * or some internal error happens during RST send + */ + rc = clean_process(pid); + if (0 == rc) { + /* Cleanup unexpected termination */ + log_debug("[%d] cleanup after unexpected termination\n", pid); + unlink(buf); + if (snprintf(buf, sizeof(buf) - 1, "%s/%s.%d.sock", + daemon_cfg.notify_dir, VMA_AGENT_BASE_NAME, pid) > 0) { + unlink(buf); + } + } else if (-ESRCH == rc) { + /* No need in peer notification */ + log_debug("[%d] no need in peer notification\n", pid); + rc = 0; + } + } else { + log_debug("[%d] skip the event\n", pid); + } + } + + /* Move to the next event */ + data = (struct inotify_event *)((uintptr_t)data + sizeof(*data) + data->len); + } + +err: + return rc; +} +#endif diff --git a/tools/daemon/store.c b/tools/daemon/store.c new file mode 100644 index 0000000..0f058fb --- /dev/null +++ b/tools/daemon/store.c @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#ifdef HAVE_CONFIG_H +#include "config.h" +#endif + +#include +#include +#include +#include + + +#include "hash.h" +#include "tc.h" +#include "daemon.h" + + +int open_store(void); +void close_store(void); + +static void free_store_pid(void *ptr); + + +int open_store(void) +{ + daemon_cfg.ht = hash_create(&free_store_pid, daemon_cfg.opt.max_pid_num); + + return (NULL == daemon_cfg.ht ? -EFAULT : 0); +} + +void close_store(void) +{ + hash_destroy(daemon_cfg.ht); +} + +static void free_store_pid(void *ptr) +{ + struct store_pid *value; + + if (ptr) { + value = (struct store_pid *)ptr; + hash_destroy(value->ht); + free(value); + } +} diff --git a/tools/daemon/tc.c b/tools/daemon/tc.c new file mode 100644 index 0000000..6a50b5d --- /dev/null +++ b/tools/daemon/tc.c @@ -0,0 +1,728 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + + +#include +#include +#include +#include +#include + +#include "hash.h" +#include "tc.h" +#include "daemon.h" +#include "nl.h" + + +/* Traffic control usage method + * 0 - tc application + * 1 - netlink api + */ +#define USE_NETLINK 1 + +/** + * @struct tc_object + * @brief tc container + */ +struct tc_object { + nl_t nl; /**< netlink object */ + struct nl_req req; /**< netlink request storage */ +}; + +#if defined(USE_NETLINK) && (USE_NETLINK == 1) +/* Use iproute2 / tc implementation as a reference + * to pack data for specific attribute + */ +static int pack_key(struct tc_u32_sel *sel, uint32_t key, uint32_t mask, int off, int offmask); +static int pack_key8(struct tc_u32_sel *sel, uint32_t key, uint32_t mask, int off, int offmask); +static int pack_key16(struct tc_u32_sel *sel, uint32_t key, uint32_t mask, int off, int offmask); +static int pack_key32(struct tc_u32_sel *sel, uint32_t key, uint32_t mask, int off, int offmask); +#endif /* USE_NETLINK */ + + +tc_t tc_create(void) +{ + tc_t tc = NULL; + + tc = (struct tc_object *)malloc(sizeof(*tc)); + if (tc) { + tc->nl = nl_create(); + if (NULL == tc->nl) { + log_error("Unable to create a netlink object\n"); + goto err; + } + memset(&tc->req, 0, sizeof(tc->req)); + } + + return tc; +err: + free(tc); + tc = NULL; + + return NULL; +} + +void tc_destroy(tc_t tc) +{ + if (tc) { + nl_destroy(tc->nl); + free(tc); + tc = NULL; + } +} + +void tc_req(tc_t tc, int ifindex, uint16_t type, uint16_t flags, struct tc_qdisc qdisc) +{ + memset(&tc->req, 0, sizeof(tc->req)); + + tc->req.hdr.nlmsg_len = NLMSG_LENGTH(sizeof(tc->req.msg)); + tc->req.hdr.nlmsg_type = type; + tc->req.hdr.nlmsg_flags = (flags ? flags : (NLM_F_REQUEST | NLM_F_ACK)); + tc->req.hdr.nlmsg_pid = 0; /* to communicate kernel */ + tc->req.hdr.nlmsg_seq = 0; /* update during send */ + + tc->req.msg.tcm_family = AF_UNSPEC; + tc->req.msg.tcm_ifindex = ifindex; + tc->req.msg.tcm_handle = qdisc.handle; + tc->req.msg.tcm_parent = qdisc.parent; + tc->req.msg.tcm_info = TC_H_MAKE(qdisc.prio << 16, htons(ETH_P_IP)); +} + +int tc_add_qdisc(tc_t tc, int ifindex) +{ + int rc = 0; + + log_debug("add qdisc using if_id: %d\n", ifindex); + +#if defined(USE_NETLINK) && (USE_NETLINK == 1) + struct tc_qdisc qdisc = {TC_H_MAKE(TC_H_INGRESS, 0), TC_H_INGRESS, 0}; + struct rtattr *opts = NULL; + + tc_req(tc, ifindex, RTM_NEWQDISC, + (NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE), + qdisc); + + nl_attr_add(&tc->req.hdr, TCA_KIND, "ingress", sizeof("ingress")); + + opts = nl_attr_nest_start(&tc->req.hdr, TCA_OPTIONS); + nl_attr_nest_end(&tc->req.hdr, opts); + + if (nl_send(tc->nl, &tc->req.hdr) < 0) { + rc = -1; + goto err; + } + if ((nl_recv(tc->nl, NULL, NULL) < 0) && (errno != EEXIST)) { + rc = -1; + goto err; + } +#else + char *out_buf = NULL; + char if_name[IF_NAMESIZE]; + + NOT_IN_USE(tc); + + if (NULL == if_indextoname(ifindex, if_name)) { + rc = -errno; + goto err; + } + + out_buf = sys_exec("tc qdisc add dev %s handle ffff: ingress " + "> /dev/null 2>&1 || echo $?", if_name); + if (NULL == out_buf || (out_buf[0] != '\0' && out_buf[0] != '0')) { + rc = -1; + goto err; + } +#endif /* USE_NETLINK */ + +err: + return rc; +} + +int tc_del_qdisc(tc_t tc, int ifindex) +{ + int rc = 0; + + log_debug("remove qdisc using if_id: %d\n", ifindex); + +#if defined(USE_NETLINK) && (USE_NETLINK == 1) + struct tc_qdisc qdisc = {TC_H_MAKE(TC_H_INGRESS, 0), TC_H_INGRESS, 0}; + struct rtattr *opts = NULL; + + tc_req(tc, ifindex, RTM_DELQDISC, + 0, + qdisc); + + nl_attr_add(&tc->req.hdr, TCA_KIND, "ingress", sizeof("ingress")); + + opts = nl_attr_nest_start(&tc->req.hdr, TCA_OPTIONS); + nl_attr_nest_end(&tc->req.hdr, opts); + + if (nl_send(tc->nl, &tc->req.hdr) < 0) { + rc = -1; + goto err; + } + if ((nl_recv(tc->nl, NULL, NULL) < 0) && (errno != EINVAL)) { + rc = -1; + goto err; + } +#else + char *out_buf = NULL; + char if_name[IF_NAMESIZE]; + + NOT_IN_USE(tc); + + if (NULL == if_indextoname(ifindex, if_name)) { + rc = -errno; + goto err; + } + + out_buf = sys_exec("tc qdisc del dev %s handle ffff: ingress " + "> /dev/null 2>&1 || echo $?", if_name); + if (NULL == out_buf || (out_buf[0] != '\0' && out_buf[0] != '0')) { + rc = -1; + goto err; + } +#endif /* USE_NETLINK */ + +err: + return rc; +} + +int tc_add_filter_divisor(tc_t tc, int ifindex, int prio, int ht) +{ + int rc = 0; + + log_debug("apply filter divisor using if_id: %d\n", ifindex); + +#if defined(USE_NETLINK) && (USE_NETLINK == 1) + struct tc_qdisc qdisc = {HANDLE_SET(ht, 0, 0), 0xffff0000, prio}; + char opt_kind[] = "u32"; + uint32_t opt_divisor = 256; + struct rtattr *opts = NULL; + + tc_req(tc, ifindex, RTM_NEWTFILTER , + (NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE), + qdisc); + + nl_attr_add(&tc->req.hdr, TCA_KIND, opt_kind, sizeof(opt_kind)); + + opts = nl_attr_nest_start(&tc->req.hdr, TCA_OPTIONS); + nl_attr_add(&tc->req.hdr, TCA_U32_DIVISOR, &opt_divisor, sizeof(opt_divisor)); + nl_attr_nest_end(&tc->req.hdr, opts); + + if (nl_send(tc->nl, &tc->req.hdr) < 0) { + rc = -1; + goto err; + } + if (nl_recv(tc->nl, NULL, NULL) < 0) { + rc = -1; + goto err; + } +#else + char *out_buf = NULL; + char if_name[IF_NAMESIZE]; + + NOT_IN_USE(tc); + + if (NULL == if_indextoname(ifindex, if_name)) { + rc = -errno; + goto err; + } + + out_buf = sys_exec("tc filter add dev %s parent ffff: prio %d handle %x: protocol ip u32 divisor 256 " + "> /dev/null 2>&1 || echo $?", + if_name, prio, ht); + if (NULL == out_buf || (out_buf[0] != '\0' && out_buf[0] != '0')) { + rc = -1; + goto err; + } +#endif /* USE_NETLINK */ + +err: + return rc; +} + +int tc_add_filter_link(tc_t tc, int ifindex, int prio, int ht, int id, uint32_t ip) +{ + int rc = 0; + + log_debug("add link filter using if_id: %d\n", ifindex); + +#if defined(USE_NETLINK) && (USE_NETLINK == 1) + struct tc_qdisc qdisc = {HANDLE_SET(0, 0, id), 0xffff0000, prio}; + char opt_kind[] = "u32"; + uint32_t opt_link = HANDLE_SET(id, 0, 0); + uint32_t opt_ht = HANDLE_SET(ht, 0, 0); + struct rtattr *opts = NULL; + struct { + struct tc_u32_sel sel; + struct tc_u32_key keys[5]; + } opt_sel; + + tc_req(tc, ifindex, RTM_NEWTFILTER, + (NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE), + qdisc); + + nl_attr_add(&tc->req.hdr, TCA_KIND, opt_kind, sizeof(opt_kind)); + + opts = nl_attr_nest_start(&tc->req.hdr, TCA_OPTIONS); + nl_attr_add(&tc->req.hdr, TCA_U32_LINK, &opt_link, sizeof(opt_link)); + nl_attr_add(&tc->req.hdr, TCA_U32_HASH, &opt_ht, sizeof(opt_ht)); + memset(&opt_sel, 0, sizeof(opt_sel)); + /* hashkey option: + * mask: 0x000000ff + * at: 20 + */ + opt_sel.sel.hmask = htonl(0x000000ff); + opt_sel.sel.hoff = 20; + /* match option for ip protocol: + * dst: 16 + * addr/mask: ip/0xffffffff + */ + pack_key32(&opt_sel.sel, ntohl(ip), 0xffffffff, 16, 0); + nl_attr_add(&tc->req.hdr, TCA_U32_SEL, &opt_sel, sizeof(opt_sel.sel) + opt_sel.sel.nkeys * sizeof(opt_sel.sel.keys[0])); + nl_attr_nest_end(&tc->req.hdr, opts); + + if (nl_send(tc->nl, &tc->req.hdr) < 0) { + rc = -1; + goto err; + } + if (nl_recv(tc->nl, NULL, NULL) < 0) { + rc = -1; + goto err; + } +#else + char *out_buf = NULL; + char if_name[IF_NAMESIZE]; + + NOT_IN_USE(tc); + + if (NULL == if_indextoname(ifindex, if_name)) { + rc = -errno; + goto err; + } + + out_buf = sys_exec("tc filter add dev %s protocol ip parent ffff: prio %d handle ::%x u32 " + "ht %x:: match ip dst %s/32 hashkey mask 0x000000ff at 20 link %x: " + "> /dev/null 2>&1 || echo $?", + if_name, prio, id, ht, sys_ip2str(ip), id); + if (NULL == out_buf || (out_buf[0] != '\0' && out_buf[0] != '0')) { + rc = -1; + goto err; + } +#endif /* USE_NETLINK */ + +err: + return rc; +} + +int tc_add_filter_tap2dev(tc_t tc, int ifindex, int prio, int id, uint32_t ip, int ifindex_to) +{ + int rc = 0; + + log_debug("add filter to redirect traffic from if_id: %d to if_id: %d\n", ifindex, ifindex_to); + +#if defined(USE_NETLINK) && (USE_NETLINK == 1) + struct tc_qdisc qdisc = {HANDLE_SET(0, 0, id), 0xffff0000, prio}; + char opt_kind[] = "u32"; + uint32_t opt_ht = HANDLE_SET(0x800, 0, 0); + struct rtattr *opts = NULL; + struct { + struct tc_u32_sel sel; + struct tc_u32_key keys[5]; + } opt_sel; + + tc_req(tc, ifindex, RTM_NEWTFILTER, + (NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE), + qdisc); + + nl_attr_add(&tc->req.hdr, TCA_KIND, opt_kind, sizeof(opt_kind)); + + /* [filter] options filling */ + opts = nl_attr_nest_start(&tc->req.hdr, TCA_OPTIONS); + { + struct rtattr *opts_action = NULL; + + /* [action] options filling */ + opts_action = nl_attr_nest_start(&tc->req.hdr, TCA_U32_ACT); + { + int opt_prio = 0; + char opt_act_kind[] = "mirred"; + struct rtattr *opts_action_prio = NULL; + + /* [mirred] options filling */ + opts_action_prio = nl_attr_nest_start(&tc->req.hdr, ++opt_prio); + nl_attr_add(&tc->req.hdr, TCA_ACT_KIND, opt_act_kind, sizeof(opt_act_kind)); + { + struct rtattr *opts_action_prio_mirred = NULL; + struct tc_mirred opt_mirred; + + opts_action_prio_mirred = nl_attr_nest_start(&tc->req.hdr, TCA_ACT_OPTIONS); + memset(&opt_mirred, 0, sizeof(opt_mirred)); + opt_mirred.eaction = TCA_EGRESS_REDIR; + opt_mirred.action = TC_ACT_STOLEN; + opt_mirred.ifindex = ifindex_to; + nl_attr_add(&tc->req.hdr, TCA_MIRRED_PARMS, &opt_mirred, sizeof(opt_mirred)); + + nl_attr_nest_end(&tc->req.hdr, opts_action_prio_mirred); + } + + nl_attr_nest_end(&tc->req.hdr, opts_action_prio); + } + + nl_attr_nest_end(&tc->req.hdr, opts_action); + } + + nl_attr_add(&tc->req.hdr, TCA_U32_HASH, &opt_ht, sizeof(opt_ht)); + memset(&opt_sel, 0, sizeof(opt_sel)); + /* match option for ip protocol: + * dst: 16 + * addr/mask: ip/0xffffffff + */ + if (ip) { + pack_key32(&opt_sel.sel, ntohl(ip), 0xffffffff, 16, 0); + } else { + pack_key32(&opt_sel.sel, ntohl(ip), 0, 0, 0); + } + opt_sel.sel.flags |= TC_U32_TERMINAL; + nl_attr_add(&tc->req.hdr, TCA_U32_SEL, &opt_sel, sizeof(opt_sel.sel) + opt_sel.sel.nkeys * sizeof(opt_sel.sel.keys[0])); + + nl_attr_nest_end(&tc->req.hdr, opts); + + if (nl_send(tc->nl, &tc->req.hdr) < 0) { + rc = -1; + goto err; + } + if (nl_recv(tc->nl, NULL, NULL) < 0) { + rc = -1; + goto err; + } +#else + char *out_buf = NULL; + char if_name[IF_NAMESIZE]; + char tap_name[IF_NAMESIZE]; + + NOT_IN_USE(tc); + + if (NULL == if_indextoname(ifindex_to, if_name)) { + rc = -errno; + goto err; + } + + if (NULL == if_indextoname(ifindex, tap_name)) { + rc = -errno; + goto err; + } + + if (ip) { + out_buf = sys_exec("tc filter add dev %s protocol ip parent ffff: prio %d " + "handle ::%d u32 ht 800:: " + "match ip dst %s/32 action mirred egress redirect dev %s " + "> /dev/null 2>&1 || echo $?", + tap_name, prio, id, sys_ip2str(ip), if_name); + } else { + out_buf = sys_exec("tc filter add dev %s protocol ip parent ffff: prio %d " + "handle ::%d u32 ht 800:: " + "match u8 0 0 action mirred egress redirect dev %s " + "> /dev/null 2>&1 || echo $?", + tap_name, prio, id, if_name); + } + if (NULL == out_buf || (out_buf[0] != '\0' && out_buf[0] != '0')) { + rc = -1; + goto err; + } +#endif /* USE_NETLINK */ + +err: + return rc; +} + +int tc_add_filter_dev2tap(tc_t tc, int ifindex, int prio, int ht, int bkt, int id, + int proto, uint32_t dst_ip, uint16_t dst_port, uint32_t src_ip, uint16_t src_port, int ifindex_to) +{ + int rc = 0; + + log_debug("add filter to redirect traffic from if_id: %d to if_id: %d\n", ifindex, ifindex_to); + +#if defined(USE_NETLINK) && (USE_NETLINK == 1) + struct tc_qdisc qdisc = {HANDLE_SET(0, 0, id), 0xffff0000, prio}; + char opt_kind[] = "u32"; + uint32_t opt_ht = HANDLE_SET(ht, bkt, 0); + struct rtattr *opts = NULL; + struct { + struct tc_u32_sel sel; + struct tc_u32_key keys[10]; + } opt_sel; + + tc_req(tc, ifindex, RTM_NEWTFILTER, + (NLM_F_REQUEST | NLM_F_ACK | NLM_F_EXCL | NLM_F_CREATE), + qdisc); + + nl_attr_add(&tc->req.hdr, TCA_KIND, opt_kind, sizeof(opt_kind)); + + /* [filter] options filling */ + opts = nl_attr_nest_start(&tc->req.hdr, TCA_OPTIONS); + { + struct rtattr *opts_action = NULL; + + /* [action] options filling */ + opts_action = nl_attr_nest_start(&tc->req.hdr, TCA_U32_ACT); + { + int opt_prio = 0; + char opt_act_kind[] = "mirred"; + struct rtattr *opts_action_prio = NULL; + + /* [mirred] options filling */ + opts_action_prio = nl_attr_nest_start(&tc->req.hdr, ++opt_prio); + nl_attr_add(&tc->req.hdr, TCA_ACT_KIND, opt_act_kind, sizeof(opt_act_kind)); + { + struct rtattr *opts_action_prio_mirred = NULL; + struct tc_mirred opt_mirred; + + opts_action_prio_mirred = nl_attr_nest_start(&tc->req.hdr, TCA_ACT_OPTIONS); + memset(&opt_mirred, 0, sizeof(opt_mirred)); + opt_mirred.eaction = TCA_EGRESS_REDIR; + opt_mirred.action = TC_ACT_STOLEN; + opt_mirred.ifindex = ifindex_to; + nl_attr_add(&tc->req.hdr, TCA_MIRRED_PARMS, &opt_mirred, sizeof(opt_mirred)); + + nl_attr_nest_end(&tc->req.hdr, opts_action_prio_mirred); + } + + nl_attr_nest_end(&tc->req.hdr, opts_action_prio); + } + + nl_attr_nest_end(&tc->req.hdr, opts_action); + } + + nl_attr_add(&tc->req.hdr, TCA_U32_HASH, &opt_ht, sizeof(opt_ht)); + memset(&opt_sel, 0, sizeof(opt_sel)); + /* [match] protocol option */ + pack_key8(&opt_sel.sel, proto, 0xff, 9, 0); + /* [match] nofrag option */ + pack_key16(&opt_sel.sel, 0, 0x3fff, 6, 0); + if (src_port) { + /* [match] src option */ + pack_key32(&opt_sel.sel, ntohl(src_ip), 0xffffffff, 12, 0); + /* [match] sport option */ + pack_key16(&opt_sel.sel, ntohs(src_port), 0xffff, 20, 0); + } + /* [match] dst option */ + pack_key32(&opt_sel.sel, ntohl(dst_ip), 0xffffffff, 16, 0); + /* [match] dport option */ + pack_key16(&opt_sel.sel, ntohs(dst_port), 0xffff, 22, 0); + opt_sel.sel.flags |= TC_U32_TERMINAL; + nl_attr_add(&tc->req.hdr, TCA_U32_SEL, &opt_sel, sizeof(opt_sel.sel) + opt_sel.sel.nkeys * sizeof(opt_sel.sel.keys[0])); + + nl_attr_nest_end(&tc->req.hdr, opts); + + if (nl_send(tc->nl, &tc->req.hdr) < 0) { + rc = -1; + goto err; + } + if (nl_recv(tc->nl, NULL, NULL) < 0) { + rc = -1; + goto err; + } +#else + char *out_buf = NULL; + char if_name[IF_NAMESIZE]; + char tap_name[IF_NAMESIZE]; + char str_tmp[100]; + + NOT_IN_USE(tc); + + if (NULL == if_indextoname(ifindex, if_name)) { + rc = -errno; + goto err; + } + + if (NULL == if_indextoname(ifindex_to, tap_name)) { + rc = -errno; + goto err; + } + + if (src_port) { + strncpy(str_tmp, sys_ip2str(src_ip), sizeof(str_tmp)); + str_tmp[sizeof(str_tmp) - 1] = '\0'; + out_buf = sys_exec("tc filter add dev %s parent ffff: protocol ip " + "prio %d handle ::%x u32 ht %x:%x: " + "match ip protocol %d 0xff " + "match ip nofrag " + "match ip src %s/32 match ip sport %d 0xffff " + "match ip dst %s/32 match ip dport %d 0xffff " + "action mirred egress redirect dev %s " + "> /dev/null 2>&1 || echo $?", + if_name, prio, id, ht, bkt, proto, + str_tmp, src_port, + sys_ip2str(dst_ip), ntohs(dst_port), tap_name); + } else { + out_buf = sys_exec("tc filter add dev %s parent ffff: protocol ip " + "prio %d handle ::%x u32 ht %x:%x: " + "match ip protocol %d 0xff " + "match ip nofrag " + "match ip dst %s/32 match ip dport %d 0xffff " + "action mirred egress redirect dev %s " + "> /dev/null 2>&1 || echo $?", + if_name, prio, id, ht, bkt, proto, + sys_ip2str(dst_ip), ntohs(dst_port), tap_name); + } + if (NULL == out_buf || (out_buf[0] != '\0' && out_buf[0] != '0')) { + rc = -1; + goto err; + } +#endif /* USE_NETLINK */ + +err: + return rc; +} + +int tc_del_filter(tc_t tc, int ifindex, int prio, int ht, int bkt, int id) +{ + int rc = 0; + + log_debug("remove filter for if_id: %d\n", ifindex); + +#if defined(USE_NETLINK) && (USE_NETLINK == 1) + struct tc_qdisc qdisc = {HANDLE_SET(ht, bkt, id), 0xffff0000, prio}; + char opt_kind[] = "u32"; + + tc_req(tc, ifindex, RTM_DELTFILTER , + 0, + qdisc); + + nl_attr_add(&tc->req.hdr, TCA_KIND, opt_kind, sizeof(opt_kind)); + + if (nl_send(tc->nl, &tc->req.hdr) < 0) { + rc = -1; + goto err; + } + if (nl_recv(tc->nl, NULL, NULL) < 0) { + rc = -1; + goto err; + } +#else + char *out_buf = NULL; + char if_name[IF_NAMESIZE]; + + NOT_IN_USE(tc); + + if (NULL == if_indextoname(ifindex, if_name)) { + rc = -errno; + goto err; + } + + out_buf = sys_exec("tc filter del dev %s parent ffff: protocol ip prio %d handle %x:%x:%x u32 " + "> /dev/null 2>&1 || echo $?", + if_name, prio, ht, bkt, id); + if (NULL == out_buf || (out_buf[0] != '\0' && out_buf[0] != '0')) { + rc = -1; + goto err; + } +#endif /* USE_NETLINK */ + +err: + return rc; +} + +#if defined(USE_NETLINK) && (USE_NETLINK == 1) +static int pack_key(struct tc_u32_sel *sel, uint32_t key, uint32_t mask, int off, int offmask) +{ + int i; + + key &= mask; + + for (i = 0; i < sel->nkeys; i++) { + if ((sel->keys[i].off == off) && (sel->keys[i].offmask == offmask)) { + uint32_t intersect = mask & sel->keys[i].mask; + + if ((key ^ sel->keys[i].val) & intersect) { + return -1; + } + sel->keys[i].val |= key; + sel->keys[i].mask |= mask; + return 0; + } + } + + if (off % 4) { + return -1; + } + sel->keys[sel->nkeys].val = key; + sel->keys[sel->nkeys].mask = mask; + sel->keys[sel->nkeys].off = off; + sel->keys[sel->nkeys].offmask = offmask; + sel->nkeys++; + + return 0; +} + +static int pack_key8(struct tc_u32_sel *sel, uint32_t key, uint32_t mask, int off, int offmask) +{ + if ((off & 3) == 0) { + key <<= 24; + mask <<= 24; + } else if ((off & 3) == 1) { + key <<= 16; + mask <<= 16; + } else if ((off & 3) == 2) { + key <<= 8; + mask <<= 8; + } + off &= ~3; + key = htonl(key); + mask = htonl(mask); + + return pack_key(sel, key, mask, off, offmask); +} + +static int pack_key16(struct tc_u32_sel *sel, uint32_t key, uint32_t mask, int off, int offmask) +{ + if ((off & 3) == 0) { + key <<= 16; + mask <<= 16; + } + off &= ~3; + key = htonl(key); + mask = htonl(mask); + + return pack_key(sel, key, mask, off, offmask); +} + +static int pack_key32(struct tc_u32_sel *sel, uint32_t key, uint32_t mask, int off, int offmask) +{ + key = htonl(key); + mask = htonl(mask); + + return pack_key(sel, key, mask, off, offmask); +} +#endif /* USE_NETLINK */ diff --git a/tools/daemon/tc.h b/tools/daemon/tc.h new file mode 100644 index 0000000..ec7ab67 --- /dev/null +++ b/tools/daemon/tc.h @@ -0,0 +1,253 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef TOOLS_DAEMON_TC_H_ +#define TOOLS_DAEMON_TC_H_ + +#include /* for the TC_H_* macros */ +#include +#include +#include + + +/* The tc_t opaque data type + */ +typedef struct tc_object* tc_t; + +struct tc_qdisc { + uint32_t handle; + uint32_t parent; + int prio; +}; + + +#define KERNEL_HT 0x800 +#define MAX_BKT 0xFF +#define MAX_ID 0xFFE +#define HANDLE_INVALID (uint32_t)(-1) + +#define HANDLE_SET(ht, bkt, id) \ + ( \ + (((uint32_t)(ht) << 20) & 0xFFF00000) | \ + (((uint32_t)(bkt) << 12) & 0x000FF000) | \ + (((uint32_t)(id) << 0) & 0x00000FFF) \ + ) + +#define HANDLE_HT(value) ((((uint32_t)(value)) & 0xFFF00000) >> 20) /* 12bits by offset 20 */ +#define HANDLE_BKT(value) ((((uint32_t)(value)) & 0x000FF000) >> 12) /* 8bits by offset 12 */ +#define HANDLE_ID(value) ((((uint32_t)(value)) & 0x00000FFF) >> 0) /* 12bits by offset 0 */ + +/** + * Initialize a tc object. + * + * @return + * the newly allocated netlink object. Must be freed with nl_destory. + */ +tc_t tc_create(void); + +/** + * Destroy up a tc object. + * + * @param tc + * The tc object. + * + * @return + * @a none + */ +void tc_destroy(tc_t tc); + +/** + * Initialize a TC request. + * + * @param[in] tc + * The TC object. + * @param[in] ifindex + * The netdevice ifindex where the rule will be applied. + * @param[in] type + * The type of TC message to create (RTM_NEWTFILTER, RTM_NEWQDISC, etc.). + * @param[in] flags + * Overrides the default netlink flags for this msg with those specified. + * @param[in] qdisc + * Set qdisc data. + * + * @return + * @a none + */ +void tc_req(tc_t tc, int ifindex, uint16_t type, uint16_t flags, struct tc_qdisc qdisc); + +/** + * Add qdisc as a TC request. + * + * @param[in] tc + * The TC object. + * @param[in] ifindex + * The netdevice ifindex where the rule will be applied. + * + * @return + * 0 on success, -1 otherwise with errno set. + */ +int tc_add_qdisc(tc_t tc, int ifindex); + +/** + * Remove qdisc as a TC request. + * + * @param[in] tc + * The TC object. + * @param[in] ifindex + * The netdevice ifindex where the rule will be applied. + * + * @return + * 0 on success, -1 otherwise with errno set. + */ +int tc_del_qdisc(tc_t tc, int ifindex); + +/** + * Add filter divisor for hash tables as a TC request. + * + * @param[in] tc + * The TC object. + * @param[in] ifindex + * The netdevice ifindex where the rule will be applied. + * @param[in] prio + * Priority value. + * @param[in] ht + * Hash table index. + * + * @return + * 0 on success, -1 otherwise with errno set. + */ +int tc_add_filter_divisor(tc_t tc, int ifindex, int prio, int ht); + +/** + * Add filter link as a TC request. + * + * @param[in] tc + * The TC object. + * @param[in] ifindex + * The netdevice ifindex where the rule will be applied. + * @param[in] prio + * Priority value. + * @param[in] ht + * Hash table index. + * @param[in] id + * Index in link table. + * @param[in] ip + * Destination ip address. + * + * @return + * 0 on success, -1 otherwise with errno set. + */ +int tc_add_filter_link(tc_t tc, int ifindex, int prio, int ht, int id, uint32_t ip); + +/** + * Add filter to redirect traffic from tap device + * to Interface device as TC request. + * + * @param[in] tc + * The TC object. + * @param[in] ifindex + * The tap device ifindex. + * @param[in] prio + * Priority value. + * @param[in] ht + * Hash table index. + * @param[in] id + * Item index. + * @param[in] proto + * Protocol type as tcp, udp etc. + * @param[in] proto + * Destination ip. + * @param[in] proto + * Destination port. + * @param[in] ifindex + * The netdevice ifindex where the rule will be applied. + * + * @return + * 0 on success, -1 otherwise with errno set. + */ +int tc_add_filter_tap2dev(tc_t tc, int ifindex, int prio, int id, uint32_t ip, int ifindex_to); + +/** + * Add filter to redirect traffic from ethernet device + * to tap device using 3tuple or 5tuple as TC request. + * + * @param[in] tc + * The TC object. + * @param[in] ifindex + * The netdevice ifindex where the rule will be applied. + * @param[in] prio + * Priority value. + * @param[in] ht + * Hash table index. + * @param[in] id + * Item index. + * @param[in] proto + * Protocol type as tcp, udp etc. + * @param[in] dst_ip + * Destination ip. + * @param[in] dst_port + * Destination port. + * @param[in] src_ip + * Source ip. + * @param[in] src_port + * Source port. + * @param[in] ifindex + * The tap device ifindex. + * + * @return + * 0 on success, -1 otherwise with errno set. + */ +int tc_add_filter_dev2tap(tc_t tc, int ifindex, int prio, int ht, int bkt, int id, + int proto, uint32_t dst_ip, uint16_t dst_port, uint32_t src_ip, uint16_t src_port, int ifindex_to); + +/** + * Remove specific filter as a TC request. + * + * @param[in] tc + * The TC object. + * @param[in] ifindex + * The netdevice ifindex where the rule will be applied. + * @param[in] prio + * Priority value. + * @param[in] ht + * Hash table index. + * @param[in] bkt + * Bucket index. + * @param[in] ht + * Item index. + * + * @return + * 0 on success, -1 otherwise with errno set. + */ +int tc_del_filter(tc_t tc, int ifindex, int prio, int ht, int bkt, int id); + +#endif /* TOOLS_DAEMON_TC_H_ */ diff --git a/tools/media/netmap-vma/apps/README.md b/tools/media/netmap-vma/apps/README.md new file mode 100644 index 0000000..d730cd4 --- /dev/null +++ b/tools/media/netmap-vma/apps/README.md @@ -0,0 +1,58 @@ + +The sample requires some small modifications to support the netmap_vma API. + +* #include "netmap_vma.h" + The header contains the new functions that implement the netmap_vma API. + +* The ioctl(NETMAP_FD(d), NIOCRXSYNC, NULL) is not needed for the netmap_vma API + +* poll input/output multiplexing + - netmap API + int poll(struct pollfd *fds, nfds_t nfds, int timeout); + poll(&pfd, nfds, -1); + + - netmap_vma API + poll_nm_vma(struct nm_desc *d, int timeout); + The poll_nm_vma API simulates the events POLLIN|POLLERR only + poll_nm_vma(d, -1); + +* Packet processing + - neatmp API + cur = rxring->cur; + while (!nm_ring_empty(rxring)) { + *slot = &rxring->slot[cur]; + *src = (uint8_t*)NETMAP_BUF(rxring, slot->buf_idx); + // process packet + handle(src, slot->len); + //next packet + cur = nm_ring_next(rxring, cur); + } + // update ring buffer + rxring->head = rxring->cur = nm_ring_next(rxring, cur); + + - netmap_vma API + while ((buf = (uint8_t*)nm_nextpkt(d, &h))) { + // process the packet + handle(h.buf, h.len); + } + The h.buf returned to the user consists of all packets in one big buffer + The h.len returns the number of packets in the buffer + +Rgarding the Multi Packet Receive Queue (MP-RQ) and the Cyclic Buffer (CB) +please refer to the User Manual +"Mellanox Messaging Accelerator (VMA) Library for Linux" 8.7 Multi Packet Receive Queue +http://www.mellanox.com/vma + +**Build and Run Instructions** + +Install the netmap framework https://github.com/luigirizzo/netmap +Install the biTStream https://code.videolan.org/videolan/bitstream.git +To build the application: +make NETMAP_SRC_DIR=/path_to_the_netmap_sources \ + BITSTREAM_DIR=/path_to_the_folder_where_bitstream_is_located + +The script run_util.sh requires some small modifications as well. +Please update the interface and the path PRELOAD if needed. +Start sample with netmap_vma API $./run-util.sh -rvma +The pkt-gen can be used to generate a traffic. + diff --git a/tools/media/netmap-vma/apps/makefile b/tools/media/netmap-vma/apps/makefile new file mode 100644 index 0000000..c275dda --- /dev/null +++ b/tools/media/netmap-vma/apps/makefile @@ -0,0 +1,28 @@ + +NETMAP_SRC_DIR = /path_to_the_netmap_sources +BITSTREAM_DIR = /path_to_the_folder_where_bitstream_is_located + +VMA_TOP_DIR = ../../../.. +NETMAP_VMA_SRC_DIR = .. + +PROGS = nm-ex1-vma nm-ex1 + +CFLAGS = -O2 -Wall -Wextra -g +CFLAGS += -I$(NETMAP_SRC_DIR)/sys/net -I$(NETMAP_SRC_DIR)/apps/include -I$(NETMAP_VMA_SRC_DIR) +CFLAGS += -I$(BITSTREAM_DIR) + +LDLIBS += -lrt +LDLIBS_VMA += $(NETMAP_VMA_SRC_DIR)/netmap_vma.so +LDLIBS_VMA += -L $(VMA_TOP_DIR)/src/vma/.libs -lvma + +all: $(PROGS) + +nm-ex1-vma: LDLIBS += $(LDLIBS_VMA) +nm-ex1-vma.o: nm-ex1.c + $(CC) $(CFLAGS) -DNETMAP_VMA -c $^ -o $@ + +nm-ex1.o: nm-ex1.c + $(CC) $(CFLAGS) -c $^ -o $@ + +clean: + rm -f nm-ex1 nm-ex1.o nm-ex1-vma nm-ex1-vma.o diff --git a/tools/media/netmap-vma/apps/nm-ex1.c b/tools/media/netmap-vma/apps/nm-ex1.c new file mode 100644 index 0000000..34d4640 --- /dev/null +++ b/tools/media/netmap-vma/apps/nm-ex1.c @@ -0,0 +1,136 @@ +#include +#include +#include + +#include +#include + +#define NETMAP_WITH_LIBS +#include +#include +#include + +#ifdef NETMAP_VMA +#include "netmap_vma.h" +#endif + +/* switch for blocking or non blocking API */ +static const bool block = true; + +/* packet processing */ +static void handle(const uint8_t *pkt, size_t len) +{ +#ifndef NETMAP_VMA + if (len < ETHERNET_HEADER_LEN + IP_HEADER_MINSIZE) + { + return; + } + + if (ethernet_get_lentype(pkt) != ETHERNET_TYPE_IP) + { + return; + } + + const uint8_t *ip = &pkt[ETHERNET_HEADER_LEN]; + if (ip_get_version(ip) != 4) { + return; + } + //printf("ip_get_len=%d ip_get_proto=%d\n", ip_get_len(ip), ip_get_proto(ip)); + +#else + size_t k, packets = len; + + for (k = 0; k < packets; k++) { + + if (ethernet_get_lentype(pkt) != ETHERNET_TYPE_IP) + { + return; + } + + const uint8_t *ip = &pkt[ETHERNET_HEADER_LEN]; + if (ip_get_version(ip) != 4) { + return; + } + //printf("ip_get_len=%d ip_get_proto=%d\n", ip_get_len(ip), ip_get_proto(ip)); + + pkt += STRIDE_SIZE; + } +#endif +} + +static void loop(struct nm_desc *d) +{ +#ifndef NETMAP_VMA + struct pollfd pfd = { + .fd = NETMAP_FD(d), + .events = POLLIN|POLLERR, + }; + nfds_t nfds = 1; +#endif + struct timespec delay; + delay.tv_sec = 0; + delay.tv_nsec = 1000000; // 1ms +#ifndef NETMAP_VMA + struct netmap_ring *rxring = NETMAP_RXRING(d->nifp, d->cur_rx_ring); +#endif + for (;;) { + if (block) { +#ifndef NETMAP_VMA + if (poll(&pfd, nfds, -1) != (int)nfds) { +#else + if (poll_nm_vma(d, -1) < 0) { +#endif + perror("poll"); + break; + } + } else { + clock_nanosleep(CLOCK_MONOTONIC, 0, &delay, NULL); +#ifndef NETMAP_VMA + if (ioctl(NETMAP_FD(d), NIOCRXSYNC, NULL) < 0) + perror("ioctl"); +#endif + } +#ifndef NETMAP_VMA + uint32_t cur = rxring->cur; + /* process all packets */ + while (!nm_ring_empty(rxring)) { + struct netmap_slot *slot = &rxring->slot[cur]; + const uint8_t *src = (uint8_t*)NETMAP_BUF(rxring, slot->buf_idx); + /* process packet */ + handle(src, slot->len); + /* next packet */ + cur = nm_ring_next(rxring, cur); + } + /* update ring buffer */ + rxring->head = rxring->cur = nm_ring_next(rxring, cur); +#else + uint8_t *buf; + struct nm_pkthdr h; + + /* process all packets */ + while ((buf = (uint8_t*)nm_nextpkt(d, &h))) { + /* process packets */ + handle(h.buf, h.len); + } +#endif + } +} + +int main(int argc, char **argv) +{ + if (argc != 2) { + printf("Usage: %s netmap:p1p2-0/R\n", argv[0]); + return 1; + } + + /* open interface */ + struct nm_desc *d = nm_open(argv[1], NULL, 0, 0); + if (!d) { + return 2; + } + sleep(2); + loop(d); + nm_close(d); + + return 0; +} diff --git a/tools/media/netmap-vma/apps/run-util.sh b/tools/media/netmap-vma/apps/run-util.sh new file mode 100644 index 0000000..2f484d3 --- /dev/null +++ b/tools/media/netmap-vma/apps/run-util.sh @@ -0,0 +1,41 @@ +#!/bin/bash + +IF="ens4f0" + +VMA_TOP_DIR=../../../.. + +PRELOAD="LD_PRELOAD=${VMA_TOP_DIR}/src/vma/.libs/libvma.so" + +LD_PATH="LD_LIBRARY_PATH=.." + +APP=nm-ex1 +PORT="" + +run_test() +{ + sh -c "${PRELOAD} ${LD_PATH} ./${APP} netmap:${IF}-1/R${PORT}" +} + +usage() +{ +cat << eOm + usage:$0 -r -rvma +eOm +} + +case "$1" in + -r) + PRELOAD=""; LD_PATH="" + run_test + ;; + -rvma) + APP+="-vma"; PORT=":2000" + run_test + ;; + *) + usage + exit 0 + ;; +esac + +# diff --git a/tools/media/netmap-vma/makefile b/tools/media/netmap-vma/makefile new file mode 100644 index 0000000..f721464 --- /dev/null +++ b/tools/media/netmap-vma/makefile @@ -0,0 +1,24 @@ + +TARGET=netmap_vma.so + +VMA_TOP_DIR = ../../../ + +CPPFLAGS = -Wall -Werror -Wunused -O3 -fPIC -m64 -g +CPPFLAGS += -DHAVE_MP_RQ -DNETMAP_VMA -I$(VMA_TOP_DIR)/src/vma -I$(VMA_TOP_DIR)/src + +LDFLAGS = -shared +LIBS = -lpthread -lrt + +.PHONY : clean + +SOURCES = $(shell echo *.cpp) +HEADERS = $(shell echo *.h) +OBJECTS=$(SOURCES:.cpp=.o) + +all: $(TARGET) + +clean: + rm -f $(OBJECTS) $(TARGET) + +$(TARGET) : $(OBJECTS) + $(CC) $(CFLAGS) $(OBJECTS) -o $@ $(LDFLAGS) $(LIBS) diff --git a/tools/media/netmap-vma/netmap_vma.cpp b/tools/media/netmap-vma/netmap_vma.cpp new file mode 100644 index 0000000..96e2c16 --- /dev/null +++ b/tools/media/netmap-vma/netmap_vma.cpp @@ -0,0 +1,709 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "vma/util/vtypes.h" +#include "vma_extra.h" + +#ifdef HAVE_MP_RQ +#define NETMAP_WITH_LIBS +#include + +/* buffer size is 2^17 = 128MB */ +#define VMA_CYCLIC_BUFFER_SIZE (1<<17) +#define VMA_CYCLIC_BUFFER_USER_PKT_SIZE 1400 +#define VMA_CYCLIC_BUFFER_MIN_PKTS 1000 +#define VMA_CYCLIC_BUFFER_MAX_PKTS 5000 +#define VMA_NM_SLEEP 1 + +#define MAX_RINGS 1000 +#define PRINT_PERIOD_SEC 5 +#define PRINT_PERIOD 1000000 * PRINT_PERIOD_SEC +#define MAX_SOCKETS_PER_RING 4096 +#define STRIDE_SIZE 2048 + +class RXSock; +class CommonCyclicRing; + +class RXSock { +public: + uint64_t statTime; + int lastPacketType; + int index; + int fd; + int ring_fd; + uint16_t rtpPayloadType; + uint16_t sin_port; + struct ip_mreqn mc; + char ipAddress[INET_ADDRSTRLEN]; + int bad_packets; +}; + +class CommonCyclicRing { +public: + unsigned long printCount; + int numOfSockets; + int ring_id; + int ring_fd; + RXSock* hashedSock[MAX_SOCKETS_PER_RING]; + std::ofstream * pOutputfile; + std::vector addr_vect; + std::vector sock_vect; + CommonCyclicRing():printCount(0),numOfSockets(0),ring_fd(0){ + for (int i=0; i < MAX_SOCKETS_PER_RING; i++) { + hashedSock[i] = 0; + } + } + struct vma_api_t *vma_api; + size_t min_s; + size_t max_s; + int flags; + int sleep_time; + struct vma_completion_cb_t completion; + bool is_readable; +}; + +struct flow_param { + int ring_id; + unsigned short hash; + sockaddr_in addr; +}; + +static unsigned short hashIpPort2(sockaddr_in addr) +{ + int hash = ((size_t)(addr.sin_addr.s_addr) * 59) ^ ((size_t)(addr.sin_port) << 16); + unsigned char smallHash = (unsigned char)(((unsigned char) ((hash*19) >> 24)) ^ ((unsigned char) ((hash*17) >> 16)) ^ ((unsigned char) ((hash*5) >> 8)) ^ ((unsigned char) hash)); + unsigned short mhash = ((((addr.sin_addr.s_addr >>24) & 0x7) << 8) | smallHash) ; + //printf("0x%x\n",addr.sin_addr.s_addr); + return mhash; +} + +static inline unsigned long long int time_get_usec() +{ + struct timeval tv; + gettimeofday(&tv, NULL); + return (((unsigned long long int) tv.tv_sec * 1000000LL) + + (unsigned long long int) tv.tv_usec); +} + +static int CreateRingProfile(bool CommonFdPerRing, int RingProfile, int user_id, int RxSocket) +{ + vma_ring_alloc_logic_attr profile; + profile.engress = 0; + profile.ingress = 1; + profile.ring_profile_key = RingProfile; + if (CommonFdPerRing) { + profile.user_id = user_id; + profile.comp_mask = VMA_RING_ALLOC_MASK_RING_PROFILE_KEY | + VMA_RING_ALLOC_MASK_RING_USER_ID | + VMA_RING_ALLOC_MASK_RING_INGRESS; + + // if we want several Fd's per ring, we need to assign RING_LOGIC_PER_THREAD / RING_LOGIC_PER_CORE + profile.ring_alloc_logic = RING_LOGIC_PER_USER_ID; + } else { + profile.comp_mask = VMA_RING_ALLOC_MASK_RING_PROFILE_KEY| + VMA_RING_ALLOC_MASK_RING_INGRESS; + // if we want several Fd's per ring, we need to assign RING_LOGIC_PER_THREAD / RING_LOGIC_PER_CORE + profile.ring_alloc_logic = RING_LOGIC_PER_SOCKET; + } + return setsockopt(RxSocket, SOL_SOCKET, SO_VMA_RING_ALLOC_LOGIC,&profile, sizeof(profile)); +} + +static int OpenRxSocket(int ring_id, sockaddr_in* addr, uint32_t ssm, const char *device, + struct ip_mreqn *mc, int RingProfile, bool CommonFdPerRing) +{ + int i_ret; + struct timeval timeout = { 0, 1 }; + int i_opt = 1; + struct ifreq ifr; + struct sockaddr_in *p_addr; + + // Create the socket + int RxSocket = socket(AF_INET, SOCK_DGRAM, IPPROTO_UDP); + + if (RxSocket < 0) { + printf("%s: Failed to create socket (%s)\n", + __func__,std::strerror(errno)); + return 0; + } + + // Enable socket reuse (for multi channels bind to a single socket) + i_ret = setsockopt(RxSocket, SOL_SOCKET, SO_REUSEADDR, + (void *) &i_opt, sizeof(i_opt)); + if (i_ret < 0) { + close(RxSocket); + RxSocket = 0; + printf("%s: Failed to set SO_REUSEADDR (%s)\n", + __func__, strerror(errno)); + return 0; + } + fcntl(RxSocket, F_SETFL, O_NONBLOCK); + CreateRingProfile(CommonFdPerRing, RingProfile, ring_id, RxSocket); + // bind to specific device + struct ifreq interface; + strncpy(interface.ifr_ifrn.ifrn_name, device, IFNAMSIZ); + //printf("%s SO_BINDTODEVICE %s\n",__func__,interface.ifr_ifrn.ifrn_name); + if (setsockopt(RxSocket, SOL_SOCKET, SO_BINDTODEVICE, + (char *) &interface, sizeof(interface)) < 0) { + printf("%s: Failed to bind to device (%s)\n", + __func__, strerror(errno)); + close(RxSocket); + RxSocket = 0; + return 0; + } + + // bind to socket + i_ret = bind(RxSocket, (struct sockaddr *)addr, sizeof(struct sockaddr)); + if (i_ret < 0) { + printf("%s: Failed to bind to socket (%s)\n",__func__,strerror(errno)); + close(RxSocket); + RxSocket = 0; + return 0; + } + + memset(&ifr, 0, sizeof(struct ifreq)); + strncpy(ifr.ifr_name, device, IFNAMSIZ); + // Get device IP + i_ret = ioctl(RxSocket, SIOCGIFADDR, &ifr); + if (i_ret < 0) { + printf("%s: Failed to obtain interface IP (%s)\n",__func__, strerror(errno)); + close(RxSocket); + RxSocket = 0; + return 0; + } + + if (((addr->sin_addr.s_addr & 0xFF) >= 224) && ((addr->sin_addr.s_addr & 0xFF) <= 239)) { + p_addr = (struct sockaddr_in *) &(ifr.ifr_addr); + if (ssm == 0) { + struct ip_mreqn mreq; + // join the multicast group on specific device + memset(&mreq, 0, sizeof(struct ip_mreqn)); + + mreq.imr_multiaddr.s_addr = addr->sin_addr.s_addr; + mreq.imr_address.s_addr = p_addr->sin_addr.s_addr; + *mc = mreq; + // RAFI MP_RING is created + i_ret = setsockopt(RxSocket, IPPROTO_IP, + IP_ADD_MEMBERSHIP, &mreq, + sizeof(struct ip_mreqn)); + + if (i_ret < 0) { + printf("%s: add membership to (0X%08X) on (0X%08X) failed. (%s)\n",__func__,mreq.imr_multiaddr.s_addr, + mreq.imr_address.s_addr, + strerror(errno)); + close(RxSocket); + RxSocket = 0; + return 0; + } + } else { + struct ip_mreq_source mreqs; + // join the multicast group on specific device + memset(&mreqs, 0, sizeof(struct ip_mreq_source)); + + mreqs.imr_multiaddr.s_addr = addr->sin_addr.s_addr; + mreqs.imr_interface.s_addr = p_addr->sin_addr.s_addr; + mreqs.imr_sourceaddr.s_addr = ssm; + + i_ret = setsockopt(RxSocket, IPPROTO_IP, + IP_ADD_SOURCE_MEMBERSHIP, &mreqs, + sizeof(struct ip_mreq_source)); + + if (i_ret < 0) { + printf("%s: add membership to (0X%08X), ssm (0X%08X) failed. (%s)\n",__func__, + mreqs.imr_multiaddr.s_addr, + mreqs.imr_sourceaddr.s_addr, + strerror(errno)); + close(RxSocket); + RxSocket = 0; + return 0; + } + } + } + + // Set max receive timeout + i_ret = setsockopt(RxSocket, SOL_SOCKET, SO_RCVTIMEO, &timeout, + sizeof(struct timeval)); + if (i_ret < 0) { + printf("%s: Failed to set SO_RCVTIMEO (%s)\n",__func__, + strerror(errno)); + close(RxSocket); + RxSocket = 0; + return 0; + } + + return RxSocket; +} + +#define IP_HEADER_OFFSET 14 +#define IP_HEADER_SIZE 20 +#define IP_DEST_OFFSET (IP_HEADER_OFFSET+ 16) +#define UDP_HEADER_OFFSET (IP_HEADER_SIZE + IP_HEADER_OFFSET) +#define PORT_DEST_OFFSET (UDP_HEADER_OFFSET + 2) + +static void AddFlow(flow_param flow,CommonCyclicRing* rings[], int &uniqueRings) +{ + int ring_id = flow.ring_id; + if (rings[ring_id] == NULL) { + rings[ring_id] = new CommonCyclicRing; + rings[ring_id]->ring_id =ring_id; + uniqueRings++; + } + rings[ring_id]->numOfSockets++; + sockaddr_in* pAddr = new sockaddr_in; + *pAddr = flow.addr; + rings[ring_id]->addr_vect.push_back(pAddr); +} + +static void destroyFlows(CommonCyclicRing* rings[]) +{ + for (int i=0; i < MAX_RINGS; i++) { + if (rings[i] != NULL) { + for (std::vector::iterator it = rings[i]->addr_vect.begin(); it!=rings[i]->addr_vect.end(); ++it) { + delete *it; + } + for (std::vector::iterator it = rings[i]->sock_vect.begin(); it!=rings[i]->sock_vect.end(); ++it) { + delete *it; + } + delete rings[i]; + } + } +} + +#define DEFAULT_PORT 2000 + +static CommonCyclicRing* pRings[MAX_RINGS]; +static void init_ring_helper(CommonCyclicRing* pRing); + +extern "C" +struct nm_desc *nm_open_vma(const char *nm_ifname, const struct nmreq *req, uint64_t flags, const struct nm_desc *arg) +{ + NOT_IN_USE(req); + NOT_IN_USE(flags); + NOT_IN_USE(arg); + + char *opts = NULL; + char *nm_ring = NULL; + char *nm_port = NULL; + char nm_mode = ' '; + char ifname[IFNAMSIZ]; + char nm_ring_val[10]; + u_int namelen; + struct nm_desc *d = NULL; + int nm_ring_len = 0; + int nm_ring_set = 0; + struct ifaddrs *ifaddr, *ifa; + char host[NI_MAXHOST]; + + d = new nm_desc(); + if (strncmp(nm_ifname, "netmap:", 7)) { + errno = 0; + printf("name not recognised\n"); + return NULL; + } + nm_ifname += 7; + opts = (char*)nm_ifname; + for (; *opts && !index("-", *opts) ; opts++); + namelen = opts - nm_ifname; + if (namelen >= sizeof(d->req.nr_name)) { + printf("name too long\n"); + return NULL; + } else { + memcpy(ifname, nm_ifname, namelen); + ifname[namelen] = '\0'; + memcpy(d->req.nr_name, nm_ifname, namelen); + d->req.nr_name[namelen] = '\0'; + } + + while(*opts) { + switch (*opts) { + case '-': + nm_ring = ++opts; + nm_ring_set = 1; + break; + case '/': + if (nm_ring_set--) nm_ring_len = opts - nm_ring; + nm_mode = *(opts +1); + break; + case ':': + if (nm_ring_set--) nm_ring_len = opts - nm_ring; + nm_port = ++opts; + break; + default: + break; + } + opts++; + } + + std::string nmring; + nmring.append(nm_ring, nm_ring_len); + + std::string nmport; + if (nm_port == NULL) { + std::ostringstream s; + s << DEFAULT_PORT; + nmport.append(s.str()); + printf("nm_mode=%c nm_port=%d\n", nm_mode, DEFAULT_PORT); + } else { + nmport.append(nm_port); + memcpy(nm_ring_val, nm_ring, nm_ring_len); + nm_ring_val[nm_ring_len] = '\0'; + printf("nm_ring_val=%s nm_mode=%c nm_port=%s\n", nm_ring_val, nm_mode, nm_port); + } + + if (getifaddrs(&ifaddr) == -1) { + printf("error getifaddrs\n"); + return NULL; + } + for (ifa = ifaddr; ifa != NULL; ifa = ifa->ifa_next) { + int ret; + if (ifa->ifa_addr == NULL) + continue; + ret = getnameinfo(ifa->ifa_addr, sizeof(struct sockaddr_in), host, NI_MAXHOST, NULL, 0, NI_NUMERICHOST); + if ((strcmp(ifa->ifa_name, ifname) == 0) && (ifa->ifa_addr->sa_family == AF_INET)) { + if (ret != 0) { + printf("error getnameinfo\n"); + return NULL; + } + break; + } + } + freeifaddrs(ifaddr); + + std::string opts_line; + opts_line.append(host); + opts_line = opts_line + " " + nmport + " " + nmring; + + bool ringPerFd = false; + for (int j = 0; j < MAX_RINGS; j++) { + pRings[j] = NULL; + } + + std::string ip; + std::string line; + int port; + int ring_id; + int sock_num = 0, socketRead = 0; + char HashColision[MAX_RINGS][MAX_SOCKETS_PER_RING] = {0}; + int uniqueRings; + int hash_colision_cnt = 0; + char *cnfif_file = new char[IFNAMSIZ+4]; + + snprintf(cnfif_file, IFNAMSIZ+4, "%s.txt", ifname); + + std::vector cnf_if; + + std::ifstream infile(cnfif_file); + if (infile) { + while (getline(infile, line)) { + if ((line[0] == '#') || ((line[0] == '/') && (line[1] == '/'))) { + continue; + } + cnf_if.push_back(line); + } + } else { + cnf_if.push_back(opts_line); + } + + while (!cnf_if.empty()) { + line = cnf_if.back(); + cnf_if.pop_back(); + std::istringstream iss(line); + struct flow_param flow; + if (iss >> ip >> port >> ring_id) { + socketRead++; + flow.addr.sin_family = AF_INET; + flow.addr.sin_port = htons(port); + flow.addr.sin_addr.s_addr = inet_addr(ip.c_str()); + flow.addr.sin_addr.s_addr = ntohl(ntohl(flow.addr.sin_addr.s_addr)); + printf("adding ip %s port %d,\n", ip.c_str(), port); + flow.hash = hashIpPort2(flow.addr); + printf("adding %s:%d hash val %d\n",ip.c_str(),port, flow.hash); + if (flow.addr.sin_addr.s_addr < 0x01000001) { + printf("Error - illegal IP %x\n", flow.addr.sin_addr.s_addr); + return NULL; + } + } else { + continue; + } + + printf("ring_id=%d\n",ring_id); + flow.ring_id = ring_id; + if (HashColision[ring_id][flow.hash] == 0) { + HashColision[ring_id][flow.hash] = 1; + // add the fd to the ring, if needed create a ring, update num of rings, and num of flows within this ring. + AddFlow(flow, pRings, uniqueRings); + } else { + hash_colision_cnt++; + printf("Hash socket colision found , socket%s:%d - dropped, total %d\n",ip.c_str(),port,hash_colision_cnt); + } + if (socketRead == sock_num) { + printf("read %d sockets from the file\n", socketRead); + break; + } + } + + d->req.nr_rx_rings = ring_id; + d->req.nr_ringid = ring_id; + + int prof = 0; + struct vma_api_t *vma_api = vma_get_api(); + vma_ring_type_attr ring; + ring.ring_type = VMA_RING_CYCLIC_BUFFER; + ring.ring_cyclicb.num = VMA_CYCLIC_BUFFER_SIZE; + ring.ring_cyclicb.stride_bytes = VMA_CYCLIC_BUFFER_USER_PKT_SIZE; + //ring.ring_cyclicb.comp_mask = VMA_RING_TYPE_MASK; + int res = vma_api->vma_add_ring_profile(&ring, &prof); + if (res) { + printf("failed adding ring profile"); + return NULL; + } + // for every ring, open sockets + for (int i=0; i< MAX_RINGS; i++) { + if (pRings[i] == NULL) { + continue; + } + std::vector::iterator it; + for (it = pRings[i]->addr_vect.begin(); + it!=pRings[i]->addr_vect.end(); ++it) { + struct ip_mreqn mc; + printf("Adding socket to ring %d\n",i); + RXSock* pSock = new RXSock; + pSock->fd = OpenRxSocket(pRings[i]->ring_id, *it, 0, ifname, &mc, prof, !ringPerFd); + if (pSock->fd <= 0) { + printf("Error OpenRxSocket failed. %d\n", i); + return NULL; + } + memcpy(&pSock->mc, &mc, sizeof(mc)); + pSock->statTime = time_get_usec() + 1000*i; + pSock->index = i; + pSock->bad_packets = 0; + pSock->sin_port = ntohs((*it)->sin_port); + unsigned short hash = hashIpPort2(**it); + //printf("hash value is %d\n",hash); + if (NULL != pRings[i]->hashedSock[hash]) { + printf ("Collision, reshuffle your ip addresses \n"); + return NULL; + } + pRings[i]->hashedSock[hash] = pSock; + inet_ntop(AF_INET, &((*it)->sin_addr), pSock->ipAddress, INET_ADDRSTRLEN); + pRings[i]->sock_vect.push_back(pSock); + pRings[i]->min_s = VMA_CYCLIC_BUFFER_MIN_PKTS; + pRings[i]->max_s = VMA_CYCLIC_BUFFER_MAX_PKTS; + pRings[i]->flags = MSG_DONTWAIT; + pRings[i]->vma_api = vma_get_api(); + pRings[i]->sleep_time = VMA_NM_SLEEP; + pRings[i]->is_readable = false; + init_ring_helper(pRings[i]); + } + } + return d; +} + +void init_ring_helper(CommonCyclicRing* pRing) +{ + int sock_len = pRing->numOfSockets; + for (int i = 0; i < sock_len; i++) { + int ring_fd_num = pRing->vma_api->get_socket_rings_num(pRing->sock_vect[i]->fd); + int* ring_fds = new int[ring_fd_num]; + pRing->vma_api->get_socket_rings_fds(pRing->sock_vect[i]->fd, ring_fds, ring_fd_num); + pRing->sock_vect[i]->ring_fd = *ring_fds; + pRing->ring_fd = *ring_fds; + delete[] ring_fds; + } +} + +static inline int cb_buffer_read(int ring) +{ + pRings[ring]->completion.packets = 0; + return pRings[ring]->vma_api->vma_cyclic_buffer_read(pRings[ring]->ring_fd, &pRings[ring]->completion, pRings[ring]->min_s, pRings[ring]->max_s, pRings[ring]->flags); +} + +static inline int cb_buffer_is_readable(int ring) +{ + for (int j = 0; j < 10; j++) { + pRings[ring]->completion.packets = 0; + if (pRings[ring]->vma_api->vma_cyclic_buffer_read(pRings[ring]->ring_fd, &pRings[ring]->completion, pRings[ring]->min_s, pRings[ring]->max_s, pRings[ring]->flags) < 0) { + return -1; + } + if (pRings[ring]->completion.packets) { + pRings[ring]->is_readable = true; + return 1; + } + } + //usleep(pRings[ring]->sleep_time); + return 0; +} + +// delay_ms(10) - 1ms +static void delay_ms(int ms) +{ + int start_time_ms, now_time_ms, time_diff; + struct timespec start; + struct timespec now; + + clock_gettime(CLOCK_REALTIME, &start); + start_time_ms = ((double)(start.tv_nsec)/1e9)*10000; // 0.1ms + + while(1) { + clock_gettime(CLOCK_REALTIME, &now); + now_time_ms = ((double)(now.tv_nsec)/1e9)*10000; + time_diff = now_time_ms - start_time_ms; + if (time_diff < 0) { + time_diff += 1000000000; + } + if (time_diff > ms) { + break; + } + usleep(0); + } +} + +extern "C" +int poll_nm_vma(struct nm_desc *d, int timeout) +{ + int ret = 0; + + int ring = d->req.nr_ringid; + pRings[ring]->is_readable = true; + + if (timeout == 0) { + return cb_buffer_is_readable(ring); + } + if (timeout > 0) { + while (timeout--) { + ret = cb_buffer_is_readable(ring); + if (ret) + return ret; + delay_ms(10); // daly 1ms + } + } else if (timeout < 0) { + while(!ret) { + ret = cb_buffer_is_readable(ring); + } + } + return ret; +} + +extern "C" +u_char *nm_nextpkt_vma(struct nm_desc *d, struct nm_pkthdr *hdr) +{ + int ring = d->req.nr_ringid; + uint8_t *data = NULL; + struct vma_completion_cb_t *completion = &pRings[ring]->completion; + + if (pRings[ring]->is_readable) { + pRings[ring]->is_readable = false; + hdr->len = hdr->caplen = completion->packets; + hdr->buf = data = ((uint8_t *)completion->payload_ptr); + + d->hdr.buf = data; + d->hdr.len = completion->packets; + d->hdr.caplen = 0; + return (u_char *)data; + } + + for (int j = 0; j < 10; j++) { + int res = cb_buffer_read(ring); + if (res == -1) { + printf("vma_cyclic_buffer_read returned -1"); + return NULL; + } + if (completion->packets == 0) { + continue; + } + hdr->len = hdr->caplen = completion->packets; + hdr->buf = data = ((uint8_t *)completion->payload_ptr); + d->hdr.buf = data; + d->hdr.len = completion->packets; + d->hdr.caplen = 0; + break; + } + return (u_char *)data; +} + +extern "C" +int nm_dispatch_vma(struct nm_desc *d, int cnt, nm_cb_t cb, u_char *arg) +{ + NOT_IN_USE(arg); + int ring = d->req.nr_ringid; + struct vma_completion_cb_t *completion = &pRings[ring]->completion; + int c = 0, got = 0; + d->hdr.buf = NULL; + d->hdr.flags = NM_MORE_PKTS; + d->hdr.d = d; + + if (cnt == 0) + cnt = -1; + /* cnt == -1 means infinite, but rings have a finite amount + * of buffers and the int is large enough that we never wrap, + * so we can omit checking for -1 + */ + for (c = 0; cnt != got; c++) { + int res = cb_buffer_read(ring); + if (res == -1) { + printf("vma_cyclic_buffer_read returned -1"); + return 0; + } + if (completion->packets == 0) { + continue; + } + got++; + d->hdr.len = d->hdr.caplen = completion->packets; + d->hdr.buf = ((uint8_t *)completion->payload_ptr); + } + if (d->hdr.buf) { + cb(arg, &d->hdr, d->hdr.buf); + } + return got; +} + +extern "C" +int nm_close_vma(struct nm_desc *d) +{ + destroyFlows(pRings); + if (d == NULL) { + return EINVAL; + } + delete d; + return 0; +} +#endif + diff --git a/tools/media/netmap-vma/netmap_vma.h b/tools/media/netmap-vma/netmap_vma.h new file mode 100644 index 0000000..23e594d --- /dev/null +++ b/tools/media/netmap-vma/netmap_vma.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2001-2020 Mellanox Technologies, Ltd. All rights reserved. + * + * This software is available to you under a choice of one of two + * licenses. You may choose to be licensed under the terms of the GNU + * General Public License (GPL) Version 2, available from the file + * COPYING in the main directory of this source tree, or the + * BSD license below: + * + * Redistribution and use in source and binary forms, with or + * without modification, are permitted provided that the following + * conditions are met: + * + * - Redistributions of source code must retain the above + * copyright notice, this list of conditions and the following + * disclaimer. + * + * - Redistributions in binary form must reproduce the above + * copyright notice, this list of conditions and the following + * disclaimer in the documentation and/or other materials + * provided with the distribution. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, + * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF + * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND + * NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS + * BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN + * ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN + * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef NETMAP_VMA_H +#define NETMAP_VMA_H + +#ifdef NETMAP_VMA +#define STRIDE_SIZE 2048 +#define nm_open nm_open_vma +#define nm_close nm_close_vma +#define nm_nextpkt nm_nextpkt_vma +#define nm_dispatch nm_dispatch_vma +struct nm_desc *nm_open_vma(const char *ifname, const struct nmreq *req, + uint64_t flags, const struct nm_desc *arg); +int nm_close_vma(struct nm_desc *); +u_char* nm_nextpkt_vma(struct nm_desc *d, struct nm_pkthdr *hdr); +int poll_nm_vma(struct nm_desc *d, int timeout); +int nm_dispatch_vma(struct nm_desc *d, int cnt, nm_cb_t cb, u_char *arg); +#endif + +#endif /* NETMAP_VMA_H */